Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[XPU] Support several ops on precision of fp16. #10025

Merged
merged 1 commit into from
Feb 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions lite/kernels/x86/calib_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,26 @@ void CalibComputeFp32ToInt8<Ptype, DLType>::Run() {
din, dout, scale.data(), 1, 1, param.input->numel());
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeFp32ToFp16<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
const auto* din = param.input->template data<float>();
auto* dout = param.output->template mutable_data<float16>();
for (auto i = 0; i < param.input->numel(); ++i) {
dout[i] = static_cast<float16>(din[i]);
}
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeFp16ToFp32<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
const auto* din = param.input->template data<float16>();
auto* dout = param.output->template mutable_data<float>();
for (auto i = 0; i < param.input->numel(); ++i) {
dout[i] = static_cast<float>(din[i]);
}
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeInt64ToInt32<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
Expand Down Expand Up @@ -84,6 +104,26 @@ void CalibComputeFp32ToInt32<Ptype, DLType>::Run() {
}
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeInt32ToFp16<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
const auto* din = param.input->template data<int32_t>();
auto* dout = param.output->template mutable_data<float16>();
for (auto i = 0; i < param.input->numel(); ++i) {
dout[i] = static_cast<float16>(din[i]);
}
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeFp16ToInt32<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
const auto* din = param.input->template data<float16>();
auto* dout = param.output->template mutable_data<int32_t>();
for (auto i = 0; i < param.input->numel(); ++i) {
dout[i] = static_cast<int32_t>(din[i]);
}
}

template <PrecisionType Ptype, DataLayoutType DLType>
void CalibComputeInt64ToFp32<Ptype, DLType>::Run() {
auto& param = this->template Param<operators::CalibParam>();
Expand Down Expand Up @@ -171,6 +211,23 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
.Finalize();

typedef paddle::lite::kernels::x86::CalibComputeFp32ToFp16<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
fp_fp32_to_fp16;
REGISTER_LITE_KERNEL(calib, kX86, kFloat, kNCHW, fp_fp32_to_fp16, fp32_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.Finalize();

typedef paddle::lite::kernels::x86::CalibComputeFp16ToFp32<PRECISION(kFP16),
DATALAYOUT(kNCHW)>
fp16_fp16_to_fp32;
REGISTER_LITE_KERNEL(calib, kX86, kFP16, kNCHW, fp16_fp16_to_fp32, fp16_to_fp32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
.Finalize();

REGISTER_LITE_KERNEL(
calib_once, kX86, kInt8, kNCHW, i8_fp32_to_int8, fp32_to_int8)
.BindInput("Input",
Expand Down Expand Up @@ -223,3 +280,21 @@ REGISTER_LITE_KERNEL(
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
.Finalize();

typedef paddle::lite::kernels::x86::CalibComputeInt32ToFp16<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
fp_int32_to_fp16;
REGISTER_LITE_KERNEL(
calib, kX86, kFloat, kNCHW, fp_int32_to_fp16, int32_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.Finalize();
typedef paddle::lite::kernels::x86::CalibComputeFp16ToInt32<PRECISION(kFloat),
DATALAYOUT(kNCHW)>
fp_fp16_to_int32;
REGISTER_LITE_KERNEL(
calib, kX86, kFloat, kNCHW, fp_fp16_to_int32, fp16_to_int32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
.Finalize();
48 changes: 48 additions & 0 deletions lite/kernels/x86/calib_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ namespace lite {
namespace kernels {
namespace x86 {

typedef uint16_t float16;
template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeFp32ToInt8 : public KernelLite<TARGET(kX86), Ptype, DLType> {
public:
Expand All @@ -33,6 +34,30 @@ class CalibComputeFp32ToInt8 : public KernelLite<TARGET(kX86), Ptype, DLType> {
private:
};

template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeFp32ToFp16 : public KernelLite<TARGET(kX86), Ptype, DLType> {
public:
using param_t = operators::CalibParam;

void Run() override;

~CalibComputeFp32ToFp16() override{};

private:
};

template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeFp16ToFp32 : public KernelLite<TARGET(kX86), Ptype, DLType> {
public:
using param_t = operators::CalibParam;

void Run() override;

~CalibComputeFp16ToFp32() override{};

private:
};

template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeInt64ToInt32
: public KernelLite<TARGET(kX86), Ptype, DLType> {
Expand Down Expand Up @@ -107,6 +132,29 @@ class CalibComputeInt64ToFp32 : public KernelLite<TARGET(kX86), Ptype, DLType> {
private:
};

template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeInt32ToFp16 : public KernelLite<TARGET(kX86), Ptype, DLType> {
public:
using param_t = operators::CalibParam;

void Run() override;

~CalibComputeInt32ToFp16() override{};

private:
};
template <PrecisionType Ptype, DataLayoutType DLType>
class CalibComputeFp16ToInt32 : public KernelLite<TARGET(kX86), Ptype, DLType> {
public:
using param_t = operators::CalibParam;

void Run() override;

~CalibComputeFp16ToInt32() override{};

private:
};

} // namespace x86
} // namespace kernels
} // namespace lite
Expand Down
2 changes: 2 additions & 0 deletions lite/kernels/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ add_kernel(lod_reset_compute_xpu XPU extra SRCS lod_reset_compute.cc)
add_kernel(select_input_compute_xpu XPU extra SRCS select_input_compute.cc)
add_kernel(group_norm_compute_xpu XPU extra SRCS group_norm_compute.cc)
add_kernel(deformable_conv_compute_xpu XPU extra SRCS deformable_conv_compute.cc)
add_kernel(sin_compute_xpu XPU extra SRCS sin_compute.cc)
add_kernel(cos_compute_xpu XPU extra SRCS cos_compute.cc)

# extra(fused kernel)
add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc)
Expand Down
17 changes: 8 additions & 9 deletions lite/kernels/xpu/activation_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -389,15 +389,6 @@ using siluFP32 =
paddle::lite::kernels::xpu::SiluCompute<float, PRECISION(kFloat)>;
using siluFP16 =
paddle::lite::kernels::xpu::SiluCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(silu, kXPU, kFloat, kNCHW, siluFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(silu, kXPU, kFP16, kNCHW, siluFP16, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

using eluFP32 =
paddle::lite::kernels::xpu::EluCompute<float, PRECISION(kFloat)>;
using eluFP16 =
Expand All @@ -410,6 +401,14 @@ REGISTER_LITE_KERNEL(elu, kXPU, kFP16, kNCHW, eluFP16, DISABLE_XPU1_eluFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(silu, kXPU, kFloat, kNCHW, siluFP32, silu_fp32)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(silu, kXPU, kFP16, kNCHW, siluFP16, silu_fp16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def)
Expand Down
83 changes: 83 additions & 0 deletions lite/kernels/xpu/calib_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,29 @@ using xpu_calib_fp32_to_fp16 =
using xpu_calib_fp16_to_fp32 =
paddle::lite::kernels::xpu::CalibCompute<float16, float, PRECISION(kFloat)>;

using xpu_calib_fp32_to_fp16_kfp16 =
paddle::lite::kernels::xpu::CalibCompute<float, float16, PRECISION(kFP16)>;
using xpu_calib_fp16_to_fp32_kfp16 =
paddle::lite::kernels::xpu::CalibCompute<float16, float, PRECISION(kFP16)>;
Comment on lines +116 to +119
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

命名改下
xpu_calib_fp32_to_fp16
xpu_calib_fp16_to_fp32

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个等业务侧模型验证之后再改吧。


using xpu_calib_int64_to_fp16 =
paddle::lite::kernels::xpu::CalibCompute<int64_t,
float16,
PRECISION(kFP16)>;
using xpu_calib_fp16_to_int64 =
paddle::lite::kernels::xpu::CalibCompute<float16,
int64_t,
PRECISION(kFP16)>;

using xpu_calib_int32_to_fp16 =
paddle::lite::kernels::xpu::CalibCompute<int32_t,
float16,
PRECISION(kFP16)>;
using xpu_calib_fp16_to_int32 =
paddle::lite::kernels::xpu::CalibCompute<float16,
int32_t,
PRECISION(kFP16)>;

REGISTER_LITE_KERNEL(
calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, calib_int64_to_int32)
.BindInput("Input",
Expand Down Expand Up @@ -140,6 +163,45 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_fp16_to_fp32_kfp16, calib_fp16_to_fp32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_fp32_to_fp16_kfp16, calib_fp32_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_int64_to_fp16, calib_int64_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_fp16_to_int64, calib_fp16_to_int64)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_int32_to_fp16, calib_int32_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
calib, kXPU, kFP16, kNCHW, xpu_calib_fp16_to_int32, calib_fp16_to_int32)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
.Finalize();

REGISTER_LITE_KERNEL(calib_once,
kXPU,
kFloat,
Expand Down Expand Up @@ -175,6 +237,27 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
.Finalize();

REGISTER_LITE_KERNEL(calib_once,
kXPU,
kFP16,
kNCHW,
xpu_calib_int64_to_fp16,
calib_int64_to_fp16)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(calib_once,
kXPU,
kFP16,
kNCHW,
xpu_calib_fp16_to_int64,
calib_fp16_to_int64)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
.Finalize();

using xpu_calib_fp32_to_int8 =
paddle::lite::kernels::xpu::CalibCompute<float, int8_t, PRECISION(kInt8)>;

Expand Down
54 changes: 54 additions & 0 deletions lite/kernels/xpu/cos_compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/xpu/cos_compute.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

template <typename T, PrecisionType PType>
void CosCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();

int r = xdnn::cos(ctx.GetRawContext(),
param.X->template data<T>(),
param.Out->template mutable_data<T>(TARGET(kXPU)),
param.X->numel());
CHECK_EQ(r, 0);
}

} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle

namespace xpu = paddle::lite::kernels::xpu;

using cosFP32 =
paddle::lite::kernels::xpu::CosCompute<float, PRECISION(kFloat)>;
using cosFP16 =
paddle::lite::kernels::xpu::CosCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(cos, kXPU, kFloat, kNCHW, cosFP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
REGISTER_LITE_KERNEL(cos, kXPU, kFP16, kNCHW, cosFP16, cosFP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();
Loading