Skip to content

Commit

Permalink
cherry-pick from develop to support models (#8199)
Browse files Browse the repository at this point in the history
* [XPU] change match_matrix_tensor op from old version to refector verison (#7012)

* [XPU] change sequence concat op from old version to refector verison (#6847)

* [XPU] change sequence_reverse api from old version to refector version (#6798)

* [XPU] fix some bugs for transformer (#7014)

* [XPU] Mul quant (#6850)

* [XPU] bugfix on fc max (#7152)

* expand_v2 supports dynamic shape (#7116)

* [XPU] change search_noaligned_mat_mul op to fc_batched_vsl op (#7081)

* [xpu] support qkv-fused weight reuse in scs_tran_match (#7293)

* [XPU] Scs trans match (#7307)

* [XPU] add lod_array_length; argmax support int32 (#7314)

* [XPU] change fc_int16 op to fc_fusion (#7029)

* [XPU] super big ernie support (#7184)

* [XPU] free default workspace of xpu_ctx before setting up a new gm workspace (#7422)

* [XPU] use get_max_ptr function (#7482)

* [xpu] Fc int31 (#7514)

* [xpu] fix continuous encoder fuse and fc max size

* [xpu] refactor fc int31 for KL2

* use get_max_ptr function (#7529)

* add activation xpu gelu (#7527)

* [xpu] more check with multi_encoder pass (#7593)

* [XPU]use get_max_ptr_size in search attention op (#7528)

* [XPU]use get_max_ptr_size in bigru op (#7498)

* [XPU] change sequence_topk_avg_pooling op from old version to refector verison (#7411)

* update xpu api sequence_unpad (#7640)

* update xpu api l2_norm (#7724)

* [XPU] __xpu__resnet_fuse_pass should not match ResNeXt50 (#7824)

* [xpu] support encoder mul shape without equal length and more check (#7753)

* [XPU] use new search_varconv (#7865)

* [XPU] use new sequence_topk_avg_pooling (#7834)

* [XPU] fix xpu memory leak bug for arm arch (#8010)

* [XPU] new new op in mmdnn (#7998)

* [XPU] fix xpu l2_norm bug (#7983)

* [XPU] grnn_cell op  in mmdnn (#8139)

* [XPU] use new concat in mmdnn (#8184)

* [XPU] build 2.10 depending on new xpu_sdk_url: xdnn 2.3.0 and xre 4.0.7.1
  • Loading branch information
newway authored Jan 17, 2022
1 parent 685c9fc commit bb2edd6
Show file tree
Hide file tree
Showing 52 changed files with 1,511 additions and 1,156 deletions.
2 changes: 1 addition & 1 deletion cmake/device/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ set (XPU_DOWNLOAD_DIR ${XPU_SOURCE_DIR}/download)
set (XPU_INSTALL_DIR ${THIRD_PARTY_PATH}/install)

if (NOT XPU_SDK_URL)
set (XPU_SDK_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20211022")
set (XPU_SDK_URL "https://baidu-kunlun-product.cdn.bcebos.com/klx-sdk/search/20210107")
endif ()

if (NOT XPU_SDK_ENV)
Expand Down
10 changes: 10 additions & 0 deletions lite/api/paddle_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,16 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {
#endif
}

void set_xpu_gm_workspace_method(size_t gm_size) {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::local_gm_size = gm_size;
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_gm_workspace_method' is ignored, please "
"rebuild it with LITE_WITH_XPU=ON.";
#endif
}

void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::SetDev(dev_no);
Expand Down
2 changes: 2 additions & 0 deletions lite/api/paddle_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,8 @@ class LITE_API CxxConfig : public ConfigBase {
void set_xpu_workspace_l3_size_per_thread(int l3_size = 0x4000000);
void set_xpu_l3_cache_method(size_t l3_size, bool locked = false);

void set_xpu_gm_workspace_method(size_t gm_size);

void set_xpu_conv_autotune(bool autotune = true,
const std::string& autotune_file = "");

Expand Down
5 changes: 4 additions & 1 deletion lite/backends/xpu/target_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,8 @@ void TargetWrapperXPU::FreeL3Cache() {
}

// xpu context
LITE_THREAD_LOCAL xdnn::Context* TargetWrapperXPU::tls_raw_ctx_{nullptr};
LITE_THREAD_LOCAL std::shared_ptr<xdnn::Context> TargetWrapperXPU::tls_raw_ctx_{
nullptr};
// multi encoder config
LITE_THREAD_LOCAL std::string
TargetWrapperXPU::multi_encoder_precision; // NOLINT
Expand All @@ -180,6 +181,8 @@ LITE_THREAD_LOCAL std::string TargetWrapperXPU::conv_autotune_file;
LITE_THREAD_LOCAL bool TargetWrapperXPU::need_l3_mutex{false};
LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_l3_size{
std::numeric_limits<size_t>::max()};
LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_gm_size{
0x4000000}; // 64 * 1024 * 1024
LITE_THREAD_LOCAL void* TargetWrapperXPU::local_l3_ptr_{nullptr};
void* TargetWrapperXPU::shared_l3_ptr_{nullptr};
size_t TargetWrapperXPU::shared_l3_size{0};
Expand Down
37 changes: 29 additions & 8 deletions lite/backends/xpu/target_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ namespace lite {

// MAX(lod.size()) = 32
const int XPU_MAX_LOD_SIZE = 32;
// MAX(lod.size()) = 64 in XPU refactor
const int XPU_MAX_LOD_SIZE_64 = 64;
// MAX(lod[i + 1] - lod[i]) = 512
const int XPU_MAX_LOD_SEQ_LEN = 512;
// QUANT SCALE NUM == XPU CDNN NUM
const int XPU_QUANT_SCALE_NUM = 6;

using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;

Expand Down Expand Up @@ -76,9 +76,9 @@ class TargetWrapper<TARGET(kXPU)> {
static XPUScratchPadGuard MallocScratchPad(size_t size);

static xdnn::Context* GetRawContext() {
if (tls_raw_ctx_ == nullptr) {
tls_raw_ctx_ = xdnn::create_context();
CHECK(tls_raw_ctx_);
if (tls_raw_ctx_.get() == nullptr) {
tls_raw_ctx_.reset(xdnn::create_context(), xdnn::destroy_context);
CHECK(tls_raw_ctx_.get());
if (l3_planner_ == nullptr) {
l3_planner_ = new XPUL3Planner;
}
Expand All @@ -100,8 +100,28 @@ class TargetWrapper<TARGET(kXPU)> {
local_l3_size = max_l3_size;
}
CHECK_LE(shared_l3_size, max_l3_size);
if (local_gm_size > 0) {
VLOG(3) << "Try To Malloc Local GM Workspace Size is" << local_gm_size;
void* local_gm_ptr = nullptr;
int ret =
xpu_malloc(reinterpret_cast<void**>(&local_gm_ptr), local_gm_size);
if (ret != 0 || local_gm_ptr == nullptr) {
VLOG(3) << "No Enough GM Workspace For Current Predictor.";
} else {
void* old_ptr = tls_raw_ctx_->_gm_mgr.get_ptr();
if (old_ptr != nullptr) {
TargetWrapperXPU::Free(old_ptr);
}
ret = tls_raw_ctx_->_gm_mgr.set(local_gm_ptr, local_gm_size);
if (ret != 0) {
LOG(WARNING) << "XPU GM Mgr Init Fail, Please Check Configuration.";
TargetWrapperXPU::Free(local_gm_ptr);
local_gm_ptr = nullptr;
}
}
}
}
return tls_raw_ctx_;
return tls_raw_ctx_.get();
}
static void MallocL3Cache(
const std::vector<std::vector<int64_t>>& query_shape);
Expand Down Expand Up @@ -131,7 +151,8 @@ class TargetWrapper<TARGET(kXPU)> {
// l3 cache config
static LITE_THREAD_LOCAL bool need_l3_mutex; // model level l3 size
static LITE_THREAD_LOCAL size_t local_l3_size; // model level l3 size
static size_t shared_l3_size; // model level l3 size
static LITE_THREAD_LOCAL size_t local_gm_size;
static size_t shared_l3_size; // model level l3 size
static LITE_THREAD_LOCAL std::vector<XPUL3CacheBlock*>
l3_block_dict; // l3 cache block used between op layers

Expand All @@ -140,7 +161,7 @@ class TargetWrapper<TARGET(kXPU)> {
void* l3_ptr,
size_t l3_size,
const std::vector<std::vector<int64_t>>& query_shape);
static LITE_THREAD_LOCAL xdnn::Context* tls_raw_ctx_;
static LITE_THREAD_LOCAL std::shared_ptr<xdnn::Context> tls_raw_ctx_;
static LITE_THREAD_LOCAL void* local_l3_ptr_;
static void* shared_l3_ptr_;
static std::mutex mutex_l3_;
Expand Down
9 changes: 9 additions & 0 deletions lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@ class XPUFcFuser : public FuseBase {
} else if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int8" ||
lite::TargetWrapperXPU::multi_encoder_precision == "int8") {
precision = "int8";
if (op_desc.HasAttr("enable_int8") &&
op_desc.GetAttr<bool>("enable_int8")) {
CHECK(op_desc.HasAttr("X0_scale")) << " quant model fc no X0_scale";
CHECK(op_desc.HasAttr("Y0_scale")) << " quant model fc no Y0_scale";
VLOG(3) << "Use int8 quant model in XPUFcOp, InputMax:"
<< 127 * op_desc.GetAttr<std::vector<float>>("X0_scale")[0]
<< ", WeightMax: "
<< 127 * op_desc.GetAttr<std::vector<float>>("Y0_scale")[0];
}
VLOG(3) << "Use int8 in XPUFcOp";
}
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,17 @@ class XPULinkConvMaxFuser : public FuseBase {
public:
explicit XPULinkConvMaxFuser(bool with_branch) { with_branch_ = with_branch; }
void BuildPattern() override {
auto non_quant_teller = [](const Node* node) -> bool {
auto op_desc = *const_cast<Node*>(node)->stmt()->op_info();
return (!op_desc.HasAttr("enable_int8") ||
!op_desc.GetAttr<bool>("enable_int8"));
};

auto* input =
VarNode("input")->assert_is_op_input("__xpu__conv2d", "Input");
auto* xpu_fusion_op =
OpNode("xpu_fusion_op", "__xpu__conv2d")
->assert_node_satisfied(non_quant_teller)
->assert_op_attr<bool>("has_branch", with_branch_);

PMNode* branch = nullptr;
Expand Down Expand Up @@ -100,8 +107,14 @@ class XPULinkConvMaxFuser : public FuseBase {
class XPULinkFcMaxFuser : public FuseBase {
public:
void BuildPattern() override {
auto non_quant_teller = [](const Node* node) -> bool {
auto op_desc = *const_cast<Node*>(node)->stmt()->op_info();
return (!op_desc.HasAttr("enable_int8") ||
!op_desc.GetAttr<bool>("enable_int8"));
};
auto* input = VarNode("input")->assert_is_op_input("__xpu__fc", "Input");
auto* xpu_fusion_op = OpNode("xpu_fusion_op", "__xpu__fc");
auto* xpu_fusion_op = OpNode("xpu_fusion_op", "__xpu__fc")
->assert_node_satisfied(non_quant_teller);

*input >> *xpu_fusion_op;
}
Expand Down
Loading

0 comments on commit bb2edd6

Please sign in to comment.