cherry-pick from develop to support models (#8199)

* [XPU] change match_matrix_tensor op from old version to refector verison (#7012) * [XPU] change sequence concat op from old version to refector verison (#6847) * [XPU] change sequence_reverse api from old version to refector version (#6798) * [XPU] fix some bugs for transformer (#7014) * [XPU] Mul quant (#6850) * [XPU] bugfix on fc max (#7152) * expand_v2 supports dynamic shape (#7116) * [XPU] change search_noaligned_mat_mul op to fc_batched_vsl op (#7081) * [xpu] support qkv-fused weight reuse in scs_tran_match (#7293) * [XPU] Scs trans match (#7307) * [XPU] add lod_array_length; argmax support int32 (#7314) * [XPU] change fc_int16 op to fc_fusion (#7029) * [XPU] super big ernie support (#7184) * [XPU] free default workspace of xpu_ctx before setting up a new gm workspace (#7422) * [XPU] use get_max_ptr function (#7482) * [xpu] Fc int31 (#7514) * [xpu] fix continuous encoder fuse and fc max size * [xpu] refactor fc int31 for KL2 * use get_max_ptr function (#7529) * add activation xpu gelu (#7527) * [xpu] more check with multi_encoder pass (#7593) * [XPU]use get_max_ptr_size in search attention op (#7528) * [XPU]use get_max_ptr_size in bigru op (#7498) * [XPU] change sequence_topk_avg_pooling op from old version to refector verison (#7411) * update xpu api sequence_unpad (#7640) * update xpu api l2_norm (#7724) * [XPU] __xpu__resnet_fuse_pass should not match ResNeXt50 (#7824) * [xpu] support encoder mul shape without equal length and more check (#7753) * [XPU] use new search_varconv (#7865) * [XPU] use new sequence_topk_avg_pooling (#7834) * [XPU] fix xpu memory leak bug for arm arch (#8010) * [XPU] new new op in mmdnn (#7998) * [XPU] fix xpu l2_norm bug (#7983) * [XPU] grnn_cell op in mmdnn (#8139) * [XPU] use new concat in mmdnn (#8184) * [XPU] build 2.10 depending on new xpu_sdk_url: xdnn 2.3.0 and xre 4.0.7.1
PaddlePaddle · Jan 17, 2022 · bb2edd6 · bb2edd6
1 parent 685c9fc
commit bb2edd6
Show file tree

Hide file tree

Showing 52 changed files with 1,511 additions and 1,156 deletions.
diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake
@@ -23,7 +23,7 @@ set (XPU_DOWNLOAD_DIR ${XPU_SOURCE_DIR}/download)
 set (XPU_INSTALL_DIR ${THIRD_PARTY_PATH}/install)
 
 if (NOT XPU_SDK_URL)
-  set (XPU_SDK_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20211022")
+  set (XPU_SDK_URL "https://baidu-kunlun-product.cdn.bcebos.com/klx-sdk/search/20210107")
 endif ()
 
 if (NOT XPU_SDK_ENV)

diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
@@ -547,6 +547,16 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {
 #endif
 }
 
+void set_xpu_gm_workspace_method(size_t gm_size) {
+#ifdef LITE_WITH_XPU
+  lite::TargetWrapperXPU::local_gm_size = gm_size;
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_gm_workspace_method' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
 void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 #ifdef LITE_WITH_XPU
   lite::TargetWrapperXPU::SetDev(dev_no);

diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
@@ -408,6 +408,8 @@ class LITE_API CxxConfig : public ConfigBase {
   void set_xpu_workspace_l3_size_per_thread(int l3_size = 0x4000000);
   void set_xpu_l3_cache_method(size_t l3_size, bool locked = false);
 
+  void set_xpu_gm_workspace_method(size_t gm_size);
+
   void set_xpu_conv_autotune(bool autotune = true,
                              const std::string& autotune_file = "");
 

diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc
@@ -168,7 +168,8 @@ void TargetWrapperXPU::FreeL3Cache() {
 }
 
 // xpu context
-LITE_THREAD_LOCAL xdnn::Context* TargetWrapperXPU::tls_raw_ctx_{nullptr};
+LITE_THREAD_LOCAL std::shared_ptr<xdnn::Context> TargetWrapperXPU::tls_raw_ctx_{
+    nullptr};
 // multi encoder config
 LITE_THREAD_LOCAL std::string
     TargetWrapperXPU::multi_encoder_precision;  // NOLINT
@@ -180,6 +181,8 @@ LITE_THREAD_LOCAL std::string TargetWrapperXPU::conv_autotune_file;
 LITE_THREAD_LOCAL bool TargetWrapperXPU::need_l3_mutex{false};
 LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_l3_size{
     std::numeric_limits<size_t>::max()};
+LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_gm_size{
+    0x4000000};  // 64 * 1024 * 1024
 LITE_THREAD_LOCAL void* TargetWrapperXPU::local_l3_ptr_{nullptr};
 void* TargetWrapperXPU::shared_l3_ptr_{nullptr};
 size_t TargetWrapperXPU::shared_l3_size{0};

diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h
@@ -38,10 +38,10 @@ namespace lite {
 
 // MAX(lod.size()) = 32
 const int XPU_MAX_LOD_SIZE = 32;
+// MAX(lod.size()) = 64 in XPU refactor
+const int XPU_MAX_LOD_SIZE_64 = 64;
 // MAX(lod[i + 1] - lod[i]) = 512
 const int XPU_MAX_LOD_SEQ_LEN = 512;
-// QUANT SCALE NUM == XPU CDNN NUM
-const int XPU_QUANT_SCALE_NUM = 6;
 
 using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
 
@@ -76,9 +76,9 @@ class TargetWrapper<TARGET(kXPU)> {
   static XPUScratchPadGuard MallocScratchPad(size_t size);
 
   static xdnn::Context* GetRawContext() {
-    if (tls_raw_ctx_ == nullptr) {
-      tls_raw_ctx_ = xdnn::create_context();
-      CHECK(tls_raw_ctx_);
+    if (tls_raw_ctx_.get() == nullptr) {
+      tls_raw_ctx_.reset(xdnn::create_context(), xdnn::destroy_context);
+      CHECK(tls_raw_ctx_.get());
       if (l3_planner_ == nullptr) {
         l3_planner_ = new XPUL3Planner;
       }
@@ -100,8 +100,28 @@ class TargetWrapper<TARGET(kXPU)> {
         local_l3_size = max_l3_size;
       }
       CHECK_LE(shared_l3_size, max_l3_size);
+      if (local_gm_size > 0) {
+        VLOG(3) << "Try To Malloc Local GM Workspace Size is" << local_gm_size;
+        void* local_gm_ptr = nullptr;
+        int ret =
+            xpu_malloc(reinterpret_cast<void**>(&local_gm_ptr), local_gm_size);
+        if (ret != 0 || local_gm_ptr == nullptr) {
+          VLOG(3) << "No Enough GM Workspace For Current Predictor.";
+        } else {
+          void* old_ptr = tls_raw_ctx_->_gm_mgr.get_ptr();
+          if (old_ptr != nullptr) {
+            TargetWrapperXPU::Free(old_ptr);
+          }
+          ret = tls_raw_ctx_->_gm_mgr.set(local_gm_ptr, local_gm_size);
+          if (ret != 0) {
+            LOG(WARNING) << "XPU GM Mgr Init Fail, Please Check Configuration.";
+            TargetWrapperXPU::Free(local_gm_ptr);
+            local_gm_ptr = nullptr;
+          }
+        }
+      }
     }
-    return tls_raw_ctx_;
+    return tls_raw_ctx_.get();
   }
   static void MallocL3Cache(
       const std::vector<std::vector<int64_t>>& query_shape);
@@ -131,7 +151,8 @@ class TargetWrapper<TARGET(kXPU)> {
   // l3 cache config
   static LITE_THREAD_LOCAL bool need_l3_mutex;    // model level l3 size
   static LITE_THREAD_LOCAL size_t local_l3_size;  // model level l3 size
-  static size_t shared_l3_size;                   // model level l3 size
+  static LITE_THREAD_LOCAL size_t local_gm_size;
+  static size_t shared_l3_size;  // model level l3 size
   static LITE_THREAD_LOCAL std::vector<XPUL3CacheBlock*>
       l3_block_dict;  // l3 cache block used between op layers
 
@@ -140,7 +161,7 @@ class TargetWrapper<TARGET(kXPU)> {
       void* l3_ptr,
       size_t l3_size,
       const std::vector<std::vector<int64_t>>& query_shape);
-  static LITE_THREAD_LOCAL xdnn::Context* tls_raw_ctx_;
+  static LITE_THREAD_LOCAL std::shared_ptr<xdnn::Context> tls_raw_ctx_;
   static LITE_THREAD_LOCAL void* local_l3_ptr_;
   static void* shared_l3_ptr_;
   static std::mutex mutex_l3_;

diff --git a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
@@ -91,6 +91,15 @@ class XPUFcFuser : public FuseBase {
     } else if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int8" ||
                lite::TargetWrapperXPU::multi_encoder_precision == "int8") {
       precision = "int8";
+      if (op_desc.HasAttr("enable_int8") &&
+          op_desc.GetAttr<bool>("enable_int8")) {
+        CHECK(op_desc.HasAttr("X0_scale")) << " quant model fc no X0_scale";
+        CHECK(op_desc.HasAttr("Y0_scale")) << " quant model fc no Y0_scale";
+        VLOG(3) << "Use int8 quant model in XPUFcOp, InputMax:"
+                << 127 * op_desc.GetAttr<std::vector<float>>("X0_scale")[0]
+                << ", WeightMax: "
+                << 127 * op_desc.GetAttr<std::vector<float>>("Y0_scale")[0];
+      }
       VLOG(3) << "Use int8 in XPUFcOp";
     }
 #endif

diff --git a/lite/core/optimizer/mir/fusion/__xpu__link_previous_out_max_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__link_previous_out_max_pass.cc
@@ -59,10 +59,17 @@ class XPULinkConvMaxFuser : public FuseBase {
  public:
   explicit XPULinkConvMaxFuser(bool with_branch) { with_branch_ = with_branch; }
   void BuildPattern() override {
+    auto non_quant_teller = [](const Node* node) -> bool {
+      auto op_desc = *const_cast<Node*>(node)->stmt()->op_info();
+      return (!op_desc.HasAttr("enable_int8") ||
+              !op_desc.GetAttr<bool>("enable_int8"));
+    };
+
     auto* input =
         VarNode("input")->assert_is_op_input("__xpu__conv2d", "Input");
     auto* xpu_fusion_op =
         OpNode("xpu_fusion_op", "__xpu__conv2d")
+            ->assert_node_satisfied(non_quant_teller)
             ->assert_op_attr<bool>("has_branch", with_branch_);
 
     PMNode* branch = nullptr;
@@ -100,8 +107,14 @@ class XPULinkConvMaxFuser : public FuseBase {
 class XPULinkFcMaxFuser : public FuseBase {
  public:
   void BuildPattern() override {
+    auto non_quant_teller = [](const Node* node) -> bool {
+      auto op_desc = *const_cast<Node*>(node)->stmt()->op_info();
+      return (!op_desc.HasAttr("enable_int8") ||
+              !op_desc.GetAttr<bool>("enable_int8"));
+    };
     auto* input = VarNode("input")->assert_is_op_input("__xpu__fc", "Input");
-    auto* xpu_fusion_op = OpNode("xpu_fusion_op", "__xpu__fc");
+    auto* xpu_fusion_op = OpNode("xpu_fusion_op", "__xpu__fc")
+                              ->assert_node_satisfied(non_quant_teller);
 
     *input >> *xpu_fusion_op;
   }