diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 87b943abd0106..e9cb7d325f711 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -6,28 +6,34 @@ if(WITH_NV_JETSON)
   add_definitions(-DWITH_NV_JETSON)
   set(paddle_known_gpu_archs "53 62 72")
   set(paddle_known_gpu_archs10 "53 62 72")
+  set(paddle_known_gpu_archs11 "53 62 72 87")
+  set(paddle_known_gpu_archs12 "53 62 72 87 90")
 elseif(NEW_RELEASE_ALL)
   message("Using New Release Strategy - All Arches Packge")
   add_definitions(-DNEW_RELEASE_ALL)
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
-  set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
+  set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
+  set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90")
 elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Packge")
   add_definitions(-DNEW_RELEASE_PYPI)
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
   set(paddle_known_gpu_archs10 "")
-  set(paddle_known_gpu_archs11 "60 61 70 75 80")
+  set(paddle_known_gpu_archs11 "61 70 75 80")
+  set(paddle_known_gpu_archs12 "61 70 75 80 90")
 elseif(NEW_RELEASE_JIT)
   message("Using New Release Strategy - JIT Packge")
   add_definitions(-DNEW_RELEASE_JIT)
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
-  set(paddle_known_gpu_archs10 "35 50 60 70 75")
-  set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
+  set(paddle_known_gpu_archs10 "50 60 70 75")
+  set(paddle_known_gpu_archs11 "50 60 70 75 80")
+  set(paddle_known_gpu_archs12 "50 60 70 75 80 90")
 else()
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
-  set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
+  set(paddle_known_gpu_archs "70 80")
+  set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
+  set(paddle_known_gpu_archs12 "70 80")
 endif()
 
 ######################################################################################
@@ -98,12 +104,12 @@ endfunction()
 function(select_nvcc_arch_flags out_variable)
   # List of arch names
   set(archs_names
-      "Kepler"
       "Maxwell"
       "Pascal"
       "Volta"
       "Turing"
       "Ampere"
+      "Hopper"
       "All"
       "Manual")
   set(archs_name_default "Auto")
@@ -142,9 +148,7 @@ function(select_nvcc_arch_flags out_variable)
     unset(CUDA_ARCH_PTX CACHE)
   endif()
 
-  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
-    set(cuda_arch_bin "30 35")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+  if(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
     if(WITH_NV_JETSON)
       set(cuda_arch_bin "53")
     else()
@@ -165,11 +169,17 @@ function(select_nvcc_arch_flags out_variable)
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
-    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
-      set(cuda_arch_bin "80")
-    elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+
-      set(cuda_arch_bin "80 86")
+    if(WITH_NV_JETSON)
+      set(cuda_arch_bin "87")
+    else()
+      if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
+        set(cuda_arch_bin "80")
+      else()
+        set(cuda_arch_bin "80 86")
+      endif()
     endif()
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper")
+    set(cuda_arch_bin "90")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -186,6 +196,13 @@ function(select_nvcc_arch_flags out_variable)
     set(cuda_arch_bin ${CUDA_ARCH_BIN})
   endif()
 
+  # cuda11.4
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.6)
+    set(cuda_arch_bin "70 80")
+  else()
+    set(cuda_arch_bin "70 80 90")
+  endif()
+
   if(NEW_RELEASE_JIT)
     set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}")
     set(cuda_arch_bin "")
@@ -249,6 +266,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
+  set(paddle_known_gpu_archs "${paddle_known_gpu_archs12} 90")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()
 
 if(NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index cd7b254892ed1..69a1058d0db0f 100644
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -25,8 +25,18 @@ set(GLOO_LIBRARY_DIR
     "${GLOO_INSTALL_DIR}/lib"
     CACHE PATH "gloo library directory." FORCE)
 # As we add extra features for gloo, we use the non-official repo
-set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git)
-set(GLOO_TAG v0.0.2)
+if(WITH_GPU)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0)
+    set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git)
+    set(GLOO_TAG v0.0.2)
+  else()
+    set(GLOO_REPOSITORY ${GIT_URL}/ziyoujiyi/gloo.git)
+    set(GLOO_TAG v0.0.3)
+  endif()
+else()
+  set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git)
+  set(GLOO_TAG v0.0.2)
+endif()
 set(GLOO_LIBRARIES
     "${GLOO_INSTALL_DIR}/lib/libgloo.a"
     CACHE FILEPATH "gloo library." FORCE)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index c7a4e1d99bff1..4eeac3515d160 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -23,7 +23,15 @@ set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 # in case of low internet speed
 #set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
+if(WITH_GPU)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0)
+      set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
+  else()
+      set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184)
+  endif()
+else()
+  set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
+endif()
 
 set(WARPCTC_INCLUDE_DIR
     "${WARPCTC_INSTALL_DIR}/include"
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index fa9e408b5af66..7b1a75eb03469 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -8,6 +8,8 @@ set(XPU_API_LIB_NAME "libxpuapi.so")
 set(XPU_API_PLUGIN_NAME  "libxpuplugin.so")
 set(XPU_RT_LIB_NAME "libxpurt.so")
 set(XPU_RT_ALIAS_LIB_NAME "libxpurt.so.1")
+set(XPU_ML_LIB_NAME "libxpuml.so")
+set(XPU_ML_ALIAS_LIB_NAME "libxpuml.so.1")
 
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
@@ -128,6 +130,8 @@ set(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
 set(XPU_API_PLUGIN "${XPU_LIB_DIR}/${XPU_API_PLUGIN_NAME}")
 set(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
 set(XPU_RT_ALIAS_LIB "${XPU_LIB_DIR}/${XPU_RT_ALIAS_LIB_NAME}")
+set(XPU_ML_LIB "${THIRD_PARTY_PATH}/xpu/src/extern_xpu/xre-bdcentos_x86_64/so/${XPU_ML_LIB_NAME}")
+set(XPU_ML_ALIAS_LIB "${THIRD_PARTY_PATH}/xpu/src/extern_xpu/xre-bdcentos_x86_64/so/${XPU_ML_ALIAS_LIB_NAME}")
 
 set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
 
@@ -173,9 +177,9 @@ if(WITH_XPU_BKCL)
   set(XPU_BKCL_LIB "${XPU_LIB_DIR}/${XPU_BKCL_LIB_NAME}")
   set(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include")
   include_directories(${XPU_BKCL_INC_DIR})
-  target_link_libraries(xpulib -Wl,--push-state,--no-as-needed ${XPU_API_LIB} ${XPU_API_PLUGIN} ${XPU_RT_LIB} ${XPU_BKCL_LIB} -Wl,--pop-state)
+  target_link_libraries(xpulib -Wl,--push-state,--no-as-needed ${XPU_API_LIB} ${XPU_API_PLUGIN} ${XPU_RT_LIB} ${XPU_BKCL_LIB} ${XPU_ML_LIB} -Wl,--pop-state)
 else()
-  target_link_libraries(xpulib -Wl,--push-state,--no-as-needed ${XPU_API_LIB} ${XPU_API_PLUGIN} ${XPU_RT_LIB} -Wl,--pop-state)
+  target_link_libraries(xpulib -Wl,--push-state,--no-as-needed ${XPU_API_LIB} ${XPU_API_PLUGIN} ${XPU_RT_LIB} ${XPU_ML_LIB} -Wl,--pop-state)
 endif()
 
 add_dependencies(xpulib ${XPU_PROJECT})
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index c7bc72a9c959b..fde58e2d56183 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -317,8 +317,7 @@ if(WITH_ONNXRUNTIME)
 endif()
 
 if(WITH_GPU)
-  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0 OR ${CMAKE_CUDA_COMPILER_VERSION}
-                                                 GREATER_EQUAL 11.6)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
     include(external/cub) # download cub
     list(APPEND third_party_deps extern_cub)
   endif()
diff --git a/cmake/version.cmake b/cmake/version.cmake
index 83bd3f1b1bc4a..88e767b968bd9 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -71,3 +71,27 @@ math(EXPR PADDLE_VERSION_INTEGER "${PADDLE_MAJOR_VER} * 1000000
 add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
 add_definitions(-DPADDLE_VERSION_INTEGER=${PADDLE_VERSION_INTEGER})
 message(STATUS "Paddle version is ${PADDLE_VERSION}")
+
+#add git version
+set(COMMIT_HASH "")
+set(BRANCH_NAME "")
+find_package(Git QUIET)
+if(GIT_FOUND)
+execute_process(
+    COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%H
+    OUTPUT_VARIABLE COMMIT_HASH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_QUIET
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+)
+execute_process(
+    COMMAND ${GIT_EXECUTABLE} symbolic-ref --short -q HEAD
+    OUTPUT_VARIABLE BRANCH_NAME
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_QUIET
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+)
+endif()
+message(STATUS "Git version is ${BRANCH_NAME}:${COMMIT_HASH}")
+add_definitions(-DPADDLE_BRANCH_NAME="${BRANCH_NAME}")
+add_definitions(-DPADDLE_COMMIT_HASH="${COMMIT_HASH}")
diff --git a/paddle/fluid/framework/boxps_trainer.cc b/paddle/fluid/framework/boxps_trainer.cc
index 76a3e7c43057d..3558eebbfe0c7 100644
--- a/paddle/fluid/framework/boxps_trainer.cc
+++ b/paddle/fluid/framework/boxps_trainer.cc
@@ -15,10 +15,10 @@
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/box_wrapper.h"
+#include "paddle/fluid/framework/io/fs.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/trainer.h"
 #include "paddle/fluid/framework/trainer_desc.pb.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/io/fs.h"
 
 DECLARE_bool(enable_binding_train_cpu);
 namespace paddle {
@@ -94,7 +94,17 @@ void BoxPSTrainer::InitOtherEnv(const ProgramDesc& main_program) {
   }
   VLOG(3) << "init other env done.";
 }
-
+// dump thread pool
+inline std::shared_ptr<paddle::framework::ThreadPool>& GetDumpThreadPool(
+    int thread_num) {
+  static std::shared_ptr<paddle::framework::ThreadPool> dump_thread_pool =
+      nullptr;
+  if (dump_thread_pool != nullptr) {
+    return dump_thread_pool;
+  }
+  dump_thread_pool.reset(new paddle::framework::ThreadPool(thread_num));
+  return dump_thread_pool;
+}
 std::string BoxPSTrainer::GetDumpPath(int tid) {
   return string::format_string("%s/part-%05d", dump_fields_path_.c_str(), tid);
 }
@@ -104,8 +114,8 @@ void BoxPSTrainer::DumpWork(int tid) {
   int fileid = 0;
   size_t file_size = 0;
   while (!is_finish) {
-    std::string path = string::format_string("%s/part-%05d-%05d",
-        dump_fields_path_.c_str(), tid, fileid++);
+    std::string path = string::format_string(
+        "%s/part-%05d-%05d", dump_fields_path_.c_str(), tid, fileid++);
     int err_no = 0;
     std::shared_ptr<FILE> fp = fs_open_write(path, &err_no, dump_converter_);
     // split dump file size
@@ -134,36 +144,53 @@ void BoxPSTrainer::InitDumpEnv() {
     workers_[i]->SetChannelWriter(queue_.get());
   }
   // TODO(hutuxian): should make it as a config
+  dump_futures_.clear();
+  auto pool = GetDumpThreadPool(dump_thread_num_);
   for (int i = 0; i < dump_thread_num_; i++) {
-    dump_thread_.push_back(
-        std::thread(std::bind(&TrainerBase::DumpWork, this, i)));
+    dump_futures_.emplace_back(pool->Run([this, i]() { this->DumpWork(i); }));
   }
   VLOG(0) << "init dump write file thread num=" << dump_thread_num_;
 }
-
-void BoxPSTrainer::CopyParameters(const Scope& root_scope, int device_id) {
-  Scope* thread_scope = GetWorkerScope(device_id);
-  for (const std::string& name : *param_need_sync_) {
-    const LoDTensor& root_tensor = root_scope.FindVar(name)->Get<LoDTensor>();
-
-    // TODO(hutxian): check a new var of the same name is created in
-    LoDTensor* gpu_tensor = thread_scope->Var(name)->GetMutable<LoDTensor>();
-    platform::Place place = platform::CUDAPlace(device_id);
-    TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
-               static_cast<Tensor*>(gpu_tensor));
+// final dump env
+void BoxPSTrainer::FinalizeDumpEnv() {
+  queue_->Close();
+  for (auto& th : dump_futures_) {
+    th.get();
   }
+  dump_futures_.clear();
+  queue_.reset();
+  VLOG(0) << "finalize dump write file thread";
 }
 
-void BoxPSTrainer::DumpParameters(void) {
-  Scope* thread_scope = GetWorkerScope(0);
-  for (const auto& var : persistable_vars_) {
-    auto* root_tensor = root_scope_->Var(var)->GetMutable<LoDTensor>();
-    // TODO(hutuxian): Add a final all-reduce?
-    const auto& thread_tensor = thread_scope->FindVar(var)->Get<LoDTensor>();
-    TensorCopy(thread_tensor, root_tensor->place(), root_tensor);
+inline std::vector<std::shared_ptr<paddle::framework::ThreadPool>>&
+GetThreadPool(int thread_num) {
+  static std::vector<std::shared_ptr<paddle::framework::ThreadPool>>
+      thread_pools;
+  if (!thread_pools.empty()) {
+    return thread_pools;
   }
+  thread_pools.resize(thread_num);
+  for (int i = 0; i < thread_num; ++i) {
+    thread_pools[i].reset(new paddle::framework::ThreadPool(1));
+  }
+  if (!FLAGS_enable_binding_train_cpu) {
+    return thread_pools;
+  }
+  std::vector<int>& train_cores = boxps::get_train_cores();
+  if (train_cores.size() < static_cast<size_t>(thread_num)) {
+    return thread_pools;
+  }
+  std::vector<int> ncores;
+  for (int i = 0; i < thread_num; ++i) {
+    ncores.push_back(train_cores[i]);
+    if (train_cores.size() / 2 == static_cast<size_t>(thread_num)) {
+      ncores.push_back(train_cores[i + thread_num]);
+    }
+    thread_pools[i]->SetCPUAffinity(ncores, false);
+    ncores.clear();
+  }
+  return thread_pools;
 }
-
 void BoxPSTrainer::InitTrainerEnv(const ProgramDesc& main_program,
                                   const platform::Place& place) {
   PADDLE_ENFORCE(root_scope_, "Null root_scope pointer");
@@ -183,51 +210,75 @@ void BoxPSTrainer::InitTrainerEnv(const ProgramDesc& main_program,
 
   std::set<std::string> async_param_name;
   if (async_mode_) {
-    async_param_name = dense_table_->Init(*root_scope_, *param_need_sync_.get(),
+    async_param_name = dense_table_->Init(*root_scope_,
+                                          *param_need_sync_.get(),
                                           persistable_vars_,
                                           async_grad_name_);
   }
+  auto pool = GetThreadPool(thread_num_);
+  wait_futures_.clear();
+  CHECK(static_cast<int>(pool.size()) == thread_num_);
   for (int i = 0; i < thread_num_; ++i) {
-    auto this_worker =
-        std::dynamic_pointer_cast<paddle::framework::BoxPSWorker>(workers_[i]);
-    this_worker->SetRootScope(root_scope_);
-    if (async_mode_) {
-      this_worker->SetDenseTable(dense_table_.get());
-      this_worker->SetAsyncParamName(async_param_name);
-    }
-    this_worker->CreateDeviceResource(main_program);
-    //    CopyParameters(*root_scope_, i);
-  }
-}
-inline std::vector<std::shared_ptr<paddle::framework::ThreadPool>>&
-GetThreadPool(int thread_num) {
-  static std::vector<std::shared_ptr<paddle::framework::ThreadPool>>
-      thread_pools;
-  if (!thread_pools.empty()) {
-    return thread_pools;
+    wait_futures_.emplace_back(
+        pool[i]->Run([this, i, &async_param_name, &main_program]() {
+          auto this_worker =
+              std::dynamic_pointer_cast<paddle::framework::BoxPSWorker>(
+                  workers_[i]);
+          this_worker->SetRootScope(root_scope_);
+          if (async_mode_) {
+            this_worker->SetDenseTable(dense_table_.get());
+            this_worker->SetAsyncParamName(async_param_name);
+          }
+          this_worker->CreateDeviceResource(main_program);
+        }));
   }
-  thread_pools.resize(thread_num);
-  for (int i = 0; i < thread_num; ++i) {
-    thread_pools[i].reset(new paddle::framework::ThreadPool(1));
-  }
-  if (!FLAGS_enable_binding_train_cpu) {
-    return thread_pools;
+  RemoveOtherDeviceVars(main_program, root_scope_);
+  for (auto& th : wait_futures_) {
+    th.get();
   }
-  std::vector<int>& train_cores = boxps::get_train_cores();
-  if (train_cores.size() < static_cast<size_t>(thread_num)) {
-    return thread_pools;
+  VLOG(0) << "InitTrainerEnv done!";
+}
+
+void BoxPSTrainer::RemoveOtherDeviceVars(const ProgramDesc& main_program,
+                                         Scope* root_scope) {
+  std::vector<std::string> remove_vars;
+  std::unordered_set<std::string> unpersist_var_names;
+  auto& block = main_program.Block(0);
+  auto all_desc = block.AllOps();
+  auto box_wrapper = BoxWrapper::GetInstance();
+  int rank_id = box_wrapper->GetMpiRank();
+  int gum_num = box_wrapper->GetGpuNum();
+  // 1. Get other device's Param
+  for (auto& op_desc : all_desc) {
+    // broadcast op
+    if (op_desc->Type() != "c_broadcast") {
+      continue;
+    }
+    int root_id = op_desc->GetAttrIfExists<int>("root");
+    if ((root_id / gum_num) == rank_id) {
+      continue;
+    }
+    for (auto& o : op_desc->Inputs()) {
+      for (auto& name : o.second) {
+        unpersist_var_names.insert(name);
+      }
+    }
   }
-  std::vector<int> ncores;
-  for (int i = 0; i < thread_num; ++i) {
-    ncores.push_back(train_cores[i]);
-    if (train_cores.size() / 2 == static_cast<size_t>(thread_num)) {
-      ncores.push_back(train_cores[i + thread_num]);
+  VLOG(0) << "root scope remove_params size = " << unpersist_var_names.size();
+  // 2. Get moment param
+  for (auto& unpersist_var_name : unpersist_var_names) {
+    for (auto& var : block.AllVars()) {
+      std::string name = var->Name();
+      if (var->Persistable() && name.find(unpersist_var_name) == 0) {
+        remove_vars.push_back(name);
+      }
     }
-    thread_pools[i]->SetCPUAffinity(ncores, false);
-    ncores.clear();
   }
-  return thread_pools;
+  if (remove_vars.empty()) return;
+  VLOG(0) << "root scope remove_vars's size = " << remove_vars.size();
+  root_scope->EraseVars(remove_vars);
 }
+
 void BoxPSTrainer::Run() {
   VLOG(3) << "Going to run";
   auto pool = GetThreadPool(thread_num_);
@@ -242,11 +293,16 @@ void BoxPSTrainer::Run() {
           pool[i]->Run([this, i]() { workers_[i]->TrainFilesWithProfiler(); }));
     }
   }
+  for (auto& th : wait_futures_) {
+    th.get();
+  }
 }
 
 void BoxPSTrainer::Finalize() {
-  for (auto& th : wait_futures_) {
-    th.get();
+  for (int i = 0; i < thread_num_; ++i) {
+    auto this_worker =
+        std::dynamic_pointer_cast<paddle::framework::BoxPSWorker>(workers_[i]);
+    this_worker->Finalize();
   }
   if (async_mode_) {
     // must be after train thread, otherwise the ps_buffer_ will be closed first
@@ -255,14 +311,12 @@ void BoxPSTrainer::Finalize() {
   if (need_dump_field_ || need_dump_param_) {
     FinalizeDumpEnv();
   }
-  DumpParameters();
   root_scope_->DropKids();
 }
 
 Scope* BoxPSTrainer::GetWorkerScope(int thread_id) {
   return workers_[thread_id]->GetThreadScope();
 }
-
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc
index 4ad91e2e90881..4ca0ad514bdb2 100644
--- a/paddle/fluid/framework/boxps_worker.cc
+++ b/paddle/fluid/framework/boxps_worker.cc
@@ -41,13 +41,31 @@ limitations under the License. */
 #include <scalopus_general/general_provider.h>
 #include <scalopus_tracing/native_trace_provider.h>
 #endif
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/program_utils.h"
 
+DECLARE_bool(enable_dump_main_program);
 DECLARE_bool(enable_sync_dense_moment);
 DECLARE_bool(check_nan_inf);
-PADDLE_DEFINE_EXPORTED_bool(padbox_enable_gc, false, "enable paddlebox gc");
+DECLARE_bool(lineid_have_extend_info);
+DECLARE_bool(dump_filed_same_as_aibox);
+PADDLE_DEFINE_EXPORTED_bool(padbox_enable_gc, true, "enable paddlebox gc");
+PADDLE_DEFINE_EXPORTED_bool(padbox_enable_print_op_debug,
+                            false,
+                            "enable print op debug ,default false");
+PADDLE_DEFINE_EXPORTED_bool(enable_print_dump_field_debug,
+                            false,
+                            "enable print dump field debug ,default false");
+PADDLE_DEFINE_EXPORTED_bool(enable_print_dump_info_debug,
+                            false,
+                            "enable print dump info debug ,default false");
+PADDLE_DEFINE_EXPORTED_bool(
+    padbox_enable_sharding_stage,
+    true,
+    "enable sharding stage step1 only param and grad split, default false");
+
 namespace paddle {
 namespace framework {
-
 BoxPSAsynDenseTable::BoxPSAsynDenseTable(const int device_num)
     : device_num_(device_num) {
   int buffer_size = device_num * 4;  // magic number
@@ -358,7 +376,28 @@ void BoxPSAsynDenseTable::InitThreadGroup() {
   }
   thread_pool.reset(new paddle::framework::ThreadPool(thread_num_));
 }
-
+//======================== BoxPSWorker ======================
+// init
+void BoxPSWorker::MemoryShareTensor::init(const std::string& name,
+                                          const platform::Place& place,
+                                          const int64_t& total_len,
+                                          Scope* root_scope) {
+  char szname[512] = {0};
+  snprintf(szname,
+           sizeof(szname),
+           "paddlebox_boxps_worker_share_%s_%d",
+           name.c_str(),
+           place.GetDeviceId());
+  data_tensor_ = root_scope->Var(szname)->GetMutable<phi::DenseTensor>();
+  data_tensor_->mutable_data<float>({total_len, 1}, place);
+}
+// share
+phi::DenseTensor& BoxPSWorker::MemoryShareTensor::share(
+    phi::DenseTensor* gpu_tensor, const size_t& len) {
+  gpu_tensor->ShareDataWith(data_tensor_->Slice(offset_, offset_ + len));
+  offset_ += len;
+  return *gpu_tensor;
+}
 static const int DenseKStepNode = 1;
 static const int DenseKStepALL = 2;
 static const int DenseDataNormal = 3;
@@ -371,11 +410,32 @@ void BoxPSWorker::Initialize(const TrainerDesc& desc) {
   }
   VLOG(1) << "boxps_worker init device num: " << device_num_;
 }
-
+void BoxPSWorker::Finalize() {
+  if (sharding_mode_ || device_id_ == 0) {
+    for (auto& name : need_copy_vars_) {
+      Variable* root_var = root_scope_->FindVar(name);
+      if (root_var == nullptr) {
+        continue;
+      }
+      auto root_tensor = root_var->GetMutable<phi::DenseTensor>();
+      Variable* var = thread_scope_->FindVar(name);
+      auto tensor = var->Get<phi::DenseTensor>();
+      TensorCopy(tensor, root_tensor->place(), root_tensor);
+    }
+    dev_ctx_->Wait();
+  }
+}
 void BoxPSWorker::SetDenseTable(BoxPSAsynDenseTable* dense) {
   dense_table_ = dense;
 }
-
+inline bool IsDataNormParam(const std::string& name) {
+  if (name.find(".batch_size") != std::string::npos ||
+      name.find(".batch_sum") != std::string::npos ||
+      name.find(".batch_square_sum") != std::string::npos) {
+    return true;
+  }
+  return false;
+}
 int BoxPSWorker::CheckNeedParam(VarDesc* var) {
   if (!var->Persistable()) {
     return 0;
@@ -413,8 +473,9 @@ int BoxPSWorker::CheckNeedParam(VarDesc* var) {
   return 0;
 }
 
-int64_t BoxPSWorker::AllocParamTensor(int64_t* pad_len) {
-  auto& block = program_->Block(0);
+int64_t BoxPSWorker::AllocParamTensor(const ProgramDesc& program,
+                                      int64_t* pad_len) {
+  auto& block = program.Block(0);
   // init var and copy persistable
   int64_t total_param_len = 0;
   int64_t total_moment_len = 0;
@@ -449,12 +510,12 @@ int64_t BoxPSWorker::AllocParamTensor(int64_t* pad_len) {
           << ", sync length:" << all_sync_param_len
           << ", sync mode:" << sync_mode_ << ", node size:" << node_size_
           << ", device num:" << device_num_ << ", one ring:" << one_ring_;
-  param_sync_.mutable_data<float>({all_sync_param_len, 1}, place_);
+  param_sync_.init("total_param_sync", place_, all_sync_param_len, root_scope_);
   return total_param_len;
 }
 
-int64_t BoxPSWorker::AllocParamTensorAsync() {
-  auto& block = program_->Block(0);
+int64_t BoxPSWorker::AllocParamTensorAsync(const ProgramDesc& program) {
+  auto& block = program.Block(0);
   // init var and copy persistable
   int64_t total_param_len = 0;
   for (auto& var : block.AllVars()) {
@@ -475,17 +536,263 @@ int64_t BoxPSWorker::AllocParamTensorAsync() {
   CHECK(total_param_len > 0) << "error param total zero";
   CHECK(dense_table_->GetParamTotalLen() == total_param_len);
 
-  param_async_.mutable_data<float>({total_param_len, 1}, place_);
-  grad_async_.mutable_data<float>({total_param_len, 1}, place_);
+  param_async_.init("total_param_async", place_, total_param_len, root_scope_);
+  grad_async_.init("total_grad_async", place_, total_param_len, root_scope_);
   return total_param_len;
 }
 
-void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
-  program_.reset(new ProgramDesc(main_prog));
-  auto& block = program_->Block(0);
-  for (auto& op_desc : block.AllOps()) {
+int BoxPSWorker::IsParameter(const std::string& name, bool full_match) {
+  if (full_match) {
+    auto it = params2rootid_.find(name);
+    if (it == params2rootid_.end()) {
+      return -1;
+    }
+    if (it->second == nccl_rank_id_) {
+      return 1;
+    }
+    return 0;
+  } else {
+    // moment, acc
+    for (auto it = params2rootid_.begin(); it != params2rootid_.end(); ++it) {
+      if (strncmp(name.c_str(), it->first.c_str(), it->first.length()) != 0) {
+        continue;
+      }
+      if (it->second == nccl_rank_id_) {
+        return 1;
+      }
+      return 0;
+    }
+    return -1;
+  }
+}
+
+static bool FindVarInMap(const VariableNameMap& op_var_map,
+                         const std::multiset<std::string>& var_set) {
+  for (auto& o : op_var_map) {
+    for (auto& name : o.second) {
+      if (var_set.find(name) != var_set.end()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+static bool IsAvgOp(OpDesc* op_desc) {
+  if (op_desc->Type() != "elementwise_add" &&
+      op_desc->Type() != "elementwise_mul") {
+    return false;
+  }
+  for (auto& o : op_desc->Outputs()) {
+    for (auto& name : o.second) {
+      if (name.find("avg_weight") != std::string::npos ||
+          name.find("@avg") != std::string::npos) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+void BoxPSWorker::BuildShardingDepends(const ProgramDesc& program) {
+  nccl_rank_id_ = place_.GetDeviceId();
+#if defined(PADDLE_WITH_CUDA)
+  auto box_wrapper = BoxWrapper::GetInstance();
+  nccl_rank_id_ = box_wrapper->GetNCCLRankId(nccl_rank_id_);
+#endif
+
+  auto& block = program.Block(0);
+  auto all_desc = block.AllOps();
+
+  for (auto& op_desc : all_desc) {
+    // broadcast op
+    if (op_desc->Type() != "c_broadcast") {
+      continue;
+    }
+    int root_id = op_desc->GetAttrIfExists<int>("root");
+    int ring_id = op_desc->GetAttrIfExists<int>("ring_id");
+    if (ring_id >= 0 && ring_id != ring_id_) {
+      ring_id_ = ring_id;
+    }
+    for (auto& o : op_desc->Inputs()) {
+      for (auto& name : o.second) {
+        auto var = block.FindVar(name);
+        if (!var->Persistable() || !var->IsParameter()) {
+          continue;
+        }
+        if (params2rootid_.find(name) != params2rootid_.end()) {
+          auto it = params2rootid_.find(name);
+          if (it->second != root_id) {
+            std::cout << "error: param name conflict" << std::endl;
+          }
+          continue;
+        }
+        params2rootid_.insert(std::make_pair(name, root_id));
+      }
+    }
+  }
+  if (params2rootid_.empty()) {
+    return;
+  }
+  sharding_mode_ = true;
+  size_t copy_param_cnt = 0;
+  // check find
+  for (auto& var : block.AllVars()) {
+    if (!var->Persistable()) {
+      continue;
+    }
+    std::string name = var->Name();
+    int ret = IsParameter(name, var->IsParameter());
+    if (ret < 0 || ret == 1) {
+      if (ret == 1) {
+        persist_param_vars_.insert(name);
+      }
+      continue;
+    }
+    if (var->IsParameter()) {
+      // persist parameter，eg: data_norm, learning_rate
+      if (IsDataNormParam(name) ||
+          name.find("learning_rate") != std::string::npos) {
+        ++copy_param_cnt;
+      } else {
+        unpersist_vars_.insert(name);
+      }
+    } else {
+      // adam ubmq1_h2_param.b_0_moment1_0, avg_weight @avg @w_backup
+      remove_vars_.insert(name);
+    }
+  }
+
+  std::multiset<std::string> all_remove_inputs;
+  for (auto& op_desc : all_desc) {
+    if (FindVarInMap(op_desc->Inputs(), remove_vars_)) {
+      for (auto& o : op_desc->Inputs()) {
+        for (auto& name : o.second) {
+          all_remove_inputs.insert(name);
+        }
+      }
+      remove_ops_.insert(op_desc);
+    } else if (IsAvgOp(op_desc) &&
+               (FindVarInMap(op_desc->Outputs(), remove_vars_) ||
+                FindVarInMap(op_desc->Inputs(), unpersist_vars_))) {
+      remove_ops_.insert(op_desc);
+    }
+  }
+
+  size_t total_scale_cnt = 0;
+  size_t remove_scale_cnt = 0;
+  // remove scale op
+  for (auto& op_desc : all_desc) {
+    if (op_desc->Type() != "scale") {
+      continue;
+    }
+    ++total_scale_cnt;
+    // check scale output
+    for (auto& name : op_desc->Output("Out")) {
+      if (all_remove_inputs.find(name) == all_remove_inputs.end()) {
+        continue;
+      }
+      ++remove_scale_cnt;
+      remove_ops_.insert(op_desc);
+      break;
+    }
+  }
+  // stage1
+  if (FLAGS_padbox_enable_sharding_stage) {
+    std::multiset<std::string> broadcast_vars;
+    for (auto& op_desc : all_desc) {
+      if (op_desc->Type() != "c_broadcast") {
+        continue;
+      }
+      bool find = false;
+      for (auto& o : op_desc->Inputs()) {
+        for (auto& name : o.second) {
+          auto it = broadcast_vars.find(name);
+          if (it != broadcast_vars.end()) {
+            find = true;
+            continue;
+          }
+          broadcast_vars.insert(name);
+        }
+      }
+      if (find) {
+        remove_ops_.insert(op_desc);
+      }
+    }
+  }
+
+  // reset dump param
+  if (need_dump_param_ && dump_param_ != nullptr) {
+    for (auto& name : *dump_param_) {
+      auto var = block.FindVar(name);
+      if (var == nullptr) {
+        continue;
+      }
+      std::string new_name = name;
+      size_t pos = new_name.find("@");
+      if (pos > 0) {
+        new_name = name.substr(0, pos);
+      }
+      if (persist_param_vars_.find(new_name) == persist_param_vars_.end()) {
+        continue;
+      }
+      shard_dump_params_.push_back(name);
+    }
+    dump_param_ = &shard_dump_params_;
+  }
+  // reset dump fields
+  if (need_dump_field_ && dump_fields_ != nullptr) {
+    for (auto& name : *dump_fields_) {
+      auto var = block.FindVar(name);
+      if (var == nullptr) {
+        continue;
+      }
+      if (remove_vars_.find(name) != remove_vars_.end()) {
+        continue;
+      }
+      shard_dump_fields_.push_back(name);
+    }
+    dump_fields_ = &shard_dump_fields_;
+  }
+  VLOG(3) << "device id=" << int(place_.GetDeviceId())
+          << ", nccl rank=" << nccl_rank_id_
+          << ", total param count=" << params2rootid_.size()
+          << ", remove op count=" << remove_ops_.size()
+          << ", total scale op=" << total_scale_cnt << ", remove "
+          << remove_scale_cnt << ", remove var count=" << remove_vars_.size()
+          << ", unpersist var count=" << unpersist_vars_.size()
+          << ", dump param count=" << shard_dump_params_.size()
+          << ", dump fields count=" << shard_dump_fields_.size();
+}
+inline bool IsCommunicationOp(const std::string& op_name) {
+  if (op_name == "c_broadcast" || op_name == "c_reduce_sum" ||
+      op_name == "c_allreduce_sum") {
+    return true;
+  }
+  return false;
+}
+inline bool IsSyncStreamOp(const std::string& op_name) {
+  if (op_name == "c_sync_comm_stream" || op_name == "c_sync_calc_stream") {
+    return true;
+  }
+  return false;
+}
+void BoxPSWorker::CreateThreadOperators(const ProgramDesc& program) {
+  auto& block = program.Block(0);
+
+  size_t op_index = 0;
+  auto ops_descs = block.AllOps();
+  for (auto& op_desc : ops_descs) {
+    // skip remove ops
+    if (remove_ops_.find(op_desc) != remove_ops_.end()) {
+      continue;
+    }
+    std::string op_name = op_desc->Type();
+    // single stream not need sync
+    if (IsSyncStreamOp(op_name)) {
+      continue;
+    }
     // skip feed fetch op
-    if (op_desc->Type() == "feed" || op_desc->Type() == "fetch") {
+    if (op_name == "feed" || op_name == "fetch") {
       for (auto& o : op_desc->Inputs()) {
         skip_vars_.insert(skip_vars_.end(), o.second.begin(), o.second.end());
       }
@@ -494,6 +801,29 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
       }
     }
     ops_.push_back(OpRegistry::CreateOp(*op_desc));
+    // change to device stream
+    if (IsCommunicationOp(op_name)) {
+      ops_[op_index]->SetAttr("use_calc_stream", true);
+    }
+    ++op_index;
+  }
+  // add stream sync point
+  bool find = false;
+  for (size_t op_id = 0; op_id < ops_.size(); ++op_id) {
+    auto& op = ops_[op_id];
+    std::string op_name = op->Type();
+    if (!IsCommunicationOp(op_name)) {
+      if (find) {
+        find = false;
+        sync_points_.insert(op.get());
+      }
+      continue;
+    }
+    if (find) {
+      continue;
+    }
+    find = true;
+    sync_points_.insert(op.get());
   }
   // skip dump fields
   if (need_dump_field_ && dump_fields_ != nullptr) {
@@ -512,19 +842,23 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
     skip_vars_.insert(
         skip_vars_.end(), monitor_vars.begin(), monitor_vars.end());
   }
-
-  int64_t pad_len = 0;
-  if (sync_mode_ > 0) {
-    AllocParamTensor(&pad_len);
-  } else if (dense_table_) {
-    AllocParamTensorAsync();
+  if (FLAGS_padbox_enable_gc) {
+    // add op gc vars
+    unused_vars_ = GetUnusedVars(block, ops_, skip_vars_, &unpersist_vars_);
   }
+  VLOG(3) << "device[" << device_id_ << "] total op count=" << block.OpSize()
+          << ", create op count=" << ops_.size()
+          << ", skip vars count=" << skip_vars_.size()
+          << ", unused vars count=" << unused_vars_.size();
+}
+
+void BoxPSWorker::CreateThreadScopeForAsync(const ProgramDesc& program) {
+  AllocParamTensorAsync(program);
 
   thread_scope_ = &(root_scope_->NewScope());
 
-  int64_t offset = 0;
-  int64_t grad_offset = 0;
   // make param and param@GRAD in same order
+  auto& block = program.Block(0);
   std::vector<VarDesc*> sorted_var = block.AllVars();
   std::sort(sorted_var.begin(),
             sorted_var.end(),
@@ -541,18 +875,19 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
                 return var1->Name() < var2->Name();
               }
             });
+
   // init var and copy persistable
   int grad_var_num = 0;
   int var_num = 0;
   int persistable_num = 0;
-  int share_var_num = 0;
-  int64_t share_persistable_len = 0;
   int64_t total_persistable_len = 0;
+  int param_total = 0;
+
   for (auto& var : sorted_var) {
     std::string name = var->Name();
+    ++param_total;
     if (!var->Persistable()) {
-      if (dense_table_ &&
-          async_param_name_.find(name) != async_param_name_.end()) {
+      if (async_param_name_.find(name) != async_param_name_.end()) {
         // parm@GRAD can not find in root_scope_  use parm length replace
         VLOG(3) << "device[" << device_id_ << "] grad var name " << name;
         const LoDTensor& root_tensor =
@@ -562,10 +897,7 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
             thread_scope_->Var(name)->GetMutable<LoDTensor>();
         auto dim = root_tensor.dims();
         size_t len = root_tensor.numel();
-        gpu_tensor
-            ->ShareDataWith(grad_async_.Slice(grad_offset, grad_offset + len))
-            .Resize(dim);
-        grad_offset += len;
+        grad_async_.share(gpu_tensor, len).Resize(dim);
         grad_var_num += 1;
         skip_vars_.push_back(name);
       } else {
@@ -573,75 +905,291 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
         InitializeVariable(ptr, var->GetType());
       }
     } else {
-      const LoDTensor& root_tensor =
-          root_scope_->FindVar(name)->Get<LoDTensor>();
-      size_t len = root_tensor.numel();
+      Variable* root_var = root_scope_->FindVar(name);
+      if (!root_var) {
+        VLOG(0) << "not found var name=" << name;
+        continue;
+      }
+      if (root_var->IsType<phi::SelectedRows>()) {
+        continue;
+      }
+      phi::DenseTensor* root_tensor = root_var->GetMutable<phi::DenseTensor>();
+      size_t len = root_tensor->numel();
+      ++persistable_num;
+      total_persistable_len += len;
+
+      LoDTensor* gpu_tensor = thread_scope_->Var(name)->GetMutable<LoDTensor>();
+      if (async_param_name_.find(name) != async_param_name_.end()) {
+        VLOG(3) << "device[" << device_id_ << "] Persistable var name " << name;
+        auto dim = root_tensor->dims();
+        param_async_.share(gpu_tensor, len).Resize(dim);
+        var_num += 1;
+        skip_vars_.push_back(name);
+      }
+      // only support copy
+      TensorCopy(*static_cast<const Tensor*>(root_tensor),
+                 place_,
+                 static_cast<Tensor*>(gpu_tensor));
+    }
+  }
+  VLOG(0) << "device[" << device_id_ << "] total param count=" << param_total
+          << ", persistable=" << persistable_num << "("
+          << total_persistable_len / 262144.0 << "MB)"
+          << ", param_async_ offset:" << param_async_.offset_
+          << ", grad_offset: " << grad_async_.offset_
+          << ", var_num: " << var_num << ", grad_var_num: " << grad_var_num;
+  CHECK(param_async_.offset_ <= param_async_.numel());
+  CHECK(grad_async_.offset_ <= grad_async_.numel());
+}
+void BoxPSWorker::CreateThreadScopeForNorm(const ProgramDesc& program) {
+  int64_t pad_len = 0;
+  if (sync_mode_ > 0) {
+    AllocParamTensor(program, &pad_len);
+  }
+  thread_scope_ = &(root_scope_->NewScope());
+
+  auto& block = program.Block(0);
+  std::vector<VarDesc*> all_vars = block.AllVars();
+
+  // init var and copy persistable
+  int persistable_num = 0;
+  int share_var_num = 0;
+  int64_t share_persistable_len = 0;
+  int64_t total_persistable_len = 0;
+  int param_total = 0;
+  int copy_persist_num = 0;
+
+  for (auto& var : all_vars) {
+    std::string name = var->Name();
+    ++param_total;
+    if (var->Persistable()) {
+      Variable* root_var = root_scope_->FindVar(name);
+      if (!root_var) {
+        VLOG(0) << "not found var name=" << name;
+        continue;
+      }
+      if (root_var->IsType<phi::SelectedRows>()) {
+        continue;
+      }
+      phi::DenseTensor* root_tensor = root_var->GetMutable<phi::DenseTensor>();
+      size_t len = root_tensor->numel();
       ++persistable_num;
       total_persistable_len += len;
-      // add gc skip vars
-      skip_vars_.push_back(name);
 
       LoDTensor* gpu_tensor = thread_scope_->Var(name)->GetMutable<LoDTensor>();
       if (sync_mode_ > 0) {
         if (CheckNeedParam(var)) {
-          auto dim = root_tensor.dims();
-          gpu_tensor->ShareDataWith(param_sync_.Slice(offset, offset + len))
-              .Resize(dim);
-          offset += len;
+          auto dim = root_tensor->dims();
+          param_sync_.share(gpu_tensor, len).Resize(dim);
+          skip_vars_.push_back(name);
         }
-      } else if (dense_table_) {
-        if (async_param_name_.find(name) != async_param_name_.end()) {
-          VLOG(3) << "device[" << device_id_ << "] Persistable var name "
-                  << name;
-          auto dim = root_tensor.dims();
-          gpu_tensor->ShareDataWith(param_async_.Slice(offset, offset + len))
-              .Resize(dim);
-          offset += len;
-          var_num += 1;
+      }
+      // data norm copy and learning rate
+      if (!gpu_tensor->initialized() && place_ == root_tensor->place()) {
+        auto dim = root_tensor->dims();
+        gpu_tensor->ShareDataWith(*root_tensor).Resize(dim);
+        ++share_var_num;
+        share_persistable_len += len;
+      } else {
+        TensorCopy(*static_cast<const Tensor*>(root_tensor),
+                   place_,
+                   static_cast<Tensor*>(gpu_tensor));
+        ++copy_persist_num;
+        // add copy back to root scope
+        if (device_id_ == 0) {
+          need_copy_vars_.push_back(name);
+          skip_vars_.push_back(name);
         }
       }
-      if (!gpu_tensor->initialized() && place_ == root_tensor.place()) {
-        auto dim = root_tensor.dims();
-        gpu_tensor->ShareDataWith(root_tensor).Resize(dim);
+    } else {
+      auto* ptr = thread_scope_->Var(name);
+      InitializeVariable(ptr, var->GetType());
+    }
+  }
+  if (sync_mode_ > 0) {
+    CHECK(param_sync_.offset_ <= (param_sync_.numel() - pad_len));
+  }
+  VLOG(0) << "device[" << device_id_ << "] total param count=" << param_total
+          << ", persistable=[total:" << persistable_num << "("
+          << total_persistable_len / 262144.0 << "MB)"
+          << ", share:" << share_var_num << "("
+          << share_persistable_len / 262144.0 << "MB)"
+          << ", copy:" << copy_persist_num << "]";
+}
+void BoxPSWorker::CreateThreadScopeForSharding(const ProgramDesc& program) {
+  int64_t pad_len = 0;
+  if (sync_mode_ > 0) {
+    AllocParamTensor(program, &pad_len);
+  }
+  thread_scope_ = &(root_scope_->NewScope());
+
+  auto& block = program.Block(0);
+  std::vector<VarDesc*> all_vars = block.AllVars();
+
+  // init var and copy persistable
+  int persistable_num = 0;
+  int share_var_num = 0;
+  int64_t share_persistable_len = 0;
+  int64_t total_persistable_len = 0;
+  int persist_reset = 0;
+  int param_total = 0;
+  int unpersist_num = 0;
+  int copy_persist_num = 0;
+  int64_t real_persist_len = 0;
+  int real_persist_num = 0;
+  int delete_vars_num = 0;
+
+  for (auto& var : all_vars) {
+    std::string name = var->Name();
+    all_vars_.push_back(name);
+    if (remove_vars_.find(name) != remove_vars_.end()) {
+      ++delete_vars_num;
+      continue;
+    }
+    thread_vars_.push_back(name);
+    ++param_total;
+    if (var->Persistable()) {
+      if (unpersist_vars_.find(name) != unpersist_vars_.end()) {
+        // unpersist vars(include other thread var and other device var)
+        auto* ptr = thread_scope_->Var(name);
+        InitializeVariable(ptr, var->GetType());
+        // set dims
+        auto dims = phi::make_ddim(var->GetShape());
+        auto var_dtype =
+            paddle::framework::TransToPhiDataType(var->GetDataType());
+        ptr->GetMutable<phi::DenseTensor>()->Resize(dims).set_type(var_dtype);
+        ++unpersist_num;
+        ++persistable_num;
+        total_persistable_len += ptr->GetMutable<phi::DenseTensor>()->numel();
+        continue;
+      }
+      Variable* root_var = root_scope_->FindVar(name);
+      if (!root_var) {
+        VLOG(0) << "not found var name=" << name;
+        continue;
+      }
+      if (root_var->IsType<phi::SelectedRows>()) {
+        continue;
+      }
+      phi::DenseTensor* root_tensor = root_var->GetMutable<phi::DenseTensor>();
+      size_t len = root_tensor->numel();
+      ++persistable_num;
+      total_persistable_len += len;
+      real_persist_len += len;
+      ++real_persist_num;
+      // convert one device to other device c_broadcast param
+      if (persist_param_vars_.find(name) != persist_param_vars_.end()) {
+        // same device
+        if (place_ == root_tensor->place()) {
+          ++share_var_num;
+          share_persistable_len += len;
+          continue;
+        }
+
+        auto src_place = root_tensor->place();
+        auto holder = root_tensor->MoveMemoryHolder();
+        auto dst_ptr = root_tensor->mutable_data(
+            place_, root_tensor->dtype(), holder->size());
+        
+        #if defined(PADDLE_WITH_CUDA)
+            auto stream = static_cast<phi::GPUContext*>(dev_ctx_)->stream();
+            memory::Copy(
+                place_, dst_ptr, src_place, holder->ptr(), holder->size(), stream);
+            CHECK(platform::is_gpu_place(root_tensor->place()));
+        #elif defined(PADDLE_WITH_XPU)
+            // XPUStream stream = static_cast<platform::XPUDeviceContext*>(dev_ctx_)
+            //                        ->x_context()
+            //                        ->xpu_stream;
+            memory::Copy(
+                place_, dst_ptr, src_place, holder->ptr(), holder->size());
+            CHECK(platform::is_xpu_place(root_tensor->place()));
+        #endif
+        
+        ++persist_reset;
+        continue;
+      }
+
+      LoDTensor* gpu_tensor = thread_scope_->Var(name)->GetMutable<LoDTensor>();
+      if (sync_mode_ > 0) {
+        if (CheckNeedParam(var)) {
+          auto dim = root_tensor->dims();
+          param_sync_.share(gpu_tensor, len).Resize(dim);
+          skip_vars_.push_back(name);
+        }
+      }
+      // data norm copy and learning rate
+      if (!gpu_tensor->initialized() && place_ == root_tensor->place()) {
+        auto dim = root_tensor->dims();
+        gpu_tensor->ShareDataWith(*root_tensor).Resize(dim);
         ++share_var_num;
         share_persistable_len += len;
       } else {
-        TensorCopy(*static_cast<const Tensor*>(&root_tensor),
+        TensorCopy(*static_cast<const Tensor*>(root_tensor),
                    place_,
                    static_cast<Tensor*>(gpu_tensor));
+        ++copy_persist_num;
+        // device 0 need sync datanorm and learning rate to root scope
+        if (device_id_ == 0) {
+          need_copy_vars_.push_back(name);
+          skip_vars_.push_back(name);
+        }
       }
+    } else {
+      auto* ptr = thread_scope_->Var(name);
+      InitializeVariable(ptr, var->GetType());
     }
   }
   if (sync_mode_ > 0) {
-    CHECK(offset <= (param_sync_.numel() - pad_len));
-  } else if (dense_table_) {
-    VLOG(3) << "device[" << device_id_
-            << "]CreateDeviceResource param_async_ offset:" << offset
-            << " grad_offset: " << grad_offset << " var_num: " << var_num
-            << " grad_var_num: " << grad_var_num;
-    CHECK(offset <= param_async_.numel());
-    CHECK(grad_offset <= grad_async_.numel());
-  }
-  if (share_var_num > 0) {
-    VLOG(0) << "device[" << device_id_ << "] persistable total num ["
-            << persistable_num << "," << total_persistable_len << ","
-            << total_persistable_len / 262144.0
-            << "MB], share persistable num [" << share_var_num << ","
-            << share_persistable_len << "," << share_persistable_len / 262144.0
-            << "MB]";
+    CHECK(param_sync_.offset_ <= (param_sync_.numel() - pad_len));
   }
-  if (FLAGS_padbox_enable_gc) {
-    // add op gc vars
-    unused_vars_ = GetUnusedVars2(block, ops_, skip_vars_);
-    //  for (auto &var : unused_vars_) {
-    //    VLOG(0) << "op name=" << var.first->Type() << ", gc names: " <<
-    //    paddle::string::join_strings(var.second, ",");
-    //  }
-    if (device_id_ == 0) {
-      VLOG(0) << "total op count=" << ops_.size()
-              << ", skip vars count=" << skip_vars_.size()
-              << ", unused vars op count=" << unused_vars_.size();
+  VLOG(0) << "device[" << device_id_ << "] total param count=" << param_total
+          << ", persistable=[total:" << persistable_num << "("
+          << total_persistable_len / 262144.0 << "MB)"
+          << ", real:" << real_persist_num << "(" << real_persist_len / 262144.0
+          << "MB)"
+          << ", share:" << share_var_num << "("
+          << share_persistable_len / 262144.0 << "MB)"
+          << ", reset:" << persist_reset << ", unpersist:" << unpersist_num
+          << ", copy:" << copy_persist_num << "], delete=" << delete_vars_num;
+}
+void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
+  BuildShardingDepends(main_prog);
+  if (dense_table_) {
+    // async
+    CreateThreadScopeForAsync(main_prog);
+  } else if (sharding_mode_) {
+    // sharding mode
+    CreateThreadScopeForSharding(main_prog);
+  } else {
+    // normal
+    CreateThreadScopeForNorm(main_prog);
+  }
+  CreateThreadOperators(main_prog);
+
+  // debug str
+  if (FLAGS_enable_dump_main_program) {
+    std::ostringstream str_os;
+    for (auto& op : ops_) {
+      str_os << op->DebugStringEx(thread_scope_);
+      // add gc
+      auto it = unused_vars_.find(op.get());
+      if (it != unused_vars_.end()) {
+        str_os << ", gc names: [";
+        for (auto& name : it->second) {
+          str_os << name << ",";
+        }
+        str_os << "]";
+      }
+      str_os << "\n";
     }
+    auto box_ptr = BoxWrapper::GetInstance();
+    char filename[512] = {0};
+    snprintf(filename,
+             sizeof(filename),
+             "./device_%d_ops_%d.txt",
+             thread_id_,
+             box_ptr->Phase());
+    WriteToFile(filename, str_os.str());
   }
 }
 void BoxPSWorker::SyncParam(void) {
@@ -654,7 +1202,7 @@ void BoxPSWorker::SyncParam(void) {
   box_ptr->DenseNcclTimer(device_id_, false, 0x03);
 
 #if defined(PADDLE_WITH_CUDA)
-  auto comm = platform::NCCLCommContext::Instance().Get(0, device_id_);
+  auto comm = platform::NCCLCommContext::Instance().Get(ring_id_, device_id_);
   auto stream = static_cast<phi::GPUContext*>(dev_ctx_)->stream();
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 #elif defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_XPU)
@@ -692,7 +1240,7 @@ void BoxPSWorker::SyncParam(void) {
         sendbuff, sendbuff, numel, ncclFloat32, ncclSum, comm->comm(), stream));
   }
   const float scale = 1.0 / (device_num_ * node_size_);
-  TensorScaleValue(place_, param_sync_, &param_sync_, scale);
+  TensorScaleValue(place_, param_sync_.tensor(), &param_sync_.tensor(), scale);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 #elif defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_XPU)
 
@@ -707,7 +1255,7 @@ void BoxPSWorker::SyncParam(void) {
       BKCL_SUCCESS,
       platform::errors::PreconditionNotMet("BKCL all reduce failed"));
   const float scale = 1.0 / (device_num_ * node_size_);
-  TensorScaleValue(place_, param_sync_, &param_sync_, scale);
+  TensorScaleValue(place_, param_sync_.tensor(), &param_sync_.tensor(), scale);
   PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(stream));
 #endif
 
@@ -732,7 +1280,6 @@ inline void AddAucMonitor(const Scope* scope, const platform::Place& place) {
     metric_msg->add_data(scope, place);
   }
 }
-
 void BoxPSWorker::TrainFiles() {
   VLOG(3) << "begin gpubox_worker TrainFiles";
   platform::Timer timer;
@@ -756,20 +1303,25 @@ void BoxPSWorker::TrainFiles() {
     VLOG(2) << "[" << device_id_
             << "]begin running ops, batch size:" << batch_size
             << ", batch id=" << step;
-
     if (dense_table_) {
-      dense_table_->PullDense(place_, &param_async_);
+      dense_table_->PullDense(place_, &param_async_.tensor());
     }
-
     for (auto& op : ops_) {
+      if (FLAGS_padbox_enable_print_op_debug) {
+        VLOG(0) << "thread id=" << thread_id_ << ", "
+                << op->DebugStringEx(thread_scope_);
+      }
+      // add stream sync
+      if (sync_points_.find(op.get()) != sync_points_.end()) {
+        dev_ctx_->Wait();
+      }
       op->Run(*thread_scope_, place_);
       if (gc) {
         DeleteUnusedTensors(*thread_scope_, op.get(), unused_vars_, gc.get());
       }
     }
-
     if (dense_table_) {
-      dense_table_->PushDense(place_, &grad_async_);
+      dense_table_->PushDense(place_, &grad_async_.tensor());
     } else if (sync_mode_ > 0) {
       if (step > param_sync_step_) {
         step = 0;
@@ -797,6 +1349,12 @@ void BoxPSWorker::TrainFiles() {
       thread_scope_->DropKids();
     }
     ++step;
+    // std::stringstream ss;
+    // ss << "Malloc Cnt: ";
+    // for (int i = 0; i < 8; ++i) {
+    //   ss << "dev: " << i << " malloc times: "<< platform::get_malloc_cnt(i) << " ";
+    // }
+    // VLOG(0) << ss.str();
   }
   // sync param step
   if (sync_mode_ > 0) {
@@ -847,7 +1405,6 @@ void BoxPSWorker::TrainFilesWithProfiler() {
   outer_timer.Start();
   while (true) {
     main_timer.Resume();
-
     reader_timer.Resume();
 #if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU))
     TRACE_SCOPE_START("PackBatchTask", dev_ctx_->Wait());
@@ -880,9 +1437,11 @@ void BoxPSWorker::TrainFilesWithProfiler() {
       dev_ctx_->Wait();
       timeline.Pause();
       op_total_time[op_id++] += timeline.ElapsedUS();
+
 #if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU))
       RUNTIME_TRACE_SCOPE_END((op->Type()+" run").c_str(),);
 #endif
+
       if (gc) {
         DeleteUnusedTensors(*thread_scope_, op.get(), unused_vars_, gc.get());
       }
@@ -909,12 +1468,12 @@ void BoxPSWorker::TrainFilesWithProfiler() {
 
     if (need_dump_field_) {
       dump_timer.Resume();
-      DumpFieldBoxPS(*thread_scope_, dump_mode_, dump_interval_);
+      DumpField(*thread_scope_, dump_mode_, dump_interval_);
       dump_timer.Pause();
     }
-    if (need_dump_param_ && device_id_ == 0) {
+    if (need_dump_param_ && (sharding_mode_ || device_id_ == 0)) {
       dump_timer.Resume();
-      DumpParamBoxPS(*thread_scope_, step_cnt);
+      DumpParam(*thread_scope_, step_cnt);
       dump_timer.Pause();
     }
     if (gc) {
@@ -954,6 +1513,314 @@ void BoxPSWorker::TrainFilesWithProfiler() {
   auto box_ptr = BoxWrapper::GetInstance();
   box_ptr->PrintSyncTimer(device_id_, outer_timer.ElapsedSec());
 }
+
+//================================== paddlebox dump
+//============================================
+template <class... ARGS>
+inline void format_string_append(
+    std::string* str,
+    const char* fmt,
+    ARGS&&... args) {  // use VA_ARGS may be better ?
+  int len = snprintf(NULL, 0, fmt, args...);
+  PADDLE_ENFORCE(len >= 0, "format args length error");
+  size_t oldlen = str->length();
+  str->resize(oldlen + len + 1);
+  PADDLE_ENFORCE(snprintf(&(*str)[oldlen], (size_t)len + 1, fmt, args...) ==
+                 len);
+  str->resize(oldlen + len);
+}
+
+static const size_t max_fmt_buff_size = 40;
+
+inline void GetLodBound(const LoD& lod,
+                        const int64_t& dim,
+                        const int& index,
+                        std::pair<int64_t, int64_t>* bound) {
+  if (lod.size() != 0) {
+    bound->first = lod[0][index] * dim;
+    bound->second = lod[0][index + 1] * dim;
+  } else {
+    bound->first = index * dim;
+    bound->second = (index + 1) * dim;
+  }
+}
+
+template <typename T, typename C>
+inline void PrintLodTensorFmtType(const Tensor* tensor,
+                                  const int64_t& start,
+                                  const int64_t& end,
+                                  const char* fmt,
+                                  std::string* str) {
+  int64_t num = end - start;
+  if (num <= 0) {
+    return;
+  }
+
+  size_t oldlen = str->length();
+  // resize string
+  str->resize(oldlen + num * max_fmt_buff_size);
+
+  const C* ptr = reinterpret_cast<const C*>(tensor->data<T>());
+  for (int64_t i = start; i < end; ++i) {
+    int ret = snprintf(&(*str)[oldlen], max_fmt_buff_size, fmt, ptr[i]);
+    PADDLE_ENFORCE(ret > 0, "args or buff size error");
+    oldlen = oldlen + ret;
+  }
+  // resize real string
+  str->resize(oldlen);
+}
+inline void PrintLodTensor(const Tensor* tensor,
+                           const int64_t& start,
+                           const int64_t& end,
+                           std::string* out) {
+  auto dtype = framework::TransToProtoVarType(tensor->dtype());
+  if (dtype == proto::VarType::FP32) {
+    PrintLodTensorFmtType<float, float>(tensor, start, end, ":%.9g", out);
+  } else if (dtype == proto::VarType::INT64) {
+    PrintLodTensorFmtType<int64_t, uint64_t>(tensor, start, end, ":%lu", out);
+  } else if (dtype == proto::VarType::FP64) {
+    PrintLodTensorFmtType<double, double>(tensor, start, end, ":%.9g", out);
+  } else if (dtype == proto::VarType::INT32) {
+    PrintLodTensorFmtType<int, int>(tensor, start, end, ":%d", out);
+  } else if (dtype == proto::VarType::INT16) {
+    PrintLodTensorFmtType<int16_t, int16_t>(tensor, start, end, ":%d", out);
+  } else {
+    out->append("unsupported type");
+  }
+}
+inline void GetTensorBound(const LoDTensor& tensor,
+                           int index,
+                           std::pair<int64_t, int64_t>* bound) {
+  auto& dims = tensor.dims();
+  if (tensor.lod().size() != 0) {
+    auto& lod = tensor.lod()[0];
+    bound->first = lod[index] * dims[1];
+    bound->second = lod[index + 1] * dims[1];
+  } else {
+    bound->first = index * dims[1];
+    bound->second = (index + 1) * dims[1];
+  }
+}
+void BoxPSWorker::DumpParam(const Scope& scope, const int batch_id) {
+  size_t field_num = dump_param_->size();
+
+  auto chan = writer_.channel();
+  // thread process fields
+#ifdef PADDLE_WITH_BOX_PS
+  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
+  box_ptr->ExecuteFunc(
+      platform::CPUPlace(),
+#else
+  parallel_run_dynamic(
+#endif
+      field_num,
+      [this, &scope, batch_id, chan](const size_t& id) {
+        auto& name = (*dump_param_)[id];
+        Variable* var = scope.FindVar(name);
+        if (var == nullptr || !var->IsInitialized()) {
+          return;
+        }
+        const LoDTensor& tensor = var->Get<LoDTensor>();
+        if (!tensor.IsInitialized()) {
+          VLOG(0) << "Note: param[" << name
+                  << "] is not initialized, so it was skipped.";
+          return;
+        }
+        framework::LoDTensor cpu_tensor;
+        if (!platform::is_cpu_place(tensor.place())) {
+          TensorCopySync(tensor, platform::CPUPlace(), &cpu_tensor);
+        } else {
+          cpu_tensor.ShareDataWith(tensor);
+        }
+
+        std::string s;
+        format_string_append(&s, "(%d,%s)", batch_id, name.c_str());
+        int64_t len = cpu_tensor.numel();
+        PrintLodTensor(&cpu_tensor, 0, len, &s);
+        // write to channel
+        chan->WriteMove(1, &s);
+      });
+}
+
+void BoxPSWorker::DumpField(const Scope& scope,
+                            int dump_mode,
+                            int dump_interval) {
+  // dump_mode: 0: no random, 1: random with insid hash, 2: random with random
+  // number
+  size_t batch_size = device_reader_->GetCurBatchSize();
+  size_t field_num = dump_fields_->size();
+  std::vector<int64_t> dims(field_num, 0);
+  std::vector<framework::LoDTensor> cpu_tensors(field_num);
+  std::vector<const LoD*> lods(field_num, nullptr);
+
+// #ifdef PADDLE_WITH_XPU_KP
+std::set<std::string> used_slot_set;
+#if (defined PADDLE_WITH_XPU_KP) && (defined PADDLE_WITH_BOX_PS)
+  auto real_reader = dynamic_cast<SlotPaddleBoxDataFeed *>(device_reader_);
+  PADDLE_ENFORCE_NOT_NULL(
+        real_reader, platform::errors::NotFound("In XPU only support SlotPaddleBoxDataFeed"));
+  std::vector<std::string> used_slot_names;
+  real_reader->GetUsedSlotIndex(nullptr, &used_slot_names);
+  for (auto & slot : used_slot_names) {
+    used_slot_set.insert(slot);
+  }
+#endif
+
+  // copy fields
+#ifdef PADDLE_WITH_BOX_PS
+  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
+  box_ptr->ExecuteFunc(
+      platform::CPUPlace(),
+#else
+  parallel_run_dynamic(
+#endif
+      field_num,
+      [this, &dims, &cpu_tensors, &lods, &scope, &used_slot_set, batch_size](const size_t& i) {
+        auto& field = (*dump_fields_)[i];
+        Variable* var = scope.FindVar(field);
+        if (var == nullptr || !var->IsInitialized()) {
+          VLOG(3) << "Note: field[" << field
+                  << "] cannot be find in scope, so it was skipped.";
+          return;
+        }
+        const LoDTensor& tensor = var->Get<LoDTensor>();
+        if (!tensor.IsInitialized()) {
+          VLOG(3) << "Note: field[" << field
+                  << "] is not initialized, so it was skipped.";
+          return;
+        }
+        if (!CheckValidOutput(&tensor, batch_size)) {
+          //      VLOG(0) << "Note: field[" << field << "] cannot pass check, so
+          //      it was "
+          //                                            "skipped. Maybe the
+          //                                            dimension is " "wrong ";
+          return;
+        }
+        dims[i] = tensor.dims()[1];
+        lods[i] = (&tensor.lod());
+        if (!platform::is_cpu_place(tensor.place())) {
+          TensorCopySync(tensor, platform::CPUPlace(), &cpu_tensors[i]);
+        } else {
+          cpu_tensors[i].ShareDataWith(tensor);
+        }
+#ifdef PADDLE_WITH_XPU_KP
+    auto fid2sign_map_ptr = paddle::framework::BoxWrapper::GetInstance()->GetFid2SginMap();
+    if (used_slot_set.find(field) != used_slot_set.end() \
+        && fid2sign_map_ptr != nullptr && fid2sign_map_ptr->size() > 0) {
+      auto t_dtype = framework::TransToProtoVarType(cpu_tensors[i].dtype());
+      if (t_dtype == proto::VarType::INT64) {
+        size_t numel = cpu_tensors[i].numel();
+        int64_t * slot_data = cpu_tensors[i].data<int64_t>();
+        for (size_t j = 0; j < numel; ++j) {
+          uint64_t fid = static_cast<uint64_t>(slot_data[j]);
+          PADDLE_ENFORCE_LT(fid, fid2sign_map_ptr->size());
+          uint64_t sign = (*fid2sign_map_ptr)[fid];
+          PADDLE_ENFORCE(sign > 0 || (sign == 0 && fid == 0),
+              platform::errors::PreconditionNotMet(
+              "sign can only be 0 when fid is 0, fid:%llu, sign:%llu",
+              (unsigned long long)(fid), (unsigned long long)sign));
+          slot_data[j] = static_cast<int64_t>(sign);
+        }
+      }
+    }
+#endif
+      });
+
+  // dump data
+  std::default_random_engine engine(0);
+  std::uniform_int_distribution<size_t> dist(0U, INT_MAX);
+  // need dump check
+  auto need_dump_func = [this, &dist, &engine, dump_mode, dump_interval](
+                            const std::string& lineid) {
+    size_t r = 0;
+    if (dump_mode == 1) {
+      r = XXH64(lineid.data(), lineid.length(), 0);
+    } else if (dump_mode == 2) {
+      r = dist(engine);
+    }
+    if (r % dump_interval != 0) {
+      return false;
+    }
+    return true;
+  };
+
+  std::atomic<size_t> line_cnt{0};
+  std::atomic<size_t> num_cnt{0};
+
+  auto chan = writer_.channel();
+#ifdef PADDLE_WITH_BOX_PS
+  box_ptr->ExecuteFunc(
+      platform::CPUPlace(),
+#else
+  // dump data
+  parallel_run_dynamic(
+#endif
+      batch_size,
+      [this,
+       chan,
+       &dims,
+       &cpu_tensors,
+       &lods,
+       &need_dump_func,
+       field_num,
+       &line_cnt,
+       &num_cnt](const size_t& i) {
+        const std::string& lineid = device_reader_->GetLineId(i);
+        if (!need_dump_func(lineid)) {
+          return;
+        }
+
+        ++line_cnt;
+
+        thread_local std::pair<int64_t, int64_t> bound;
+        std::string s;
+        size_t pos = 0;
+        if (FLAGS_lineid_have_extend_info) {
+          pos = lineid.find(" ");
+          if (pos != std::string::npos) {
+            s.append(&lineid[0], pos);
+          } else {
+            s.append(lineid);
+          }
+        } else {
+          s.append(lineid);
+        }
+
+        size_t num = 0;
+        for (size_t k = 0; k < field_num; ++k) {
+          auto& lod = lods[k];
+          if (lod == nullptr) {
+            continue;
+          }
+          auto& field = (*dump_fields_)[k];
+          s.append("\t", 1);
+          GetLodBound(*lod, dims[k], i, &bound);
+
+          num += (bound.second - bound.first);
+          if (FLAGS_dump_filed_same_as_aibox) {
+            size_t ext_pos = field.find(".");
+            if (ext_pos != std::string::npos) {
+              s.append(&field[0], ext_pos);
+            } else {
+              s.append(field);
+            }
+          } else {
+            format_string_append(
+                &s, "%s:%ld", field.c_str(), bound.second - bound.first);
+          }
+          PrintLodTensor(&cpu_tensors[k], bound.first, bound.second, &s);
+        }
+        num_cnt += num;
+
+        // append extends tag info
+        if (pos > 0) {
+          s.append("\t", 1);
+          s.append(&lineid[pos + 1], lineid.length() - pos - 1);
+        }
+        // write to channel
+        chan->WriteMove(1, &s);
+      });
+}
 }  // namespace framework
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 9d26d285ebacc..d2bbd31bbf1e0 100755
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -3122,6 +3122,7 @@ void SlotPaddleBoxDataFeed::GetUsedSlotIndex(
   // get feasigns that FeedPass doesn't need
   const std::unordered_set<std::string>& slot_name_omited_in_feedpass_ =
       boxps_ptr->GetOmitedSlot();
+
   if (used_slot_index != nullptr) {
     used_slot_index->clear();
   }
@@ -3694,9 +3695,11 @@ void SlotPaddleBoxDataFeed::BuildSlotBatchGPU(const int ins_num) {
                         slot_total_num * sizeof(size_t),
                         cudaMemcpyDeviceToHost, stream));
   cudaStreamSynchronize(stream);
+
 #elif defined(PADDLE_WITH_XPU_KP)
   platform::MemcpySyncD2H(offsets.data(), d_slot_offsets, slot_total_num * sizeof(size_t), this->place_);
 #endif
+
   copy_timer_.Pause();
   data_timer_.Resume();
 
@@ -3762,6 +3765,7 @@ void SlotPaddleBoxDataFeed::BuildSlotBatchGPU(const int ins_num) {
 
   trans_timer_.Resume();
   void** dest_gpu_p = reinterpret_cast<void**>(pack_->slot_buf_ptr());
+
 #if defined(PADDLE_WITH_CUDA)
   CUDA_CHECK(cudaMemcpyAsync(dest_gpu_p, h_tensor_ptrs.data(),
                         use_slot_size_ * sizeof(void*),
diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc
index d173148718ae8..28b40e00ad126 100644
--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -17,11 +17,6 @@ limitations under the License. */
 #include <chrono>
 
 #include "paddle/fluid/framework/convert_utils.h"
-#ifdef PADDLE_WITH_BOX_PS
-#include "paddle/fluid/framework/fleet/box_wrapper.h"
-#endif
-DECLARE_bool(lineid_have_extend_info);
-DECLARE_bool(dump_filed_same_as_aibox);
 
 namespace phi {
 class DenseTensor;
@@ -247,20 +242,27 @@ bool CheckValidOutput(const LoDTensor* tensor, size_t batch_size) {
 
 void DeviceWorker::DumpParam(const Scope& scope, const int batch_id) {
   std::ostringstream os;
+  int device_id = int(place_.GetDeviceId());
   for (auto& param : *dump_param_) {
     os.str("");
     Variable* var = scope.FindVar(param);
-    if (var == nullptr) {
+    if (var == nullptr || !var->IsInitialized()) {
+      continue;
+    }
+    if (!var->IsType<phi::DenseTensor>()) {
       continue;
     }
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    if (tensor == nullptr || !tensor->IsInitialized()) {
+      continue;
+    }
     framework::LoDTensor cpu_tensor;
     if (platform::is_gpu_place(tensor->place())) {
       TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
       tensor = &cpu_tensor;
     }
     int64_t len = tensor->numel();
-    os << "(" << batch_id << "," << param << ")"
+    os << "(" << device_id << "," << batch_id << "," << param << ")"
        << PrintLodTensor(tensor, 0, len);
     writer_ << os.str();
   }
@@ -429,6 +431,11 @@ void DeviceWorker::DumpField(const Scope& scope,
               << "] cannot be find in scope, so it was skipped.";
       continue;
     }
+    if (!var->IsType<phi::DenseTensor>()) {
+      VLOG(3) << "Note: field[" << field
+              << "] is not dense tensor, so it was skipped.";
+      continue;
+    }
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
     if (!tensor->IsInitialized()) {
       VLOG(0) << "Note: field[" << field
@@ -467,286 +474,6 @@ void DeviceWorker::DumpField(const Scope& scope,
   }
   writer_.Flush();
 }
-template <class... ARGS>
-void format_string_append(std::string* str,
-                          const char* fmt,
-                          ARGS&&... args) {  // use VA_ARGS may be better ?
-  int len = snprintf(NULL, 0, fmt, args...);
-  PADDLE_ENFORCE(len >= 0, "format args length error");
-  size_t oldlen = str->length();
-  str->resize(oldlen + len + 1);
-  PADDLE_ENFORCE(snprintf(&(*str)[oldlen], (size_t)len + 1, fmt, args...) ==
-                 len);
-  str->resize(oldlen + len);
-}
-inline void GetLodBound(const LoD& lod,
-                        const int64_t& dim,
-                        const int& index,
-                        std::pair<int64_t, int64_t>* bound) {
-  if (lod.size() != 0) {
-    bound->first = lod[0][index] * dim;
-    bound->second = lod[0][index + 1] * dim;
-  } else {
-    bound->first = index * dim;
-    bound->second = (index + 1) * dim;
-  }
-}
-template <typename T, typename C>
-void PrintLodTensorFmtType(const Tensor* tensor,
-                           const int64_t& start,
-                           const int64_t& end,
-                           const char* fmt,
-                           std::string* out_val) {
-  if (start >= end) {
-    return;
-  }
-  const T* ptr = tensor->data<T>();
-  for (int64_t i = start; i < end; i++) {
-    format_string_append(out_val, fmt, static_cast<C>(ptr[i]));
-  }
-}
-void PrintLodTensor(const Tensor* tensor,
-                    const int64_t& start,
-                    const int64_t& end,
-                    std::string* out) {
-  auto dtype = framework::TransToProtoVarType(tensor->dtype());
-  if (dtype == proto::VarType::FP32) {
-    PrintLodTensorFmtType<float, float>(tensor, start, end, ":%.9g", out);
-  } else if (dtype == proto::VarType::INT64) {
-    PrintLodTensorFmtType<int64_t, uint64_t>(tensor, start, end, ":%lu", out);
-  } else if (dtype == proto::VarType::FP64) {
-    PrintLodTensorFmtType<double, double>(tensor, start, end, ":%.9g", out);
-  } else if (dtype == proto::VarType::INT32) {
-    PrintLodTensorFmtType<int, int>(tensor, start, end, ":%d", out);
-  } else if (dtype == proto::VarType::INT16) {
-    PrintLodTensorFmtType<int16_t, int16_t>(tensor, start, end, ":%d", out);
-  } else if (dtype == proto::VarType::BOOL) {
-    PrintLodTensorFmtType<bool, bool>(tensor, start, end, ":%d", out);
-  } else {
-    out->append("unsupported type");
-  }
-}
-void DeviceWorker::DumpParamBoxPS(const Scope& scope, const int batch_id) {
-  size_t field_num = dump_param_->size();
-
-  auto chan = writer_.channel();
-  // thread process fields
-#ifdef PADDLE_WITH_BOX_PS
-  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
-  box_ptr->ExecuteFunc(
-      platform::CPUPlace(),
-#else
-  parallel_run_dynamic(
-#endif
-      field_num,
-      [this, &scope, batch_id, chan](const size_t& id) {
-        auto& name = (*dump_param_)[id];
-        Variable* var = scope.FindVar(name);
-        if (var == nullptr || !var->IsInitialized()) {
-          return;
-        }
-        const LoDTensor& tensor = var->Get<LoDTensor>();
-        if (!tensor.IsInitialized()) {
-          VLOG(0) << "Note: param[" << name
-                  << "] is not initialized, so it was skipped.";
-          return;
-        }
-        framework::LoDTensor cpu_tensor;
-        if (!platform::is_cpu_place(tensor.place())) {
-          TensorCopySync(tensor, platform::CPUPlace(), &cpu_tensor);
-        } else {
-          cpu_tensor.ShareDataWith(tensor);
-        }
-
-        std::string s;
-        format_string_append(&s, "(%d,%s)", batch_id, name.c_str());
-        int64_t len = cpu_tensor.numel();
-        PrintLodTensor(&cpu_tensor, 0, len, &s);
-        // write to channel
-        chan->WriteMove(1, &s);
-      });
-}
-void DeviceWorker::DumpFieldBoxPS(
-    const Scope& scope,
-    int dump_mode,
-    int dump_interval) {  // dump_mode: 0: no random,
-                          // 1: random with insid hash,
-                          // 2: random with random
-                          // number
-  size_t batch_size = device_reader_->GetCurBatchSize();
-  size_t field_num = dump_fields_->size();
-  std::vector<int64_t> dims(field_num, 0);
-  std::vector<framework::LoDTensor> cpu_tensors(field_num);
-  std::vector<const LoD*> lods(field_num, nullptr);
-
-// #ifdef PADDLE_WITH_XPU_KP
-std::set<std::string> used_slot_set;
-#if (defined PADDLE_WITH_XPU_KP) && (defined PADDLE_WITH_BOX_PS)
-  auto real_reader = dynamic_cast<SlotPaddleBoxDataFeed *>(device_reader_);
-  PADDLE_ENFORCE_NOT_NULL(
-        real_reader, platform::errors::NotFound("In XPU only support SlotPaddleBoxDataFeed"));
-  std::vector<std::string> used_slot_names;
-  real_reader->GetUsedSlotIndex(nullptr, &used_slot_names);
-  for (auto & slot : used_slot_names) {
-    used_slot_set.insert(slot);
-  }
-#endif
-
-  // copy fields
-#ifdef PADDLE_WITH_BOX_PS
-  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
-  box_ptr->ExecuteFunc(
-      platform::CPUPlace(),
-#else
-  parallel_run_dynamic(
-#endif
-      field_num,
-      [this, &dims, &cpu_tensors, &lods, &scope, &used_slot_set, batch_size](const size_t& i) {
-        auto& field = (*dump_fields_)[i];
-        Variable* var = scope.FindVar(field);
-        if (var == nullptr || !var->IsInitialized()) {
-          VLOG(3) << "Note: field[" << field
-                  << "] cannot be find in scope, so it was skipped.";
-          return;
-        }
-        const LoDTensor& tensor = var->Get<LoDTensor>();
-        if (!tensor.IsInitialized()) {
-          VLOG(3) << "Note: field[" << field
-                  << "] is not initialized, so it was skipped.";
-          return;
-        }
-        if (!CheckValidOutput(&tensor, batch_size)) {
-          //      VLOG(0) << "Note: field[" << field << "] cannot pass check, so
-          //      it was "
-          //                                            "skipped. Maybe the
-          //                                            dimension is " "wrong ";
-          return;
-        }
-        dims[i] = tensor.dims()[1];
-        lods[i] = (&tensor.lod());
-        if (!platform::is_cpu_place(tensor.place())) {
-          TensorCopySync(tensor, platform::CPUPlace(), &cpu_tensors[i]);
-        } else {
-          cpu_tensors[i].ShareDataWith(tensor);
-        }
-#ifdef PADDLE_WITH_XPU_KP
-    auto fid2sign_map_ptr = paddle::framework::BoxWrapper::GetInstance()->GetFid2SginMap();
-    if (used_slot_set.find(field) != used_slot_set.end() \
-        && fid2sign_map_ptr != nullptr && fid2sign_map_ptr->size() > 0) {
-      auto t_dtype = framework::TransToProtoVarType(cpu_tensors[i].dtype());
-      if (t_dtype == proto::VarType::INT64) {
-        size_t numel = cpu_tensors[i].numel();
-        int64_t * slot_data = cpu_tensors[i].data<int64_t>();
-        for (size_t j = 0; j < numel; ++j) {
-          uint64_t fid = static_cast<uint64_t>(slot_data[j]);
-          PADDLE_ENFORCE_LT(fid, fid2sign_map_ptr->size());
-          uint64_t sign = (*fid2sign_map_ptr)[fid];
-          PADDLE_ENFORCE(sign > 0 || (sign == 0 && fid == 0),
-              platform::errors::PreconditionNotMet(
-              "sign can only be 0 when fid is 0, fid:%llu, sign:%llu",
-              (unsigned long long)(fid), (unsigned long long)sign));
-          slot_data[j] = static_cast<int64_t>(sign);
-        }
-      }
-    }
-#endif
-      });
-
-  // dump data
-  std::default_random_engine engine(0);
-  std::uniform_int_distribution<size_t> dist(0U, INT_MAX);
-  // need dump check
-  auto need_dump_func = [this, &dist, &engine, dump_mode, dump_interval](
-                            const std::string& lineid) {
-    size_t r = 0;
-    if (dump_mode == 1) {
-      r = XXH64(lineid.data(), lineid.length(), 0);
-    } else if (dump_mode == 2) {
-      r = dist(engine);
-    }
-    if (r % dump_interval != 0) {
-      return false;
-    }
-    return true;
-  };
-
-  std::atomic<size_t> line_cnt{0};
-  std::atomic<size_t> num_cnt{0};
-
-  auto chan = writer_.channel();
-#ifdef PADDLE_WITH_BOX_PS
-  box_ptr->ExecuteFunc(
-      platform::CPUPlace(),
-#else
-  // dump data
-  parallel_run_dynamic(
-#endif
-      batch_size,
-      [this,
-       chan,
-       &dims,
-       &cpu_tensors,
-       &lods,
-       &need_dump_func,
-       field_num,
-       &line_cnt,
-       &num_cnt](const size_t& i) {
-        const std::string& lineid = device_reader_->GetLineId(i);
-        if (!need_dump_func(lineid)) {
-          return;
-        }
-
-        ++line_cnt;
-
-        thread_local std::pair<int64_t, int64_t> bound;
-        std::string s;
-        size_t pos = 0;
-        if (FLAGS_lineid_have_extend_info) {
-          pos = lineid.find(" ");
-          if (pos != std::string::npos) {
-            s.append(&lineid[0], pos);
-          } else {
-            s.append(lineid);
-          }
-        } else {
-          s.append(lineid);
-        }
-
-        size_t num = 0;
-        for (size_t k = 0; k < field_num; ++k) {
-          auto& lod = lods[k];
-          if (lod == nullptr) {
-            continue;
-          }
-          auto& field = (*dump_fields_)[k];
-          s.append("\t", 1);
-          GetLodBound(*lod, dims[k], i, &bound);
-
-          num += (bound.second - bound.first);
-          if (FLAGS_dump_filed_same_as_aibox) {
-            size_t ext_pos = field.find(".");
-            if (ext_pos != std::string::npos) {
-              s.append(&field[0], ext_pos);
-            } else {
-              s.append(field);
-            }
-          } else {
-            format_string_append(
-                &s, "%s:%ld", field.c_str(), bound.second - bound.first);
-          }
-          PrintLodTensor(&cpu_tensors[k], bound.first, bound.second, &s);
-        }
-        num_cnt += num;
-
-        // append extends tag info
-        if (pos > 0) {
-          s.append("\t", 1);
-          s.append(&lineid[pos + 1], lineid.length() - pos - 1);
-        }
-        // write to channel
-        chan->WriteMove(1, &s);
-      });
-}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index b46e3daeaa92f..9483ee84e9293 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -225,11 +225,7 @@ class DeviceWorker {
   virtual void DumpParam(const Scope& scope, const int batch_id);
   virtual void DumpField(const Scope& scope,
                          int dump_mode,
-                         int dump_interval = 10000);
-  virtual void DumpParamBoxPS(const Scope& scope, const int batch_id);
-  virtual void DumpFieldBoxPS(const Scope& scope,
-                              int dump_mode,
-                              int dump_interval = 10000);
+                         int dump_interval = 10000);                    
 
   Scope* root_scope_ = nullptr;
   Scope* thread_scope_;
@@ -851,15 +847,33 @@ class BoxPSAsynDenseTable {
 };
 
 class BoxPSWorker : public DeviceWorker {
+  struct MemoryShareTensor {
+    int64_t offset_ = 0;
+    phi::DenseTensor* data_tensor_ = nullptr;
+    // init
+    void init(const std::string& name,
+              const platform::Place& place,
+              const int64_t& total_len,
+              Scope* root_scope);
+    // share
+    phi::DenseTensor& share(phi::DenseTensor* gpu_tensor, const size_t& len);
+    template <typename T>
+    T* data() {
+      return data_tensor_->data<T>();
+    }
+    phi::DenseTensor& tensor() { return *data_tensor_; }
+    // numel
+    int64_t numel(void) { return data_tensor_->numel(); }
+  };
+
  public:
   BoxPSWorker() {}
   ~BoxPSWorker() override {}
 
   void Initialize(const TrainerDesc& desc) override;
-
+  void Finalize();
   void BindingDataFeedMemory() override {}
   void CreateDeviceResource(const ProgramDesc& main_prog) override;
-
   void TrainFiles() override;
   void TrainFilesWithProfiler() override;
 
@@ -881,23 +895,34 @@ class BoxPSWorker : public DeviceWorker {
  protected:
   int PackBatchTask(void);
   int CheckNeedParam(VarDesc* var);
-  int64_t AllocParamTensor(int64_t* pad_len);
-  int64_t AllocParamTensorAsync();
+  int64_t AllocParamTensor(const ProgramDesc& program, int64_t* pad_len);
+  int64_t AllocParamTensorAsync(const ProgramDesc& program);
   void SyncParam(void);
+  void BuildShardingDepends(const ProgramDesc& program);
+  void CreateThreadScopeForAsync(const ProgramDesc& program);
+  void CreateThreadScopeForSharding(const ProgramDesc& program);
+  void CreateThreadScopeForNorm(const ProgramDesc& program);
+  void CreateThreadOperators(const ProgramDesc& program);
+  int IsParameter(const std::string& name, bool full_match);
+
+ protected:
+  virtual void DumpParam(const Scope& scope, const int batch_id);
+  virtual void DumpField(const Scope& scope,
+                         int dump_mode,
+                         int dump_interval = 10000);
 
  protected:
   int device_id_;
   int thread_id_;
 
-  std::shared_ptr<framework::ProgramDesc> program_;
   std::vector<std::unique_ptr<OperatorBase>> ops_;
   platform::DeviceContext* dev_ctx_ = nullptr;
 
   // dense async table
   BoxPSAsynDenseTable* dense_table_ = nullptr;
-  Tensor param_async_;
-  Tensor grad_async_;
-  Tensor param_sync_;
+  MemoryShareTensor param_async_;
+  MemoryShareTensor grad_async_;
+  MemoryShareTensor param_sync_;
   std::set<std::string> async_param_name_;
   int param_sync_step_ = 0;
   int sync_mode_ = 0;
@@ -908,6 +933,22 @@ class BoxPSWorker : public DeviceWorker {
   std::vector<std::string> skip_vars_;
   std::unordered_map<const OperatorBase*, std::vector<std::string>>
       unused_vars_;
+
+  int nccl_rank_id_ = 0;
+  int ring_id_ = 0;
+  std::unordered_map<std::string, int> params2rootid_;
+  std::multiset<std::string> remove_vars_;
+  std::vector<std::string> all_vars_;
+  std::vector<std::string> thread_vars_;
+  std::multiset<std::string> unpersist_vars_;
+  std::multiset<std::string> persist_param_vars_;
+  std::multiset<OpDesc*> remove_ops_;
+  std::vector<std::string> need_copy_vars_;
+  std::vector<std::string> shard_dump_params_;
+  std::vector<std::string> shard_dump_fields_;
+  bool sharding_mode_ = false;
+  // op extend
+  std::unordered_set<const OperatorBase*> sync_points_;
 };
 #endif
 
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index b790a66b14d9c..e2a6fdf5c9545 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -61,16 +61,25 @@ bool OpInOutInfo::IsInArgBufferNeeded(const std::string &in_arg_name) const {
   return no_need_buffer_ins_.empty() || other_args_set_.count(in_arg_name) != 0;
 }
 
-static bool VarCanBeDeleted(const std::string &name,
-                            const BlockDesc &block,
-                            const std::unordered_set<std::string> &skip_vars) {
+static bool VarCanBeDeleted(
+    const std::string &name,
+    const BlockDesc &block,
+    const std::unordered_set<std::string> &skip_vars,
+    const std::multiset<std::string> *unpersist_vars = nullptr) {
   if (skip_vars.count(name) != 0) {
     return false;
   }
 
   auto *var_desc = block.FindVar(name);
   if (var_desc == nullptr || var_desc->Persistable()) {
-    return false;
+    if (unpersist_vars != nullptr) {
+      // unpersist vars
+      if (unpersist_vars->find(name) == unpersist_vars->end()) {
+        return false;
+      }
+    } else {
+      return false;
+    }
   }
 
   auto type = var_desc->Proto()->type().type();
@@ -79,15 +88,19 @@ static bool VarCanBeDeleted(const std::string &name,
          type == proto::VarType::SELECTED_ROWS ||
          type == proto::VarType::LOD_TENSOR_ARRAY;
 }
-
 std::unordered_map<const OperatorBase *, std::vector<std::string>>
 GetUnusedVars(const BlockDesc &block,
               const std::vector<std::unique_ptr<OperatorBase>> &ops,
-              const std::vector<std::string> &skip_var_list) {
+              const std::vector<std::string> &skip_var_list,
+              const std::multiset<std::string> *unpersist_vars) {
   std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
                                             skip_var_list.end());
 
   std::unordered_map<std::string, size_t> var_op_idx_map;
+  std::unordered_map<std::string, std::string> old_to_new;
+  std::unordered_map<std::string, std::string> new_to_old;
+
+  bool is_sharding_mode = (unpersist_vars != nullptr && !unpersist_vars->empty());
 
   for (size_t i = 0; i < ops.size(); ++i) {
     auto *op = ops[i].get();
@@ -95,9 +108,27 @@ GetUnusedVars(const BlockDesc &block,
     OpInOutInfo info;
     for (auto &name_pair : op->Inputs()) {
       for (auto &name : name_pair.second) {
-        if (!VarCanBeDeleted(name, block, skip_vars)) {
+        if (!VarCanBeDeleted(name, block, skip_vars, unpersist_vars)) {
           continue;
         }
+        bool is_unpersist_var = false;
+        if (is_sharding_mode) {
+          if (unpersist_vars->find(name) != unpersist_vars->end()) {
+            is_unpersist_var = true;
+            // c_broadcast
+            if (op->Type() == "c_broadcast") {
+              auto it = old_to_new.find(name);
+              if (it == old_to_new.end()) {
+                old_to_new[name] = name;
+                new_to_old[name] = name;
+              } else {
+                std::string new_name = it->second + "_";
+                old_to_new[name] = new_name;
+                new_to_old[new_name] = name;
+              }
+            }
+          }
+        }
 
         // var can be gc-ed
         if (!info.IsBuilt()) {
@@ -106,7 +137,11 @@ GetUnusedVars(const BlockDesc &block,
 
         if (info.IsInArgBufferNeeded(name)) {
           // Update the last living op of variable to current op
-          var_op_idx_map[name] = i;
+          if (is_unpersist_var && old_to_new.count(name) > 0) {
+            var_op_idx_map[old_to_new[name]] = i;
+          } else {
+            var_op_idx_map[name] = i;
+          }
         } else {
           VLOG(10) << "Skip reference count computing of variable "
                    << name_pair.first << "(" << name << ") in Operator "
@@ -114,12 +149,15 @@ GetUnusedVars(const BlockDesc &block,
         }
       }
     }
-
     for (auto &name_pair : op->Outputs()) {
       for (auto &name : name_pair.second) {
-        if (VarCanBeDeleted(name, block, skip_vars)) {
+        if (VarCanBeDeleted(name, block, skip_vars, unpersist_vars)) {
           // Update the last living op of variable to current op
-          var_op_idx_map[name] = i;
+          if (is_sharding_mode && old_to_new.count(name) > 0) {
+            var_op_idx_map[old_to_new[name]] = i;
+          } else {
+            var_op_idx_map[name] = i;
+          }
         }
       }
     }
@@ -129,7 +167,11 @@ GetUnusedVars(const BlockDesc &block,
   for (auto &name_op_idx_pair : var_op_idx_map) {
     auto &name = name_op_idx_pair.first;
     size_t op_idx = name_op_idx_pair.second;
-    result[ops[op_idx].get()].emplace_back(name);
+    if (is_sharding_mode && new_to_old.count(name) > 0) {
+      result[ops[op_idx].get()].emplace_back(new_to_old[name]);
+    } else {
+      result[ops[op_idx].get()].emplace_back(name);
+    }
   }
   return result;
 }
diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h
index f56505dced85b..b772af9110726 100644
--- a/paddle/fluid/framework/executor_gc_helper.h
+++ b/paddle/fluid/framework/executor_gc_helper.h
@@ -46,12 +46,11 @@ struct OpInOutInfo {
   std::unordered_set<std::string> other_args_set_;
   bool is_built_{false};
 };
-
 std::unordered_map<const OperatorBase *, std::vector<std::string>>
 GetUnusedVars(const BlockDesc &block,
               const std::vector<std::unique_ptr<OperatorBase>> &ops,
-              const std::vector<std::string> &skip_vars);
-
+              const std::vector<std::string> &skip_vars,
+              const std::multiset<std::string> *unpersist_vars = nullptr);
 // Collect unused tensors
 void DeleteUnusedTensors(const Scope &scope,
                          const std::vector<std::string> &delete_vars,
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc
index 7386030ae217f..5d8f7876f28a8 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ b/paddle/fluid/framework/fleet/box_wrapper.cc
@@ -1115,8 +1115,8 @@ const std::vector<double> BoxWrapper::GetNanInfMetricMsg(
   std::vector<double> metric_return_values_(4, 0.0);
   auto* naninf_cal_ = iter->second->GetCalculator();
   naninf_cal_->computeNanInfMsg();
-  metric_return_values_[0] = naninf_cal_->nan_rate();
-  metric_return_values_[1] = naninf_cal_->inf_rate();
+  metric_return_values_[0] = naninf_cal_->nan_cnt();
+  metric_return_values_[1] = naninf_cal_->inf_cnt();
   metric_return_values_[2] = naninf_cal_->nan_inf_rate();
   metric_return_values_[3] = naninf_cal_->size();
   naninf_cal_->reset_nan_inf();
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index 40ebdecc78ca4..cbbdc0c4d4233 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -47,6 +47,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/string/string_helper.h"
+
 #include "paddle/fluid/framework/fleet/metrics.h"
 #include "paddle/fluid/framework/fleet/box_wrapper_kernel.h"
 
@@ -60,6 +61,7 @@ limitations under the License. */
 #include <scalopus_general/general_provider.h>
 #include <scalopus_tracing/native_trace_provider.h>
 #endif
+
 #define BUF_SIZE 1024 * 1024
 
 DECLARE_bool(padbox_auc_runner_mode);
@@ -68,7 +70,9 @@ DECLARE_int32(padbox_dataset_shuffle_thread_num);
 
 namespace paddle {
 namespace framework {
+
 extern int make_day_id(const int &y, const int &m, const int &d);
+
 #ifdef PADDLE_WITH_BOX_PS
 #define MAX_GPU_NUM 16
 
@@ -352,6 +356,11 @@ class MetricMsg {
         platform::errors::NotFound("Error: var %s is not found in scope.",
                                    varname.c_str()));
     auto& gpu_tensor = var->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(
+        gpu_tensor.IsInitialized(),
+        true,
+        platform::errors::InvalidArgument(
+            "Error: monitor var `%s` uninitialized Tensor.", varname.c_str()));
     *data = gpu_tensor.data<T>();
     *len = gpu_tensor.numel();
   }
@@ -365,6 +374,11 @@ class MetricMsg {
         platform::errors::NotFound("Error: var %s is not found in scope.",
                                    varname.c_str()));
     auto& gpu_tensor = var->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(
+        gpu_tensor.IsInitialized(),
+        true,
+        platform::errors::InvalidArgument(
+            "Error: monitor var `%s` uninitialized Tensor.", varname.c_str()));
     auto* gpu_data = gpu_tensor.data<T>();
     auto len = gpu_tensor.numel();
     data->resize(len);
@@ -506,6 +520,12 @@ class BoxWrapper {
     std::cout<<"start profile in BoxWrapper"<<std::endl;
 #endif
   }
+  int GetMpiSize() { return boxps::MPICluster::Ins().size(); }
+  int GetMpiRank() { return boxps::MPICluster::Ins().rank(); }
+  int GetNCCLRankId(const int& device_id) {
+    return (GetMpiRank() * gpu_num_ + device_id);
+  }
+  int GetGpuNum() { return gpu_num_; }
   void SetDatasetName(const std::string& name) {}
   void SetInputTableDim(size_t dim) { input_table_dim_ = dim; }
   void FeedPass(int date, const std::vector<uint64_t>& feasgin_to_box);
@@ -569,6 +589,7 @@ class BoxWrapper {
                           const int batch_size,
                           const int skip_offset,
                           bool expand_only);
+
   void PushSparseGradCaseGPU(const paddle::platform::Place& place,
                              const std::vector<const uint64_t*>& keys,
                              const std::vector<const float*>& grad_values,
@@ -578,6 +599,7 @@ class BoxWrapper {
                              const int batch_size,
                              const int skip_offset,
                              bool expand_only);
+
   void PushSparseGradCaseXPU(const paddle::platform::Place& place,
                              const std::vector<const uint64_t*>& keys,
                              const std::vector<const float*>& grad_values,
@@ -935,7 +957,8 @@ class BoxWrapper {
     for (auto& name : var_names) {
       auto it = std::find(skip_gc_vars_.begin(), skip_gc_vars_.end(), name);
       if (it != skip_gc_vars_.end()) {
-        return;
+        // return;
+        continue;
       }
       skip_gc_vars_.push_back(name);
     }
@@ -1099,11 +1122,13 @@ class BoxWrapper {
   std::set<std::string> slot_eval_set_;
   std::atomic<uint16_t> dataset_id_{0};
   std::atomic<uint16_t> round_id_{0};
+
 #if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU))
   scalopus::TransportLoopbackFactory::Ptr factory;
   std::shared_ptr<scalopus::EndpointManagerPoll> manager;
   scalopus::CatapultRecorder::Ptr catapult_recorder;
 #endif
+
   // skip gc vars
   std::vector<std::string> skip_gc_vars_;
 };
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index 0bd62606c950f..738d175632df8 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -272,8 +272,10 @@ void BoxWrapper::PullSparseCaseCPU(const paddle::platform::Place& place,
     slot_lens[i + 1] = total_length;
   }
   dev.total_key_length = total_length;
+
   uint64_t* total_keys = dev.keys_tensor.mutable_data<uint64_t>(
     static_cast<int64_t>(total_length * 2) * sizeof(int64_t), place);
+
   int* key2slot = dev.keys2slot.mutable_data<int>(
       static_cast<int64_t>(total_length * 5) * sizeof(int), place);
   int* total_dims =
@@ -1298,8 +1300,11 @@ void CheckPushValue(
 void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place,
     const std::vector<const uint64_t*>& keys,
     const std::vector<const float*>& grad_values,
-    const std::vector<int64_t>& slot_lengths, const int hidden_size,
-    const int expand_embed_dim, const int batch_size, const int skip_offset,
+    const std::vector<int64_t>& slot_lengths,
+    const int hidden_size,
+    const int expand_embed_dim,
+    const int batch_size,
+    const int skip_offset,
     bool expand_only) {
 #ifdef PADDLE_WITH_XPU_KP
   int device_id = place.GetDeviceId();
@@ -1516,6 +1521,7 @@ void BoxWrapper::PushSparseGradCase(
     const int batch_size,
     const int skip_offset,
     bool expand_only) {
+
   if (platform::is_cpu_place(place)) {
     PushSparseGradCaseCPU(place,
                           keys,
diff --git a/paddle/fluid/framework/fleet/metrics.cc b/paddle/fluid/framework/fleet/metrics.cc
index ae7111b4d28b5..036435b1eaa46 100644
--- a/paddle/fluid/framework/fleet/metrics.cc
+++ b/paddle/fluid/framework/fleet/metrics.cc
@@ -647,4 +647,4 @@ void BasicAucCalculator::computeNanInfMsg() {
 
 }  // namespace framework
 }  // namespace paddle
-#endif
+#endif
\ No newline at end of file
diff --git a/paddle/fluid/framework/fleet/metrics.h b/paddle/fluid/framework/fleet/metrics.h
index e7e227f222dfe..1f62429759e8d 100644
--- a/paddle/fluid/framework/fleet/metrics.h
+++ b/paddle/fluid/framework/fleet/metrics.h
@@ -124,6 +124,8 @@ class BasicAucCalculator {
   double mae() const { return _mae; }
   double nan_rate() const { return _nan_rate; }
   double inf_rate() const { return _inf_rate; }
+  double nan_cnt() const { return _nan_cnt; }
+  double inf_cnt() const { return _inf_cnt; }
   double nan_inf_rate() const { return _nan_inf_rate; }
   double actual_ctr() const { return _actual_ctr; }
   double predicted_ctr() const { return _predicted_ctr; }
@@ -852,4 +854,4 @@ class Metric {
 };
 }  // namespace framework
 }  // namespace paddle
-#endif
+#endif
\ No newline at end of file
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 51aeed2e5d734..50524e11b46ff 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -34,6 +34,9 @@ enum class OpRole {
   kDist = 0x0008,
   // Tag all learning rate scheduler operators.
   kLRSched = 0x0010,
+  
+  // scale lr(for adam)
+  kScaleLr = 0x0012,
 
   kLoss = 0x0100,
   // The default value of op's role. This should be only used for unittests and
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9133489cb09c2..550997f3cdea6 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -71,6 +71,9 @@ DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
 DECLARE_bool(run_kp_kernel);
 DECLARE_bool(enable_host_event_recorder_hook);
+PADDLE_DEFINE_EXPORTED_bool(enable_check_input_var,
+                            false,
+                            "enable check input var");
 
 namespace paddle {
 namespace framework {
@@ -1795,7 +1798,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         os << "\n";
         printf("%s", os.str().c_str());
       }
-      PADDLE_ENFORCE(false, "ERROR: check INF and NAN: %s",
+      PADDLE_ENFORCE(false,
+                     "ERROR: check INF and NAN: %s",
                      DebugStringEx(&exec_scope).c_str());
     }
 #else
@@ -1960,7 +1964,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
               << ", fallbacking to CPU one!";
       expected_kernel_key.place_ = platform::CPUPlace();
       kernel_iter = kernels.find(expected_kernel_key);
-    } else if (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) {
+    } else if (!paddle::platform::is_xpu_support_op(type_,
+                                                    expected_kernel_key)) {
       VLOG(3) << "fluid XPU not support kernel: " << type_
               << ", expected_kernel_key:" << expected_kernel_key
               << ", fallbacking to CPU one!";
@@ -2441,13 +2446,15 @@ void OperatorWithKernel::ParseInputDataType(
       }
     }
     if (t != nullptr) {
-      PADDLE_ENFORCE_EQ(
-          t->IsInitialized(),
-          true,
-          platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
-                                            "contains uninitialized Tensor.",
-                                            Type(),
-                                            name));
+      if (FLAGS_enable_check_input_var) {
+        PADDLE_ENFORCE_EQ(
+            t->IsInitialized(),
+            true,
+            platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
+                                              "contains uninitialized Tensor.",
+                                              Type(),
+                                              name));
+      }
       *data_type = paddle::framework::TransToProtoVarType(t->dtype());
     }
   }
diff --git a/paddle/fluid/framework/program_utils.cc b/paddle/fluid/framework/program_utils.cc
index a32350a569db0..09468273fd780 100644
--- a/paddle/fluid/framework/program_utils.cc
+++ b/paddle/fluid/framework/program_utils.cc
@@ -186,6 +186,30 @@ void ProgramProcessor::AddDepToBlockOp(const BlockDesc &block) {
 }
 
 ProgramProcessor::ProgramProcessor() {}
+// write to file
+void WriteToFile(const std::string &file_path, const std::string &msg) {
+  FILE *fp = fopen(file_path.c_str(), "w");
+  if (fp == NULL) {
+    LOG(WARNING) << "open write file path=" << file_path << " failed";
+    return;
+  }
+  fwrite(msg.c_str(), 1, msg.length(), fp);
+  fclose(fp);
+}
+void DumpProgramDescFile(const std::string &name, const ProgramDesc &program) {
+  ProgramDesc *new_prog = const_cast<ProgramDesc *>(&program);
+  std::string print_str;
+  google::protobuf::TextFormat::Printer printer;
+  printer.SetUseShortRepeatedPrimitives(true);
+  printer.SetSingleLineMode(false);
+  const ::google::protobuf::Message *message =
+	  reinterpret_cast<const ::google::protobuf::Message *>(new_prog->Proto());
+  printer.PrintToString(*message, &print_str);
+
+  char filename[512] = {0};
+  snprintf(filename, sizeof(filename), "./%s_%lu.proto", name.c_str(), time(0));
+  WriteToFile(filename, print_str);
+}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/program_utils.h b/paddle/fluid/framework/program_utils.h
index 4a276e80112b7..a609181b9cf51 100644
--- a/paddle/fluid/framework/program_utils.h
+++ b/paddle/fluid/framework/program_utils.h
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/program_desc.h"
+#include <google/protobuf/text_format.h>
+
+#include <functional>
 
+#include "paddle/fluid/framework/program_desc.h"
 namespace paddle {
 namespace framework {
 
@@ -32,6 +35,21 @@ class ProgramProcessor {
 
   void AddDepToBlockOp(const BlockDesc &block);
 };
-
+void WriteToFile(const std::string &file_path, const std::string &msg);
+void DumpProgramDescFile(const std::string &name, const ProgramDesc &program);
+template <class V>
+void DumpV(
+    const V &v,
+    const char *path,
+    std::function<std::string(typename V::value_type)> f =
+        [](typename V::value_type it) -> std::string { return it; }) {
+  std::ostringstream str_os;
+  for (auto it : v) {
+    str_os << f(it) << std::endl;
+  }
+  std::ofstream ofs(path);
+  ofs << str_os.str();
+  ofs.close();
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index d930a64c533f7..c515646c3ef84 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -399,10 +399,9 @@ class BoxPSTrainer : public TrainerBase {
   void InitDumpEnv() override;
   virtual std::string GetDumpPath(int tid);
   virtual void DumpWork(int tid);
-
- protected:
-  void CopyParameters(const Scope& root_scope, int device_id);
-  void DumpParameters(void);
+  virtual void FinalizeDumpEnv();
+  void RemoveOtherDeviceVars(const ProgramDesc& main_program,
+                             Scope* root_scope);
 
  protected:
   int thread_num_;
@@ -412,6 +411,7 @@ class BoxPSTrainer : public TrainerBase {
   //  std::vector<std::thread> worker_threads_;
   std::vector<std::future<void>> wait_futures_;
   std::vector<DataFeed*> readers_;
+  std::vector<std::future<void>> dump_futures_;
 
   std::shared_ptr<std::vector<std::string>> param_need_sync_;
   std::vector<std::string> persistable_vars_;
diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc
index 907fd37e44205..4a8ded3e7acf0 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -23,6 +23,9 @@ limitations under the License. */
     defined(PADDLE_WITH_MLU) || defined(PADDLE_WITH_ASCEND_CL)
 #define USE_DEVICE
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
+#elif defined(PADDLE_WITH_XPU)
+#define USE_DEVICE
+DECLARE_uint64(reallocate_xpu_memory_in_mb);
 #endif
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
@@ -55,8 +58,11 @@ BuddyAllocator::BuddyAllocator(
     };
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    init_allocate_size_func_ = &platform::GpuInitAllocSize;
+    init_allocate_size_func_ = &platform::GpuInitAllocSize; 
     re_allocate_size_func_ = &platform::GpuReallocSize;
+#elif defined(PADDLE_WITH_XPU)
+    init_allocate_size_func_ = &platform::XpuInitAllocSize; 
+    re_allocate_size_func_ = &platform::XpuReallocSize;
 #elif defined(PADDLE_WITH_ASCEND_CL)
     init_allocate_size_func_ = &platform::NPUInitAllocSize;
     re_allocate_size_func_ = &platform::NPUReallocSize;
@@ -97,7 +103,8 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   VLOG(10) << "alloc: " << unaligned_size
            << ", padding for desc: " << sizeof(MemoryBlock::Desc)
            << ", extra padding: " << extra_padding_size_
-           << ", alignment: " << min_chunk_size_;
+           << ", alignment: " << min_chunk_size_
+           << ", max_chunk_size: " << max_chunk_size_;
   // acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
 
@@ -263,6 +270,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
 #elif defined(PADDLE_WITH_MLU)
   allocate_bytes = DeviceAllocateSize(
       &platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes);
+#elif defined(PADDLE_WITH_XPU)
+  allocate_bytes = DeviceAllocateSize(&platform::XpuInitAllocSize, &platform::XpuReallocSize, request_bytes);
 #endif
 #endif
 
@@ -357,7 +366,12 @@ size_t BuddyAllocator::DeviceAllocateSize(
     } else {
       // Compute the re-allocation size, we store the re-allocation size when
       // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
-      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
+#if defined(PADDLE_WITH_XPU)
+     auto flag_realloc_size = FLAGS_reallocate_xpu_memory_in_mb;
+#else
+     auto flag_realloc_size = FLAGS_reallocate_gpu_memory_in_mb;
+#endif
+      if (realloc_size_ == 0 || flag_realloc_size == 0ul) {
         realloc_size_ = re_allocate_size_func();
       }
       allocate_bytes = std::max(realloc_size_, request_bytes);
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 0508bf211d832..fe8110219ca24 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -29,6 +29,14 @@
 #include "paddle/phi/common/place.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#elif defined(PADDLE_WITH_XPU)
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+
+DECLARE_bool(use_xpu_buddy_allocator);
+DECLARE_double(fraction_of_xpu_memory_to_use);
+DECLARE_uint64(initial_xpu_memory_in_mb);
+DECLARE_uint64(reallocate_xpu_memory_in_mb);
+
 #endif
 
 PADDLE_DEFINE_EXPORTED_bool(
@@ -143,36 +151,132 @@ size_t Used<platform::IPUPlace>(const platform::IPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+// TODO@yaocheng05: Add xpu support GetXPUBuddyAllocator
+
+// For XPU
+#if defined(PADDLE_WITH_XPU)
+class XPUBuddyAllocatorList {
+ private:
+  XPUBuddyAllocatorList() : devices_(platform::GetXPUSelectedDevices()) {
+    auto xpu_num = devices_.size();
+    allocators_.resize(xpu_num);
+    init_flags_.reserve(xpu_num);
+    for (size_t i = 0; i < xpu_num; ++i) {
+      init_flags_.emplace_back(new std::once_flag());
+    }
+  }
+
+  static XPUBuddyAllocatorList *CreateNewInstance() {
+    return new XPUBuddyAllocatorList();
+  }
+
+ public:
+  static XPUBuddyAllocatorList *Instance() {
+    static auto *instance = CreateNewInstance();
+    return instance;
+  }
+
+  BuddyAllocator *Get(int xpu_id) {
+    auto pos = std::distance(
+        devices_.begin(), std::find(devices_.begin(), devices_.end(), xpu_id));
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
+                      platform::errors::OutOfRange(
+                          "The index exceeds the size of devices, the size of "
+                          "devices is %d, the index is %d",
+                          devices_.size(),
+                          pos));
+
+    std::call_once(*init_flags_[pos], [this, pos] {
+      platform::SetXPUDeviceId(devices_[pos]);
+      allocators_[pos].reset(
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                             new detail::XPUAllocator(devices_[pos])),
+                             platform::XpuMinChunkSize(),
+                             platform::XpuMaxChunkSize()));
+      VLOG(10) << "\n\nNOTE:\n"
+               << "You can set GFlags environment variable "
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
+               << "or 'FLAGS_initial_gpu_memory_in_mb' "
+               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
+               << "to change the memory size for GPU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_xpu_memory_to_use
+               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
+               << FLAGS_initial_xpu_memory_in_mb
+               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
+               << FLAGS_reallocate_xpu_memory_in_mb << "\n\n";
+    });
+
+    return allocators_[pos].get();
+  }
+
+ private:
+  std::vector<int> devices_;
+  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
+  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
+};
+
+BuddyAllocator *GetXPUBuddyAllocator(int xpu_id) {
+  return XPUBuddyAllocatorList::Instance()->Get(xpu_id);
+}
+#endif
+
+
 // For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
-#ifdef PADDLE_WITH_XPU
-  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
-  void *p = nullptr;
 
-  platform::XPUDeviceGuard gurad(place.device);
-  int ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
-  if (ret != XPU_SUCCESS) {
-    VLOG(10) << "xpu memory malloc(" << size << ") failed, try again";
-    xpu_wait();
-    ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
-  }
-  PADDLE_ENFORCE_EQ(
-      ret,
-      XPU_SUCCESS,
-      platform::errors::External(
-          "XPU API return wrong value[%d], no enough memory", ret));
-  if (FLAGS_init_allocated_mem) {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "xpu memory FLAGS_init_allocated_mem is not implemented."));
+#ifdef PADDLE_WITH_XPU
+  if (FLAGS_use_xpu_buddy_allocator) {
+    auto *buddy_allocator = GetXPUBuddyAllocator(place.device);
+    auto *ptr = buddy_allocator->Alloc(size);
+    if (ptr == nullptr) {
+      platform::XPUMLHandler handler;
+      auto re = handler.getMemoryUsageTuple(place.device);
+
+      PADDLE_THROW(platform::errors::ResourceExhausted(
+          "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
+          "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
+          string::HumanReadableSize(size),
+          place.device,
+          string::HumanReadableSize(std::get<2>(re)),
+          string::HumanReadableSize(std::get<0>(re)),
+          string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
+          string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
+          string::HumanReadableSize(std::get<1>(re))));
+    } 
+    return ptr;
+  } else {
+    VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+    void *p = nullptr;
+
+    platform::XPUDeviceGuard gurad(place.device);
+    // platform::inc_malloc_cnt(place.device);
+    int ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
+    if (ret != XPU_SUCCESS) {
+      VLOG(10) << "xpu memory malloc(" << size << ") failed, try again";
+      xpu_wait();
+      // platform::inc_malloc_cnt(place.device);
+      ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
+    }
+    PADDLE_ENFORCE_EQ(
+        ret,
+        XPU_SUCCESS,
+        platform::errors::External(
+            "XPU API return wrong value[%d], no enough memory", ret));
+    if (FLAGS_init_allocated_mem) {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "xpu memory FLAGS_init_allocated_mem is not implemented."));
+    }
+    VLOG(10) << "  pointer=" << p;
+    return p;
   }
-  VLOG(10) << "  pointer=" << p;
-  return p;
 #else
   PADDLE_THROW(
       platform::errors::PermissionDenied("'XPUPlace' is not supported."));
-  return nullptr;
 #endif
+  return nullptr;
 }
 
 template <>
@@ -182,9 +286,12 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place,
 #ifdef PADDLE_WITH_XPU
   VLOG(10) << "Free " << size << " bytes on " << platform::Place(place);
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-
-  platform::XPUDeviceGuard gurad(place.device);
-  xpu_free(p);
+  if (FLAGS_use_xpu_buddy_allocator) {
+    GetXPUBuddyAllocator(place.device)->Free(p);
+  } else {
+    platform::XPUDeviceGuard gurad(place.device);
+    xpu_free(p);
+  }
 #else
   PADDLE_THROW(
       platform::errors::PermissionDenied("'XPUPlace' is not supported."));
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index fcfece978cb7f..91a6e3ffd4a62 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -39,6 +39,9 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#if defined(PADDLE_WITH_XPU)
+DECLARE_double(fraction_of_xpu_memory_to_use);
+#endif
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler/mem_tracing.h"
@@ -289,6 +292,82 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
 
 #endif
 
+#if defined(PADDLE_WITH_XPU)
+void* XPUAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  void* p;
+  // platform::inc_malloc_cnt(xpu_id_);
+  auto result = platform::RecordedXpuMalloc(&p, size, xpu_id_);
+
+  if (result == XPU_SUCCESS) {
+    *index = 0;
+    xpu_alloc_size_ += size;
+    return p;
+  } else {
+    size_t avail, total, actual_avail, actual_total;
+    bool is_limited = platform::RecordedXpuMemGetInfo(
+        &avail, &total, &actual_avail, &actual_total, xpu_id_);
+    size_t allocated = total - avail;
+
+    std::string err_msg;
+    if (is_limited) {
+      auto limit_size = (total >> 20);
+      err_msg = string::Sprintf(
+          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
+          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
+          "maximum GPU memory usage is limited to %d MB.\n"
+          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+          limit_size,
+          limit_size);
+    }
+
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on XPU %d. "
+        "Cannot allocate %s memory on XPU %d, %s memory has been allocated and "
+        "available memory is only %s.\n\n"
+        "Please check whether there is any other process using GPU %d.\n"
+        "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
+        "2. If no, please try one of the following suggestions:\n"
+        "   1) Decrease the batch size of your model.\n"
+        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
+        "please set it to a higher value but less than 1.0.\n"
+        "      The command is "
+        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
+        xpu_id_,
+        string::HumanReadableSize(size),
+        xpu_id_,
+        string::HumanReadableSize(allocated),
+        string::HumanReadableSize(avail),
+        xpu_id_,
+        FLAGS_fraction_of_xpu_memory_to_use,
+        err_msg));
+    return nullptr;
+  }
+}
+
+void XPUAllocator::Free(void* p, size_t size, size_t index) {
+  PADDLE_ENFORCE_EQ(index,
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(xpu_alloc_size_,
+                    size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated gpu memory (%d)",
+                        size,
+                        xpu_alloc_size_));
+  xpu_alloc_size_ -= size;
+
+  platform::RecordedXpuFree(p, size, xpu_id_);
+}
+
+bool XPUAllocator::UseGpu() const { return true; }
+
+#endif
+
+
 #ifdef PADDLE_WITH_ASCEND_CL
 void* NPUAllocator::Alloc(size_t* index, size_t size) {
   if (size <= 0) return nullptr;
diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h
index 18c2e278f99c5..8d84a130a1b23 100644
--- a/paddle/fluid/memory/allocation/system_allocator.h
+++ b/paddle/fluid/memory/allocation/system_allocator.h
@@ -68,6 +68,21 @@ class CUDAPinnedAllocator : public SystemAllocator {
 };
 #endif
 
+#if defined(PADDLE_WITH_XPU)
+class XPUAllocator : public SystemAllocator {
+ public:
+  explicit XPUAllocator(int xpu_id) : xpu_id_(xpu_id) {}
+
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t xpu_alloc_size_ = 0;
+  int xpu_id_; 
+};
+#endif
+
 #ifdef PADDLE_WITH_ASCEND_CL
 
 class NPUAllocator : public SystemAllocator {
diff --git a/paddle/fluid/operators/batch_fc_op.cc b/paddle/fluid/operators/batch_fc_op.cc
index 9ee4bad1d73b7..7cc1844393b03 100644
--- a/paddle/fluid/operators/batch_fc_op.cc
+++ b/paddle/fluid/operators/batch_fc_op.cc
@@ -44,6 +44,61 @@ class BatchFCOp : public framework::OperatorWithKernel {
     auto w_dims = ctx->GetInputDim("W");
 
     int batchcount = ctx->Attrs().Get<int>("batchcount");
+    int transpose_weight = ctx->Attrs().Get<bool>("transpose_weight");
+
+    if (transpose_weight) {
+      // Input_dim: [batch_count, ?, in_dim]
+      // W_dim: [in_dim, batch_count * out_dim]
+      // Bias_dim: [1, batch_count * out_dim]
+      // Out_dim: [batch_count, ?, out_dim]
+      PADDLE_ENFORCE_GT(
+          batchcount,
+          0,
+          platform::errors::PreconditionNotMet(
+              "with transpose weight, batchcount should > 0"));
+      PADDLE_ENFORCE_EQ(
+          w_dims.size(),
+          2,
+          platform::errors::InvalidArgument(
+              "W of BatchFCOp should have 2D."));
+
+      int out_dim = w_dims[1] / batchcount;
+      PADDLE_ENFORCE_EQ(
+          input_dims.size(),
+          3,
+          platform::errors::InvalidArgument(
+              "Input of BatchFCOp should have 3D."));
+      PADDLE_ENFORCE_EQ(
+          input_dims[2],
+          w_dims[0],
+          platform::errors::InvalidArgument(
+              "Input.dim[2] and w_dims[0] of BatchFCOp should be same."));
+      PADDLE_ENFORCE_EQ(
+          input_dims[0],
+          batchcount,
+          platform::errors::InvalidArgument(
+              "Input.dim[0] and batchcount of BatchFCOp should be same."));
+      PADDLE_ENFORCE_EQ(
+          input_dims[2],
+          w_dims[0],
+          platform::errors::InvalidArgument(
+              "Input.dim[2] and W.dim[1] of BatchFCOp should be same."));
+
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE_EQ(
+          bias_dims.size(),
+          2,
+          platform::errors::InvalidArgument("Bias of BatchFCOp should have 2D."));
+      PADDLE_ENFORCE_EQ(
+          bias_dims[1],
+          w_dims[1],
+          platform::errors::InvalidArgument(
+              "Bias.dim[1] should be same as input.dim[2]."));
+
+      ctx->SetOutputDim("Out", {input_dims[0], input_dims[1], out_dim});
+      ctx->ShareLoD("Input", /*->*/ "Out");
+      return;
+    }
     if (batchcount > 0) {
       int feature_dim = input_dims[1] / batchcount;
       PADDLE_ENFORCE_EQ(feature_dim, w_dims[0],
@@ -139,6 +194,7 @@ class BatchFCOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Bias", "(Tensor) Input tensor of batch_fc_op operator.");
     AddOutput("Out", "Output tensor of batch_fc_op operator.");
     AddAttr<int>("batchcount", "(int64_t) the batchcount").SetDefault(0);
+    AddAttr<bool>("transpose_weight", "(bool) the transpose_weight").SetDefault(false);
     AddComment(R"DOC(
 BatchFC Operator.
 Notice: It currently supports GPU device.
diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu
index f9fac45ef6e5e..652eddb560099 100644
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -171,11 +171,96 @@ void transpose_split_row(cudaStream_t stream, const unsigned int rown,
                                stream>>>(rown, coln, num_block, source, dest);
 }
 
+template <typename T>
+__global__ void transpose_weight_kernel(const T* source, T* dest,
+                      const unsigned int rown, const unsigned int coln, const int64_t batch_count) {
+  int x = blockIdx.x * blockDim.x + threadIdx.x;
+  int y = blockIdx.y * blockDim.y + threadIdx.y;
+  if (x < rown && y < coln) {
+    int dst_coln = coln / batch_count;
+    int dst_x =  x + y / dst_coln * rown;
+    int dst_y =  y % dst_coln;
+    dest[dst_x * dst_coln + dst_y] = source[x * coln + y];
+  }
+}
+
+template <typename T>
+void transpose_weight_impl(cudaStream_t stream, const T* source, T* dest,
+                      const unsigned int rown, const unsigned int coln, const int64_t batch_count) {
+  dim3 grid((rown + 15) / 16, (coln + 15) / 16);
+  dim3 block(16, 16);
+  transpose_weight_kernel<<<grid, block, 0, stream>>>(source, dest, rown, coln, batch_count);
+}
+
 template <typename DeviceContext, typename T>
 class BatchFCCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     int batchcount = ctx.Attr<int>("batchcount");
+    auto transpose_weight = ctx.Attr<bool>("transpose_weight");
+    if (transpose_weight) {
+      // Input_dim: [batch_count, ?, in_dim]
+      // W_dim: [in_dim, batch_count * out_dim]
+      // Bias_dim: [1, batch_count * out_dim]
+      // Out_dim: [batch_count, ?, out_dim]
+      auto* input = ctx.Input<framework::LoDTensor>("Input");
+      auto* w = ctx.Input<Tensor>("W");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* output = ctx.Output<framework::LoDTensor>("Out");
+      auto input_dims = input->dims();
+      auto w_dims = w->dims();
+      auto slot_pairs_num = input_dims[0];
+      auto ins_num = input_dims[1];
+      auto in_dim = input_dims[2];
+      auto out_dim = w_dims[1] / batchcount;
+  
+      // get data ptr
+      const T* in_data = input->data<T>();
+      const T* w_data = w->data<T>();
+      const T* bias_data = bias->data<T>();
+  
+      output->Resize({slot_pairs_num, ins_num, out_dim});
+      T* out_data = output->mutable_data<T>(ctx.GetPlace());
+
+      auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
+
+      Tensor w_help;
+      w_help =
+          ctx.AllocateTmpTensor<T, DeviceContext>({batchcount, w_dims[0], w_dims[1] / batchcount}, dev_ctx);
+      T* w_help_data = w_help.data<T>();
+
+      transpose_weight_impl<T>(ctx.cuda_device_context().stream(), w_data, w_help_data, w_dims[0], w_dims[1], batchcount);
+
+      CBLAS_TRANSPOSE transA = CblasNoTrans;
+      CBLAS_TRANSPOSE transB = CblasNoTrans;
+  
+      T alpha = 1;
+      T beta = 0;
+      int64_t strideA = ins_num * in_dim;
+      int64_t strideB = in_dim * out_dim;
+
+      auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
+      blas.BatchedGEMM(transA,
+                       transB,
+                       ins_num,
+                       out_dim,
+                       in_dim,
+                       alpha,
+                       in_data,
+                       w_help_data,
+                       beta,
+                       out_data,
+                       slot_pairs_num,
+                       strideA,
+                       strideB);
+      add_bias<T>(ctx.cuda_device_context().stream(),
+                  out_data,
+                  slot_pairs_num,
+                  ins_num,
+                  out_dim,
+                  bias_data);
+      return;
+    }
     if (batchcount > 0) {
       auto* input = ctx.Input<framework::LoDTensor>("Input");
       auto* w = ctx.Input<Tensor>("W");
diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h
index 8425dcb521ab6..4dc83c9717ae7 100644
--- a/paddle/fluid/operators/conv_base_helper.h
+++ b/paddle/fluid/operators/conv_base_helper.h
@@ -36,25 +36,33 @@ using framework::ConvSearchCache;
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
-// As the basic for SearchAlgorithm struct.
-template <typename PerfT>
-struct SearchAlgorithm {};
-
 // As the container of searchAlgorithm::Find() result.
 template <typename AlgoT>
 struct SearchResult {
   SearchResult() {}
+
   explicit SearchResult(AlgoT a) : algo(a) {}
+  explicit SearchResult(AlgoT a, float t, size_t size)
+      : algo(a), time(t), workspace_size(size) {}
 
   AlgoT algo = static_cast<AlgoT>(0);
   float time = -1.f;
   size_t workspace_size = 0;
+  bool exhaustive_search = false;
 };
 
 template <typename T>
 static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
   out << "[";
-  for (auto const& tmp : v) out << tmp << ",";
+  bool is_first = true;
+  for (auto const& tmp : v) {
+    if (is_first) {
+      out << tmp;
+      is_first = false;
+    } else {
+      out << ", " << tmp;
+    }
+  }
   out << "]";
   return out;
 }
@@ -76,28 +84,50 @@ struct ConvArgsBase {
   // dilations
   std::vector<int> d;
 
+  // groups
+  int group;
+
+  // data foramt
+  DataLayout data_layout;
+
   ConvArgsBase(const framework::Tensor* x,
                const framework::Tensor* w,
                const framework::Tensor* o,
                const std::vector<int> s,
                const std::vector<int> p,
                const std::vector<int> d,
-               DataT dtype)
-      : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
+               DataT dtype,
+               int g,
+               DataLayout layout)
+      : x(x),
+        w(w),
+        o(o),
+        s(s),
+        p(p),
+        d(d),
+        cudnn_dtype(dtype),
+        group(g),
+        data_layout(layout) {}
 
   template <typename T>
-  size_t GetCacheKey() const {
+  phi::autotune::ConvCacheKey Convert2ConvCacheKey() const {
     auto x_shape = phi::vectorize(x->dims());
     auto w_shape = phi::vectorize(w->dims());
     VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
-             << ", strides=" << s << ", paddings=" << p << ", dilations=" << d;
-    return phi::autotune::ConvKey(
+             << ", strides=" << s << ", paddings=" << p << ", dilations=" << d
+             << ", data=" << paddle::experimental::CppTypeToDataType<T>::Type()
+             << ", group=" << group
+             << ", data layout=" << static_cast<int64_t>(data_layout);
+
+    return phi::autotune::ConvCacheKey(
         x_shape,
         w_shape,
         p,
         s,
         d,
-        paddle::experimental::CppTypeToDataType<T>::Type());
+        paddle::experimental::CppTypeToDataType<T>::Type(),
+        group,
+        static_cast<int64_t>(data_layout));
   }
 };
 
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 1b8d421d133f1..2fa1683833c33 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -146,79 +146,21 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
   }
 }
 
-static void SetConvMathType(const phi::GPUContext& ctx,
-                            cudnnDataType_t dtype,
-                            const platform::ConvolutionDescriptor& cdesc) {
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
-    VLOG(5) << "use cudnn_tensor_op_math";
-#if CUDA_VERSION >= 11000
-#if CUDNN_VERSION_MIN(8, 1, 0)
-  } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
-#endif  // CUDNN_VERSION_MIN(8, 1, 0)
-  } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        cdesc.desc(), CUDNN_FMA_MATH));
-#endif  // CUDA_VERSION >= 11000
-  } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        cdesc.desc(), CUDNN_DEFAULT_MATH));
-    VLOG(5) << "NOT use cudnn_tensor_op_math";
-  }
-#endif
-}
+template <typename PerfT>
+struct SearchAlgorithmBase {};
 
 // cuDNN convolution forward algorithm searcher, consisted of three searching
 // modes, namely: deterministic, heuristic and exhaustive_search mode.
 // As well as one workspace size acquirsition function with respect to
 // the chosen alogrithm.
 template <>
-struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
+struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
   using PerfT = cudnnConvolutionFwdAlgoPerf_t;
   using AlgoT = cudnnConvolutionFwdAlgo_t;
+  constexpr static phi::autotune::AlgorithmType kAlgoType =
+      phi::autotune::AlgorithmType::kConvForward;
 
-  template <typename T>
-  static SearchResult<AlgoT> Find(const ConvArgs& args,
-                                  bool exhaustive_search,
-                                  bool deterministic,
-                                  const phi::GPUContext& ctx) {
-    SearchResult<AlgoT> result;
-    auto dtype = platform::CudnnDataType<T>::type;
-    SetConvMathType(ctx, dtype, args.cdesc);
-
-    if (deterministic) {
-      result = FindAlgoDeterministic();
-    } else {
-      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
-      // 2. Once turning on auto-tune, runn heuristic search(default) before
-      //    auto-tune process, run exhaustive_search during mentioned process.
-      // 3. After auto-tune process, run cached algorithm if cached, run
-      //    default mode for the rest.
-      size_t key = args.GetCacheKey<T>();
-      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
-      if (cache.Find(key)) {
-        result.algo = static_cast<AlgoT>(cache.Get(key));
-      } else {
-        bool use_autotune =
-            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
-        if (exhaustive_search || use_autotune) {
-          result = FindAlgoExhaustiveSearch<T>(args, ctx);
-          cache.Set(key, static_cast<int64_t>(result.algo));
-        } else {
-          result = FindAlgoHeuristic(args, ctx);
-        }
-      }
-    }
-    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
-            << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo << ", workspace="
-            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
-    return result;
-  }
+  static const std::string GetPerfName() { return "ConvForward"; }
 
   static size_t GetWorkspaceSize(const ConvArgs& args,
                                  cudnnConvolutionFwdAlgo_t algo) {
@@ -235,9 +177,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
     return workspace_size;
   }
 
- private:
-  static SearchResult<AlgoT> FindAlgoDeterministic() {
-    return SearchResult<AlgoT>(static_cast<AlgoT>(1));
+ protected:
+  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
+    auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
+    return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
   }
 
   // Heuristic search mode, calling the cudnnGetXxxAlgorithm.
@@ -266,6 +209,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 
     if (result.workspace_size > workspace_size_limit) {
 #if CUDNN_VERSION >= 8000
+      VLOG(4) << GetPerfResultString<PerfT>("[Heuristic] FwdAlgo Perf result",
+                                            perf_results,
+                                            actual_perf_count,
+                                            workspace_size_limit);
       // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
       ChooseAlgoByWorkspace<PerfT, AlgoT>(
           perf_results, workspace_size_limit, &result);
@@ -298,6 +245,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
             workspace_size_limit,
             &(result.algo)));
 #endif
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -343,6 +291,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
     ChooseAlgoByWorkspace<PerfT, AlgoT>(
         perf_results, workspace_size_limit, &result);
 
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -380,49 +329,13 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 // As well as one workspace size acquirsition function with
 // respect to the chosen alogrithm.
 template <>
-struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
+struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
   using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
   using AlgoT = cudnnConvolutionBwdDataAlgo_t;
+  constexpr static phi::autotune::AlgorithmType kAlgoType =
+      phi::autotune::AlgorithmType::kConvBackwardData;
 
-  template <typename T>
-  static SearchResult<AlgoT> Find(const ConvArgs& args,
-                                  bool exhaustive_search,
-                                  bool deterministic,
-                                  const phi::GPUContext& ctx) {
-    SearchResult<AlgoT> result;
-    auto dtype = platform::CudnnDataType<T>::type;
-    SetConvMathType(ctx, dtype, args.cdesc);
-
-    if (deterministic) {
-      result = FindAlgoDeterministic();
-    } else {
-      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
-      // 2. Once turning on auto-tune, runn heuristic search(default) before
-      //    auto-tune process, run exhaustive_search during mentioned process.
-      // 3. After auto-tune process, run cached algorithm if cached, run
-      //    default mode for the rest.
-      size_t key = args.GetCacheKey<T>();
-      auto& cache =
-          phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
-      if (cache.Find(key)) {
-        result.algo = static_cast<AlgoT>(cache.Get(key));
-      } else {
-        bool use_autotune =
-            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
-        if (exhaustive_search || use_autotune) {
-          result = FindAlgoExhaustiveSearch<T>(args, ctx);
-          cache.Set(key, static_cast<int64_t>(result.algo));
-        } else {
-          result = FindAlgoHeuristic(args, ctx);
-        }
-      }
-    }
-    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
-            << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo << ", workspace="
-            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
-    return result;
-  }
+  static const std::string GetPerfName() { return "ConvBackwardData"; }
 
   static size_t GetWorkspaceSize(const ConvArgs& args,
                                  cudnnConvolutionBwdDataAlgo_t algo) {
@@ -439,9 +352,12 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
     return workspace_size;
   }
 
- private:
-  static SearchResult<AlgoT> FindAlgoDeterministic() {
-    return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
+ protected:
+  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
+    auto workspace_size =
+        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
+    return SearchResult<AlgoT>(
+        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size);
   }
 
   static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
@@ -513,7 +429,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
             workspace_size_limit,
             &(result.algo)));
 #endif
-
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -559,6 +475,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
     ChooseAlgoByWorkspace<PerfT, AlgoT>(
         perf_results, workspace_size_limit, &result);
 
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -594,50 +511,13 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 // exhaustive_search mode. As well as one workspace size acquirsition function
 // with respect to the chosen alogrithm.
 template <>
-struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
+struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
   using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
   using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
+  constexpr static phi::autotune::AlgorithmType kAlgoType =
+      phi::autotune::AlgorithmType::kConvBackwardFilter;
 
-  template <typename T>
-  static SearchResult<AlgoT> Find(const ConvArgs& args,
-                                  bool exhaustive_search,
-                                  bool deterministic,
-                                  const phi::GPUContext& ctx) {
-    platform::CUDAGraphCaptureModeGuard guard;
-    SearchResult<AlgoT> result;
-    auto dtype = platform::CudnnDataType<T>::type;
-    SetConvMathType(ctx, dtype, args.cdesc);
-
-    if (deterministic) {
-      result = FindAlgoDeterministic();
-    } else {
-      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
-      // 2. Once turning on auto-tune, runn heuristic search(default) before
-      //    auto-tune process, run exhaustive_search during mentioned process.
-      // 3. After auto-tune process, run cached algorithm if cached, run
-      //    default mode for the rest.
-      size_t key = args.GetCacheKey<T>();
-      auto& cache =
-          phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
-      if (cache.Find(key)) {
-        result.algo = static_cast<AlgoT>(cache.Get(key));
-      } else {
-        bool use_autotune =
-            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
-        if (exhaustive_search || use_autotune) {
-          result = FindAlgoExhaustiveSearch<T>(args, ctx);
-          cache.Set(key, static_cast<int64_t>(result.algo));
-        } else {
-          result = FindAlgoHeuristic(args, ctx);
-        }
-      }
-    }
-    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
-            << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo << ", workspace="
-            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
-    return result;
-  }
+  static const std::string GetPerfName() { return "ConvBackwardFilter"; }
 
   static size_t GetWorkspaceSize(const ConvArgs& args,
                                  cudnnConvolutionBwdFilterAlgo_t algo) {
@@ -655,9 +535,12 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
     return workspace_size;
   }
 
- private:
-  static SearchResult<AlgoT> FindAlgoDeterministic() {
-    return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
+ protected:
+  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
+    auto workspace_size =
+        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
+    return SearchResult<AlgoT>(
+        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size);
   }
 
   static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
@@ -718,6 +601,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
             &(result.algo)));
 #endif
 
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -786,6 +670,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
       ChooseAlgo(perf_results, workspace_size_limit, &result);
     }
 
+    result.workspace_size = GetWorkspaceSize(args, result.algo);
     return result;
   }
 
@@ -867,5 +752,103 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
   }
 };
 
+template <typename PerfT>
+struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
+  using AlgoT = typename SearchAlgorithmBase<PerfT>::AlgoT;
+
+  template <typename T>
+  static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
+                                  const ConvArgs& args,
+                                  bool exhaustive_search,
+                                  bool deterministic,
+                                  bool enable_autotune = true) {
+    SearchResult<AlgoT> result;
+    bool use_autotune = false;
+    auto dtype = platform::CudnnDataType<T>::type;
+    SetConvMathType(ctx, dtype, args.cdesc);
+
+    if (deterministic) {
+      result = SearchAlgorithmBase<PerfT>::FindAlgoDeterministic(args);
+    } else {
+      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
+      // 2. Once turning on auto-tune, run heuristic (default) before
+      //    auto-tune process, run exhaustive_search during mentioned process.
+      //    Auto tune is only enabled between specified range.
+      // 3. After auto-tune process, run cached algorithm if cached, run
+      //    default mode for the rest.
+      auto key = args.Convert2ConvCacheKey<T>();
+      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
+          SearchAlgorithmBase<PerfT>::kAlgoType);
+      bool find_in_cache = cache.Find(key);
+      if (find_in_cache) {
+        auto t = cache.Get(key);
+        result.algo = static_cast<AlgoT>(t.algo);
+        result.workspace_size = t.workspace_size;
+        result.exhaustive_search = t.exhaustive_search;
+      }
+      if (!result.exhaustive_search) {
+        bool need_update_cache = false;
+        // In conv2d_tranpose, enable_autotune is set to false because some
+        // algorithm picked by exhaustive search method produce wrong result.
+        use_autotune = enable_autotune &&
+                       phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
+        if (exhaustive_search || use_autotune) {
+          // Once autotune is enabled, the autotuned result can rewrite the
+          // previous result in cache found by heuristic method.
+          result =
+              SearchAlgorithmBase<PerfT>::template FindAlgoExhaustiveSearch<T>(
+                  args, ctx);
+          need_update_cache = true;
+        } else if (!find_in_cache) {
+          result = SearchAlgorithmBase<PerfT>::FindAlgoHeuristic(args, ctx);
+          need_update_cache = true;
+        }
+        if (need_update_cache) {
+          phi::autotune::ConvAutoTuneResult node(
+              static_cast<int64_t>(result.algo),
+              result.workspace_size,
+              exhaustive_search || use_autotune);
+          cache.Set(key, node);
+        }
+      }
+    }
+    VLOG(3) << "[cuDNN " << SearchAlgorithmBase<PerfT>::GetPerfName()
+            << "] exhaustive_search=" << exhaustive_search
+            << ", use_autotune=" << use_autotune
+            << ", deterministic=" << deterministic
+            << ", choose algo=" << result.algo
+            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
+    return result;
+  }
+
+  static void SetConvMathType(const phi::GPUContext& ctx,
+                              cudnnDataType_t dtype,
+                              const platform::ConvolutionDescriptor& cdesc) {
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
+      VLOG(5) << "Enable Tensor Core for FLOAT16";
+#if CUDA_VERSION >= 11000
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    } else if (ctx.GetComputeCapability() >= 80 &&
+               dtype == CUDNN_DATA_BFLOAT16) {
+      VLOG(5) << "Enable Tensor Core for BFLOAT16";
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
+#endif  // CUDNN_VERSION_MIN(8, 1, 0)
+    } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
+      VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cdesc.desc(), CUDNN_FMA_MATH));
+#endif  // CUDA_VERSION >= 11000
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cdesc.desc(), CUDNN_DEFAULT_MATH));
+    }
+#endif
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/cross_norm_hadamard_op.cu b/paddle/fluid/operators/cross_norm_hadamard_op.cu
index df643de164ffe..4594421565770 100644
--- a/paddle/fluid/operators/cross_norm_hadamard_op.cu
+++ b/paddle/fluid/operators/cross_norm_hadamard_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cublas.h>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/fluid/operators/fused/fused_seq_tensor_op.cc b/paddle/fluid/operators/fused/fused_seq_tensor_op.cc
new file mode 100644
index 0000000000000..7430d0d32ca37
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_seq_tensor_op.cc
@@ -0,0 +1,132 @@
+#include "paddle/fluid/operators/fused/fused_seq_tensor_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class FusedSeqTensorOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input",  "Input", "FusedSeqTensorOp");
+    OP_INOUT_CHECK(ctx->HasInput("ADInput"), "ADInput",  "ADInput", "FusedSeqTensorOp");
+    
+    OP_INOUT_CHECK(ctx->HasOutput("DINOut"), "DINOut", "DINOut", "FusedSeqTensorOp");
+    OP_INOUT_CHECK(ctx->HasOutput("MaskOut"), "MaskOut", "MaskOut", "FusedSeqTensorOp");
+    OP_INOUT_CHECK(ctx->HasOutput("SideInfoOut"), "SideInfoOut", "SideInfoOut", "FusedSeqTensorOp");
+    OP_INOUT_CHECK(ctx->HasOutput("ADSlotSessionOut"), "ADSlotSessionOut", "ADSlotSessionOut", "FusedSeqTensorOp");
+
+    const framework::DDim input_dims = ctx->GetInputDim("Input");
+    const framework::DDim ad_input_dims = ctx->GetInputDim("ADInput");
+
+    auto ad_slot_num = ctx->Attrs().Get<int64_t>("ad_slot_num");
+    auto batch_count = ctx->Attrs().Get<int64_t>("batch_count");
+    auto max_length = ctx->Attrs().Get<int64_t>("max_length");
+    auto slot_num = ctx->Attrs().Get<int64_t>("slot_num");
+    auto fea_emb_dim = ctx->Attrs().Get<int64_t>("fea_emb_dim");
+    auto ad_slot_offset = ctx->Attrs().Get<int64_t>("ad_slot_offset");
+
+    int64_t one_ins_dim = batch_count * max_length * slot_num * fea_emb_dim;
+    PADDLE_ENFORCE_EQ(
+        input_dims[1], one_ins_dim,
+        platform::errors::InvalidArgument(
+          "input dims error, %ld != %ld", input_dims[1], one_ins_dim));
+
+    int64_t one_ins_ad_dim = batch_count * 1 * ad_slot_num * fea_emb_dim;
+    PADDLE_ENFORCE_EQ(
+        ad_input_dims[1], one_ins_ad_dim,
+        platform::errors::InvalidArgument(
+          "input dims error, %ld != %ld", ad_input_dims[1], one_ins_ad_dim));
+    PADDLE_ENFORCE_LT(
+        ad_slot_num, slot_num, 
+        platform::errors::InvalidArgument(
+          "ad_slot_num [%ld] >  slot_num [%ld]", ad_slot_num, slot_num));
+    PADDLE_ENFORCE_GT(
+        ad_slot_num, 0, 
+        platform::errors::InvalidArgument(
+          "ad_slot_num [%ld] <= 0", ad_slot_num));
+    PADDLE_ENFORCE_LE(
+        ad_slot_offset, slot_num - 1, 
+        platform::errors::InvalidArgument(
+          "ad_slot_num [%ld] > slot_num - 1 [%ld]", ad_slot_offset, slot_num));
+    PADDLE_ENFORCE_GE(
+        ad_slot_offset, 0, 
+        platform::errors::InvalidArgument(
+          "ad_slot_offset [%ld] < 0", ad_slot_offset));
+    if (ad_slot_offset != 0) {
+      PADDLE_ENFORCE_EQ(
+          ad_slot_num + ad_slot_offset, slot_num, 
+          platform::errors::InvalidArgument(
+            "ad_slot_num [%ld] + ad_slot_offset [%ld] !=  slot_num [%ld]", ad_slot_num, ad_slot_offset, slot_num));
+    }
+    
+    auto ins_num = input_dims[0];
+    if (batch_count > 1) {
+      ctx->SetOutputDim("DINOut", {batch_count, ins_num * max_length, ad_slot_num * fea_emb_dim * 4});
+      ctx->SetOutputDim("MaskOut", {batch_count, ins_num, max_length});
+      ctx->SetOutputDim("SideInfoOut", {batch_count, ins_num * max_length, (slot_num - ad_slot_num) * fea_emb_dim});
+      ctx->SetOutputDim("ADSlotSessionOut", {batch_count, ins_num * max_length, ad_slot_num, fea_emb_dim});
+    } else {
+      ctx->SetOutputDim("DINOut", {ins_num, max_length, ad_slot_num * fea_emb_dim * 4});
+      ctx->SetOutputDim("MaskOut", {ins_num, max_length});
+      ctx->SetOutputDim("SideInfoOut", {ins_num, max_length, (slot_num - ad_slot_num) * fea_emb_dim});
+      ctx->SetOutputDim("ADSlotSessionOut", {ins_num, max_length, ad_slot_num * fea_emb_dim});
+    }
+    ctx->ShareLoD("Input", "DINOut");
+    ctx->ShareLoD("Input", "MaskOut");
+    ctx->ShareLoD("Input", "SideInfoOut");
+    ctx->ShareLoD("Input", "ADSlotSessionOut");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
+        ctx.device_context());
+  }
+};
+
+class FusedSeqTensorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input",
+             "The input tensors of operator.");
+    AddInput("ADInput",
+             "The input ad tensors of operator. ");
+    AddOutput("DINOut",
+              "DINOut");
+    AddOutput("MaskOut",
+              "MaskOut");
+    AddOutput("SideInfoOut",
+              "SideInfoOut");
+    AddOutput("ADSlotSessionOut",
+              "ADSlotSessionOut");
+
+    AddAttr<int64_t>("batch_count", "(int, default 1)");
+    AddAttr<int64_t>("max_length", "(int, default 1)");
+    AddAttr<int64_t>("slot_num", "(int, default 1)");
+    AddAttr<int64_t>("fea_emb_dim", "(int, default 1)");
+    AddAttr<int64_t>("ad_slot_num", "(int, default 1)");
+    AddAttr<int64_t>("ad_slot_offset", "(int, default 1)");
+
+    AddComment(R"DOC(
+Fuse seq tensor.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(fused_seq_tensor, 
+                  ops::FusedSeqTensorOp, ops::FusedSeqTensorOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+  fused_seq_tensor,
+  ops::FusedSeqTensorCPUKernel<phi::CPUContext, float>);
diff --git a/paddle/fluid/operators/fused/fused_seq_tensor_op.cu b/paddle/fluid/operators/fused/fused_seq_tensor_op.cu
new file mode 100644
index 0000000000000..8210cd43808c3
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_seq_tensor_op.cu
@@ -0,0 +1,290 @@
+#include <cublas.h>
+#include <fstream>
+#include <string>
+#include "paddle/fluid/operators/fused/fused_seq_tensor_op.h" // don't remove this
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void cal_ad_slot_session_kernel(const T* input,
+                                  const T* ad_input,
+                                  T* din_output,
+                                  T* ad_slot_session_output,
+                                  const size_t batch_num, 
+                                  const size_t ins_num, 
+                                  const size_t slot_num,
+                                  const size_t max_length,
+                                  const size_t fea_emb_dim,
+                                  const size_t ad_slot_num,
+                                  const size_t ad_slot_offset) {
+                              
+  size_t batch_idx = blockIdx.x;
+  size_t ins_idx = blockIdx.y;
+  size_t fea_idx = blockIdx.z;
+
+  const size_t one_slot_dim = max_length * fea_emb_dim;
+  const size_t one_seq_dim = slot_num * one_slot_dim;
+  const size_t ad_seq_dim = ad_slot_num * one_slot_dim;
+
+  const size_t piece_of_ad_seq_dim = ad_slot_num * fea_emb_dim;
+  for (size_t idx = threadIdx.x; idx < piece_of_ad_seq_dim; idx += blockDim.x) {
+    size_t slot_idx = idx / fea_emb_dim + ad_slot_offset;
+    size_t out_slot_idx = idx / fea_emb_dim;
+    size_t fea_dim_idx = idx % fea_emb_dim;
+    
+    size_t input_fea_begin_idx = ins_idx * (batch_num * one_seq_dim) + batch_idx * one_seq_dim +
+                                slot_idx * one_slot_dim + fea_idx * fea_emb_dim;
+
+    size_t ad_fea_begin_idx =
+      ins_idx * (1 * batch_num * piece_of_ad_seq_dim) +  batch_idx * piece_of_ad_seq_dim +
+      out_slot_idx * fea_emb_dim;
+    
+    const T input_val = input[input_fea_begin_idx + fea_dim_idx];
+    const T ad_val = ad_input[ad_fea_begin_idx + fea_dim_idx];
+
+    size_t fea_concat_start_idx =
+      batch_idx * (ins_num * ad_seq_dim * 4) + ins_idx * (ad_seq_dim * 4) +
+      fea_idx * (piece_of_ad_seq_dim * 4) + out_slot_idx * fea_emb_dim;
+    
+    din_output[fea_concat_start_idx + fea_dim_idx] = input_val;
+    din_output[fea_concat_start_idx + fea_dim_idx + piece_of_ad_seq_dim] = ad_val;
+    din_output[fea_concat_start_idx + fea_dim_idx + piece_of_ad_seq_dim * 2] = input_val - ad_val;
+    din_output[fea_concat_start_idx + fea_dim_idx + piece_of_ad_seq_dim * 3] = input_val * ad_val;
+    
+    size_t ad_slot_session_out_start_idx =
+      batch_idx * (ins_num * ad_seq_dim) + ins_idx * ad_seq_dim +
+      fea_idx * piece_of_ad_seq_dim + out_slot_idx * fea_emb_dim;
+    ad_slot_session_output[ad_slot_session_out_start_idx + fea_dim_idx] = input_val;
+  }
+}
+
+template <typename T>
+__global__ void cal_sideinfo_kernel(const T* input,
+                                  T* side_info_output,
+                                  const size_t batch_num,
+                                  const size_t ins_num, 
+                                  const size_t slot_num,
+                                  const size_t max_length,
+                                  const size_t fea_emb_dim,
+                                  const size_t sideinfo_slot_num,
+                                  const size_t sideinfo_slot_offset) {
+  
+  size_t batch_idx = blockIdx.x;
+  size_t ins_idx = blockIdx.y;
+  size_t fea_idx = blockIdx.z;
+  
+  const size_t one_slot_dim = max_length * fea_emb_dim;
+  const size_t input_one_seq_dim = slot_num * one_slot_dim;
+  const size_t sideinfo_seq_dim = sideinfo_slot_num * one_slot_dim;
+
+  const size_t piece_of_sideinfo_seq_dim = sideinfo_slot_num * fea_emb_dim;
+  for (size_t idx = threadIdx.x; idx < piece_of_sideinfo_seq_dim; idx += blockDim.x) {
+    size_t out_slot_idx = idx / fea_emb_dim;
+    size_t slot_idx = out_slot_idx + sideinfo_slot_offset;
+    size_t fea_dim_idx = idx % fea_emb_dim;
+    
+    size_t input_fea_begin_idx = ins_idx * (batch_num * input_one_seq_dim) + batch_idx * input_one_seq_dim +
+                                slot_idx * one_slot_dim + fea_idx * fea_emb_dim;
+    
+    size_t fea_transpose_start_idx =
+      batch_idx * (ins_num * sideinfo_seq_dim) + ins_idx * sideinfo_seq_dim +
+      fea_idx * (sideinfo_slot_num * fea_emb_dim) + out_slot_idx * fea_emb_dim;
+
+    side_info_output[fea_transpose_start_idx + fea_dim_idx] = input[input_fea_begin_idx + fea_dim_idx];
+  }
+}
+
+template <typename T>
+__global__ void cal_sideinfo_kernel_without_loop(const T* input,
+                                  T* side_info_output,
+                                  const size_t batch_num,
+                                  const size_t ins_num, 
+                                  const size_t slot_num,
+                                  const size_t max_length,
+                                  const size_t fea_emb_dim,
+                                  const size_t sideinfo_slot_num,
+                                  const size_t sideinfo_slot_offset) {
+  
+  size_t batch_idx = blockIdx.x;
+  size_t ins_idx = blockIdx.y;
+  size_t fea_idx = blockIdx.z;
+
+  size_t slot_idx = threadIdx.y + sideinfo_slot_offset;
+  size_t out_slot_idx = threadIdx.y;
+  size_t fea_dim_idx = threadIdx.x;
+  
+  const size_t one_slot_dim = max_length * fea_emb_dim;
+  size_t input_one_seq_dim = slot_num * one_slot_dim;
+  size_t out_one_seq_dim = sideinfo_slot_num * one_slot_dim;
+
+  size_t input_fea_begin_idx = ins_idx * (batch_num * input_one_seq_dim) + batch_idx * (input_one_seq_dim) +
+                              slot_idx * one_slot_dim + fea_idx * fea_emb_dim;
+  
+  size_t fea_transpose_start_idx =
+    batch_idx * (ins_num * out_one_seq_dim) + ins_idx * out_one_seq_dim +
+    fea_idx * (sideinfo_slot_num * fea_emb_dim) + out_slot_idx * fea_emb_dim;
+
+  side_info_output[fea_transpose_start_idx + fea_dim_idx] = input[input_fea_begin_idx + fea_dim_idx];
+}
+
+template <typename T>
+__device__ void warpReduce(volatile T* cache, int tid) {
+    cache[tid] += cache[tid+32];
+    cache[tid] += cache[tid+16];
+    cache[tid] += cache[tid+8];
+    cache[tid] += cache[tid+4];
+    cache[tid] += cache[tid+2];
+    cache[tid] += cache[tid+1];
+}
+
+#define THREAD_PER_BLOCK 128
+template <typename T>
+__global__ void reduce_sum_max_length(const T* input,
+                                      T* mask_output,
+                                      const size_t batch_count,
+                                      const size_t ins_num,
+                                      const size_t slot_num,
+                                      const size_t max_length,
+                                      const size_t fea_emb_dim) {
+    size_t batch_idx = blockIdx.x;
+    size_t ins_idx = blockIdx.y; 
+    size_t fea_idx = blockIdx.z;
+
+    size_t data_len_per_block = slot_num * fea_emb_dim;
+    
+    __shared__ T sdata[THREAD_PER_BLOCK];
+    //each thread loads one element from global memory to shared mem
+    size_t input_start_idx = ins_idx * (batch_count * slot_num * max_length * fea_emb_dim) + 
+                              batch_idx * (slot_num * max_length * fea_emb_dim);
+
+    size_t tid = threadIdx.x;
+    // memset shared mem
+    sdata[tid] = 0;  
+    for (size_t idx = tid; idx < data_len_per_block; idx += blockDim.x) {
+      size_t slot_idx = idx / fea_emb_dim;
+      size_t fea_dim_idx = idx % fea_emb_dim;
+      size_t offset = slot_idx * (max_length * fea_emb_dim) + fea_idx * fea_emb_dim + fea_dim_idx;
+      sdata[tid] += input[input_start_idx + offset];
+    }
+    __syncthreads();
+
+    for(size_t s = blockDim.x / 2; s > 32; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] += sdata[tid + s];
+        }
+        __syncthreads();
+    }
+    // When s < 32, we have only one warp left, no need to sync threads, no need to if (tid < s)
+    if(tid < 32) {
+      warpReduce<T>(sdata, tid);
+    }
+
+    if(tid == 0) {
+        // [batch_count, ins_num, max_length]
+        size_t out_idx = batch_idx * (ins_num * max_length)
+                        + ins_idx * (max_length) 
+                        + fea_idx;
+        if (fabs(sdata[tid]) > 1e-8) {
+            mask_output[out_idx] = 1;
+        } else {
+            mask_output[out_idx] = 0;
+        }
+    }
+}
+
+template <typename T>
+class FusedSeqTensorCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto input = ctx.Input<framework::Tensor>("Input");
+    PADDLE_ENFORCE_NOT_NULL(input, platform::errors::NotFound("Input not found"));
+    auto ad_input = ctx.Input<framework::Tensor>("ADInput");
+    PADDLE_ENFORCE_NOT_NULL(ad_input, platform::errors::NotFound("Input not found"));
+
+    auto din_output = ctx.Output<framework::Tensor>("DINOut");
+    PADDLE_ENFORCE_NOT_NULL(din_output,
+                            platform::errors::NotFound("DINOut not found"));
+    T* din_output_data = din_output->mutable_data<T>(ctx.GetPlace());
+    auto mask_output = ctx.Output<framework::Tensor>("MaskOut");
+    PADDLE_ENFORCE_NOT_NULL(mask_output,
+                            platform::errors::NotFound("MaskOut not found"));
+    T* mask_output_output_data = mask_output->mutable_data<T>(ctx.GetPlace());
+    auto side_info_output = ctx.Output<framework::Tensor>("SideInfoOut");
+    PADDLE_ENFORCE_NOT_NULL(side_info_output,
+                            platform::errors::NotFound("Output not found"));
+    T* side_info_output_data =
+        side_info_output->mutable_data<T>(ctx.GetPlace());
+    auto ad_slot_session_output =
+        ctx.Output<framework::Tensor>("ADSlotSessionOut");
+    PADDLE_ENFORCE_NOT_NULL(ad_slot_session_output,
+                            platform::errors::NotFound("Output not found"));
+    T* ad_slot_session_output_data =
+        ad_slot_session_output->mutable_data<T>(ctx.GetPlace());
+
+    auto batch_count = ctx.Attr<int64_t>("batch_count");
+    auto max_length = ctx.Attr<int64_t>("max_length");
+    auto slot_num = ctx.Attr<int64_t>("slot_num");
+    auto fea_emb_dim = ctx.Attr<int64_t>("fea_emb_dim");
+    auto ad_slot_num = ctx.Attr<int64_t>("ad_slot_num");
+    auto ad_slot_offset = ctx.Attr<int64_t>("ad_slot_offset");
+
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
+    auto stream = ctx.cuda_device_context().stream();
+
+    auto input_dims = input->dims();
+    size_t ins_num = input_dims[0];
+
+    dim3 ad_grid(batch_count, ins_num, max_length);
+    dim3 ad_block(std::min(static_cast<size_t>(1024), static_cast<size_t>(ad_slot_num * fea_emb_dim)));
+
+    cal_ad_slot_session_kernel<<<ad_grid, ad_block, 0, stream>>>(
+        input->data<T>(), ad_input->data<T>(), din_output_data,
+        ad_slot_session_output_data,
+        batch_count, ins_num, slot_num, max_length, fea_emb_dim,
+        ad_slot_num, ad_slot_offset);
+
+    size_t sideinfo_slot_offset = 0;
+    if (ad_slot_offset == 0) {
+      sideinfo_slot_offset = ad_slot_num;
+    }
+    size_t fea_padding_dim = ((fea_emb_dim + 31) / 32) * 32;
+    size_t sideinfo_slot_num = slot_num - ad_slot_num;
+    
+    if (sideinfo_slot_num * fea_emb_dim < 1024) {
+      dim3 sideinfo_grid(batch_count, ins_num, max_length);
+      dim3 sideinfo_block(fea_emb_dim, sideinfo_slot_num);
+      cal_sideinfo_kernel_without_loop<<<sideinfo_grid, sideinfo_block, 0, stream>>>(
+        input->data<T>(), side_info_output_data, batch_count, ins_num, 
+        slot_num, max_length, fea_emb_dim,
+        sideinfo_slot_num, sideinfo_slot_offset);
+    } else {
+      dim3 sideinfo_grid(batch_count, ins_num, max_length);
+      dim3 sideinfo_block(sideinfo_slot_num * fea_emb_dim);
+      cal_sideinfo_kernel<<<sideinfo_grid, sideinfo_block, 0, stream>>>(
+          input->data<T>(), side_info_output_data, batch_count, ins_num, 
+          slot_num, max_length, fea_emb_dim,
+          sideinfo_slot_num, sideinfo_slot_offset);
+    }
+
+    dim3 reduce_grid(batch_count, ins_num, max_length);
+    dim3 reduce_block(THREAD_PER_BLOCK);
+    reduce_sum_max_length<<<reduce_grid, reduce_block, 0, stream>>>(
+        input->data<T>(), mask_output_output_data, batch_count,
+        ins_num, slot_num, max_length, fea_emb_dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+  fused_seq_tensor,
+  ops::FusedSeqTensorCUDAKernel<float>);
\ No newline at end of file
diff --git a/paddle/fluid/operators/fused/fused_seq_tensor_op.h b/paddle/fluid/operators/fused/fused_seq_tensor_op.h
new file mode 100644
index 0000000000000..d7bbadd72e3b5
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_seq_tensor_op.h
@@ -0,0 +1,16 @@
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class FusedSeqTensorCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext&) const override {
+    PADDLE_THROW(platform::errors::Unimplemented("fused_seq_tensor supports only GPU"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
index 474863cad18b9..4b1eda15dfcfc 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc
@@ -48,6 +48,7 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel {
     bool clk_filter = ctx->Attrs().Get<bool>("clk_filter");
     const int embed_thres_size = ctx->Attrs().Get<int>("embed_thres_size");
     const int embedx_concate_size = ctx->Attrs().Get<int>("embedx_concate_size");
+    //const bool fill_zero = ctx->Attrs().Get<bool>("fill_zero");
 
     // need filter quant_ratio more than zero
     if (ctx->Attrs().Get<bool>("need_filter")) {
@@ -142,6 +143,7 @@ class FusedSeqpoolCVMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("embed_thres_size", "(int, default 0)").SetDefault(0);
     AddAttr<int>("embedx_concate_size", "(int, default 1)").SetDefault(1);
     AddAttr<bool>("embedx_concate_filter", "(bool, default false)").SetDefault(false);
+    AddAttr<bool>("fill_zero", "(bool, default true)").SetDefault(true);
     AddAttr<bool>("fix_ctr_to_click", "(bool, default false)").SetDefault(false);
 
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index d7ba888aa1dd5..76c02b4a6c93f 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -177,7 +177,7 @@ __global__ void FusedSeqpoolKernelEmbedQuantFilterEmbedxConcate(
     size_t **lods_values, const int batch_size, const int embedding_size,
     const float pad_value, const int cvm_offset, const float show_coeff,
     const float clk_coeff, const float threshold, const int quant_ratio,
-    const float embed_threshold, const int embedx_concate_size, bool embedx_concate_filter) {
+    const float embed_threshold, const int embedx_concate_size, bool embedx_concate_filter, bool fill_zero) {
   CUDA_KERNEL_LOOP(i, N) {
     int key = i / embedding_size;
     int offset = i % embedding_size;  // embedx id
@@ -188,11 +188,17 @@ __global__ void FusedSeqpoolKernelEmbedQuantFilterEmbedxConcate(
 
     double val = pad_value;
     int concate_index = 0;
+    bool val_use_zero = false;
     for (auto k = start; k < end; ++k) {
+      val_use_zero = false;
       T &show = *(input_values[x] + k * embedding_size);
       T &click = *(input_values[x] + k * embedding_size + 1);
-      if (!embedx_concate_filter&&(show - click) * show_coeff + click * clk_coeff < threshold) {
-        continue;
+      if (embedx_concate_filter && (show - click) * show_coeff + click * clk_coeff < threshold) {
+        if (fill_zero) {
+          val_use_zero = true;
+        } else {
+          continue;
+        }
       }
       T &embedw = *(input_values[x] + k * embedding_size + cvm_offset);
       T embedx_weight_score = 0.0;
@@ -202,16 +208,28 @@ __global__ void FusedSeqpoolKernelEmbedQuantFilterEmbedxConcate(
       }
       embedx_weight_score = std::sqrt(embedx_weight_score) + std::abs(embedw);
       if (embedx_concate_filter && embedx_weight_score < embed_threshold) {
-        continue;
+        if (fill_zero) {
+          val_use_zero = true;
+        } else {
+          continue;
+        }
       }
       if (offset < cvm_offset) {  // show & click
-        val = *(input_values[x] + k * embedding_size + offset);
+        if (val_use_zero) {
+          val = pad_value;
+        } else {
+          val = *(input_values[x] + k * embedding_size + offset);
+        }
       } else {
-        val = ((static_cast<int>(
+        if (val_use_zero) {
+          val = pad_value;
+        } else {
+          val = ((static_cast<int>(
                     *(input_values[x] + k * embedding_size + offset) *
                         quant_ratio +
                     0.5)) /
                 static_cast<float>(quant_ratio));
+        }
       }
       if (concate_index == embedx_concate_size) {
         *(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + (embedx_concate_size-1) * embedding_size + offset) += val;
@@ -356,7 +374,9 @@ void FusedSeqpoolCVM(const paddle::platform::Place &place,
                      float clk_coeff, float threshold, float embed_threshold,
                      const int quant_ratio, const bool clk_filter,
                      const int embed_thres_size, const int embedx_concate_size,
-                     bool embedx_concate_filter, bool fix_ctr_to_click) {
+                     bool embedx_concate_filter,
+                     bool fill_zero, 
+                     bool fix_ctr_to_click) {
   auto stream = dynamic_cast<phi::GPUContext*>(
               platform::DeviceContextPool::Instance().Get(place))
               ->stream();
@@ -399,7 +419,7 @@ void FusedSeqpoolCVM(const paddle::platform::Place &place,
                                          0, stream>>>(
         N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size,
         embedding_size, padding_value, cvm_offset, show_coeff, clk_coeff,
-        threshold, quant_ratio, embed_threshold, embedx_concate_size, embedx_concate_filter);
+        threshold, quant_ratio, embed_threshold, embedx_concate_size, embedx_concate_filter, fill_zero);
     }
   } else if (need_filter) {  // quant need filter
     FusedSeqpoolKernelQuantFilter<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
@@ -694,6 +714,7 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
     const int embed_thres_size = ctx.Attr<int>("embed_thres_size");
     const int embedx_concate_size = ctx.Attr<int>("embedx_concate_size");
     bool embedx_concate_filter = ctx.Attr<bool>("embedx_concate_filter");
+    bool fill_zero = ctx.Attr<bool>("fill_zero");
     bool fix_ctr_to_click = ctx.Attr<bool>("fix_ctr_to_click");
 
     framework::GPULodVector gpu_lods[slot_size];
@@ -742,8 +763,8 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel<T> {
                     embedding_size, padding_value, use_cvm, cvm_offset,
                     need_filter, embed_threshold_filter, show_coeff, clk_coeff,
                     threshold, embed_threshold, quant_ratio, clk_filter,
-                    embed_thres_size, embedx_concate_size, embedx_concate_filter,
-                    fix_ctr_to_click);
+                    embed_thres_size, embedx_concate_size, embedx_concate_filter, 
+                    fill_zero, fix_ctr_to_click);
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.cu
index e552d60a24c8b..0e01eb1785132 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.cu
@@ -52,6 +52,74 @@ __global__ void FusedSeqpoolWithConvKernelNormal(const size_t N, T **input_value
     *(seqpool_output_values[x] + y * embedding_size + offset) = val;
   }
 }
+
+// Filter
+template <typename T>
+__global__ void FusedSeqpoolWithConvKernelFilter(const size_t N, T **input_values,
+                                         T **seqpool_output_values,
+                                         size_t **lods_values,
+                                         const int batch_size,
+                                         const int embedding_size,
+                                         const float pad_value,
+                                         const float show_coeff,
+                                         const float clk_coeff,
+                                         const float threshold) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;
+    int x = key / batch_size;  // slot id
+    int y = key % batch_size;  // ins id
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+
+    double val = pad_value;
+    for (auto k = start; k < end; ++k) {
+      T &show = *(input_values[x] + k * embedding_size);
+      T &click = *(input_values[x] + k * embedding_size + 1);
+      if ((show - click) * show_coeff + click * clk_coeff < threshold) {
+        continue;
+      }
+      val += *(input_values[x] + k * embedding_size + offset);
+    }
+    *(seqpool_output_values[x] + y * embedding_size + offset) = val;
+  }
+}
+
+// normal & expand slot's feasign
+template <typename T>
+__global__ void FusedSeqpoolWithConvKernelNormalEmbedxConcate(const size_t N, T **input_values,
+                                         T **seqpool_output_values,
+                                         size_t **lods_values,
+                                         const int batch_size,
+                                         const int embedding_size,
+                                         const float pad_value,
+                                         const int embedx_concate_size) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;
+    int x = key / batch_size;  // slot id
+    int y = key % batch_size;  // ins id
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+    int concate_index = 0;
+
+    double val = pad_value;
+    for (auto k = start; k < end; ++k) {
+      val = *(input_values[x] + k * embedding_size + offset);
+      if (concate_index == embedx_concate_size) {
+        *(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + (embedx_concate_size-1) * embedding_size + offset) += val;
+      } else {
+        *(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + concate_index * embedding_size + offset) = val;
+        concate_index += 1;
+      }
+    }
+    while (concate_index < embedx_concate_size) {
+      *(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + concate_index * embedding_size + offset) = pad_value;
+      concate_index += 1;
+    }
+  }
+}
+
 // join only need show input
 template <typename T>
 __global__ void FusedCVMWithConvKernelNormal(const size_t N, T **output_values,
@@ -81,6 +149,38 @@ __global__ void FusedCVMWithConvKernelNormal(const size_t N, T **output_values,
   }
 }
 
+// join only need show input, and expand slot's feasign
+template <typename T>
+__global__ void FusedCVMWithConvKernelNormalConcate(const size_t N, T **output_values,
+                                       T **seqpool_output_values,
+                                       const int batch_size,
+                                       const int embedding_size,
+                                       const int noclk_embedding_size,
+                                       const int embedx_concate_size) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / noclk_embedding_size;
+    int offset = i % noclk_embedding_size;
+    int x = key / batch_size;  // slot id
+    int y = key % batch_size;  // ins id
+    for (int k=0; k < embedx_concate_size; k++) {
+      if (offset == 0) {         // show
+        *(output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size) =
+            log(*(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size) + 1);
+      } else if (offset == 1) {  // click
+        *(output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + 1) =
+            log(*(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + 1) + 1);
+      } else if (offset == 2) {  // conv
+        *(output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + 2) =
+            log(*(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + 2) + 1) -
+            log(*(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + 1) + 1);
+      } else {  // filter show, offset - 1
+        *(output_values[x] + y * noclk_embedding_size * embedx_concate_size + k * embedding_size + offset) =
+            *(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + offset);
+      }
+    }
+  }
+}
+
 // join only need show input
 template <typename T>
 __global__ void FusedCVMWithConvKernelWithOutShow(const size_t N, T **output_values,
@@ -109,6 +209,37 @@ __global__ void FusedCVMWithConvKernelWithOutShow(const size_t N, T **output_val
  }
 }
 
+// join only need show input, and expand slot's feasign
+template <typename T>
+__global__ void FusedCVMWithConvKernelWithOutShowConcate(const size_t N, T **output_values,
+                                      T **seqpool_output_values,
+                                      const int batch_size,
+                                      const int embedding_size,
+                                      const int noclk_embedding_size,
+                                      const int embedx_concate_size) {
+ CUDA_KERNEL_LOOP(i, N) {
+   int key = i / noclk_embedding_size;
+   int offset = i % noclk_embedding_size;
+   int x = key / batch_size;  // slot id
+   int y = key % batch_size;  // ins id
+   for (int k=0; k < embedx_concate_size; k++) {
+     if (offset == 0) {         // show
+       // do nothing
+     } else if (offset == 1) {  // click
+       *(output_values[x] + y * noclk_embedding_size * embedx_concate_size + k * noclk_embedding_size) =
+           log(*(seqpool_output_values[x] + y * embedding_size + 1) + 1);
+     } else if (offset == 2) {  // conv
+       *(output_values[x] + y * noclk_embedding_size * embedx_concate_size + k * noclk_embedding_size + 1) =
+           log(*(seqpool_output_values[x] + y * embedding_size + 2) + 1) -
+           log(*(seqpool_output_values[x] + y * embedding_size + 1) + 1);
+     } else {  // filter show, offset - 1
+       *(output_values[x] + y * noclk_embedding_size * embedx_concate_size + k * noclk_embedding_size + offset) =
+           *(seqpool_output_values[x] + y * embedding_size + offset);
+     }
+   }
+ }
+}
+
 // update not need show click input
 template <typename T>
 __global__ void FusedCVMWithConvKernelNoCVM(const size_t N, T **output_values,
@@ -128,6 +259,28 @@ __global__ void FusedCVMWithConvKernelNoCVM(const size_t N, T **output_values,
   }
 }
 
+// update not need show click input, expand slot's feasign
+template <typename T>
+__global__ void FusedCVMWithConvKernelNoCVMConcate(const size_t N, T **output_values,
+                                    T **seqpool_output_values,
+                                    const int batch_size,
+                                    const int no_cvm_embedding_size,
+                                    const int cvm_offset,
+                                    const int embedx_concate_size) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / no_cvm_embedding_size;
+    int offset = i % no_cvm_embedding_size;
+    int x = key / batch_size;  // slot id
+    int y = key % batch_size;  // ins id
+    // no cvm
+    for (int k=0; k < embedx_concate_size; k++) {
+      *(output_values[x] + y * no_cvm_embedding_size * embedx_concate_size + k * no_cvm_embedding_size + offset) =
+          *(seqpool_output_values[x] + y * (no_cvm_embedding_size + cvm_offset) * embedx_concate_size +
+            k * (no_cvm_embedding_size + cvm_offset) + offset + cvm_offset);
+    }
+  }
+}
+
 template <typename T>
 void FusedSeqpoolCVMWithConv(const paddle::platform::Place &place,
                      const std::vector<const T *> &input_data,
@@ -136,7 +289,10 @@ void FusedSeqpoolCVMWithConv(const paddle::platform::Place &place,
                      std::vector<const size_t *> lods, const int batch_size,
                      const int slot_num, const int embedding_size,
                      const float padding_value, const bool use_cvm,
-                     const int cvm_offset, bool show_filter) {
+                     float need_filter, float show_coeff,
+                     float clk_coeff, float threshold,
+                     const int cvm_offset, bool show_filter,
+                     const int embedx_concate_size) {
   auto stream = dynamic_cast<phi::GPUContext*>(
                  platform::DeviceContextPool::Instance().Get(place))
                  ->stream();
@@ -167,81 +323,169 @@ void FusedSeqpoolCVMWithConv(const paddle::platform::Place &place,
 
   size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
   // first sum pool
-  FusedSeqpoolWithConvKernelNormal<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
-                             stream>>>(
-      N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size,
-      embedding_size, padding_value);
+  if (embedx_concate_size == 1){
+    if (need_filter) {  //filter
+      FusedSeqpoolWithConvKernelFilter<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
+                              stream>>>(
+        N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size,
+        embedding_size, padding_value, show_coeff, clk_coeff, threshold);
+    } else {  //normal
+      FusedSeqpoolWithConvKernelNormal<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
+                                stream>>>(
+        N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size,
+        embedding_size, padding_value);
+    }
+  } else {
+      FusedSeqpoolWithConvKernelNormalEmbedxConcate<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
+                                 stream>>>(
+          N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size,
+          embedding_size, padding_value, embedx_concate_size);
+  }
   // second log
   if (use_cvm) {
     if (show_filter) {
         N = static_cast<size_t>(batch_size * slot_num * (embedding_size - 1));
-        FusedCVMWithConvKernelWithOutShow<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
-                                 stream>>>(N, gpu_output_values,
-                                           gpu_seqpool_output_values, batch_size,
-                                           embedding_size, embedding_size - 1);
+        if (embedx_concate_size == 1) {
+          FusedCVMWithConvKernelWithOutShow<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
+                                   stream>>>(N, gpu_output_values,
+                                             gpu_seqpool_output_values, batch_size,
+                                             embedding_size, embedding_size - 1);
+        } else {
+          FusedCVMWithConvKernelWithOutShowConcate<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
+                                   stream>>>(N, gpu_output_values,
+                                             gpu_seqpool_output_values, batch_size,
+                                             embedding_size, embedding_size - 1, embedx_concate_size);
+        }
     } else {
-        FusedCVMWithConvKernelNormal<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
-                                 stream>>>(N, gpu_output_values,
-                                           gpu_seqpool_output_values, batch_size,
-                                           embedding_size, embedding_size);
+        if (embedx_concate_size == 1) {
+          FusedCVMWithConvKernelNormal<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
+                                   stream>>>(N, gpu_output_values,
+                                             gpu_seqpool_output_values, batch_size,
+                                             embedding_size, embedding_size);
+        } else {
+          FusedCVMWithConvKernelNormalConcate<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
+                                   stream>>>(N, gpu_output_values,
+                                             gpu_seqpool_output_values, batch_size,
+                                             embedding_size, embedding_size, embedx_concate_size);
+        }
     }
   } else {
     // not need show click input
     N = static_cast<size_t>(batch_size * slot_num *
                             (embedding_size - cvm_offset));
-    FusedCVMWithConvKernelNoCVM<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
-        N, gpu_output_values, gpu_seqpool_output_values, batch_size,
-        (embedding_size - cvm_offset), cvm_offset);
+    if (embedx_concate_size == 1) { 
+      FusedCVMWithConvKernelNoCVM<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+          N, gpu_output_values, gpu_seqpool_output_values, batch_size,
+          (embedding_size - cvm_offset), cvm_offset);
+    } else {
+      FusedCVMWithConvKernelNoCVMConcate<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+          N, gpu_output_values, gpu_seqpool_output_values, batch_size,
+          (embedding_size - cvm_offset), cvm_offset, embedx_concate_size);
+    }
+  }
+}
+
+// join grad
+template <typename T>
+__global__ void FusedSeqpoolCVMWithConvGradKernelWithCVM(
+    const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values,
+    size_t **lods_values, const int batch_size, const int embedding_size,
+    const int cvm_offset) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;  // embedx offset
+    int x = key / batch_size;         // slot id
+    int y = key % batch_size;         // ins id
+
+    T &val = (offset < cvm_offset)
+                 ? *(cvm_values[x] + y * cvm_offset + offset)
+                 : *(out_grads_values[x] + y * embedding_size + offset);
+
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+    for (auto k = start; k < end; ++k) {
+      *(in_grads_values[x] + k * embedding_size + offset) = val;
+    }
+  }
+}
+
+// join grad, expand slot's feasign
+template <typename T>
+__global__ void FusedSeqpoolCVMWithConvGradKernelWithCVMConcate(
+    const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values,
+    size_t **lods_values, const int batch_size, const int embedding_size,
+    const int cvm_offset, const int embedx_concate_size) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;  // embedx offset
+    int x = key / batch_size;         // slot id
+    int y = key % batch_size;         // ins id
+
+    int concate_index = 0;
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+    for (auto k = start; k < end; ++k) {
+      T &val = (offset < cvm_offset)
+                 ? *(cvm_values[x] + y * cvm_offset + offset)
+                 : *(out_grads_values[x] + y * embedding_size * embedx_concate_size + 
+                   embedding_size * concate_index + offset);
+      *(in_grads_values[x] + k * embedding_size + offset) = val;
+      concate_index = concate_index == (embedx_concate_size - 1) ? concate_index : concate_index + 1;
+    }
   }
 }
- // join grad
- template <typename T>
- __global__ void FusedSeqpoolCVMWithConvGradKernelWithCVM(
-     const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values,
-     size_t **lods_values, const int batch_size, const int embedding_size,
-     const int cvm_offset) {
-   CUDA_KERNEL_LOOP(i, N) {
-     int key = i / embedding_size;
-     int offset = i % embedding_size;  // embedx offset
-     int x = key / batch_size;         // slot id
-     int y = key % batch_size;         // ins id
- 
-     T &val = (offset < cvm_offset)
-                  ? *(cvm_values[x] + y * cvm_offset + offset)
-                  : *(out_grads_values[x] + y * embedding_size + offset);
- 
-     auto &start = *(lods_values[x] + y);
-     auto &end = *(lods_values[x] + y + 1);
-     for (auto k = start; k < end; ++k) {
-       *(in_grads_values[x] + k * embedding_size + offset) = val;
-     }
-   }
- }
-     
- // join only show not has click
- template <typename T>
- __global__ void FusedSeqpoolCVMWithConvGradKernelWithShow(
-     const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values,
-     size_t **lods_values, const int batch_size, const int embedding_size,
-     const int cvm_offset) {
-   CUDA_KERNEL_LOOP(i, N) {
-     int key = i / embedding_size;
-     int offset = i % embedding_size;  // embedx offset
-     int x = key / batch_size;         // slot id
-     int y = key % batch_size;         // ins id
- 
-     T &val =
-         (offset < cvm_offset)
-             ? *(cvm_values[x] + y * cvm_offset + offset)
-             : *(out_grads_values[x] + y * (embedding_size - 1) + offset - 1);
  
-     auto &start = *(lods_values[x] + y);
-     auto &end = *(lods_values[x] + y + 1);
-     for (auto k = start; k < end; ++k) {
-       *(in_grads_values[x] + k * embedding_size + offset) = val;
-     }
-   }
- }
+// join only show not has click
+template <typename T>
+__global__ void FusedSeqpoolCVMWithConvGradKernelWithShow(
+    const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values,
+    size_t **lods_values, const int batch_size, const int embedding_size,
+    const int cvm_offset) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;  // embedx offset
+    int x = key / batch_size;         // slot id
+    int y = key % batch_size;         // ins id
+
+    T &val =
+        (offset < cvm_offset)
+            ? *(cvm_values[x] + y * cvm_offset + offset)
+            : *(out_grads_values[x] + y * (embedding_size - 1) + offset - 1);
+
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+    for (auto k = start; k < end; ++k) {
+      *(in_grads_values[x] + k * embedding_size + offset) = val;
+    }
+  }
+}
+
+// join only show not has click, expand slot's feasign
+template <typename T>
+__global__ void FusedSeqpoolCVMWithConvGradKernelWithShowConcate(
+    const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values,
+    size_t **lods_values, const int batch_size, const int embedding_size,
+    const int cvm_offset, const int embedx_concate_size) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;  // embedx offset
+    int x = key / batch_size;         // slot id
+    int y = key % batch_size;         // ins id
+
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+    int concate_index = 0;
+    for (auto k = start; k < end; ++k) {
+      T &val =
+        (offset < cvm_offset)
+            ? *(cvm_values[x] + y * cvm_offset + offset)
+            : *(out_grads_values[x] + y * (embedding_size - 1) * embedx_concate_size + 
+              (embedding_size - 1) * concate_index + offset - 1);
+      *(in_grads_values[x] + k * embedding_size + offset) = val;
+      concate_index = concate_index == (embedx_concate_size - 1) ? concate_index : concate_index + 1;
+    }
+  }
+}
 
 // update grad
 template <typename T>
@@ -267,6 +511,33 @@ __global__ void FusedSeqpoolCVMWithConvGradKernelNoCVM(
     }
   }
 }
+
+// update grad, expand slot's feasign
+template <typename T>
+__global__ void FusedSeqpoolCVMWithConvGradKernelNoCVMConcate(
+    const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values,
+    size_t **lods_values, const int batch_size, const int embedding_size,
+    const int cvm_offset, const int embedx_concate_size) {
+  CUDA_KERNEL_LOOP(i, N) {
+    int key = i / embedding_size;
+    int offset = i % embedding_size;  // embedx offset
+    int x = key / batch_size;         // slot id
+    int y = key % batch_size;         // ins id
+
+    auto &start = *(lods_values[x] + y);
+    auto &end = *(lods_values[x] + y + 1);
+    int concate_index = 0;
+    for (auto k = start; k < end; ++k) {
+      T &val = (offset < cvm_offset)
+                 ? *(cvm_values[x] + y * cvm_offset + offset)
+                 : *(out_grads_values[x] + y * (embedding_size - cvm_offset) * embedx_concate_size +
+                     (embedding_size - cvm_offset) * concate_index + offset - cvm_offset);
+      *(in_grads_values[x] + k * embedding_size + offset) = val;
+      concate_index = concate_index == (embedx_concate_size - 1) ? concate_index : concate_index + 1;
+    }
+  }
+}
+
 template <typename T>
 void FusedSeqpoolCVMGradWithConv(const paddle::platform::Place &place,
                          const std::vector<const T *> &out_grads_data,
@@ -275,7 +546,8 @@ void FusedSeqpoolCVMGradWithConv(const paddle::platform::Place &place,
                          const std::vector<const size_t *> &lods,
                          const int batch_size, const int slot_num,
                          const int embedding_size, const bool use_cvm,
-                         const int cvm_offset, bool show_filter) {
+                         const int cvm_offset, bool show_filter,
+                         const int embedx_concate_size) {
   auto stream = dynamic_cast<phi::GPUContext*>(
                  platform::DeviceContextPool::Instance().Get(place))
                  ->stream();
@@ -307,23 +579,43 @@ void FusedSeqpoolCVMGradWithConv(const paddle::platform::Place &place,
   size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
   if (use_cvm) {
     if (show_filter) {
+      if (embedx_concate_size == 1) {
         FusedSeqpoolCVMWithConvGradKernelWithShow<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS,
                                             0, stream>>>(
             N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values,
             lods_values, batch_size, embedding_size, cvm_offset);
-
+      } else {
+        FusedSeqpoolCVMWithConvGradKernelWithShowConcate<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS,
+                                            0, stream>>>(
+            N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values,
+            lods_values, batch_size, embedding_size, cvm_offset, embedx_concate_size); 
+      }
     } else {
+      if (embedx_concate_size == 1) { 
         FusedSeqpoolCVMWithConvGradKernelWithCVM<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS,
                                             0, stream>>>(
             N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values,
             lods_values, batch_size, embedding_size, cvm_offset);
+      } else {
+        FusedSeqpoolCVMWithConvGradKernelWithCVMConcate<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS,
+                                            0, stream>>>(
+            N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values,
+            lods_values, batch_size, embedding_size, cvm_offset, embedx_concate_size);
+      }
     }
   } else {
     // update grad
-    FusedSeqpoolCVMWithConvGradKernelNoCVM<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
-                                     stream>>>(
-        N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values,
-        lods_values, batch_size, embedding_size, cvm_offset);
+    if (embedx_concate_size == 1) {
+      FusedSeqpoolCVMWithConvGradKernelNoCVM<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
+                                       stream>>>(
+          N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values,
+          lods_values, batch_size, embedding_size, cvm_offset);
+    } else {
+     FusedSeqpoolCVMWithConvGradKernelNoCVMConcate<<<GET_BLOCK(N), PADDLE_CUDA_NUM_THREADS, 0,
+                                       stream>>>(
+          N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values,
+          lods_values, batch_size, embedding_size, cvm_offset, embedx_concate_size);
+    }
   }
 }
 
@@ -344,9 +636,14 @@ class FusedSeqpoolCVMWithConvCUDAKernel : public framework::OpKernel<T> {
 
     auto padding_value = ctx.Attr<float>("pad_value");
     auto use_cvm = ctx.Attr<bool>("use_cvm");
+    bool need_filter = ctx.Attr<bool>("need_filter");
+    float show_coeff = ctx.Attr<float>("show_coeff");
+    float clk_coeff = ctx.Attr<float>("clk_coeff");
+    float threshold = ctx.Attr<float>("threshold");
     const int cvm_offset = ctx.Attr<int>("cvm_offset");
     bool show_filter = ctx.Attr<bool>("show_filter");
-    
+    const int embedx_concate_size = ctx.Attr<int>("embedx_concate_size");
+
     framework::GPULodVector gpu_lods[slot_size];
     auto place = ctx.GetPlace();
         
@@ -368,13 +665,13 @@ class FusedSeqpoolCVMWithConvCUDAKernel : public framework::OpKernel<T> {
       if (use_cvm) {
         if (show_filter) {
             // show will filtered
-            output->Resize({batch_size, embedding_size - 1});
+            output->Resize({batch_size, (embedding_size - 1) * embedx_concate_size});
         } else {
             // show will filtered
-            output->Resize({batch_size, embedding_size});
+            output->Resize({batch_size, embedding_size * embedx_concate_size});
         }
       } else {
-        output->Resize({batch_size, embedding_size - cvm_offset});
+        output->Resize({batch_size, (embedding_size - cvm_offset) * embedx_concate_size});
       }
       output_data[i] =
           reinterpret_cast<T *>(output->mutable_data<T>(ctx.GetPlace()));
@@ -382,11 +679,13 @@ class FusedSeqpoolCVMWithConvCUDAKernel : public framework::OpKernel<T> {
 
       seqpool_output_data[i] =
           reinterpret_cast<T *>(seqpool_outputs[i].mutable_data<T>(
-              {batch_size, embedding_size}, ctx.GetPlace()));
+              {batch_size, embedding_size * embedx_concate_size}, ctx.GetPlace()));
     }
     FusedSeqpoolCVMWithConv(ctx.GetPlace(), input_data, output_data,
                     seqpool_output_data, lods_data, batch_size, slot_size,
-                    embedding_size, padding_value, use_cvm, cvm_offset, show_filter);
+                    embedding_size, padding_value, use_cvm,
+                    need_filter, show_coeff, clk_coeff, threshold,
+                    cvm_offset, show_filter, embedx_concate_size);
   }
 };
 
@@ -402,6 +701,7 @@ class FusedSeqpoolCVMWithConvGradCUDAKernel : public framework::OpKernel<T> {
     auto use_cvm = ctx.Attr<bool>("use_cvm");
     const int cvm_offset = ctx.Attr<int>("cvm_offset");
     bool show_filter = ctx.Attr<bool>("show_filter");
+    const int embedx_concate_size = ctx.Attr<int>("embedx_concate_size");
 
     const auto slot_size = in_grads.size();
     std::vector<const T *> out_grads_data(slot_size);
@@ -436,7 +736,7 @@ class FusedSeqpoolCVMWithConvGradCUDAKernel : public framework::OpKernel<T> {
     }
     FusedSeqpoolCVMGradWithConv(ctx.GetPlace(), out_grads_data, in_grads_data, cvm_data,
                         lods_data, batch_size, slot_size, embedding_size,
-                        use_cvm, cvm_offset, show_filter);
+                        use_cvm, cvm_offset, show_filter, embedx_concate_size);
   }
 };
 
diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc
index c6bed95e83dc5..25ed83844a079 100644
--- a/paddle/fluid/operators/index_select_op.cc
+++ b/paddle/fluid/operators/index_select_op.cc
@@ -104,8 +104,11 @@ class IndexSelectGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+#ifdef PADDLE_ON_INFERENCE
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer,
                                     "X");
+#endif
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -119,6 +122,12 @@ REGISTER_OPERATOR(index_select,
                   ops::IndexSelectGradMaker<paddle::framework::OpDesc>,
                   ops::IndexSelectGradMaker<paddle::imperative::OpBase>,
                   IndexSelectInferShapeFunctor);
+
+#ifdef PADDLE_ON_INFERENCE
 REGISTER_OPERATOR(index_select_grad,
                   ops::IndexSelectGradOp,
                   ops::IndexSelectGradNoNeedBufferVarsInferer);
+#else
+REGISTER_OPERATOR(index_select_grad,
+                  ops::IndexSelectGradOp);
+#endif
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index f18053e297e55..50a7a3414dc52 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -13,9 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <thrust/random.h>
-#include <thrust/sort.h>
-
 #include <iostream>
 #include <vector>
 
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
index 73d1131655aca..defea15e022c8 100644
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
@@ -326,6 +326,7 @@ class RankAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetAttrMap(this->Attrs());
   }
 };
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(RankAttentionGradOpNoNeedBufferVarsInference,
                                     "X",
                                     "RankOffset",
diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu
index 76999815e8eaf..99a954673da16 100644
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -264,9 +264,9 @@ __global__ void kernel_rank_back_propagate(const int para_row,
   // rank offset 2:1:46:2:44:3:45
   CUDA_KERNEL_LOOP(idx, ins_num * ins_col * para_col * max_rank) {
     int ins_id = idx / para_col / ins_col / max_rank;
-    int para_col_id = (idx / ins_col / max_rank) % para_col;
+    int para_col_id = (idx / ins_col / ins_num) % para_col;
     int ins_col_id = (idx / para_col / max_rank) % ins_col;
-    int k = (idx / para_col / ins_col) % max_rank;
+    int k = idx % max_rank;
 
     int lower = rank_offset[ins_id * rank_cols] - 1;
     if (lower < 0) {
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
index a9a44861c11e1..eeb284d167883 100644
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -37,12 +37,9 @@ template <typename DeviceContext, typename T>
 class SaveCombineOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
     auto filename = ctx.Attr<std::string>("file_path");
     auto overwrite = ctx.Attr<bool>("overwrite");
-    auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
     auto save_to_memory = ctx.Attr<bool>("save_to_memory");
-    auto output = ctx.Output<std::string>("Y");
 
     bool is_present = FileExists(filename);
     if (is_present && !overwrite) {
@@ -52,8 +49,31 @@ class SaveCombineOpKernel : public framework::OpKernel<T> {
           filename,
           overwrite));
     }
+    if (save_to_memory) {
+      auto output = ctx.Output<std::string>("Y");
+      PADDLE_ENFORCE_NE(output,
+                        nullptr,
+                        platform::errors::InvalidArgument(
+                            "Cannot find variable Y for save_combine_op"));
+      std::ostringstream ss;
+      SaveCombineVars(ctx, reinterpret_cast<std::ostream *>(&ss));
+      *output = ss.str();
+    } else {
+      MkDirRecursively(DirName(filename).c_str());
+      std::ofstream fout(filename, std::ios::binary);
+      PADDLE_ENFORCE_EQ(static_cast<bool>(fout),
+                        true,
+                        platform::errors::Unavailable(
+                            "Cannot open %s to save variables.", filename));
+      SaveCombineVars(ctx, reinterpret_cast<std::ostream *>(&fout));
+      fout.close();
+    }
+  }
 
-    std::ostringstream ss;
+ protected:
+  void SaveCombineVars(const framework::ExecutionContext &ctx, std::ostream *os) const {
+    auto place = ctx.GetPlace();
+    auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
     auto inp_var_names = ctx.InputNames("X");
     auto &inp_vars = ctx.MultiInputVar("X");
     PADDLE_ENFORCE_GT(inp_var_names.size(),
@@ -102,9 +122,9 @@ class SaveCombineOpKernel : public framework::OpKernel<T> {
           out.set_lod(tensor.lod());
           framework::TransDataType(
               in_kernel_type, out_kernel_type, tensor, &out);
-          framework::SerializeToStream(ss, out, dev_ctx);
+          framework::SerializeToStream(*os, out, dev_ctx);
         } else {
-          framework::SerializeToStream(ss, tensor, dev_ctx);
+          framework::SerializeToStream(*os, tensor, dev_ctx);
         }
       } else {
         auto &tensor = inp_vars[i]->Get<framework::Vocab>();
@@ -114,25 +134,9 @@ class SaveCombineOpKernel : public framework::OpKernel<T> {
           framework::ConvertWstrToStr(it->first, &t);
           data.emplace(t, it->second);
         }
-        framework::StringMapToStream(ss, data);
+        framework::StringMapToStream(*os, data);
       }
     }
-    if (save_to_memory) {
-      PADDLE_ENFORCE_NE(output,
-                        nullptr,
-                        platform::errors::InvalidArgument(
-                            "Cannot find variable Y for save_combine_op"));
-      *output = ss.str();
-    } else {
-      MkDirRecursively(DirName(filename).c_str());
-      std::ofstream fout(filename, std::ios::binary);
-      PADDLE_ENFORCE_EQ(static_cast<bool>(fout),
-                        true,
-                        platform::errors::Unavailable(
-                            "Cannot open %s to save variables.", filename));
-      fout << ss.str();
-      fout.close();
-    }
   }
 };
 
diff --git a/paddle/fluid/operators/scaled_fc_op.cu b/paddle/fluid/operators/scaled_fc_op.cu
index 20bd9dbf07361..bf920093ff794 100644
--- a/paddle/fluid/operators/scaled_fc_op.cu
+++ b/paddle/fluid/operators/scaled_fc_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cublas.h>
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/scaled_fc_op.h"
diff --git a/paddle/fluid/operators/scaled_int8fc_op.cu b/paddle/fluid/operators/scaled_int8fc_op.cu
index c03bbf61d67fb..347640fadd68f 100644
--- a/paddle/fluid/operators/scaled_int8fc_op.cu
+++ b/paddle/fluid/operators/scaled_int8fc_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cublas.h>
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/scaled_int8fc_op.h"
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index 7185d2356aae5..4ff874c3e89f5 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_helper.h"
 
 namespace paddle {
 namespace platform {
@@ -70,11 +71,6 @@ namespace platform {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
-  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
-  for (index_type i = __index__; __index__ < (num);          \
-       __index__ += blockDim.x * gridDim.x, i = __index__)
-
 class CublasHandleHolder {
  public:
   CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
index 291dd6c7ce1c7..a49d9013fb6d0 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
@@ -17,9 +17,10 @@
 namespace paddle {
 namespace platform {
 
-void CudaProfilerInit(std::string output_file,
-                      std::string output_mode,
-                      std::string config_file) {
+void CudaProfilerInit(const std::string& output_file,
+                      const std::string& output_mode,
+                      const std::string& config_file) {
+#if CUDA_VERSION < 11000
   PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
                  platform::errors::InvalidArgument(
                      "Unsupported cuda profiler output mode, expect `kvp` or "
@@ -28,6 +29,7 @@ void CudaProfilerInit(std::string output_file,
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+#endif
 }
 
 void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }
@@ -35,8 +37,16 @@ void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); }
 void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); }
 
 #ifndef _WIN32
-void CudaNvtxRangePush(std::string name) {
-  dynload::nvtxRangePushA(name.c_str());
+void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) {
+  nvtxEventAttributes_t eventAttrib;
+  eventAttrib.version = NVTX_VERSION;
+  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  eventAttrib.colorType = NVTX_COLOR_ARGB;
+  eventAttrib.color = static_cast<uint32_t>(color);
+  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  eventAttrib.message.ascii = name.c_str();
+
+  dynload::nvtxRangePushEx(&eventAttrib);
 }
 
 void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h
index 6c7cf0fd8dd94..555a83a0210f2 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h
@@ -23,16 +23,26 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-void CudaProfilerInit(std::string output_file,
-                      std::string output_mode,
-                      std::string config_file);
+void CudaProfilerInit(const std::string& output_file,
+                      const std::string& output_mode,
+                      const std::string& config_file);
 
 void CudaProfilerStart();
 
 void CudaProfilerStop();
 
 #ifndef _WIN32
-void CudaNvtxRangePush(std::string name);
+enum class NvtxRangeColor : uint32_t {
+  Black = 0x00000000,
+  Red = 0x00ff0000,
+  Green = 0x0000ff00,
+  Blue = 0x000000ff,
+  White = 0x00ffffff,
+  Yellow = 0x00ffff00,
+};
+
+void CudaNvtxRangePush(const std::string& name,
+                       const NvtxRangeColor color = NvtxRangeColor::Green);
 
 void CudaNvtxRangePop();
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index c6f367228e114..edcd29b88d0c2 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -21,6 +21,25 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
+#include "paddle/fluid/memory/stats.h"
+
+DECLARE_uint64(xpu_memory_limit_mb);
+DECLARE_double(fraction_of_xpu_memory_to_use);
+DECLARE_uint64(initial_xpu_memory_in_mb);
+DECLARE_uint64(reallocate_xpu_memory_in_mb);
+
+constexpr static float fraction_reserve_xpu_memory = 0.05f;
+
+PADDLE_DEFINE_EXPORTED_bool(enable_xpu_memory_usage_log,
+                            false,
+                            "Whether to print the message of xpu memory usage "
+                            "at exit, mainly used for UT and CI.");
+PADDLE_DEFINE_EXPORTED_bool(enable_xpu_memory_usage_log_mb,
+                            true,
+                            "Whether to print the message of xpu memory usage "
+                            "MB as a unit of measurement.");
 
 namespace paddle {
 namespace platform {
@@ -199,5 +218,333 @@ phi::backends::xpu::XPUVersion get_xpu_version(int dev_id) {
   return phi::backends::xpu::get_xpu_version(dev_id);
 }
 
+std::once_flag XPUMLHandler::init_flag_;
+
+XPUMLHandler::XPUMLHandler() {
+  std::call_once(XPUMLHandler::init_flag_, &XPUMLHandler::init_ml);
+  xpumlDeviceGetCount(&device_nums_);
+  device_handlers_.resize(device_nums_);
+  mem_infos_.resize(device_nums_);
+  for (unsigned int i = 0; i < device_nums_; ++i) {
+      xpumlDeviceGetHandleByIndex(i, &device_handlers_[i]);
+  }
+}
+
+
+/**************************** Memory Management **************************/
+// == Memory monitor ==
+void XPUMLHandler::init_ml() {
+    xpumlInit();
+}
+
+bool XPUMLHandler::getMemoryUsageInfo(int dev_id, unsigned long long *total,
+                                      unsigned long long* used, unsigned long long *free) {
+  if(xpumlDeviceGetMemoryInfo(device_handlers_[dev_id], &mem_infos_[dev_id]) != xpumlReturn_enum::XPUML_SUCCESS) {
+    return false;
+  }
+  *total = mem_infos_[dev_id].totalGlobalMemory;
+  *free = mem_infos_[dev_id].freeGlobalMemory;
+  *used = mem_infos_[dev_id].usedGlobalMemory;
+  return true;
+}
+
+bool XPUMLHandler::getL3UsageInfo(int dev_id, unsigned long long *total,
+                                  unsigned long long *used, unsigned long long *free) {
+  if(xpumlDeviceGetMemoryInfo(device_handlers_[dev_id], &mem_infos_[dev_id]) != xpumlReturn_enum::XPUML_SUCCESS) {
+    return false;
+  }
+  *total = mem_infos_[dev_id].totalL3Memory;
+  *free = mem_infos_[dev_id].freeL3Memory;
+  *used = mem_infos_[dev_id].usedL3Memory;
+  return true;
+}
+
+std::tuple<unsigned long long, unsigned long long, unsigned long long> XPUMLHandler::getMemoryUsageTuple(int dev_id) {
+  if(xpumlDeviceGetMemoryInfo(device_handlers_[dev_id], &mem_infos_[dev_id]) != xpumlReturn_enum::XPUML_SUCCESS) {
+    return {0, 0, 0};
+  }
+  return {mem_infos_[dev_id].totalGlobalMemory, 
+          mem_infos_[dev_id].usedGlobalMemory, 
+          mem_infos_[dev_id].freeGlobalMemory};
+
+}
+
+std::tuple<unsigned long long, unsigned long long, unsigned long long> XPUMLHandler::getL3UsageTuple(int dev_id) {
+  if(xpumlDeviceGetMemoryInfo(device_handlers_[dev_id], &mem_infos_[dev_id]) != xpumlReturn_enum::XPUML_SUCCESS) {
+    return {0, 0, 0};
+  }
+  return {mem_infos_[dev_id].totalL3Memory, 
+          mem_infos_[dev_id].usedL3Memory, 
+          mem_infos_[dev_id].freeL3Memory};  
+}
+
+
+// == Memory malloc & free ==
+
+
+
+class RecordedXpuMallocHelper {
+ private:
+  explicit RecordedXpuMallocHelper(int dev_id, uint64_t limit_size = 0)
+      : dev_id_(dev_id), limit_size_(limit_size) {
+    if (NeedRecord()) {
+      mtx_.reset(new std::mutex());
+    }
+
+    if (FLAGS_enable_xpu_memory_usage_log) {
+      // A fake UPDATE to trigger the construction of memory stat instances,
+      // make sure that they are destructed after RecordedXpuMallocHelper.
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(RecordedXpuMallocHelper);
+
+ public:
+  ~RecordedXpuMallocHelper() {
+    if (FLAGS_enable_xpu_memory_usage_log) {
+      if (FLAGS_enable_xpu_memory_usage_log_mb) {
+        std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = "
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) /
+                         1048576.0
+                  << ", Allocated = "
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) /
+                         1048576.0
+                  << std::endl;
+      } else {
+        std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = "
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
+                  << ", Allocated = "
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_)
+                  << std::endl;
+      }
+    }
+  }
+
+  static RecordedXpuMallocHelper *Instance(int dev_id) {
+    static std::vector<std::unique_ptr<RecordedXpuMallocHelper>> instances_;
+
+    std::call_once(once_flag_, [] {
+      int dev_cnt = GetXPUDeviceCount();
+      instances_.reserve(dev_cnt);
+      for (int i = 0; i < dev_cnt; ++i) {
+        instances_.emplace_back(
+            new RecordedXpuMallocHelper(i, FLAGS_xpu_memory_limit_mb << 20));
+      }
+    });
+
+    PADDLE_ENFORCE_GE(
+        dev_id,
+        0,
+        platform::errors::OutOfRange(
+            "Device id must be not less than 0, but got %d.", dev_id));
+    PADDLE_ENFORCE_LT(
+        dev_id,
+        instances_.size(),
+        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
+                                     dev_id,
+                                     instances_.size()));
+    return instances_[dev_id].get();
+  }
+
+  XPUError_t Malloc(void **ptr, size_t size, bool malloc_managed_memory = false) {
+    // CHECK(malloc_managed_memory == false) << "xpu not supported yet";
+    if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
+      return XPUERR_NOMEM;
+    }
+ 
+    XPUDeviceGuard guard(dev_id_);
+    
+    int result = xpu_malloc(ptr, size);
+    VLOG(10) << "[xpu_malloc] size=" << static_cast<double>(size) / (1 << 20)
+               << " MB, result=" << result;
+    
+    if (result == 0) {
+      if (UNLIKELY(NeedRecord())) {
+        cur_size_.fetch_add(size);
+        DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
+        platform::RecordMemEvent(ptr,
+                                XPUPlace(dev_id_),
+                                size,
+                                platform::TracerMemEventType::ReservedAllocate);      
+      }
+
+      return XPU_SUCCESS;
+    } else {
+      return XPUERR_NOMEM;
+    }
+  }
+
+  void Free(void *ptr, size_t size) {
+    XPUDeviceGuard guard(dev_id_);
+    xpu_free(ptr);
+    if (UNLIKELY(NeedRecord())) {
+      cur_size_.fetch_sub(size);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
+      platform::RecordMemEvent(ptr,
+                                XPUPlace(dev_id_),
+                                size,
+                                platform::TracerMemEventType::ReservedFree);
+    }
+  }
+
+  bool GetMemInfo(size_t *avail,
+                  size_t *total,
+                  size_t *actual_avail,
+                  size_t *actual_total) {
+    unsigned long long uint64_total = 0, used = 0, free = 0;
+    
+    CHECK(ml_handler.getMemoryUsageInfo(dev_id_, &uint64_total, &used, &free) == true) << "get mem usage info failed";
+    *actual_avail = uint64_total - free;
+    *actual_total = uint64_total;
+
+    if (UNLIKELY(NeedRecord())) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      *avail = std::min(*actual_avail, limit_size_ - cur_size_.load());
+      *total = std::min(*actual_total, limit_size_);
+      return *total < *actual_total;
+    } else {
+      *avail = *actual_avail;
+      *total = *actual_total;
+      return false;
+    }
+  }
+
+  inline bool NeedRecord() const { return limit_size_ != 0; }
+
+  uint64_t RecordedSize() const { return cur_size_.load(); }
+
+  uint64_t LimitSize() const { return limit_size_; }
+
+
+ private:
+  const int dev_id_;
+  const uint64_t limit_size_;
+  std::atomic<uint64_t> cur_size_{0};
+
+  mutable std::unique_ptr<std::mutex> mtx_;
+  static std::once_flag once_flag_;
+
+  XPUMLHandler ml_handler;
+};
+
+std::once_flag RecordedXpuMallocHelper::once_flag_;
+
+XPUError_t RecordedXpuMalloc(void **ptr, size_t size, int dev_id, bool malloc_managed_memory) {
+  return RecordedXpuMallocHelper::Instance(dev_id)->Malloc(ptr, size, malloc_managed_memory);
+}
+
+void RecordedXpuFree(void *p, size_t size, int dev_id) {
+  return RecordedXpuMallocHelper::Instance(dev_id)->Free(p, size);
+}
+
+bool RecordedXpuMemGetInfo(size_t *avail,
+                           size_t *total,
+                           size_t *actual_avail,
+                           size_t *actual_total,
+                           int dev_id) {
+  return RecordedXpuMallocHelper::Instance(dev_id)->GetMemInfo(avail, total, actual_avail, actual_total);
+}
+
+size_t XpuAvailableMemToAlloc() {
+  XPUMLHandler handler;
+  unsigned long long total = 0;
+  unsigned long long used = 0;
+  unsigned long long free = 0;
+  bool re = handler.getMemoryUsageInfo(GetXPUCurrentDeviceId(), &total, &used, &free);
+  CHECK(re == true) << "query mem info failed";
+
+  size_t reserving = static_cast<size_t>(fraction_reserve_xpu_memory * free);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = free - reserving;
+  size_t min_chunk_size = XpuMinChunkSize();
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  return available_to_alloc;
+}
+
+static size_t XpuAllocSize(bool realloc) {
+  size_t available_to_alloc = XpuAvailableMemToAlloc();
+  PADDLE_ENFORCE_GT(
+      available_to_alloc,
+      0,
+      platform::errors::ResourceExhausted("Not enough available XPU memory."));
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_xpu_memory_in_mb
+                           : FLAGS_initial_xpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul
+           ? flag_mb << 20
+           : available_to_alloc * FLAGS_fraction_of_xpu_memory_to_use);
+  PADDLE_ENFORCE_GE(
+      available_to_alloc,
+      alloc_bytes,
+      platform::errors::ResourceExhausted("Not enough available GPU memory."));
+  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
+           << " MiB, is it Re-alloc: " << realloc;
+  return alloc_bytes;
+}
+
+
+size_t XpuInitAllocSize() {
+  return XpuAllocSize(false);
+}
+
+size_t XpuReallocSize() {
+  return XpuAllocSize(true);
+}
+
+size_t XpuMaxAllocSize() {
+  return std::max(XpuInitAllocSize(), XpuReallocSize());
+}
+
+size_t XpuMinChunkSize() {
+  return 1 << 8;
+}
+
+
+size_t XpuMaxChunkSize() {
+
+  size_t max_chunk_size = XpuMaxAllocSize();
+  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
+
+}
+
+// for test 
+class MallocCnter {
+public:
+  static MallocCnter & getInstance() {
+    static MallocCnter instance;
+    return instance;
+  }
+
+  void inc_malloc_cnt(int dev_id) {
+    CHECK(dev_id >= 0 && dev_id < 8);
+    malloc_cnts[dev_id]++;
+  }
+
+  int get_malloc_cnt(int dev_id) {
+    CHECK(dev_id >= 0 && dev_id < 8);
+    return malloc_cnts[dev_id].load();
+  }
+
+private:
+  MallocCnter() {}
+  std::atomic<int> malloc_cnts[8];
+};
+
+int get_malloc_cnt(int dev_id) {
+  return MallocCnter::getInstance().get_malloc_cnt(dev_id);
+}
+
+int inc_malloc_cnt(int dev_id) {
+  MallocCnter::getInstance().inc_malloc_cnt(dev_id);
+  return 0;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 85445df0bd762..9e25e82677e10 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -12,10 +12,12 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include <vector>
+#include <mutex>
 
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
 #include "xpu/runtime.h"
+#include "xpu/xpuml.h"
 
 namespace paddle {
 
@@ -106,6 +108,48 @@ using XPUDeviceGuard = phi::backends::xpu::XPUDeviceGuard;
 
 phi::backends::xpu::XPUVersion get_xpu_version(int dev_id);
 
+class XPUMLHandler {
+public:
+    XPUMLHandler();
+    // total, used, free
+    bool getMemoryUsageInfo(int dev_id, unsigned long long *total, unsigned long long* used, unsigned long long *free);
+    bool getL3UsageInfo(int dev_id, unsigned long long *total, unsigned long long *used, unsigned long long *free);
+
+    // (total, used, free)
+    std::tuple<unsigned long long, unsigned long long, unsigned long long> getMemoryUsageTuple(int dev_id);
+    std::tuple<unsigned long long, unsigned long long, unsigned long long> getL3UsageTuple(int dev_id);
+
+private:
+    static void init_ml();
+
+    static std::once_flag init_flag_;
+
+    std::vector<xpumlDevice_t> device_handlers_;
+    std::vector<xpumlMemory_t> mem_infos_;
+    unsigned int device_nums_;
+};
+
+XPUError_t RecordedXpuMalloc(void **ptr, size_t size, int dev_id, bool malloc_managed_memory = false);
+
+void RecordedXpuFree(void *p, size_t size, int dev_id);
+
+bool RecordedXpuMemGetInfo(size_t *avail,
+                           size_t *total,
+                           size_t *actual_avail,
+                           size_t *actual_total,
+                           int dev_id);
+
+size_t XpuMinChunkSize();
+size_t XpuMaxChunkSize();
+
+size_t XpuInitAllocSize();
+size_t XpuReallocSize();
+size_t XpuMaxAllocSize();
+
+// for calculate malloc times
+int get_malloc_cnt(int dev_id);
+int inc_malloc_cnt(int dev_id);
+
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/platform/dynload/cublasLt.h b/paddle/fluid/platform/dynload/cublasLt.h
index 3a1d28072c591..5bf92876f4fd0 100644
--- a/paddle/fluid/platform/dynload/cublasLt.h
+++ b/paddle/fluid/platform/dynload/cublasLt.h
@@ -39,7 +39,34 @@ namespace dynload {
   extern DynLoad__##__name __name
 
 // APIs available after CUDA 10.1
-// #if CUDA_VERSION >= 10100
+#if CUDA_VERSION >= 11010
+#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
+  __macro(cublasLtCreate);                          \
+  __macro(cublasLtDestroy);                         \
+  __macro(cublasLtMatmul);                          \
+  __macro(cublasLtMatmulDescCreate);                \
+  __macro(cublasLtMatmulDescDestroy);               \
+  __macro(cublasLtMatmulDescSetAttribute);          \
+  __macro(cublasLtMatmulDescGetAttribute);          \
+  __macro(cublasLtMatrixLayoutCreate);              \
+  __macro(cublasLtMatrixLayoutDestroy);             \
+  __macro(cublasLtMatrixLayoutSetAttribute);        \
+  __macro(cublasLtMatrixLayoutGetAttribute);        \
+  __macro(cublasLtMatmulPreferenceCreate);          \
+  __macro(cublasLtMatmulPreferenceDestroy);         \
+  __macro(cublasLtMatmulPreferenceSetAttribute);    \
+  __macro(cublasLtMatmulAlgoGetHeuristic);          \
+  __macro(cublasLtMatrixTransform);                 \
+  __macro(cublasLtMatrixTransformDescCreate);       \
+  __macro(cublasLtMatrixTransformDescDestroy);      \
+  __macro(cublasLtMatrixTransformDescSetAttribute); \
+  __macro(cublasLtMatmulAlgoInit);                  \
+  __macro(cublasLtMatmulAlgoConfigSetAttribute);    \
+  __macro(cublasLtMatmulAlgoGetIds);                \
+  __macro(cublasLtMatmulAlgoCapGetAttribute);       \
+  __macro(cublasLtMatmulAlgoCheck);                 \
+  __macro(cublasLtGetCudartVersion);
+#else
 #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
   __macro(cublasLtCreate);                       \
   __macro(cublasLtDestroy);                      \
@@ -60,6 +87,7 @@ namespace dynload {
   __macro(cublasLtMatrixTransformDescCreate);    \
   __macro(cublasLtMatrixTransformDescDestroy);   \
   __macro(cublasLtMatrixTransformDescSetAttribute);
+#endif
 
 CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
 // #endif
diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h
index c3dc9e31df354..e5816e240e6d2 100644
--- a/paddle/fluid/platform/dynload/nvtx.h
+++ b/paddle/fluid/platform/dynload/nvtx.h
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifndef _WIN32
-#include <cuda.h>
-#include <nvToolsExt.h>
-
-#include <mutex>  // NOLINT
-
 #include "paddle/phi/backends/dynload/nvtx.h"
 
 namespace paddle {
@@ -28,11 +23,12 @@ namespace dynload {
   using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
   extern DynLoad__##__name __name
 
-#define NVTX_ROUTINE_EACH(__macro) \
-  __macro(nvtxRangePushA);         \
+#define PLATFORM_NVTX_ROUTINE_EACH(__macro) \
+  __macro(nvtxRangePushA);                  \
+  __macro(nvtxRangePushEx);                 \
   __macro(nvtxRangePop);
 
-NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
+PLATFORM_NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
 
 #undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP
 }  // namespace dynload
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index e78abbf80a8d0..d507837ec915f 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -630,6 +630,56 @@ PADDLE_DEFINE_EXPORTED_uint64(
 
 #endif
 
+#if defined(PADDLE_WITH_XPU)
+
+PADDLE_DEFINE_EXPORTED_bool(
+    use_xpu_buddy_allocator,
+    true,
+    "If set true, using buddy allocator to manage mem allocation");
+
+constexpr static float fraction_of_xpu_memory_to_use = 0.1f;
+PADDLE_DEFINE_EXPORTED_double(
+    fraction_of_xpu_memory_to_use,
+    fraction_of_xpu_memory_to_use,
+    "Allocate a trunk of xpu memory that is this fraction of the "
+    "total xpu memory size. Future memory usage will be allocated "
+    "from the trunk. If the trunk doesn't have enough xpu memory, "
+    "additional trunks of the same size will be requested from xpu "
+    "until the xpu has no memory left for another trunk.");
+
+PADDLE_DEFINE_EXPORTED_uint64(
+    initial_xpu_memory_in_mb,
+    0ul,
+    "Allocate a trunk of xpu memory whose byte size is specified by "
+    "the flag. Future memory usage will be allocated from the "
+    "trunk. If the trunk doesn't have enough gpu memory, additional "
+    "trunks of the xpu memory will be requested from xpu with size "
+    "specified by FLAGS_reallocate_xpu_memory_in_mb until the xpu has "
+    "no memory left for the additional trunk. Note: if you set this "
+    "flag, the memory size set by "
+    "FLAGS_fraction_of_xpu_memory_to_use will be overrided by this "
+    "flag. If you don't set this flag, PaddlePaddle will use "
+    "FLAGS_fraction_of_xpu_memory_to_use to allocate xpu memory");
+
+PADDLE_DEFINE_EXPORTED_uint64(
+    reallocate_xpu_memory_in_mb,
+    0ul,
+    "If this flag is set, Paddle will reallocate the xpu memory with "
+    "size specified by this flag. Else Paddle will reallocate by "
+    "FLAGS_fraction_of_xpu_memory_to_use");
+
+PADDLE_DEFINE_EXPORTED_uint64(
+    xpu_memory_limit_mb,
+    0UL,
+    "The maximum gpu memory limit that the process can allocate. "
+    "If it is equal to 0, there would be no limit and all gpu memory "
+    "would be available to the process. If it is larger than 0, "
+    "the process would raise out of memory error if the allocated "
+    "memory exceeds the limit even though there is available "
+    "memory on the gpu card. The unit is MB and default value is 0.");
+
+#endif
+
 /**
  * Scope related FLAG
  * Name: local_exe_sub_scope_limit
@@ -1027,6 +1077,10 @@ PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
                             true,
                             "enable load_node_list_into_hbm, default true");
 
+PADDLE_DEFINE_EXPORTED_bool(enable_dump_main_program,
+                            false,
+                            "enable dump main program, default false");
+
 /**
  * ProcessGroupNCCL related FLAG
  * Name: nccl_blocking_wait
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index ae5e5696a80f7..f6a89a0090e79 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -132,7 +132,8 @@ set(PYBIND_SRCS
     op_function5.cc
     op_function6.cc
     op_function7.cc
-    op_function8.cc)
+    op_function8.cc
+    )
 
 if(WITH_CUSTOM_DEVICE)
   set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi)
@@ -206,6 +207,10 @@ if(WITH_NCCL OR WITH_RCCL)
   list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
 endif()
 
+if(WITH_XPU)
+  list(APPEND PYBIND_SRCS xpu_info_py.cc)
+endif()
+
 if(WITH_PYTHON)
   # generate op pybind functions automatically for dygraph.
   if(WITH_ASCEND_CL)
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 89a3904d0003f..5503c47197a26 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -49,7 +49,8 @@ void BindConstValue(pybind11::module* m) {
       .value("Loss", framework::OpRole::kLoss)
       .value("RPC", framework::OpRole::kRPC)
       .value("Dist", framework::OpRole::kDist)
-      .value("LRSched", framework::OpRole::kLRSched);
+      .value("LRSched", framework::OpRole::kLRSched)
+      .value("ScaleLr", framework::OpRole::kScaleLr);
 
   op_proto_and_checker_maker.def(
       "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 2649165eb1d3d..cdb193b928490 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -147,8 +147,9 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/device/xpu/xpu_info.h"
-#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+// #include "paddle/fluid/platform/device/xpu/xpu_info.h"
+// #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+#include "paddle/fluid/pybind/xpu_info_py.h"
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -531,6 +532,10 @@ PYBIND11_MODULE(core_avx, m) {
 PYBIND11_MODULE(core_noavx, m) {
 #endif
 
+#ifdef PADDLE_WITH_XPU
+  BindXPUInfo(&m);
+#endif
+
   BindImperative(&m);
   BindEager(&m);
   BindEagerStringTensor(&m);
@@ -2420,7 +2425,10 @@ All parameter, weight, gradient are variables in Paddle.
   BindNeighborSampleResult(&m);
   BindGraphGpuWrapper(&m);
 #endif
+
 #endif
 }
+
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index ba646816126c4..7e226b48a1d08 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -719,6 +719,18 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              }
              return dst;
            })
+      .def("copy_from",
+           [](framework::Tensor &self, const framework::Tensor &src) {
+             // follow fetch_op's inplementation
+             if (src.IsInitialized() && src.numel() > 0) {
+               TensorCopySync(src, src.place(), &self);
+             } else {
+               // Not copy, if the src tensor is empty.
+               self.clear();
+               self.Resize({0});
+             }
+             self.set_lod(src.lod());
+           })
       .def("_copy",
            [](const framework::Tensor &self, const platform::Place &place) {
              // follow fetch_op's inplementation
diff --git a/paddle/fluid/pybind/xpu_info_py.cc b/paddle/fluid/pybind/xpu_info_py.cc
new file mode 100644
index 0000000000000..5c19fb8e7baee
--- /dev/null
+++ b/paddle/fluid/pybind/xpu_info_py.cc
@@ -0,0 +1,17 @@
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/pybind/xpu_info_py.h"
+#include "tuple"
+
+namespace paddle {
+namespace pybind {
+
+void BindXPUInfo(py::module* m) {
+    py::class_<platform::XPUMLHandler>(*m, "XPUMLHandler")
+      .def(py::init<>())
+      .def("getMemoryUsageTuple", &platform::XPUMLHandler::getMemoryUsageTuple)
+      .def("getL3UsageTuple", &platform::XPUMLHandler::getL3UsageTuple);
+}
+
+}  // namespace pybind
+}  // namespace paddle
+#endif // PADDLE_WITH_XPU
diff --git a/paddle/fluid/pybind/xpu_info_py.h b/paddle/fluid/pybind/xpu_info_py.h
new file mode 100644
index 0000000000000..ffa716b7b9d0f
--- /dev/null
+++ b/paddle/fluid/pybind/xpu_info_py.h
@@ -0,0 +1,18 @@
+#pragma once
+#ifdef PADDLE_WITH_XPU
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindXPUInfo(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
+#endif // PADDLE_WITH_XPU
diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h
index 1e2a20ebdf440..8a005cb93b7d4 100644
--- a/paddle/phi/backends/dynload/cublasLt.h
+++ b/paddle/phi/backends/dynload/cublasLt.h
@@ -54,6 +54,34 @@ extern void *cublasLt_dso_handle;
 
 // APIs available after CUDA 10.1
 // #if CUDA_VERSION >= 10100
+#if CUDA_VERSION >= 11010
+#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
+  __macro(cublasLtCreate);                          \
+  __macro(cublasLtDestroy);                         \
+  __macro(cublasLtMatmul);                          \
+  __macro(cublasLtMatmulDescCreate);                \
+  __macro(cublasLtMatmulDescDestroy);               \
+  __macro(cublasLtMatmulDescSetAttribute);          \
+  __macro(cublasLtMatmulDescGetAttribute);          \
+  __macro(cublasLtMatrixLayoutCreate);              \
+  __macro(cublasLtMatrixLayoutDestroy);             \
+  __macro(cublasLtMatrixLayoutSetAttribute);        \
+  __macro(cublasLtMatrixLayoutGetAttribute);        \
+  __macro(cublasLtMatmulPreferenceCreate);          \
+  __macro(cublasLtMatmulPreferenceDestroy);         \
+  __macro(cublasLtMatmulPreferenceSetAttribute);    \
+  __macro(cublasLtMatmulAlgoGetHeuristic);          \
+  __macro(cublasLtMatrixTransform);                 \
+  __macro(cublasLtMatrixTransformDescCreate);       \
+  __macro(cublasLtMatrixTransformDescDestroy);      \
+  __macro(cublasLtMatrixTransformDescSetAttribute); \
+  __macro(cublasLtMatmulAlgoInit);                  \
+  __macro(cublasLtMatmulAlgoConfigSetAttribute);    \
+  __macro(cublasLtMatmulAlgoGetIds);                \
+  __macro(cublasLtMatmulAlgoCapGetAttribute);       \
+  __macro(cublasLtMatmulAlgoCheck);                 \
+  __macro(cublasLtGetCudartVersion);
+#else
 #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
   __macro(cublasLtCreate);                       \
   __macro(cublasLtDestroy);                      \
@@ -74,6 +102,7 @@ extern void *cublasLt_dso_handle;
   __macro(cublasLtMatrixTransformDescCreate);    \
   __macro(cublasLtMatrixTransformDescDestroy);   \
   __macro(cublasLtMatrixTransformDescSetAttribute);
+#endif
 
 CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
 // #endif
diff --git a/paddle/phi/backends/dynload/cuda_driver.cc b/paddle/phi/backends/dynload/cuda_driver.cc
index 2bd0a7bfea5c1..d9fd89a0c65a6 100644
--- a/paddle/phi/backends/dynload/cuda_driver.cc
+++ b/paddle/phi/backends/dynload/cuda_driver.cc
@@ -24,6 +24,7 @@ void* cuda_dso_handle = nullptr;
 
 #if CUDA_VERSION >= 10020
 CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP);
+CUDA_ROUTINE_EACH_CUDA_GRAPH(DEFINE_WRAP);
 #endif
 CUDA_ROUTINE_EACH(DEFINE_WRAP);
 
diff --git a/paddle/phi/backends/dynload/cuda_driver.h b/paddle/phi/backends/dynload/cuda_driver.h
index f743a33a1866f..ba771afe09023 100644
--- a/paddle/phi/backends/dynload/cuda_driver.h
+++ b/paddle/phi/backends/dynload/cuda_driver.h
@@ -72,7 +72,13 @@ extern bool HasCUDADriver();
   __macro(cuMemRelease);                  \
   __macro(cuMemAddressFree)
 
+#define CUDA_ROUTINE_EACH_CUDA_GRAPH(__macro) \
+  __macro(cuGraphNodeGetType);                \
+  __macro(cuGraphKernelNodeGetParams);        \
+  __macro(cuGraphExecKernelNodeSetParams)
+
 CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
+CUDA_ROUTINE_EACH_CUDA_GRAPH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
 #endif
 
 CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc
index 8aa3b623273d7..9bd38a89ab177 100644
--- a/paddle/phi/backends/dynload/cudnn.cc
+++ b/paddle/phi/backends/dynload/cudnn.cc
@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_FRONTEND
+CUDNN_DNN_ROUTINE_EACH_FRONTEND(DEFINE_WRAP);
+#endif
+
 bool HasCUDNN() {
   std::call_once(cudnn_dso_flag,
                  []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h
index 7b9004308e95b..3292beb037110 100644
--- a/paddle/phi/backends/dynload/cudnn.h
+++ b/paddle/phi/backends/dynload/cudnn.h
@@ -194,6 +194,19 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
+#ifdef PADDLE_WITH_CUDNN_FRONTEND
+#define CUDNN_DNN_ROUTINE_EACH_FRONTEND(__macro) \
+  __macro(cudnnBackendCreateDescriptor);         \
+  __macro(cudnnBackendDestroyDescriptor);        \
+  __macro(cudnnBackendExecute);                  \
+  __macro(cudnnBackendFinalize);                 \
+  __macro(cudnnBackendGetAttribute);             \
+  __macro(cudnnBackendSetAttribute);             \
+  __macro(cudnnGetStream);                       \
+  __macro(cudnnReorderFilterAndBias);
+CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace phi
 
diff --git a/paddle/phi/backends/dynload/nvtx.h b/paddle/phi/backends/dynload/nvtx.h
index a9a166b289e33..e51bbf2154a17 100644
--- a/paddle/phi/backends/dynload/nvtx.h
+++ b/paddle/phi/backends/dynload/nvtx.h
@@ -42,6 +42,7 @@ extern void *nvtx_dso_handle;
 
 #define NVTX_ROUTINE_EACH(__macro) \
   __macro(nvtxRangePushA);         \
+  __macro(nvtxRangePushEx);        \
   __macro(nvtxRangePop);
 
 NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP);
diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
index c62addfd257ab..2d527dd526a0e 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -14,6 +14,13 @@
 
 #pragma once
 
+#include <cuda_runtime.h>  // NOLINT
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/enforce.h"
+
 namespace phi {
 namespace backends {
 namespace gpu {
@@ -24,7 +31,7 @@ namespace gpu {
  *  [ Why need this macro? ]
  *
  *    The original looping in CUDA kernel is:
- *
+ *p
  *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
  *        i += blockDim.x * gridDim.x)`
  *
@@ -62,10 +69,37 @@ namespace gpu {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)            \
-  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
-  for (index_type i = __index__; __index__ < (num);          \
-       __index__ += blockDim.x * gridDim.x, i = __index__)
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
+       __index__ += __stride__, i = __index__)
+
+template <typename T>
+cudaDataType_t ToCudaDataType() {
+  if (std::is_same<T, float>::value) {
+    return CUDA_R_32F;
+  } else if (std::is_same<T, double>::value) {
+    return CUDA_R_64F;
+  } else if (std::is_same<T, phi::dtype::float16>::value) {
+    return CUDA_R_16F;
+#if CUDA_VERSION >= 11000
+  } else if (std::is_same<T, phi::dtype::bfloat16>::value) {
+    return CUDA_R_16BF;
+#endif
+#if CUDA_VERSION >= 11040
+  } else if (std::is_same<T, int8_t>::value) {
+    return CUDA_R_8I;
+  } else if (std::is_same<T, int32_t>::value) {
+    return CUDA_R_32I;
+#endif
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "DataType %d is unsupported for CUDA.",
+        paddle::experimental::CppTypeToDataType<T>::Type()));
+  }
+}
 
 }  // namespace gpu
 }  // namespace backends
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 87d779f9194db..62082beac13a3 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -57,6 +57,17 @@ limitations under the License. */
 // TODO(phi): remove fluid header.
 #include "paddle/fluid/platform/enforce.h"
 
+#ifdef PADDLE_ON_INFERENCE
+PADDLE_DEFINE_EXPORTED_bool(enable_cublas_tf32_op_math,
+                            false,
+                            "enable tf32 for cublas.");
+#else
+PADDLE_DEFINE_EXPORTED_bool(enable_cublas_tf32_op_math,
+                            true,
+                            "enable tf32 for cublas.");
+#endif
+DECLARE_bool(enable_cublas_tensor_op_math);
+
 namespace phi {
 
 namespace internal {
@@ -216,6 +227,8 @@ struct GPUContext::Impl {
     stream_ = new CUDAStream(place_);
     InitEigenDevice();
     InitDnnWorkspace();
+    GetDnnHandle();
+    GetBlasHandle();
   }
 
   void PartialInitWithoutAllocator() {
@@ -231,6 +244,8 @@ struct GPUContext::Impl {
                            &max_threads_per_block_,
                            &max_grid_dim_size_);
     stream_ = new CUDAStream(place_);
+    GetDnnHandle();
+    GetBlasHandle();
   }
 
   void PartialInitWithAllocator() {
@@ -238,6 +253,8 @@ struct GPUContext::Impl {
     stream_owned_ = true;
     backends::gpu::GPUDeviceGuard guard(place_.device);
     InitDnnWorkspace();
+    GetDnnHandle();
+    GetBlasHandle();
   }
 
   explicit Impl(const GPUPlace& place) : place_(place) {}
@@ -369,7 +386,7 @@ struct GPUContext::Impl {
       }
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 9000
-      if (!blas_tensor_core_handle_) {
+      if (FLAGS_enable_cublas_tensor_op_math && !blas_tensor_core_handle_) {
         if (!blas_tensor_core_handle_creator_) {
           phi::InitBlasHandle(&blas_tensor_core_handle_, stream());
         } else {
@@ -380,7 +397,7 @@ struct GPUContext::Impl {
       }
 #endif
 #if CUDA_VERSION >= 11000
-      if (!blas_tf32_tensor_core_handle_) {
+      if (FLAGS_enable_cublas_tf32_op_math && !blas_tf32_tensor_core_handle_) {
         if (!blas_tf32_tensor_core_handle_creator_) {
           phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream());
         } else {
@@ -561,40 +578,6 @@ struct GPUContext::Impl {
   }
 
   inline void CublasCall(const std::function<void(blasHandle_t)>& callback) {
-    std::call_once(flag_cublas_, [&]() {
-      if (!blas_handle_) {
-        if (!blas_handle_creator_) {
-          phi::InitBlasHandle(&blas_handle_, stream());
-        } else {
-          blas_handle_ = blas_handle_creator_();
-        }
-      }
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 9000
-      if (!blas_tensor_core_handle_) {
-        if (!blas_tensor_core_handle_creator_) {
-          phi::InitBlasHandle(&blas_tensor_core_handle_, stream());
-        } else {
-          phi::InitBlasHandle(&blas_tensor_core_handle_, stream());
-        }
-        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-      }
-#endif
-#if CUDA_VERSION >= 11000
-      if (!blas_tf32_tensor_core_handle_) {
-        if (!blas_tf32_tensor_core_handle_creator_) {
-          phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream());
-        } else {
-          blas_tf32_tensor_core_handle_ =
-              blas_tf32_tensor_core_handle_creator_();
-        }
-        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-      }
-#endif
-#endif
-    });
     if (blas_tf32_tensor_core_handle_ != nullptr) {
       std::lock_guard<std::mutex> guard(blas_tf32_mtx_);
       callback(blas_tf32_tensor_core_handle_);
@@ -606,40 +589,6 @@ struct GPUContext::Impl {
 
   inline void TensorCoreCublasCallIfAvailable(
       const std::function<void(blasHandle_t)>& callback) {
-    std::call_once(flag_tensorcore_cublas_, [&]() {
-      if (!blas_handle_) {
-        if (!blas_handle_creator_) {
-          phi::InitBlasHandle(&blas_handle_, stream());
-        } else {
-          blas_handle_ = blas_handle_creator_();
-        }
-      }
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 9000
-      if (!blas_tensor_core_handle_) {
-        if (!blas_tensor_core_handle_creator_) {
-          phi::InitBlasHandle(&blas_tensor_core_handle_, stream());
-        } else {
-          blas_tensor_core_handle_ = blas_tensor_core_handle_creator_();
-        }
-        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-      }
-#endif
-#if CUDA_VERSION >= 11000
-      if (!blas_tf32_tensor_core_handle_) {
-        if (!blas_tf32_tensor_core_handle_creator_) {
-          phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream());
-        } else {
-          blas_tf32_tensor_core_handle_ =
-              blas_tf32_tensor_core_handle_creator_();
-        }
-        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-      }
-#endif
-#endif
-    });
     if (blas_tensor_core_handle_ != nullptr) {
       std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
       callback(blas_tensor_core_handle_);
@@ -716,6 +665,14 @@ struct GPUContext::Impl {
       }
     }
   }
+  // get workspace ptr
+  void* GetWorkSpacePtr(const size_t& len) {
+    if (workspace_ptr_ == nullptr || len > workspace_ptr_->size()) {
+      workspace_ptr_.reset();
+      workspace_ptr_ = allocator_->Allocate(len);
+    }
+    return workspace_ptr_->ptr();
+  }
 
   // use one flag for all handles?
   // they should be accessed consistently
@@ -780,6 +737,8 @@ struct GPUContext::Impl {
   Allocator* allocator_{nullptr};  // external resource.
   // A internal resouce to initinalize eigen_device.
   std::unique_ptr<internal::EigenGpuStreamDevice> eigen_stream_{nullptr};
+  // work space
+  phi::Allocator::AllocationPtr workspace_ptr_{nullptr};
 };
 
 GPUContext::GPUContext(GPUContext&&) = default;
@@ -1000,4 +959,9 @@ void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; }
 
 void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; }
 
+// Get Work Space
+void* GPUContext::GetWorkSpacePtr(const size_t& len) const {
+  return impl_->GetWorkSpacePtr(len);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 989bbbcbbf5f8..c76d8549c284c 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -199,6 +199,9 @@ class PADDLE_API GPUContext : public DeviceContext {
   // clear: whether clear the original CUDAStream or not
   void SetCUDAStream(CUDAStream*, bool clear = true);
 
+  // Get Work Space
+  void* GetWorkSpacePtr(const size_t& len) const;
+
  protected:
   // NOTE: External users manage resources. Used in inference scenarios.
   // The Set interface is for inference only, DeviceContext will mark the
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 552f60783c8b2..fd712baf75480 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -34,18 +34,16 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/enforce.h"
 
-#ifdef __HIPCC__
-// HIP results in error or nan if > 256
-#define PREDEFINED_BLOCK_SIZE 256
-#else
 // CUDA performs better when thread_per_block is between [64, 512]
 #define PREDEFINED_BLOCK_SIZE 512
-#endif
 
 namespace phi {
 namespace backends {
 namespace gpu {
 
+// Limitation of the setting in one dimension of cuda grid.
+constexpr int kMultiDimslimit = 65536;
+
 template <typename T = int64_t>
 inline T DivUp(T a, T b) {
   return (a + b - 1) / b;
@@ -53,20 +51,21 @@ inline T DivUp(T a, T b) {
 
 // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
 //   for round integer value into next highest power of 2.
-inline int64_t RoundToPowerOfTwo(int64_t n) {
+inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) {
   n--;
   n |= (n >> 1);
   n |= (n >> 2);
   n |= (n >> 4);
   n |= (n >> 8);
   n |= (n >> 16);
-  int64_t min_val = 32;
-#ifdef __HIPCC__
-  int64_t max_val = 256;
-#else
+  return std::max(min_val, (n + 1));
+}
+
+inline int64_t RoundToPowerOfTwo(int64_t n) {
+  constexpr int64_t min_val = 32;
+  int64_t num = RoundToNextHighPowOfTwo(n, min_val);
   int64_t max_val = 1024;
-#endif
-  return std::min(max_val, std::max(min_val, (n + 1)));
+  return std::min(max_val, num);
 }
 
 #ifdef WITH_NV_JETSON
@@ -162,8 +161,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
 }
 
 inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
-                                            int x_dim,
-                                            int y_dim) {
+                                            int64_t x_dim,
+                                            int64_t y_dim) {
   PADDLE_ENFORCE_GT(
       x_dim,
       0,
@@ -178,7 +177,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
                                    y_dim));
 
   const int kThreadsPerBlock = 256;
-  int block_cols = std::min(x_dim, kThreadsPerBlock);
+  int block_cols = std::min<int64_t>(x_dim, kThreadsPerBlock);
   int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
 
   int max_physical_threads = context.GetMaxPhysicalThreadCount();
@@ -188,8 +187,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
   // Noticed, block size is not align to 32, if needed do it yourself.
   config.thread_per_block = dim3(block_cols, block_rows, 1);
 
-  int grid_x = std::min(DivUp<int>(x_dim, block_cols), max_blocks);
-  int grid_y = std::min(max_blocks / grid_x, std::max(y_dim / block_rows, 1));
+  int grid_x = std::min<int64_t>(DivUp<int64_t>(x_dim, block_cols), max_blocks);
+  int grid_y = std::min<int64_t>(max_blocks / grid_x,
+                                 std::max<int64_t>(y_dim / block_rows, 1));
 
   config.block_per_grid = dim3(grid_x, grid_y, 1);
   return config;
@@ -229,6 +229,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig3D(const phi::GPUContext& context,
   return config;
 }
 
+template <typename Context>
+void LimitGridDim(const Context& ctx, dim3* grid_dim) {
+  auto max_grid_dim =
+      reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
+  grid_dim->z = grid_dim->z < max_grid_dim[2] ? grid_dim->z : max_grid_dim[2];
+}
 }  // namespace gpu
 }  // namespace backends
 }  // namespace phi
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index 0257139914384..2a8dbb85e8035 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -58,16 +58,20 @@ void InitGpuProperties(Place place,
   *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId());
 
   // TODO(wilber): glog may be replaced in the future?
-  LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
-                          << static_cast<int>(place.device)
-                          << ", GPU Compute Capability: "
-                          << *compute_capability / 10 << "."
-                          << *compute_capability % 10
-                          << ", Driver API Version: " << *driver_version / 1000
-                          << "." << (*driver_version % 100) / 10
-                          << ", Runtime API Version: "
-                          << *runtime_version / 1000 << "."
-                          << (*runtime_version % 100) / 10;
+  LOG_FIRST_N(WARNING, 1)
+      << "Please NOTE: device: " << static_cast<int>(place.device)
+      << ", GPU Compute Capability: " << *compute_capability / 10 << "."
+      << *compute_capability % 10
+      << ", Driver API Version: " << *driver_version / 1000 << "."
+      << (*driver_version % 100) / 10
+      << ", Runtime API Version: " << *runtime_version / 1000 << "."
+      << (*runtime_version % 100) / 10 << ", Build Date "
+#ifdef PADDLE_BRANCH_NAME
+      << __DATE__ << " Time " << __TIME__
+      << ", Git Version: " PADDLE_BRANCH_NAME ":" PADDLE_COMMIT_HASH;
+#else
+      << __DATE__ << " Time " << __TIME__;
+#endif
 #ifdef PADDLE_WITH_HIP
   size_t miopen_major, miopen_minor, miopen_patch;
   PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
new file mode 100644
index 0000000000000..045fdf9daa568
--- /dev/null
+++ b/paddle/phi/common/memory_utils.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <future>  // NOLINT
+#include <unordered_map>
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/stream.h"
+
+namespace phi {
+
+/*
+  NOTE(YuanRisheng) Why should we add the following code?
+  We need this because MemoryUtils::instance() is a singleton object and we
+  don't recommend using singleton object in kernels. So, we wrap it using a
+  function and if we delete this singleton object in future, it will be easy to
+  change code.
+*/
+
+namespace memory_utils {
+class Buffer {
+ public:
+  explicit Buffer(const phi::Place& place) : place_(place) {}
+
+  template <typename T>
+  T* Alloc(size_t size) {
+    using AllocT = typename std::
+        conditional<std::is_same<T, void>::value, uint8_t, T>::type;
+    if (UNLIKELY(size == 0)) return nullptr;
+    size *= sizeof(AllocT);
+    if (allocation_ == nullptr || allocation_->size() < size) {
+      allocation_ = paddle::memory::Alloc(place_, size);
+    }
+    return reinterpret_cast<T*>(allocation_->ptr());
+  }
+
+  template <typename T>
+  const T* Get() const {
+    return reinterpret_cast<const T*>(
+        allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
+  }
+
+  template <typename T>
+  T* GetMutable() {
+    return reinterpret_cast<T*>(
+        allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
+  }
+
+  size_t Size() const { return allocation_ ? allocation_->size() : 0; }
+
+  phi::Place GetPlace() const { return place_; }
+
+ private:
+  Allocator::AllocationPtr allocation_;
+  phi::Place place_;
+};
+
+template <typename StreamType>
+struct ThrustAllocator {
+  typedef char value_type;
+  ThrustAllocator(phi::Place place, StreamType stream) {
+    place_ = place;
+    stream_ = stream;
+  }
+  ~ThrustAllocator() {}
+  char* allocate(std::ptrdiff_t num_bytes) {
+    auto storage =
+        paddle::memory::AllocShared(place_,
+                    num_bytes,
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream_)));
+    char* ptr = reinterpret_cast<char*>(storage->ptr());
+    busy_allocation_.emplace(std::make_pair(ptr, storage));
+    return ptr;
+  }
+  void deallocate(char* ptr, size_t) {
+    allocation_map_type::iterator iter = busy_allocation_.find(ptr);
+    // CHECK(iter != busy_allocation_.end());
+    busy_allocation_.erase(iter);
+  }
+
+ private:
+  typedef std::unordered_map<char*, std::shared_ptr<Allocation>>
+      allocation_map_type;
+  allocation_map_type busy_allocation_;
+  phi::Place place_;
+  StreamType stream_;
+};
+
+}  // namespace memory_utils
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc
index 838f2dd265eb3..ad7a2b134a20c 100644
--- a/paddle/phi/kernels/autotune/cache.cc
+++ b/paddle/phi/kernels/autotune/cache.cc
@@ -21,21 +21,6 @@
 namespace phi {
 namespace autotune {
 
-// Define the cache key of operator
-size_t ConvKey(const std::vector<int64_t>& x_dims,
-               const std::vector<int64_t>& w_dims,
-               const std::vector<int>& strides,
-               const std::vector<int>& paddings,
-               const std::vector<int>& dilations,
-               phi::DataType dtype) {
-  return GetKey(x_dims,
-                w_dims,
-                strides,
-                paddings,
-                dilations,
-                static_cast<int64_t>(dtype));
-}
-
 size_t TransposeKey(const std::vector<int64_t>& x_dims,
                     const std::vector<int32_t>& perm,
                     phi::DataType dtype) {
@@ -73,6 +58,19 @@ void AutoTuneCache::UpdateStatus() {
     cache_hits += v.second.CacheHits();
     cache_misses += v.second.CacheMisses();
   }
+
+  for (auto& v : cudnn_auto_tune_map_) {
+    VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width)
+            << AlgorithmTypeString(v.first)
+            << " Cache Size: " << v.second.Size()
+            << " Hits: " << v.second.CacheHits()
+            << " Misses: " << v.second.CacheMisses()
+            << " Hit Rate: " << v.second.CacheHitRate();
+    size += v.second.Size();
+    cache_hits += v.second.CacheHits();
+    cache_misses += v.second.CacheMisses();
+  }
+
   total_size_ = size;
   total_cache_hits_ = cache_hits;
   total_cache_misses_ = cache_misses;
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index 1263cf40e567e..54c9508571c69 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -32,6 +32,7 @@ template <typename T, typename... Rest>
 inline void HashCombine(std::size_t* seed, const T& v, Rest... rest) {
   std::hash<T> hasher;
   *seed ^= hasher(v) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
+  *seed *= 0x00000100000001B3;
   HashCombine(seed, rest...);
 }
 
@@ -41,7 +42,7 @@ namespace std {
 template <typename T>
 struct hash<std::vector<T>> {
   std::size_t operator()(std::vector<T> const& vec) const noexcept {
-    std::size_t seed = 0;
+    std::size_t seed = 0xcbf29ce484222325;
     for (auto val : vec) {
       HashCombine(&seed, val);
     }
@@ -53,6 +54,16 @@ struct hash<std::vector<T>> {
 namespace phi {
 namespace autotune {
 
+struct ConvAutoTuneResult {
+  ConvAutoTuneResult() {}
+  ConvAutoTuneResult(int64_t a, size_t size, bool search)
+      : algo(a), workspace_size(size), exhaustive_search(search) {}
+
+  int64_t algo;
+  size_t workspace_size = 0;
+  bool exhaustive_search = false;
+};
+
 template <typename... Args>
 size_t GetKey(Args&&... args) {
   size_t seed = 0;
@@ -60,24 +71,147 @@ size_t GetKey(Args&&... args) {
   return seed;
 }
 
-// Define the cache key of operator
-size_t ConvKey(const std::vector<int64_t>& x_dims,
-               const std::vector<int64_t>& w_dims,
-               const std::vector<int>& strides,
-               const std::vector<int>& paddings,
-               const std::vector<int>& dilations,
-               phi::DataType dtype);
+struct ConvCacheKey {
+  ConvCacheKey() {}
+  ConvCacheKey(const std::vector<int64_t>& arg_x_dims,
+               const std::vector<int64_t>& arg_w_dims,
+               const std::vector<int>& arg_strides,
+               const std::vector<int>& arg_paddings,
+               const std::vector<int>& arg_dilations,
+               phi::DataType arg_dtype,
+               int arg_groups,
+               int64_t arg_data_layout)
+      : x_dims(arg_x_dims),
+        w_dims(arg_w_dims),
+        strides(arg_strides),
+        paddings(arg_paddings),
+        dilations(arg_dilations),
+        dtype(arg_dtype),
+        groups(arg_groups),
+        data_layout(arg_data_layout) {}
+  size_t hash_value() const {
+    return GetKey(x_dims,
+                  w_dims,
+                  strides,
+                  paddings,
+                  dilations,
+                  static_cast<int64_t>(dtype),
+                  groups,
+                  data_layout);
+  }
+
+  std::vector<int64_t> x_dims;
+  std::vector<int64_t> w_dims;
+  std::vector<int> strides;
+  std::vector<int> paddings;
+  std::vector<int> dilations;
+  phi::DataType dtype;
+  int groups;
+  int64_t data_layout;
+};
+
+struct ConvCacheKeyHash {
+  size_t operator()(const ConvCacheKey& cache) const {
+    return cache.hash_value();
+  }
+};
+
+struct ConvCacheKeyEqual {
+  size_t operator()(const ConvCacheKey& first,
+                    const ConvCacheKey& second) const {
+    if (first.x_dims != second.x_dims) return false;
+    if (first.w_dims != second.w_dims) return false;
+    if (first.strides != second.strides) return false;
+    if (first.paddings != second.paddings) return false;
+    if (first.dilations != second.dilations) return false;
+    if (first.dtype != second.dtype) return false;
+    if (first.groups != second.groups) return false;
+    if (first.data_layout != second.data_layout) return false;
+
+    return true;
+  }
+};
+
+class CudnnAlgorithmsCacheMap {
+ public:
+  CudnnAlgorithmsCacheMap() : cache_mutex_(new std::mutex()) { hash_.clear(); }
+
+  ConvAutoTuneResult Get(const ConvCacheKey& key) {
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    PADDLE_ENFORCE_NE(
+        hash_.find(key),
+        hash_.end(),
+        phi::errors::PreconditionNotMet("The key does not exist."));
+    return hash_[key];
+  }
+
+  bool Find(const ConvCacheKey& key) {
+    bool ret = false;
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    if (hash_.find(key) != hash_.end()) {
+      cache_hits_++;
+      ret = true;
+    } else {
+      cache_misses_++;
+    }
+    return ret;
+  }
+
+  void Clean() {
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    hash_.clear();
+    cache_hits_ = 0;
+    cache_misses_ = 0;
+  }
+
+  void Set(const ConvCacheKey& key, ConvAutoTuneResult algo) {
+    std::lock_guard<std::mutex> lock(*cache_mutex_);
+    if (hash_.size() > static_cast<size_t>(1000000)) {
+      hash_.clear();
+    }
+    hash_[key] = algo;
+  }
+
+  int64_t CacheMisses() const { return cache_misses_; }
+
+  int64_t CacheHits() const { return cache_hits_; }
+
+  float CacheHitRate() const {
+    int64_t num_accesses = cache_hits_ + cache_misses_;
+    float cache_hit_rate = 0.;
+    if (num_accesses != 0) {
+      cache_hit_rate =
+          static_cast<float>(cache_hits_) / static_cast<float>(num_accesses);
+    }
+    return cache_hit_rate;
+  }
+
+  int64_t Size() const { return hash_.size(); }
+
+ private:
+  std::unordered_map<ConvCacheKey,
+                     ConvAutoTuneResult,
+                     ConvCacheKeyHash,
+                     ConvCacheKeyEqual>
+      hash_;
+  std::shared_ptr<std::mutex> cache_mutex_;
+
+  int64_t cache_hits_{0};
+  int64_t cache_misses_{0};
+};
 
 size_t TransposeKey(const std::vector<int64_t>& x_dims,
                     const std::vector<int32_t>& perm,
                     phi::DataType dtype);
-
-template <typename AlgorithmT>
+template <typename KeyT,
+          typename AlgorithmT,
+          typename HashT = std::hash<KeyT>,
+          typename KeyEqualT = std::equal_to<KeyT>>
 class AlgorithmsCache {
  public:
-  AlgorithmsCache() : cache_mutex_(new std::mutex()) { hash_.clear(); }
+  AlgorithmsCache() : cache_mutex_(new std::mutex()) {}
 
-  AlgorithmT Get(size_t key) {
+  AlgorithmT Get(const KeyT& key) {
     std::lock_guard<std::mutex> lock(*cache_mutex_);
     PADDLE_ENFORCE_NE(
         hash_.find(key),
@@ -86,7 +220,7 @@ class AlgorithmsCache {
     return hash_[key];
   }
 
-  bool Find(size_t key) {
+  bool Find(const KeyT& key) {
     bool ret = false;
     std::lock_guard<std::mutex> lock(*cache_mutex_);
     if (hash_.find(key) != hash_.end()) {
@@ -105,7 +239,7 @@ class AlgorithmsCache {
     cache_misses_ = 0;
   }
 
-  void Set(size_t key, AlgorithmT algo) {
+  void Set(const KeyT& key, AlgorithmT algo) {
     std::lock_guard<std::mutex> lock(*cache_mutex_);
     hash_[key] = algo;
   }
@@ -126,14 +260,43 @@ class AlgorithmsCache {
 
   int64_t Size() const { return hash_.size(); }
 
- private:
-  std::unordered_map<size_t, AlgorithmT> hash_;
+ protected:
+  std::unordered_map<KeyT, AlgorithmT, HashT, KeyEqualT> hash_;
   std::shared_ptr<std::mutex> cache_mutex_;
 
   int64_t cache_hits_{0};
   int64_t cache_misses_{0};
 };
 
+template <typename KeyT, typename AlgorithmT>
+class MatmulAlgorithmsCache : public AlgorithmsCache<KeyT, AlgorithmT> {
+ public:
+  MatmulAlgorithmsCache() : AlgorithmsCache<KeyT, AlgorithmT>() {}
+
+  bool FindSubKey(const KeyT& sub_key) {
+    std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
+    bool ret = (sub_hash_.find(sub_key) != sub_hash_.end()) ? true : false;
+    return ret;
+  }
+
+  void SetSubKey(const KeyT& sub_key, void* algo) {
+    std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
+    sub_hash_[sub_key] = algo;
+  }
+
+  void* GetSubKey(const KeyT& sub_key) {
+    std::lock_guard<std::mutex> lock(*(this->cache_mutex_));
+    PADDLE_ENFORCE_NE(
+        sub_hash_.find(sub_key),
+        sub_hash_.end(),
+        phi::errors::PreconditionNotMet("The key does not exist."));
+    return sub_hash_[sub_key];
+  }
+
+ private:
+  std::unordered_map<KeyT, void*> sub_hash_;
+};
+
 enum class AlgorithmType {
   kConvForward = 1,
   kConvBackwardData = 2,
@@ -143,9 +306,13 @@ enum class AlgorithmType {
 };
 
 // AlgorithmsConfigKey -> AlgorithmsID
-using AlgorithmsCacheMap = AlgorithmsCache<int64_t>;
+// (todo. hong) use cudnnConvolutionFwdAlgo_t
+using AlgorithmsCacheMap = AlgorithmsCache<int64_t, int64_t>;
 // AlgorithmType -> AlgorithmsCache
 using AlgorithmsTypeMap = std::unordered_map<int64_t, AlgorithmsCacheMap>;
+using CudnnAlgorithmsTypeMap =
+    std::unordered_map<int64_t, CudnnAlgorithmsCacheMap>;
+using MatmulAlgorithmsCacheMap = MatmulAlgorithmsCache<size_t, int64_t>;
 
 class AutoTuneCache {
  public:
@@ -158,24 +325,22 @@ class AutoTuneCache {
     return auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
 
-  AlgorithmsCacheMap& GetConvForward() {
-    return Get(AlgorithmType::kConvForward);
-  }
-
-  AlgorithmsCacheMap& GetConvBackwardData() {
-    return Get(AlgorithmType::kConvBackwardData);
-  }
-
-  AlgorithmsCacheMap& GetConvBackwardFilter() {
-    return Get(AlgorithmType::kConvBackwardFilter);
+  CudnnAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) {
+    return cudnn_auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
 
   AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }
 
+  MatmulAlgorithmsCacheMap& GetMatmul() { return matmul_auto_tune_map_; }
+
   void Clean() {
     for (auto& v : auto_tune_map_) {
       v.second.Clean();
     }
+
+    for (auto& v : cudnn_auto_tune_map_) {
+      v.second.Clean();
+    }
   }
 
   void UpdateStatus();
@@ -206,14 +371,26 @@ class AutoTuneCache {
 
   void Register(const AlgorithmType& algo_type) {
     std::lock_guard<std::mutex> lock(*autotune_cache_mutex_);
-    int64_t key = static_cast<int64_t>(algo_type);
-    if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
-      AlgorithmsCacheMap cache;
-      auto_tune_map_[key] = cache;
+    if (algo_type == AlgorithmType::kConvForward ||
+        algo_type == AlgorithmType::kConvBackwardData ||
+        algo_type == AlgorithmType::kConvBackwardFilter) {
+      int64_t key = static_cast<int64_t>(algo_type);
+      if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
+        CudnnAlgorithmsCacheMap cache;
+        cudnn_auto_tune_map_[key] = cache;
+      }
+    } else {
+      int64_t key = static_cast<int64_t>(algo_type);
+      if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
+        AlgorithmsCacheMap cache;
+        auto_tune_map_[key] = cache;
+      }
     }
   }
 
   AlgorithmsTypeMap auto_tune_map_;
+  CudnnAlgorithmsTypeMap cudnn_auto_tune_map_;
+  MatmulAlgorithmsCacheMap matmul_auto_tune_map_;
   std::shared_ptr<std::mutex> autotune_cache_mutex_;
   int64_t total_cache_hits_{0};
   int64_t total_cache_misses_{0};
diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc
index 53574c3d0c9ac..18454ad3e1997 100644
--- a/paddle/phi/kernels/autotune/cache_test.cc
+++ b/paddle/phi/kernels/autotune/cache_test.cc
@@ -25,7 +25,8 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 };
 
 TEST(AlgosCache, AlgosCache) {
   auto autotune_cache = phi::autotune::AutoTuneCache::Instance();
-  auto& cache = autotune_cache.GetConvForward();
+  auto& cache =
+      autotune_cache.GetConv(phi::autotune::AlgorithmType::kConvForward);
 
   std::vector<int64_t> x_shape = {4, 224, 224, 3};
   std::vector<int64_t> w_shape = {32, 3, 3, 3};
@@ -34,20 +35,24 @@ TEST(AlgosCache, AlgosCache) {
   std::vector<int> dilations = {1, 1};
   phi::DataType dtype = paddle::experimental::CppTypeToDataType<float>::Type();
 
-  auto key = phi::autotune::ConvKey(
-      x_shape, w_shape, paddings, strides, dilations, dtype);
+  phi::autotune::ConvCacheKey key(
+      x_shape, w_shape, paddings, strides, dilations, dtype, 0, 0);
   EXPECT_EQ(cache.Find(key), false);
-  cache.Set(key, ConvAlgos::GEMMKernel);
+  phi::autotune::ConvAutoTuneResult node(
+      static_cast<int64_t>(ConvAlgos::GEMMKernel), 0, false);
+  cache.Set(key, node);
   EXPECT_EQ(cache.Size(), 1);
   EXPECT_EQ(cache.Find(key), true);
   auto algo = cache.Get(key);
-  EXPECT_EQ(algo, ConvAlgos::GEMMKernel);
+  EXPECT_EQ(algo.algo, ConvAlgos::GEMMKernel);
 
   x_shape = {4, 128, 128, 3};
-  key = phi::autotune::ConvKey(
-      x_shape, w_shape, paddings, strides, dilations, dtype);
-  EXPECT_EQ(cache.Find(key), false);
-  cache.Set(key, ConvAlgos::CuDNNKernel_1);
+  phi::autotune::ConvCacheKey key1(
+      x_shape, w_shape, paddings, strides, dilations, dtype, 0, 1);
+  EXPECT_EQ(cache.Find(key1), false);
+  phi::autotune::ConvAutoTuneResult node1(
+      static_cast<int64_t>(ConvAlgos::CuDNNKernel_1), 0, false);
+  cache.Set(key1, node1);
   EXPECT_EQ(cache.Size(), 2);
   EXPECT_EQ(cache.CacheHits(), 1);
   EXPECT_EQ(cache.CacheMisses(), 2);
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index 509d824ca0553..459a701b5115b 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#if defined(__NVCC__)
+#include <thrust/device_vector.h>
+#endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -55,6 +58,17 @@ struct CUBlas<float> {
     PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cublasSgemv(args...));
   }
 
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+#if CUDA_VERSION >= 8000
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cublasSgemmBatched(args...));
+#else
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "SgemmBatched is not supported on cuda <= 7.5"));
+#endif
+  }
+
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
@@ -181,6 +195,17 @@ struct CUBlas<double> {
     PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cublasDgemv(args...));
   }
 
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+#if CUDA_VERSION >= 8000
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cublasDgemmBatched(args...));
+#else
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "DgemmBatched is not supported on cuda <= 7.5"));
+#endif
+  }
+
   template <typename... ARGS>
   static void GEMM_STRIDED_BATCH(ARGS... args) {
 #if CUDA_VERSION >= 8000
@@ -235,40 +260,69 @@ struct CUBlas<double> {
 };
 template <>
 struct CUBlas<int8_t> {
-  //int8_t call func:
-  //CUBlas<int8_t>::GEMM_EX(
-  //    &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_8I, ldb, A,
-  //    CUDA_R_8I, lda, &h_beta, C, CUDA_R_32F, N, CUDA_R_32F);
+  // int8_t call func:
+  // CUBlas<int8_t>::GEMM_EX(
+  //     &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_8I, ldb, A,
+  //     CUDA_R_8I, lda, &h_beta, C, CUDA_R_32F, N, CUDA_R_32F);
 
   // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
   // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
   template <typename... ARGS>
   static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      cublasOperation_t transa, cublasOperation_t transb, int m,
-                      int n, int k, const void *alpha, const void *A,
-                      cudaDataType_t Atype, int lda, const void *B,
-                      cudaDataType_t Btype, int ldb, const void *beta, void *C,
-                      cudaDataType_t Ctype, int ldc,
+                      cublasOperation_t transa,
+                      cublasOperation_t transb,
+                      int m,
+                      int n,
+                      int k,
+                      const void *alpha,
+                      const void *A,
+                      cudaDataType_t Atype,
+                      int lda,
+                      const void *B,
+                      cudaDataType_t Btype,
+                      int ldb,
+                      const void *beta,
+                      void *C,
+                      cudaDataType_t Ctype,
+                      int ldc,
                       cudaDataType_t computeType) {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
     bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
-      //algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-      //VLOG(5) << "2. CUBlas int8_t, algo is CUBLAS_GEMM_DFALT_TENSOR_OP.";
-      algo = CUBLAS_GEMM_DFALT; // only for int8 gemm
+      // algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+      // VLOG(5) << "2. CUBlas int8_t, algo is CUBLAS_GEMM_DFALT_TENSOR_OP.";
+      algo = CUBLAS_GEMM_DFALT;  // only for int8 gemm
     }
     VLOG(5) << "use_tensor_op_math: "
             << (use_tensor_op_math ? "True" : "False");
-    VLOG(5) << "3. use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+    VLOG(5) << "3. use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
     algo = CUBLAS_GEMM_DFALT;
 #endif  // CUDA_VERSION >= 9000
 
     dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cublasGemmEx(
-          handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb,
-          beta, C, Ctype, ldc, computeType, algo));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cublasGemmEx(handle,
+                                                  transa,
+                                                  transb,
+                                                  m,
+                                                  n,
+                                                  k,
+                                                  alpha,
+                                                  A,
+                                                  Atype,
+                                                  lda,
+                                                  B,
+                                                  Btype,
+                                                  ldb,
+                                                  beta,
+                                                  C,
+                                                  Ctype,
+                                                  ldc,
+                                                  computeType,
+                                                  algo));
     });
 #else
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -311,6 +365,69 @@ struct CUBlas<phi::dtype::float16> {
         ldc));
   }
 
+#if defined(__NVCC__)
+  static void GEMM_BATCH(phi::GPUContext *dev_ctx,
+                         cublasOperation_t transa,
+                         cublasOperation_t transb,
+                         int m,
+                         int n,
+                         int k,
+                         const float *alpha,
+                         const float16 **A,
+                         cudaDataType_t Atype,
+                         int lda,
+                         const float16 **B,
+                         cudaDataType_t Btype,
+                         int ldb,
+                         const float *beta,
+                         float16 **C,
+                         cudaDataType_t Ctype,
+                         int ldc,
+                         int batchCount,
+                         cudaDataType_t computeType) {
+#if CUDA_VERSION >= 8000
+    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+#endif  // CUDA_VERSION >= 9000
+    thrust::device_vector<const void *> A_ptr(A, A + batchCount);
+    thrust::device_vector<const void *> B_ptr(B, B + batchCount);
+    thrust::device_vector<void *> C_ptr(C, C + batchCount);
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cublasGemmBatchedEx(handle,
+                                                         transa,
+                                                         transb,
+                                                         m,
+                                                         n,
+                                                         k,
+                                                         alpha,
+                                                         A_ptr.data().get(),
+                                                         Atype,
+                                                         lda,
+                                                         B_ptr.data().get(),
+                                                         Btype,
+                                                         ldb,
+                                                         beta,
+                                                         C_ptr.data().get(),
+                                                         Ctype,
+                                                         ldc,
+                                                         batchCount,
+                                                         computeType,
+                                                         algo));
+    });
+#else
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "cublasGemmBatchedEx is not supported on cuda <= 7.5"));
+#endif
+  }
+#endif
+
   static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
                                  cublasOperation_t transa,
                                  cublasOperation_t transb,
@@ -961,20 +1078,20 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 8000
 }
 
-//int8_t matmul
+// int8_t matmul
 template <>
 template <>
-inline void Blas<phi::GPUContext>::GEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    float alpha,
-    const int8_t *A,
-    const int8_t *B,
-    float beta,
-    float *C, int flag) const {
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        float alpha,
+                                        const int8_t *A,
+                                        const int8_t *B,
+                                        float beta,
+                                        float *C,
+                                        int flag) const {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -985,7 +1102,8 @@ inline void Blas<phi::GPUContext>::GEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
   PADDLE_ENFORCE_GE(
-      context_.GetComputeCapability(), 53,
+      context_.GetComputeCapability(),
+      53,
       phi::errors::InvalidArgument(
           "cublas int8_t gemm requires GPU compute capability >= 53,"
           "but received %d",
@@ -1001,17 +1119,32 @@ inline void Blas<phi::GPUContext>::GEMM(
   // using tensor cores in volta GPUs.
   auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
   VLOG(3) << "1. call int8_t GEMM_EX.";
-  CUBlas<int8_t>::GEMM_EX(
-      &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_8I, ldb, A,
-      CUDA_R_8I, lda, &h_beta, C, CUDA_R_32F, N, CUDA_R_32F);
+  CUBlas<int8_t>::GEMM_EX(&cuda_ctx,
+                          cuTransB,
+                          cuTransA,
+                          N,
+                          M,
+                          K,
+                          &h_alpha,
+                          B,
+                          CUDA_R_8I,
+                          ldb,
+                          A,
+                          CUDA_R_8I,
+                          lda,
+                          &h_beta,
+                          C,
+                          CUDA_R_32F,
+                          N,
+                          CUDA_R_32F);
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
-  //context_.CublasCall([&](cublasHandle_t handle) {
-  //  CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K,
-  //                                  &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C,
-  //                                  N);
-  //});
+  // context_.CublasCall([&](cublasHandle_t handle) {
+  //   CUBlas<platform::float16>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+  //                                   &h_alpha, h_B, ldb, h_A, lda, &h_beta,
+  //                                   h_C, N);
+  // });
 #endif  // CUDA_VERSION >= 8000
 }
 
@@ -1428,6 +1561,75 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   });
 }
 
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        int lda,
+                                        const phi::dtype::bfloat16 *B,
+                                        int ldb,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C,
+                                        int ldc) const {
+#if CUDA_VERSION >= 11000
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(),
+      80,
+      phi::errors::InvalidArgument(
+          "cublas bf16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cublasGemmEx(handle,
+                                                cuTransB,
+                                                cuTransA,
+                                                N,
+                                                M,
+                                                K,
+                                                &h_alpha,
+                                                B,
+                                                CUDA_R_16BF,
+                                                ldb,
+                                                A,
+                                                CUDA_R_16BF,
+                                                lda,
+                                                &h_beta,
+                                                C,
+                                                CUDA_R_16BF,
+                                                ldc,
+                                                CUDA_R_32F,
+                                                algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
+
+#endif  // CUDA_VERSION >= 11000
+}
+
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
@@ -1708,6 +1910,97 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   }
 }
 
+#if defined(__NVCC__)
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               double alpha,
+                                               const double **A,
+                                               const double **B,
+                                               double beta,
+                                               double **C,
+                                               int batchCount) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  thrust::device_vector<const double *> A_ptr(A, A + batchCount);
+  thrust::device_vector<const double *> B_ptr(B, B + batchCount);
+  thrust::device_vector<double *> C_ptr(C, C + batchCount);
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<double>::GEMM_BATCH(handle,
+                               cuTransB,
+                               cuTransA,
+                               N,
+                               M,
+                               K,
+                               &alpha,
+                               B_ptr.data().get(),
+                               ldb,
+                               A_ptr.data().get(),
+                               lda,
+                               &beta,
+                               C_ptr.data().get(),
+                               ldc,
+                               batchCount);
+  });
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               float alpha,
+                                               const float **A,
+                                               const float **B,
+                                               float beta,
+                                               float **C,
+                                               int batchCount) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  thrust::device_vector<const float *> A_ptr(A, A + batchCount);
+  thrust::device_vector<const float *> B_ptr(B, B + batchCount);
+  thrust::device_vector<float *> C_ptr(C, C + batchCount);
+
+  context_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<float>::GEMM_BATCH(handle,
+                              cuTransB,
+                              cuTransA,
+                              N,
+                              M,
+                              K,
+                              &alpha,
+                              B_ptr.data().get(),
+                              ldb,
+                              A_ptr.data().get(),
+                              lda,
+                              &beta,
+                              C_ptr.data().get(),
+                              ldc,
+                              batchCount);
+  });
+}
+
 template <>
 template <>
 inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -1721,10 +2014,45 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                phi::dtype::float16 beta,
                                                phi::dtype::float16 **C,
                                                int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<phi::dtype::float16>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(),
+      53,
+      phi::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
+  float f_alpha = static_cast<float>(alpha);
+  float f_beta = static_cast<float>(beta);
+  auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
+  CUBlas<phi::dtype::float16>::GEMM_BATCH(&cuda_ctx,
+                                          cuTransB,
+                                          cuTransA,
+                                          N,
+                                          M,
+                                          K,
+                                          &f_alpha,
+                                          B,
+                                          CUDA_R_16F,
+                                          ldb,
+                                          A,
+                                          CUDA_R_16F,
+                                          lda,
+                                          &f_beta,
+                                          C,
+                                          CUDA_R_16F,
+                                          ldc,
+                                          batchCount,
+                                          CUDA_R_32F);
 }
 
 template <>
@@ -1740,11 +2068,69 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                phi::dtype::bfloat16 beta,
                                                phi::dtype::bfloat16 **C,
                                                int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<phi::dtype::bfloat16>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
+#if CUDA_VERSION >= 11000
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(),
+      80,
+      phi::errors::InvalidArgument(
+          "cublas bf16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float f_alpha = static_cast<float>(alpha);
+  float f_beta = static_cast<float>(beta);
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  thrust::device_vector<const void *> A_ptr(A, A + batchCount);
+  thrust::device_vector<const void *> B_ptr(B, B + batchCount);
+  thrust::device_vector<void *> C_ptr(C, C + batchCount);
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cublasGemmBatchedEx(handle,
+                                                       cuTransB,
+                                                       cuTransA,
+                                                       N,
+                                                       M,
+                                                       K,
+                                                       &f_alpha,
+                                                       B_ptr.data().get(),
+                                                       CUDA_R_16BF,
+                                                       ldb,
+                                                       A_ptr.data().get(),
+                                                       CUDA_R_16BF,
+                                                       lda,
+                                                       &f_beta,
+                                                       C_ptr.data().get(),
+                                                       CUDA_R_16BF,
+                                                       ldc,
+                                                       batchCount,
+                                                       CUDA_R_32F,
+                                                       algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "cublasGemmBatchedEx with bfloat16 is not supported on cuda <= 11"));
+
+#endif  // CUDA_VERSION >= 11000
 }
+#endif
 
 template <>
 template <typename T>
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
index e322fba39a481..60d0b4ff3c0ef 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
@@ -999,6 +999,68 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   });
 }
 
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        int lda,
+                                        const phi::dtype::bfloat16 *B,
+                                        int ldb,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C,
+                                        int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  rocblas_operation cuTransA =
+      transA ? rocblas_operation_none : rocblas_operation_transpose;
+  rocblas_operation cuTransB =
+      transB ? rocblas_operation_none : rocblas_operation_transpose;
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(),
+      80,
+      phi::errors::InvalidArgument(
+          "rocblas bf16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+
+  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::rocblas_gemm_ex(handle,
+                                                   cuTransB,
+                                                   cuTransA,
+                                                   N,
+                                                   M,
+                                                   K,
+                                                   &h_alpha,
+                                                   B,
+                                                   rocblas_datatype_bf16_r,
+                                                   ldb,
+                                                   A,
+                                                   rocblas_datatype_bf16_r,
+                                                   lda,
+                                                   &h_beta,
+                                                   C,
+                                                   rocblas_datatype_bf16_r,
+                                                   ldc,
+                                                   C,
+                                                   rocblas_datatype_bf16_r,
+                                                   ldc,
+                                                   rocblas_datatype_f32_r,
+                                                   algo,
+                                                   0,
+                                                   0));
+  });
+}
+
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
@@ -1128,6 +1190,159 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   });
 }
 
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               float16 alpha,
+                                               const float16 *A,
+                                               const float16 *B,
+                                               float16 beta,
+                                               float16 *C,
+                                               int batchCount,
+                                               int64_t strideA,
+                                               int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  const int64_t strideC = M * N;
+  context_.CublasCall([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::rocblas_hgemm_strided_batched(
+            handle,
+            cuTransB,
+            cuTransA,
+            N,
+            M,
+            K,
+            reinterpret_cast<const rocblas_half *>(&alpha),
+            reinterpret_cast<const rocblas_half *>(B),
+            ldb,
+            strideB,
+            reinterpret_cast<const rocblas_half *>(A),
+            lda,
+            strideA,
+            reinterpret_cast<const rocblas_half *>(&beta),
+            reinterpret_cast<rocblas_half *>(C),
+            ldc,
+            strideC,
+            batchCount));
+  });
+}
+
+// note(wangran16): unknown bug. parameters dislocation when calling
+// GEMM_STRIDED_BATCH<float> and GEMM_STRIDED_BATCH<double>
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               float alpha,
+                                               const float *A,
+                                               const float *B,
+                                               float beta,
+                                               float *C,
+                                               int batchCount,
+                                               int64_t strideA,
+                                               int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  const int64_t strideC = M * N;
+  context_.CublasCall([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::rocblas_sgemm_strided_batched(handle,
+                                                                 cuTransB,
+                                                                 cuTransA,
+                                                                 N,
+                                                                 M,
+                                                                 K,
+                                                                 &alpha,
+                                                                 B,
+                                                                 ldb,
+                                                                 strideB,
+                                                                 A,
+                                                                 lda,
+                                                                 strideA,
+                                                                 &beta,
+                                                                 C,
+                                                                 ldc,
+                                                                 strideC,
+                                                                 batchCount));
+  });
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               double alpha,
+                                               const double *A,
+                                               const double *B,
+                                               double beta,
+                                               double *C,
+                                               int batchCount,
+                                               int64_t strideA,
+                                               int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  const int64_t strideC = M * N;
+  context_.CublasCall([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::rocblas_dgemm_strided_batched(handle,
+                                                                 cuTransB,
+                                                                 cuTransA,
+                                                                 N,
+                                                                 M,
+                                                                 K,
+                                                                 &alpha,
+                                                                 B,
+                                                                 ldb,
+                                                                 strideB,
+                                                                 A,
+                                                                 lda,
+                                                                 strideA,
+                                                                 &beta,
+                                                                 C,
+                                                                 ldc,
+                                                                 strideC,
+                                                                 batchCount));
+  });
+}
+
 template <>
 template <>
 inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
diff --git a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
new file mode 100644
index 0000000000000..37229fc0daff1
--- /dev/null
+++ b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
@@ -0,0 +1,1149 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+
+#include "glog/logging.h"
+
+#include <cuda_runtime_api.h>  // NOLINT
+#include "cuda.h"              // NOLINT
+#include "paddle/phi/backends/dynload/cublasLt.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_helper.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/autotune/gpu_timer.h"
+#include "paddle/phi/kernels/autotune/switch_autotune.h"
+
+DECLARE_int64(cublaslt_exhaustive_search_times);
+#endif
+
+namespace phi {
+namespace funcs {
+
+#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
+
+// Set this enum according to
+// https://docs.nvidia.com/cuda/cublas/index.html#cublasltepilogue-t
+// While kMatmul, kMatmulGrad, kMatmulGradWithoutBias share the same
+// enum value, but if all elements for MatmulPlanner->GetKey() is same,
+// no matter forward or backward, they could share the same descriptor
+// cache, in that the descriptor is for description of matmul operation.
+enum MatmulFusedType {
+  kMatmul = 0,
+  kMatmulGrad = 1,
+  kMatmulGradWithoutBias = 2,
+  kMatmulBias = 3,
+  kMatmulRelu = 4,
+  kMatmulGelu = 5,
+  kMatmulBiasRelu = 6,
+  kMatmulBiasGelu = 7,
+  kMatmulBiasReluWithReservedData = 8,
+  kMatmulBiasGeluWithReservedData = 9,
+  kMatmulReluGrad = 10,
+  kMatmulGeluGrad = 11,
+  kMatmulBiasGradToA = 12,
+  kMatmulBiasGradToB = 13,
+};
+
+static cublasLtEpilogue_t ConvertFusedType(MatmulFusedType fused_type) {
+  static std::map<MatmulFusedType, cublasLtEpilogue_t> fused_type_map = {
+    {MatmulFusedType::kMatmul, CUBLASLT_EPILOGUE_DEFAULT},
+    {MatmulFusedType::kMatmulGrad, CUBLASLT_EPILOGUE_DEFAULT},
+    {MatmulFusedType::kMatmulGradWithoutBias, CUBLASLT_EPILOGUE_DEFAULT},
+    {MatmulFusedType::kMatmulBias, CUBLASLT_EPILOGUE_BIAS},
+    {MatmulFusedType::kMatmulRelu, CUBLASLT_EPILOGUE_RELU},
+    {MatmulFusedType::kMatmulGelu, CUBLASLT_EPILOGUE_GELU},
+    {MatmulFusedType::kMatmulBiasRelu, CUBLASLT_EPILOGUE_RELU_BIAS},
+    {MatmulFusedType::kMatmulBiasGelu, CUBLASLT_EPILOGUE_GELU_BIAS},
+    {MatmulFusedType::kMatmulBiasReluWithReservedData,
+     CUBLASLT_EPILOGUE_RELU_AUX_BIAS},
+    {MatmulFusedType::kMatmulBiasGeluWithReservedData,
+     CUBLASLT_EPILOGUE_GELU_AUX_BIAS},
+#if CUDA_VERSION >= 11060
+    {MatmulFusedType::kMatmulReluGrad, CUBLASLT_EPILOGUE_DRELU},
+    {MatmulFusedType::kMatmulGeluGrad, CUBLASLT_EPILOGUE_DGELU},
+    {MatmulFusedType::kMatmulBiasGradToA, CUBLASLT_EPILOGUE_BGRADA},
+    {MatmulFusedType::kMatmulBiasGradToB, CUBLASLT_EPILOGUE_BGRADB}
+#endif
+  };
+
+  return fused_type_map[fused_type];
+}
+
+enum FusedGEMMGradInType { kDX = 0, kDY = 1, kDZ = 2 };
+
+template <bool TransX, bool TransY>
+struct FusedGEMMGradTrait;
+
+template <>
+struct FusedGEMMGradTrait<false, false> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradATrans = false;
+  static constexpr auto kXGradBTrans = true;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradATrans = true;
+  static constexpr auto kYGradBTrans = false;
+};
+
+template <>
+struct FusedGEMMGradTrait<true, false> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradATrans = false;
+  static constexpr auto kXGradBTrans = true;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradATrans = false;
+  static constexpr auto kYGradBTrans = false;
+};
+
+template <>
+struct FusedGEMMGradTrait<false, true> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradATrans = false;
+  static constexpr auto kXGradBTrans = false;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradATrans = true;
+  static constexpr auto kYGradBTrans = false;
+};
+
+template <>
+struct FusedGEMMGradTrait<true, true> {
+  static constexpr auto kXGradA = FusedGEMMGradInType::kDY;
+  static constexpr auto kXGradB = FusedGEMMGradInType::kDZ;
+  static constexpr auto kXGradATrans = true;
+  static constexpr auto kXGradBTrans = true;
+
+  static constexpr auto kYGradA = FusedGEMMGradInType::kDZ;
+  static constexpr auto kYGradB = FusedGEMMGradInType::kDX;
+  static constexpr auto kYGradATrans = true;
+  static constexpr auto kYGradBTrans = true;
+};
+
+// To tell any matmul or fused matmul operation from each other.
+struct MatmulPlanner {
+ public:
+  const void* bias{nullptr};
+  void* aux_data{nullptr};
+
+  MatmulPlanner() {}
+  MatmulPlanner(const std::vector<int64_t>& x_dims,
+                const std::vector<int64_t>& y_dims,
+                const bool trans_x,
+                const bool trans_y,
+                phi::DataType dtype,
+                MatmulFusedType fused_type,
+                const void* bias_data = nullptr,
+                void* reserve_data = nullptr,  // Commonly for ReLu bit-mask.
+                bool use_addto = false,
+                bool no_exchange = true)
+      : bias(bias_data), aux_data(reserve_data), fused_type_(fused_type) {
+    use_addto_ = use_addto;
+    key_ = phi::autotune::GetKey(x_dims,
+                                 y_dims,
+                                 static_cast<int>(trans_x),
+                                 static_cast<int>(trans_y),
+                                 static_cast<int>(dtype),
+                                 static_cast<int>(fused_type_),
+                                 static_cast<int>(use_addto_),
+                                 static_cast<int>(no_exchange));
+  }
+
+  bool UseAddTo() const { return use_addto_; }
+  size_t GetKey() const { return key_; }
+  MatmulFusedType GetFusedType() const { return fused_type_; }
+
+  size_t GenSubKey() const { return key_; }
+
+ private:
+  MatmulFusedType fused_type_;
+  bool use_addto_;
+  size_t key_;
+};
+
+template <typename T>
+cublasComputeType_t GetCudaComputeType() {
+  if (std::is_same<T, double>::value) {
+    return CUBLAS_COMPUTE_64F;
+  } else if (std::is_same<T, int8_t>::value) {
+    return CUBLAS_COMPUTE_32I;
+  } else {
+    return CUBLAS_COMPUTE_32F;
+  }
+}
+
+struct MatmulDescriptor {
+ public:
+  cublasLtMatmulDesc_t op_desc{nullptr};
+  cublasLtMatrixLayout_t x_desc{nullptr};
+  cublasLtMatrixLayout_t y_desc{nullptr};
+  cublasLtMatrixLayout_t out_desc{nullptr};
+  cublasLtMatmulAlgo_t* algo{nullptr};
+  bool is_cached{false};
+
+  MatmulDescriptor() {}
+  MatmulDescriptor(const MatmulDescriptor& obj) {
+    algo = obj.algo;
+    x_desc = obj.x_desc;
+    y_desc = obj.y_desc;
+    op_desc = obj.op_desc;
+    out_desc = obj.out_desc;
+    is_cached = obj.is_cached;
+  }
+
+  MatmulDescriptor& operator=(const MatmulDescriptor& obj) {
+    algo = obj.algo;
+    x_desc = obj.x_desc;
+    y_desc = obj.y_desc;
+    op_desc = obj.op_desc;
+    out_desc = obj.out_desc;
+    is_cached = obj.is_cached;
+
+    return *this;
+  }
+
+  ~MatmulDescriptor() PADDLE_MAY_THROW {
+    if (!is_cached) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescDestroy(op_desc));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(y_desc));
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(x_desc));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cublasLtMatrixLayoutDestroy(out_desc));
+      delete algo;
+
+      op_desc = nullptr;
+      x_desc = nullptr;
+      y_desc = nullptr;
+      out_desc = nullptr;
+      algo = nullptr;
+    }
+  }
+
+  // x_desc, y_desc, op_desc are allocated in heap memory.
+  template <typename T, typename DXT, typename DYT, bool TransX, bool TransY>
+  void Create(const int64_t M,
+              const int64_t N,
+              const int64_t K,
+              const bool trans_x,
+              const bool trans_y,
+              phi::funcs::MatmulPlanner* planner,
+              const int batch_size = 1,
+              const int64_t stride_x = 0,
+              const int64_t stride_y = 0,
+              const int64_t stride_out = 0,
+              bool grad_for_dx = true) {
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    cudaDataType_t mat_type = phi::backends::gpu::ToCudaDataType<T>();
+    cudaDataType_t out_mat_type = phi::backends::gpu::ToCudaDataType<T>();
+    cudaDataType_t scale_type = phi::backends::gpu::ToCudaDataType<MT>();
+    cublasComputeType_t compute_type = GetCudaComputeType<T>();
+
+    if (std::is_same<T, int8_t>::value) {
+      out_mat_type = phi::backends::gpu::ToCudaDataType<int32_t>();
+      scale_type = phi::backends::gpu::ToCudaDataType<int32_t>();
+    }
+
+    // Create operation descriptor; see cublasLtMatmulDescAttributes_t for
+    // details about defaults; just need to set the transforms for A and B
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulDescCreate(&op_desc, compute_type, scale_type));
+    SetFusedEpilogueOpDescriptor(planner, trans_x, trans_y, N);
+
+    // Create matrix descriptors
+    CreateMatrixLayout(&x_desc, mat_type, M, K, trans_x);
+    CreateMatrixLayout(&y_desc, mat_type, K, N, trans_y);
+    CreateMatrixLayout(&out_desc, out_mat_type, M, N, false);
+
+    // Config batch size and stride.
+    if (batch_size > 1) {
+      SetBatchAndStride(x_desc, batch_size, stride_x);
+      SetBatchAndStride(y_desc, batch_size, stride_y);
+      SetBatchAndStride(out_desc, batch_size, stride_out);
+    }
+  }
+
+  cublasLtMatmulAlgo_t* SetAlgo() {
+    // while entering this function, the desc shall be cached.
+    is_cached = true;
+    algo = new cublasLtMatmulAlgo_t;
+    return algo;
+  }
+
+  template <typename T>
+  void SetFusedEpiloguePtr(phi::funcs::MatmulPlanner* planner) {
+    if (planner->bias != nullptr) {
+      const T* bias_data = static_cast<const T*>(planner->bias);
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute(
+          op_desc,
+          CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+          &bias_data,
+          sizeof(bias_data)));
+    }
+    if (planner->aux_data != nullptr) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute(
+          op_desc,
+          CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
+          &(planner->aux_data),
+          sizeof(planner->aux_data)));
+    }
+  }
+
+  std::string GetDescResultString(std::string prefix,
+                                  bool has_algo = true) const {
+    std::ostringstream out;
+    out << prefix << " \n";
+#define GET_DESC_DATA_STRING(src)                    \
+  do {                                               \
+    out << "  " << #src << " = [";                   \
+    int num = sizeof((*src)) / sizeof(src->data[0]); \
+    for (int i = 0; i < num; ++i) {                  \
+      if (i == 0) {                                  \
+        out << src->data[i];                         \
+      } else {                                       \
+        out << ", " << src->data[i];                 \
+      }                                              \
+    }                                                \
+    out << "]\n";                                    \
+  } while (0);
+
+    if (has_algo) {
+      GET_DESC_DATA_STRING(algo);
+    }
+    GET_DESC_DATA_STRING(x_desc);
+    GET_DESC_DATA_STRING(y_desc);
+    GET_DESC_DATA_STRING(out_desc);
+    GET_DESC_DATA_STRING(op_desc);
+#undef GET_DESC_DATA_STRING
+    return out.str();
+  }
+
+  void ExchangeXYDesc(bool no_exchange) {}
+
+ protected:
+  void SetFusedEpilogueOpDescriptor(phi::funcs::MatmulPlanner* planner,
+                                    const bool trans_x,
+                                    const bool trans_y,
+                                    int64_t lead_dim) {
+    cublasOperation_t cublas_trans_x = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t cublas_trans_y = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulDescSetAttribute(op_desc,
+                                                CUBLASLT_MATMUL_DESC_TRANSB,
+                                                &cublas_trans_x,
+                                                sizeof(cublas_trans_x)));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulDescSetAttribute(op_desc,
+                                                CUBLASLT_MATMUL_DESC_TRANSA,
+                                                &cublas_trans_y,
+                                                sizeof(cublas_trans_y)));
+    MatmulFusedType fused_type = planner->GetFusedType();
+    if (fused_type != MatmulFusedType::kMatmul) {
+      cublasLtEpilogue_t cublaslt_fused_type = ConvertFusedType(fused_type);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cublasLtMatmulDescSetAttribute(op_desc,
+                                                  CUBLASLT_MATMUL_DESC_EPILOGUE,
+                                                  &cublaslt_fused_type,
+                                                  sizeof(fused_type)));
+    }
+    if (planner->aux_data) {
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute(
+          op_desc,
+          CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
+          &lead_dim,
+          sizeof(lead_dim)));
+    }
+  }
+
+  void CreateMatrixLayout(cublasLtMatrixLayout_t* desc,
+                          cudaDataType type,
+                          uint64_t rows,
+                          uint64_t cols,
+                          bool trans) {
+    if (trans) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cublasLtMatrixLayoutCreate(desc, type, rows, cols, rows));
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cublasLtMatrixLayoutCreate(desc, type, cols, rows, cols));
+    }
+  }
+
+  void SetBatchAndStride(cublasLtMatrixLayout_t desc,
+                         int batch_size,
+                         int64_t stride) {
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutSetAttribute(
+        desc,
+        CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
+        &batch_size,
+        sizeof(batch_size)));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutSetAttribute(
+        desc,
+        CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
+        &stride,
+        sizeof(stride)));
+  }
+};
+
+struct MatmulGradDescriptor : MatmulDescriptor {
+ public:
+  MatmulGradDescriptor() {}
+
+  template <typename T, typename DXT, typename DYT, bool TransX, bool TransY>
+  void Create(const int64_t M,
+              const int64_t N,
+              const int64_t K,
+              const bool trans_x,
+              const bool trans_y,
+              phi::funcs::MatmulPlanner* planner,
+              const int batch_size = 1,
+              int64_t stride_x = 0,
+              int64_t stride_y = 0,
+              int64_t stride_out = 0,
+              bool grad_for_dx = true) {
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    cudaDataType_t mat_type = phi::backends::gpu::ToCudaDataType<T>();
+    cudaDataType_t scale_type = phi::backends::gpu::ToCudaDataType<MT>();
+    cublasComputeType_t compute_type = GetCudaComputeType<T>();
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulDescCreate(&op_desc, compute_type, scale_type));
+    this->SetFusedEpilogueOpDescriptor(
+        planner, trans_x, trans_y, TransX ? M : K);
+
+    // Create operation desciriptor; see cublasLtMatmulDescAttributes_t for
+    // details about defaults; just need to set the transforms for A and B
+    this->CreateMatrixLayout(&x_desc, mat_type, N, M, true);
+    if (grad_for_dx) {
+      this->CreateMatrixLayout(&y_desc, mat_type, K, N, TransY);
+      this->CreateMatrixLayout(
+          &out_desc, phi::backends::gpu::ToCudaDataType<DXT>(), M, K, TransX);
+    } else {
+      this->CreateMatrixLayout(&y_desc, mat_type, M, K, TransX);
+      this->CreateMatrixLayout(
+          &out_desc, phi::backends::gpu::ToCudaDataType<DYT>(), K, N, TransY);
+    }
+  }
+
+  void ExchangeXYDesc(bool no_exchange) {
+    if (no_exchange) {
+      return;
+    }
+    auto* temp = y_desc;
+    y_desc = x_desc;
+    x_desc = temp;
+  }
+};
+
+template <typename T, typename OutT = T, class MatmulDescT = MatmulDescriptor>
+struct CublasLtBase {
+ public:
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  static phi::Allocator::AllocationPtr GetWorkspace(const phi::GPUContext& ctx,
+                                                    size_t workspace_size) {
+    return paddle::memory::Alloc(
+        ctx.GetPlace(),
+        workspace_size,
+        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
+  }
+
+  static void RunImpl(const phi::GPUContext& ctx,
+                      MatmulDescT* desc,
+                      const size_t sub_key,
+                      const T* x_ptr,
+                      const T* y_ptr,
+                      OutT* out_ptr,
+                      phi::funcs::MatmulPlanner* planner) {
+    MT alpha = static_cast<MT>(1);
+    MT beta = planner->UseAddTo() ? static_cast<MT>(1) : static_cast<MT>(0);
+    cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle();
+
+    // NOTE(limingshu): As workspace_size varies from different DL framework,
+    // I wonder is there any smarter idea for workspace setting, currently I
+    // just followed the settings from the NVIDIA colleague`s setting.
+    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
+    //    phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx,
+    //    workspace_size);
+    void* workspace_ptr = ctx.GetWorkSpacePtr(workspace_size);
+
+    if (planner != nullptr) {
+      if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
+          (!desc->is_cached)) {
+        SearchBestAlgo(ctx,
+                       cublaslt_handle,
+                       desc,
+                       static_cast<void*>(&alpha),
+                       static_cast<void*>(&beta),
+                       y_ptr,
+                       x_ptr,
+                       out_ptr,
+                       workspace_ptr,
+                       workspace_size);
+        MatmulDescT* best_desc = new MatmulDescT(*desc);
+        VLOG(6) << best_desc->GetDescResultString(
+            "[Searched CublasltDescriptor] ");
+
+        auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
+        cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
+      }
+    }
+
+    VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] ");
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmul(cublaslt_handle,
+                                desc->op_desc,
+                                static_cast<void*>(&alpha),
+                                y_ptr,
+                                desc->y_desc,
+                                x_ptr,
+                                desc->x_desc,
+                                static_cast<void*>(&beta),
+                                out_ptr,
+                                desc->out_desc,
+                                out_ptr,
+                                desc->out_desc,
+                                desc->algo,
+                                workspace_ptr,
+                                workspace_size,
+                                ctx.stream()));
+  }
+
+  static void SearchBestAlgo(const phi::GPUContext& ctx,
+                             const cublasLtHandle_t& lt_handle,
+                             MatmulDescT* desc,
+                             const void* alpha,
+                             const void* beta,
+                             const void* y_data,
+                             const void* x_data,
+                             void* out_data,
+                             void* workspace_ptr,
+                             size_t workspace_size) {
+    cublasLtMatmulPreference_t preference;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulPreferenceCreate(&preference));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute(
+        preference,
+        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+        &workspace_size,
+        sizeof(workspace_size)));
+
+    int returned_results = 0;
+    constexpr int requested_algo_count = 10;
+    std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
+        requested_algo_count);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle,
+                                                desc->op_desc,
+                                                desc->y_desc,
+                                                desc->x_desc,
+                                                desc->out_desc,
+                                                desc->out_desc,
+                                                preference,
+                                                requested_algo_count,
+                                                heuristic_results.data(),
+                                                &returned_results));
+    PADDLE_ENFORCE_GT(returned_results,
+                      0,
+                      phi::errors::Unavailable("No GEMM algorithm avaliable."));
+    int best_algo_idx = -1;
+    if (returned_results == 1 || FLAGS_cublaslt_exhaustive_search_times <= 0) {
+      best_algo_idx = 0;
+    } else {
+      float min_time_cost = std::numeric_limits<float>::max();
+      for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
+        float cur_time_cost =
+            RunAndMeasureAlgo(ctx,
+                              lt_handle,
+                              desc,
+                              alpha,
+                              beta,
+                              y_data,
+                              x_data,
+                              out_data,
+                              workspace_ptr,
+                              workspace_size,
+                              &(heuristic_results[algo_idx].algo));
+        VLOG(6) << "[MatmulWithCublaslt] algo[" << algo_idx
+                << "] time: " << cur_time_cost << " s";
+
+        if ((best_algo_idx == 0 && (1.05 * cur_time_cost < min_time_cost)) ||
+            (cur_time_cost < min_time_cost)) {
+          best_algo_idx = algo_idx;
+          min_time_cost = cur_time_cost;
+        }
+      }
+    }
+    VLOG(6) << "[MatmulWithCublaslt] best_algo_idx: " << best_algo_idx;
+
+    cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo();
+    *best_algo = heuristic_results[best_algo_idx].algo;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulPreferenceDestroy(preference));
+  }
+
+  static float RunAndMeasureAlgo(const phi::GPUContext& ctx,
+                                 const cublasLtHandle_t& lt_handle,
+                                 MatmulDescT* desc,
+                                 const void* alpha,
+                                 const void* beta,
+                                 const void* y_data,
+                                 const void* x_data,
+                                 void* out_data,
+                                 void* workspace_ptr,
+                                 size_t workspace_size,
+                                 cublasLtMatmulAlgo_t* algo) {
+    int repeats = FLAGS_cublaslt_exhaustive_search_times;
+    if (repeats <= 0) {
+      return std::numeric_limits<float>::max();
+    }
+
+    phi::GpuTimer timer;
+    float time_cost = 0.f;
+    const auto& stream = ctx.stream();
+
+    for (int i = 0; i < repeats; ++i) {
+      timer.Start(stream);
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(lt_handle,
+                                                         desc->op_desc,
+                                                         alpha,
+                                                         y_data,
+                                                         desc->y_desc,
+                                                         x_data,
+                                                         desc->x_desc,
+                                                         beta,
+                                                         out_data,
+                                                         desc->out_desc,
+                                                         out_data,
+                                                         desc->out_desc,
+                                                         algo,
+                                                         workspace_ptr,
+                                                         workspace_size,
+                                                         stream));
+      timer.Stop(stream);
+      ctx.Wait();
+      auto time = timer.ElapsedTime();
+      if (i > 0) {
+        // Exclude the warmup runtime.
+        time_cost += time;
+      }
+    }
+    return (time_cost / (repeats - 1));
+  }
+};
+
+template <>
+struct CublasLtBase<int8_t, int32_t, MatmulDescriptor> {
+ public:
+  static phi::Allocator::AllocationPtr GetWorkspace(const phi::GPUContext& ctx,
+                                                    size_t workspace_size) {
+    return paddle::memory::Alloc(
+        ctx.GetPlace(),
+        workspace_size,
+        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
+  }
+
+  static void RunImpl(const phi::GPUContext& ctx,
+                      MatmulDescriptor* desc,
+                      const size_t sub_key,
+                      const int8_t* x_ptr,
+                      const int8_t* y_ptr,
+                      int32_t* out_ptr,
+                      phi::funcs::MatmulPlanner* planner) {
+    int32_t alpha = 1;
+    int32_t beta =
+        planner->UseAddTo() ? static_cast<int32_t>(1) : static_cast<int32_t>(0);
+    cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle();
+
+    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
+    //    phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx,
+    //    workspace_size);
+    void* workspace_ptr = ctx.GetWorkSpacePtr(workspace_size);
+
+    if (planner != nullptr) {
+      if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
+          (!desc->is_cached)) {
+        SearchBestAlgo(ctx,
+                       cublaslt_handle,
+                       desc,
+                       static_cast<void*>(&alpha),
+                       static_cast<void*>(&beta),
+                       y_ptr,
+                       x_ptr,
+                       out_ptr,
+                       workspace_ptr,
+                       workspace_size);
+        MatmulDescriptor* best_desc = new MatmulDescriptor(*desc);
+        VLOG(6) << best_desc->GetDescResultString(
+            "[Searched CublasltDescriptor] ");
+
+        auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
+        cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
+      }
+    }
+
+    VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] ");
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmul(cublaslt_handle,
+                                desc->op_desc,
+                                static_cast<void*>(&alpha),
+                                y_ptr,
+                                desc->y_desc,
+                                x_ptr,
+                                desc->x_desc,
+                                static_cast<void*>(&beta),
+                                out_ptr,
+                                desc->out_desc,
+                                out_ptr,
+                                desc->out_desc,
+                                desc->algo,
+                                workspace_ptr,
+                                workspace_size,
+                                ctx.stream()));
+  }
+
+  static void SearchBestAlgo(const phi::GPUContext& ctx,
+                             const cublasLtHandle_t& lt_handle,
+                             MatmulDescriptor* desc,
+                             const void* alpha,
+                             const void* beta,
+                             const void* y_data,
+                             const void* x_data,
+                             void* out_data,
+                             void* workspace_ptr,
+                             size_t workspace_size) {
+    cublasLtMatmulPreference_t preference;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulPreferenceCreate(&preference));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute(
+        preference,
+        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+        &workspace_size,
+        sizeof(workspace_size)));
+
+    int returned_results = 0;
+    constexpr int requested_algo_count = 10;
+    std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
+        requested_algo_count);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle,
+                                                desc->op_desc,
+                                                desc->y_desc,
+                                                desc->x_desc,
+                                                desc->out_desc,
+                                                desc->out_desc,
+                                                preference,
+                                                requested_algo_count,
+                                                heuristic_results.data(),
+                                                &returned_results));
+    PADDLE_ENFORCE_GT(returned_results,
+                      0,
+                      phi::errors::Unavailable("No GEMM algorithm avaliable."));
+    int best_algo_idx = -1;
+    if (returned_results == 1 || FLAGS_cublaslt_exhaustive_search_times <= 0) {
+      best_algo_idx = 0;
+    } else {
+      float min_time_cost = std::numeric_limits<float>::max();
+      for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
+        float cur_time_cost =
+            RunAndMeasureAlgo(ctx,
+                              lt_handle,
+                              desc,
+                              alpha,
+                              beta,
+                              y_data,
+                              x_data,
+                              out_data,
+                              workspace_ptr,
+                              workspace_size,
+                              &(heuristic_results[algo_idx].algo));
+        VLOG(6) << "[MatmulWithCublaslt] algo[" << algo_idx
+                << "] time: " << cur_time_cost << " s";
+
+        if ((best_algo_idx == 0 && (1.05 * cur_time_cost < min_time_cost)) ||
+            (cur_time_cost < min_time_cost)) {
+          best_algo_idx = algo_idx;
+          min_time_cost = cur_time_cost;
+        }
+      }
+    }
+    VLOG(6) << "[MatmulWithCublaslt] best_algo_idx: " << best_algo_idx;
+
+    cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo();
+    *best_algo = heuristic_results[best_algo_idx].algo;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulPreferenceDestroy(preference));
+  }
+
+  static float RunAndMeasureAlgo(const phi::GPUContext& ctx,
+                                 const cublasLtHandle_t& lt_handle,
+                                 MatmulDescriptor* desc,
+                                 const void* alpha,
+                                 const void* beta,
+                                 const void* y_data,
+                                 const void* x_data,
+                                 void* out_data,
+                                 void* workspace_ptr,
+                                 size_t workspace_size,
+                                 cublasLtMatmulAlgo_t* algo) {
+    int repeats = FLAGS_cublaslt_exhaustive_search_times;
+    if (repeats <= 0) {
+      return std::numeric_limits<float>::max();
+    }
+
+    phi::GpuTimer timer;
+    float time_cost = 0.f;
+    const auto& stream = ctx.stream();
+
+    for (int i = 0; i < repeats; ++i) {
+      timer.Start(stream);
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(lt_handle,
+                                                         desc->op_desc,
+                                                         alpha,
+                                                         y_data,
+                                                         desc->y_desc,
+                                                         x_data,
+                                                         desc->x_desc,
+                                                         beta,
+                                                         out_data,
+                                                         desc->out_desc,
+                                                         out_data,
+                                                         desc->out_desc,
+                                                         algo,
+                                                         workspace_ptr,
+                                                         workspace_size,
+                                                         stream));
+      timer.Stop(stream);
+      ctx.Wait();
+      auto time = timer.ElapsedTime();
+      if (i > 0) {
+        // Exclude the warmup runtime.
+        time_cost += time;
+      }
+    }
+    return (time_cost / (repeats - 1));
+  }
+};
+
+// To judge if desc is cached or not.
+template <class DescT,
+          typename T,
+          typename DXT = T,
+          typename DYT = T,
+          bool TransX = false,
+          bool TransY = false>
+struct DescriptorSetter {
+ public:
+  DescT desc;
+  size_t sub_key{std::numeric_limits<size_t>::min()};
+
+  DescriptorSetter(phi::funcs::MatmulPlanner* planner,
+                   const int64_t M,
+                   const int64_t N,
+                   const int64_t K,
+                   const bool trans_x,
+                   const bool trans_y,
+                   const int batch_size = 1,
+                   int64_t stride_x = 0,
+                   int64_t stride_y = 0,
+                   int64_t stride_out = 0,
+                   const bool no_exchange = true,
+                   bool grad_for_dx = true) {
+    if (std::is_same<T, int8_t>::value) {
+      if (!trans_x && !trans_y) {
+        PADDLE_ENFORCE_EQ(
+            (N % 4 == 0 || N == 1),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size N used in int8 matmul must be 1 or a "
+                "multiple of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                N));
+        PADDLE_ENFORCE_EQ(
+            (K % 4 == 0),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size K used in int8 matmul must be a multiple "
+                "of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                K));
+      } else if (!trans_x && trans_y) {
+        PADDLE_ENFORCE_EQ(
+            (K % 4 == 0),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size K used in int8 matmul must be a multiple "
+                "of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                K));
+      } else if (trans_x && !trans_y) {
+        PADDLE_ENFORCE_EQ(
+            (M % 4 == 0 || M == 1),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size M used in int8 matmul must be 1 or a "
+                "multiple of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                M));
+        PADDLE_ENFORCE_EQ(
+            (N % 4 == 0 || N == 1),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size N used in int8 matmul must be 1 or a "
+                "multiple of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                N));
+      } else {
+        PADDLE_ENFORCE_EQ(
+            (M % 4 == 0 || M == 1),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size M used in int8 matmul must be 1 or a "
+                "multiple of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                M));
+        PADDLE_ENFORCE_EQ(
+            (K % 4 == 0),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size K used in int8 matmul must be a multiple "
+                "of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                K));
+      }
+    }
+
+    if (planner != nullptr) {
+      sub_key = planner->GenSubKey();
+    }
+
+    auto& mamtul_cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
+    if (mamtul_cache.FindSubKey(sub_key)) {
+      desc = *(reinterpret_cast<DescT*>(mamtul_cache.GetSubKey(sub_key)));
+      desc.template SetFusedEpiloguePtr<DYT>(planner);
+      VLOG(7) << desc.GetDescResultString("[Heap CublasltDescriptor] ");
+    } else {
+      desc.template Create<T, DXT, DYT, TransX, TransY>(M,
+                                                        N,
+                                                        K,
+                                                        trans_x,
+                                                        trans_y,
+                                                        planner,
+                                                        batch_size,
+                                                        stride_x,
+                                                        stride_y,
+                                                        stride_out,
+                                                        grad_for_dx);
+      desc.ExchangeXYDesc(no_exchange);
+      if (planner != nullptr) {
+        desc.template SetFusedEpiloguePtr<DYT>(planner);
+      }
+      VLOG(7) << desc.GetDescResultString("[Stack CublasltDescriptor] ", false);
+    }
+  }
+};
+
+// For matmul with kernels autotune
+template <typename T, typename OutT = T>
+struct MatmulWithCublasLt : public CublasLtBase<T, OutT> {
+ public:
+  static void Run(const phi::GPUContext& ctx,
+                  const T* x_data,
+                  const T* y_data,
+                  OutT* out_data,
+                  const int64_t M,
+                  const int64_t N,
+                  const int64_t K,
+                  const bool trans_x,
+                  const bool trans_y,
+                  phi::funcs::MatmulPlanner* planner = nullptr) {
+    auto setter = DescriptorSetter<MatmulDescriptor, T>(
+        planner, M, N, K, trans_x, trans_y);
+    CublasLtBase<T, OutT>::RunImpl(
+        ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner);
+  }
+
+  static void RunWithBatch(const phi::GPUContext& ctx,
+                           const T* x_data,
+                           const T* y_data,
+                           OutT* out_data,
+                           const int64_t M,
+                           const int64_t N,
+                           const int64_t K,
+                           bool trans_x,
+                           bool trans_y,
+                           int batch_size,
+                           int64_t stride_x,
+                           int64_t stride_y,
+                           int64_t stride_out,
+                           phi::funcs::MatmulPlanner* planner = nullptr) {
+    auto setter = DescriptorSetter<MatmulDescriptor, T>(planner,
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        trans_x,
+                                                        trans_y,
+                                                        batch_size,
+                                                        stride_x,
+                                                        stride_y,
+                                                        stride_out);
+    CublasLtBase<T, OutT>::RunImpl(
+        ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner);
+  }
+
+  static void RunWithBatch(const phi::GPUContext& ctx,
+                           const T** x_data,
+                           const T** y_data,
+                           OutT** out_data,
+                           const int64_t M,
+                           const int64_t N,
+                           const int64_t K,
+                           bool trans_x,
+                           bool trans_y,
+                           int batch_size,
+                           phi::funcs::MatmulPlanner* planner = nullptr) {
+    for (int i = 0; i < batch_size; ++i) {
+      Run(ctx,
+          x_data[i],
+          y_data[i],
+          out_data[i],
+          M,
+          N,
+          K,
+          trans_x,
+          trans_y,
+          planner);
+    }
+  }
+};
+
+// As for just Linear fused ephilogue below: out = matmul(x, y) + bias.
+template <typename T>
+struct LinearWithCublasLt : public CublasLtBase<T> {
+  static void Run(const phi::GPUContext& ctx,
+                  const phi::DenseTensor* x,
+                  const phi::DenseTensor* y,
+                  phi::DenseTensor* out,
+                  const void* bias_data,
+                  void* reserve_data,
+                  const int64_t M,
+                  const int64_t N,
+                  const int64_t K,
+                  const bool trans_x,
+                  const bool trans_y,
+                  const MatmulFusedType fused_type) {
+    auto planner = phi::funcs::MatmulPlanner(
+        vectorize(x->dims()),
+        vectorize(y->dims()),
+        trans_x,
+        trans_y,
+        paddle::experimental::CppTypeToDataType<T>::Type(),
+        fused_type,
+        bias_data,
+        reserve_data);
+    auto setter = DescriptorSetter<MatmulDescriptor, T>(
+        &planner, M, N, K, trans_x, trans_y);
+    CublasLtBase<T>::RunImpl(ctx,
+                             &setter.desc,
+                             setter.sub_key,
+                             x->data<T>(),
+                             y->data<T>(),
+                             out->data<T>(),
+                             &planner);
+  }
+};
+
+template <typename T, typename DXT, typename DYT, bool TransX, bool TransY>
+struct LinearGradWithCublasLt : public CublasLtBase<T> {
+  static void Run(
+      const phi::GPUContext& ctx,
+      const phi::DenseTensor* x,
+      const phi::DenseTensor* y,
+      phi::DenseTensor* out,
+      const void* bias_data,
+      void* reserve_data,
+      const int64_t M,
+      const int64_t N,
+      const int64_t K,
+      const MatmulFusedType fused_type,
+      const bool trans_x,
+      const bool trans_y,
+      const bool use_addto,
+      const bool no_exchange,  // exchange x_desc and y_desc for grad.
+      bool grad_for_dx = true) {
+    auto planner = phi::funcs::MatmulPlanner(
+        vectorize(x->dims()),
+        vectorize(y->dims()),
+        trans_x,
+        trans_y,
+        paddle::experimental::CppTypeToDataType<T>::Type(),
+        fused_type,
+        bias_data,
+        reserve_data,
+        use_addto,
+        no_exchange);
+    auto setter =
+        DescriptorSetter<MatmulGradDescriptor, T, DXT, DYT, TransX, TransY>(
+            &planner,
+            M,
+            N,
+            K,
+            trans_x,
+            trans_y,
+            /*batch_size=*/1,
+            /*stride_x=*/0,
+            /*stride_y=*/0,
+            /*stride_out=*/0,
+            /*exchange_x_y_desc=*/no_exchange,
+            /*grad_for_dx=*/grad_for_dx);
+
+    // To setting data type for different kinda out_data.
+    if (grad_for_dx) {
+      CublasLtBase<T, DXT, MatmulGradDescriptor>::RunImpl(
+          ctx,
+          &setter.desc,
+          setter.sub_key,
+          no_exchange ? x->data<T>() : y->data<T>(),
+          no_exchange ? y->data<T>() : x->data<T>(),
+          out->data<DXT>(),
+          &planner);
+    } else {
+      CublasLtBase<T, DYT, MatmulGradDescriptor>::RunImpl(
+          ctx,
+          &setter.desc,
+          setter.sub_key,
+          no_exchange ? x->data<T>() : y->data<T>(),
+          no_exchange ? y->data<T>() : x->data<T>(),
+          out->data<DYT>(),
+          &planner);
+    }
+  }
+};
+#else
+// A void structure just for successfully compile.
+struct MatmulPlanner {};
+#endif  // (PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
index 0458f0d83ed1a..1b1814ec0ae2b 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -101,7 +101,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
                                     gpu_type);
   });
   if (batch_size > 1) {
-#if CUDA_VERSION >= 11070
+#if CUDA_VERSION >= 11080
     dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
       phi::dynload::cusparseCsrSetStridedBatch(
           *descriptor, batch_size, M + 1, batch_nnz);
@@ -109,7 +109,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "Batch Sparse matmul use 'cusparseCsrSetStridedBatch', which is "
-        "supported from CUDA 11.7"));
+        "supported from CUDA 11.8"));
 #endif
   }
 }
@@ -155,7 +155,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
   });
 
   if (batch_size > 1) {
-#if CUDA_VERSION >= 11070
+#if CUDA_VERSION >= 11080
     dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
       phi::dynload::cusparseCooSetStridedBatch(
           *descriptor, batch_size, batch_nnz);
@@ -163,7 +163,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "Batch Sparse matmul use 'cusparseCooSetStridedBatch', which is "
-        "supported from CUDA 11.7"));
+        "supported from CUDA 11.8"));
 #endif
   }
 }
@@ -241,7 +241,7 @@ class CuSparseDnMatDescriptor {
 
     PADDLE_ENFORCE_EQ(x.numel(), batch_size * M * N);
     if (batch_size > 1) {
-#if CUDA_VERSION >= 11070
+#if CUDA_VERSION >= 11080
       dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
         phi::dynload::cusparseDnMatSetStridedBatch(
             descriptor_, batch_size, M * N);
@@ -249,7 +249,7 @@ class CuSparseDnMatDescriptor {
 #else
       PADDLE_THROW(phi::errors::Unimplemented(
           "Batch Sparse matmul use 'cusparseDnMatSetStridedBatch', which is "
-          "supported from CUDA 11.7"));
+          "supported from CUDA 11.8"));
 #endif
     }
     VLOG(6) << "Create cusparseDnMatDescr_t " << &descriptor_;
@@ -379,7 +379,11 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
                                           &beta,
                                           out_descriptor.descriptor(),
                                           gpu_type,
+#if CUDA_VERSION >= 11040
+                                          CUSPARSE_SPMV_ALG_DEFAULT,
+#else
                                           CUSPARSE_MV_ALG_DEFAULT,
+#endif
                                           &buffer_size);
   });
 
@@ -395,7 +399,11 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
                                &beta,
                                out_descriptor.descriptor(),
                                gpu_type,
+#if CUDA_VERSION >= 11040
+                               CUSPARSE_SPMV_ALG_DEFAULT,
+#else
                                CUSPARSE_MV_ALG_DEFAULT,
+#endif
                                tmp_buffer_ptr);
   });
 }
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
index 7ecf352ffe996..700ce21caf2ba 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -14,9 +14,7 @@
 
 #include "paddle/phi/kernels/graph_send_recv_kernel.h"
 
-#include <thrust/device_vector.h>
-#include <thrust/fill.h>
-
+#include "paddle/phi/kernels/funcs/math_function.h"
 #include <algorithm>
 #include <vector>
 
@@ -59,17 +57,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
     cudaMemset(p_output, 0, memset_bytes);
 #endif
   } else if (pool_type == "MAX") {
-    thrust::device_ptr<T> p_output_ptr(p_output);
-    thrust::fill(thrust::device,
-                 p_output_ptr,
-                 p_output_ptr + memset_size,
-                 std::numeric_limits<T>::min());
+    phi::funcs::set_constant<T>(ctx, out, std::numeric_limits<T>::min());
   } else if (pool_type == "MIN") {
-    thrust::device_ptr<T> p_output_ptr(p_output);
-    thrust::fill(thrust::device,
-                 p_output_ptr,
-                 p_output_ptr + memset_size,
-                 std::numeric_limits<T>::max());
+    phi::funcs::set_constant<T>(ctx, out, std::numeric_limits<T>::max());
   }
 
   if (index_size == 0) return;
diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
index 2753937eb7142..b6c13360cd404 100644
--- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/matmul_grad_kernel.h"
-
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
+#include "paddle/phi/kernels/matmul_grad_kernel.h"
 
 PD_REGISTER_KERNEL(matmul_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/matmul_kernel.cu b/paddle/phi/kernels/gpu/matmul_kernel.cu
index e5de7966c2ec4..32d70ae0763f0 100644
--- a/paddle/phi/kernels/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_kernel.cu
@@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/matmul_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
+#include "paddle/phi/kernels/matmul_kernel.h"
 
 PD_REGISTER_KERNEL(matmul,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index 3d44c9af03c07..c52555c38e5a3 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -21,12 +21,13 @@
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
 #include <thrust/unique.h>
-
+#include <thrust/sort.h>
 #include <iostream>
 #include <vector>
 
-#include "paddle/fluid/framework/tensor_util.h"  // TensorToVector()
+#include "cub/cub.cuh"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/unique_functor.h"
@@ -194,22 +195,29 @@ static void UniqueFlattendCUDATensor(const Context& context,
   indices->Resize(phi::make_ddim({num_input}));
   auto* indices_data = context.template Alloc<IndexT>(indices);
 
-  thrust::sequence(thrust::device, indices_data, indices_data + num_input);
+#ifdef PADDLE_WITH_CUDA
+  phi::memory_utils::ThrustAllocator<cudaStream_t> allocator(context.GetPlace(),
+                                                             context.stream());
+  const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
+#else
+  const auto& exec_policy = thrust::hip::par.on(context.stream());
+#endif
+
+  thrust::sequence(exec_policy, indices_data, indices_data + num_input);
   thrust::sort_by_key(
-      thrust::device, in_data_hat, in_data_hat + num_input, indices_data);
+      exec_policy, in_data_hat, in_data_hat + num_input, indices_data);
 
   // 1. Calculate op result: 'out'
   DenseTensor range;
   range.Resize(phi::make_ddim({num_input + 1}));
   auto* range_data_ptr = context.template Alloc<IndexT>(&range);
-  thrust::sequence(
-      thrust::device, range_data_ptr, range_data_ptr + num_input + 1);
+  thrust::sequence(exec_policy, range_data_ptr, range_data_ptr + num_input + 1);
   phi::Copy(context, in_hat, context.GetPlace(), false, out);
   int num_out;
   auto out_data = context.template Alloc<InT>(out);
   num_out =
       thrust::unique_by_key(
-          thrust::device, out_data, out_data + num_input, range_data_ptr, equal)
+          exec_policy, out_data, out_data + num_input, range_data_ptr, equal)
           .first -
       out_data;
   out->Resize(phi::make_ddim({num_out}));
@@ -221,18 +229,32 @@ static void UniqueFlattendCUDATensor(const Context& context,
     DenseTensor inv_loc;
     inv_loc.Resize(phi::make_ddim({num_input}));
     auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
-    thrust::adjacent_difference(thrust::device,
+    thrust::adjacent_difference(exec_policy,
                                 in_data_hat,
                                 in_data_hat + num_input,
                                 inv_loc_data_ptr,
                                 not_equal);
-    thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
-    inv_loc_data_dev[0] = 0;  // without device_ptr, segmentation fault
-    thrust::inclusive_scan(thrust::device,
-                           inv_loc_data_ptr,
-                           inv_loc_data_ptr + num_input,
-                           inv_loc_data_ptr);
-    thrust::scatter(thrust::device,
+#ifdef PADDLE_WITH_HIP
+    hipMemset(inv_loc_data_ptr, 0, sizeof(IndexT));
+#else
+    cudaMemsetAsync(inv_loc_data_ptr, 0, sizeof(IndexT), context.stream());
+#endif
+    size_t temp_storage_bytes = 0;
+    cub::DeviceScan::InclusiveSum(NULL,
+                                  temp_storage_bytes,
+                                  inv_loc_data_ptr,
+                                  inv_loc_data_ptr,
+                                  num_input,
+                                  context.stream());
+    auto d_temp_storage =
+        paddle::memory::Alloc(context.GetPlace(), temp_storage_bytes);
+    cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(),
+                                  temp_storage_bytes,
+                                  inv_loc_data_ptr,
+                                  inv_loc_data_ptr,
+                                  num_input,
+                                  context.stream());
+    thrust::scatter(exec_policy,
                     inv_loc_data_ptr,
                     inv_loc_data_ptr + num_input,
                     indices_data,
@@ -244,11 +266,11 @@ static void UniqueFlattendCUDATensor(const Context& context,
     DenseTensor tmp_indices;
     tmp_indices.Resize(phi::make_ddim({num_input}));
     auto* tmp_indices_data_ptr = context.template Alloc<IndexT>(&tmp_indices);
-    thrust::copy(thrust::device,
+    thrust::copy(exec_policy,
                  in_data_hat,
                  in_data_hat + num_input,
                  tmp_indices_data_ptr);
-    thrust::unique_by_key(thrust::device,
+    thrust::unique_by_key(exec_policy,
                           tmp_indices_data_ptr,
                           tmp_indices_data_ptr + num_input,
                           indices_data,
@@ -261,10 +283,10 @@ static void UniqueFlattendCUDATensor(const Context& context,
     counts->Resize(phi::make_ddim({num_out}));
     auto count_data = context.template Alloc<IndexT>(counts);
     // init 'count_data' as 0
-    thrust::fill(thrust::device, count_data, count_data + num_out, 0);
+    thrust::fill(exec_policy, count_data, count_data + num_out, 0);
     thrust::device_ptr<IndexT> range_data_ptr_dev(range_data_ptr);
     range_data_ptr_dev[num_out] = num_input;
-    thrust::adjacent_difference(thrust::device,
+    thrust::adjacent_difference(exec_policy,
                                 range_data_ptr + 1,
                                 range_data_ptr + num_out + 1,
                                 count_data);
@@ -290,24 +312,29 @@ static void ComputeUniqueDims(const Context& context,
                               equal_T equal,
                               not_equal_T not_equal,
                               int64_t row) {
+#ifdef PADDLE_WITH_CUDA
+  phi::memory_utils::ThrustAllocator<cudaStream_t> allocator(context.GetPlace(),
+                                                             context.stream());
+  const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
+#else
+  const auto& exec_policy = thrust::hip::par.on(context.stream());
+#endif
   // 1. inverse indices: 'inverse'
   inverse->Resize(phi::make_ddim({row}));
   auto* inverse_data = context.template Alloc<IndexT>(inverse);
   DenseTensor inv_loc;
   inv_loc.Resize(phi::make_ddim({row}));
   auto inv_loc_data_ptr = context.template Alloc<IndexT>(&inv_loc);
-  thrust::adjacent_difference(thrust::device,
+  thrust::adjacent_difference(exec_policy,
                               sorted_indices_data,
                               sorted_indices_data + row,
                               inv_loc_data_ptr,
                               not_equal);
   thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
   inv_loc_data_dev[0] = 0;
-  thrust::inclusive_scan(thrust::device,
-                         inv_loc_data_ptr,
-                         inv_loc_data_ptr + row,
-                         inv_loc_data_ptr);
-  thrust::scatter(thrust::device,
+  thrust::inclusive_scan(
+      exec_policy, inv_loc_data_ptr, inv_loc_data_ptr + row, inv_loc_data_ptr);
+  thrust::scatter(exec_policy,
                   inv_loc_data_ptr,
                   inv_loc_data_ptr + row,
                   sorted_indices_data,
@@ -317,9 +344,9 @@ static void ComputeUniqueDims(const Context& context,
   DenseTensor range;
   range.Resize(phi::make_ddim({row + 1}));
   auto range_data_ptr = context.template Alloc<IndexT>(&range);
-  thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1);
+  thrust::sequence(exec_policy, range_data_ptr, range_data_ptr + row + 1);
   int num_out;
-  num_out = thrust::unique_by_key(thrust::device,
+  num_out = thrust::unique_by_key(exec_policy,
                                   sorted_indices_data,
                                   sorted_indices_data + row,
                                   range_data_ptr,
@@ -333,9 +360,9 @@ static void ComputeUniqueDims(const Context& context,
   // 3. counts: 'counts'
   counts->Resize(phi::make_ddim({num_out}));
   auto* count_data = context.template Alloc<IndexT>(counts);
-  thrust::fill(thrust::device, count_data, count_data + row, 0);
+  thrust::fill(exec_policy, count_data, count_data + row, 0);
   thrust::adjacent_difference(
-      thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data);
+      exec_policy, range_data_ptr + 1, range_data_ptr + row + 1, count_data);
 }
 
 // Calculate unique when 'axis' is set
@@ -384,9 +411,15 @@ static void UniqueDimsCUDATensor(const Context& context,
 
   // 2. Calculate 'indices', 'inverse', 'counts'
   // Init index and sort
-  thrust::sequence(
-      thrust::device, sorted_indices_data, sorted_indices_data + row);
-  thrust::sort(thrust::device,
+#ifdef PADDLE_WITH_CUDA
+  phi::memory_utils::ThrustAllocator<cudaStream_t> allocator(context.GetPlace(),
+                                                             context.stream());
+  const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
+#else
+  const auto& exec_policy = thrust::hip::par.on(context.stream());
+#endif
+  thrust::sequence(exec_policy, sorted_indices_data, sorted_indices_data + row);
+  thrust::sort(exec_policy,
                sorted_indices_data,
                sorted_indices_data + row,
                LessThan<InT>(col, in_trans_data));
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index ef70907b59a61..e61f58450b34f 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -254,6 +254,8 @@ void ConvCudnnGradGradKernel(
   auto dtype = paddle::platform::CudnnDataType<T>::type;
 
   auto handle = ctx.cudnn_handle();
+  auto layout = paddle::platform::GetCudnnTensorFormat(
+      paddle::platform::DataLayout::kNCHW);
 
   paddle::operators::ConvArgs args1{&transformed_ddX,
                                     W,
@@ -261,28 +263,36 @@ void ConvCudnnGradGradKernel(
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
   paddle::operators::ConvArgs args2{&transformed_X,
                                     ddW,
                                     &transformed_ddO_channel,
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
   paddle::operators::ConvArgs args3{&transformed_ddX,
                                     dW,
                                     &transformed_dO_channel,
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
   paddle::operators::ConvArgs args4{&transformed_dX,
                                     ddW,
                                     &transformed_dO_channel,
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    paddle::platform::DataLayout::kNCHW};
 
 #ifdef PADDLE_WITH_HIP
   paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result1;
@@ -298,9 +308,6 @@ void ConvCudnnGradGradKernel(
       filter_result;
 #endif
 
-  auto layout = paddle::platform::GetCudnnTensorFormat(
-      paddle::platform::DataLayout::kNCHW);
-
   // ddo = conv(ddI, W) + conv(I, ddW)
   size_t workspace_size = 0;
 
@@ -329,7 +336,7 @@ void ConvCudnnGradGradKernel(
 #else
       using search1 =
           paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      fwd_result1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
+      fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
       workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
 #endif
     }
@@ -357,7 +364,7 @@ void ConvCudnnGradGradKernel(
 #else
       using search2 =
           paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      fwd_result2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
+      fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
       workspace_size = std::max(
           workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
 #endif
@@ -387,7 +394,7 @@ void ConvCudnnGradGradKernel(
     using search3 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
     filter_result =
-        search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
+        search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
     workspace_size = std::max(
         workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
 #endif
@@ -417,7 +424,7 @@ void ConvCudnnGradGradKernel(
     using search4 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
     data_result =
-        search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
+        search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
     workspace_size = std::max(
         workspace_size, search4::GetWorkspaceSize(args4, data_result.algo));
 #endif
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 4e9c37879c002..2d61ec6e62c9c 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -251,27 +251,33 @@ void ConvCudnnGradKernel(const Context& ctx,
   T* input_grad_data = nullptr;
   T* transformed_input_grad_data = nullptr;
 
+  paddle::platform::DataLayout layout =
+      compute_format == paddle::platform::DataLayout::kNHWC
+          ? paddle::platform::DataLayout::kNHWC
+          : paddle::platform::DataLayout::kNCHW;
+
   paddle::operators::ConvArgs args1{&transformed_input_grad,
                                     &transformed_filter_channel,
                                     &transformed_output_grad_channel,
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
   paddle::operators::ConvArgs args2{&transformed_input,
                                     &transformed_filter_grad_channel,
                                     &transformed_output_grad_channel,
                                     strides,
                                     padding_common,
                                     dilations,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
 
   auto handle = ctx.cudnn_handle();
   // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
-  paddle::platform::DataLayout layout =
-      compute_format == paddle::platform::DataLayout::kNHWC
-          ? paddle::platform::DataLayout::kNHWC
-          : paddle::platform::DataLayout::kNCHW;
+
   if (transformed_input.dims().size() == 5) {
     layout = compute_format == paddle::platform::DataLayout::kNHWC
                  ? paddle::platform::DataLayout::kNDHWC
@@ -367,9 +373,8 @@ void ConvCudnnGradKernel(const Context& ctx,
 #else
     using search1 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    bwd_result = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
-    workspace_size_d = std::max(
-        workspace_size_d, search1::GetWorkspaceSize(args1, bwd_result.algo));
+    bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
+    workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size);
 #endif
   }
 
@@ -397,11 +402,10 @@ void ConvCudnnGradKernel(const Context& ctx,
     using search2 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
     filter_result =
-        search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
+        search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
     VLOG(3) << "filter algo: " << filter_result.algo << ", time "
             << filter_result.time;
-    workspace_size_w = std::max(
-        workspace_size_w, search2::GetWorkspaceSize(args2, filter_result.algo));
+    workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size);
 #endif
   }
 
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index bd95a32bc724f..80544025ff738 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -56,8 +56,7 @@ void ConvCudnnKernel(const Context& ctx,
 
   bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
   bool deterministic = FLAGS_cudnn_deterministic;
-  auto exhaustive_deterministic = exhaustive_search && deterministic;
-  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+  PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
                     false,
                     phi::errors::InvalidArgument(
                         "Cann't set exhaustive_search True and "
@@ -213,7 +212,9 @@ void ConvCudnnKernel(const Context& ctx,
                                    strides,
                                    padding_common,
                                    dilations,
-                                   dtype};
+                                   dtype,
+                                   groups,
+                                   compute_format};
 
   auto handle = ctx.cudnn_handle();
   auto workspace_handle = ctx.cudnn_workspace_handle();
@@ -313,8 +314,8 @@ void ConvCudnnKernel(const Context& ctx,
   paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
   using search =
       paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-  fwd_result = search::Find<T>(args, exhaustive_search, deterministic, ctx);
-  workspace_size = search::GetWorkspaceSize(args, fwd_result.algo);
+  fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
+  workspace_size = fwd_result.workspace_size;
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index 0ce16f66becfa..36a3caf97eb94 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -179,14 +179,18 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
   paddle::operators::ConvArgs args2{&transformed_dout,
                                     &filter,
                                     &x_transpose,
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    layout};
 
 #ifdef PADDLE_WITH_HIP
   paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
@@ -226,7 +230,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
 #else
     using search1 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    fwd_result = search1::Find<T>(args1, false, deterministic, ctx);
+    fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
 #endif
@@ -253,7 +257,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
 #else
     using search2 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-    filter_result = search2::Find<T>(args2, false, deterministic, ctx);
+    filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
 #endif
@@ -625,6 +629,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
   auto dtype = paddle::platform::CudnnDataType<T>::type;
 
   auto handle = ctx.cudnn_handle();
+  auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
 
   paddle::operators::ConvArgs args1{&transformed_ddout_channel,
                                     &filter,
@@ -632,14 +637,18 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
   paddle::operators::ConvArgs args2{&transformed_ddout_channel,
                                     &ddfilter,
                                     &transformed_x,
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
 
   paddle::operators::ConvArgs args3{&transformed_dout,
                                     dfilter,
@@ -647,14 +656,18 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
   paddle::operators::ConvArgs args4{&transformed_dout,
                                     &ddfilter,
                                     &transformed_dx_channel,
                                     strides,
                                     padding_common,
                                     dilations_,
-                                    dtype};
+                                    dtype,
+                                    groups,
+                                    GPUDNNDataLayout::kNCHW};
 #ifdef PADDLE_WITH_HIP
   paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
   paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
@@ -669,8 +682,6 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
   paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
 #endif
 
-  auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
-
   // ddo = conv(ddI, filter) + conv(I, ddfilter)
   size_t workspace_size = 0;
 
@@ -699,7 +710,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 #else
     using search1 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    bwd_result1 = search1::Find<T>(args1, false, deterministic, ctx);
+    bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
 #endif
 
@@ -723,7 +734,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 #else
     using search2 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    bwd_result2 = search2::Find<T>(args2, false, deterministic, ctx);
+    bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
 #endif
@@ -750,7 +761,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 #else
     using search3 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-    filter_result = search3::Find<T>(args3, false, deterministic, ctx);
+    filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
 #endif
@@ -778,7 +789,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 #else
     using search4 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    fwd_result = search4::Find<T>(args4, false, deterministic, ctx);
+    fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
 #endif
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index 58ead4c3287f8..5aa7bd60a0aa8 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -205,7 +205,9 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
                                    strides,
                                    padding_common,
                                    dilations_,
-                                   dtype};
+                                   dtype,
+                                   groups,
+                                   data_layout};
   args.handle = handle;
   args.idesc.set(transformed_out, iwo_groups);
   args.wdesc.set(filter, layout_tensor, iwo_groups);
@@ -228,7 +230,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
   using search =
       paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-  bwd_result = search::Find<T>(args, false, deterministic, ctx);
+  bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
   workspace_size =
       std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
 #endif
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index be32f85fe99a4..6c75ab86d7c4c 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -100,7 +100,7 @@ void MatMul(const Context& dev_ctx,
             const DenseTensor& b,
             bool trans_b,
             DenseTensor* out,
-            bool flag = false) {
+            bool flag) {
   dev_ctx.template Alloc<T>(out);
   auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
   auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
@@ -120,6 +120,32 @@ void MatMul(const Context& dev_ctx,
               dev_ctx.template Alloc<T>(out),
               static_cast<T>(flag));
 }
+template <typename Context, typename T>
+void MatMul(const Context& dev_ctx,
+            const DenseTensor& a,
+            bool trans_a,
+            const DenseTensor& b,
+            bool trans_b,
+            DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+  if (a.dims().size() == 3 && b.dims().size() <= 2) {
+    // the transpose_X must be false, if is true, the transpose cost much time
+    if (!trans_a) {
+      mat_dim_a.height_ *= mat_dim_a.batch_size_;
+      mat_dim_a.batch_size_ = 0;
+    }
+  }
+  blas.MatMul(a.data<T>(),
+              mat_dim_a,
+              b.data<T>(),
+              mat_dim_b,
+              static_cast<T>(1),
+              dev_ctx.template Alloc<T>(out),
+              static_cast<T>(false));
+}
 
 /**
  * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index 99257ce4a6adf..6e2e8e3634c6e 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -478,12 +478,24 @@ void MatMulFunction(const Context& dev_ctx,
                     DenseTensor* Out,
                     bool trans_x,
                     bool trans_y,
-                    bool flag = false) {
+                    bool flag) {
   const std::vector<std::int64_t> x_dims = vectorize(X.dims());
   const std::vector<std::int64_t> y_dims = vectorize(Y.dims());
   MatMulFunction<Context, T>(
       dev_ctx, X, Y, x_dims, y_dims, Out, trans_x, trans_y, flag);
 }
+template <typename Context, typename T>
+void MatMulFunction(const Context& dev_ctx,
+                    const DenseTensor& X,
+                    const DenseTensor& Y,
+                    DenseTensor* Out,
+                    bool trans_x,
+                    bool trans_y) {
+  const std::vector<std::int64_t> x_dims = vectorize(X.dims());
+  const std::vector<std::int64_t> y_dims = vectorize(Y.dims());
+  MatMulFunction<Context, T>(
+      dev_ctx, X, Y, x_dims, y_dims, Out, trans_x, trans_y, false);
+}
 
 template <typename T, typename Context>
 void MatmulKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
index a8e88f351ccbc..389737037a38e 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/coalesce_kernel.h"
-
+#include <thrust/sort.h>
+#include <thrust/unique.h>
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 1eae4be579aa7..9c35964587cd9 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -27,6 +27,7 @@
 from .lamb_optimizer import LambOptimizer
 from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
 from .sharding_optimizer import ShardingOptimizer
+from .sharding_optimizer import ThreadShardingOptimizer
 from .dygraph_optimizer import HybridParallelOptimizer
 from .dygraph_optimizer import HeterParallelOptimizer
 from .dygraph_optimizer import HybridParallelGradScaler
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
index 7002dfa2be514..33c4d01e4daea 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
@@ -51,6 +51,7 @@ def has_var(self, var_name):
             self._var_device_id(var_name) == self.worker_idx
 
     def _split_params(self, params_grads, worker_idx, worker_num):
+        """
         param2device = {}
         total_param_mem = 0.0
         param2mem = []
@@ -62,12 +63,29 @@ def _split_params(self, params_grads, worker_idx, worker_num):
         device_idx = 0
         mem_accu = 0.0
         for param_name, mem in param2mem:
-            if mem_accu > total_param_mem * 1.0 * (device_idx + 1) / worker_num:
+            if mem_accu > total_param_mem * (device_idx + 1) / worker_num:
                 device_idx += 1
             device2params[device_idx].append(param_name)
             param2device[param_name] = device_idx
             mem_accu += mem
         return param2device, device2params
+        """
+        param2device = {}
+        device2params = {x: [] for x in range(worker_num)}
+        
+        sizes = [0] * worker_num
+        for param in [x[0] for x in params_grads]:
+            numel = get_var_size(param)
+            device_idx = sizes.index(min(sizes))
+            device2params[device_idx].append(param.name)
+            param2device[param.name] = device_idx
+            sizes[device_idx] += numel
+            
+        for x in range(worker_num):
+            print("device id: %s, num: %s, mem: %s, names: %s" % (
+                x, len(device2params[x]), sizes[x], device2params[x]))
+        
+        return param2device, device2params
 
     def _var_device_id(self, var_name):
         if var_name in self.global_param2device:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index 39f71be0cde76..605e94e94d9d6 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -754,7 +754,7 @@ def get_first_optimize_op_idx(block):
     return first_opt_op_idx
 
 
-def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root):
+def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root,use_calc_stream=False):
     """
     _add_broadcast_ops
     """
@@ -767,6 +767,7 @@ def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root):
                                       attrs={
                                           'ring_id': ring_id,
                                           'root': root_device,
+                                          'use_calc_stream': use_calc_stream,
                                           OP_ROLE_KEY: op_role
                                       })
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index fcecc3a9a671e..7916218cbbc11 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -71,6 +71,9 @@ def __init__(self, optimizer):
         self._reduced_grads_to_param = {}
         self._shard = Shard()
         self._verbose = False
+        self._thread_mode = False
+        self._use_calc_stream = False
+
 
         # use sharding as outer parallelism (e.g. inner:Megatron & outer sharding)
         self.mp_degree = 1
@@ -576,10 +579,12 @@ def _apply_optimize_offload_pass(self, params_grads):
     def _dump_program_for_debug(self):
         main_block = self._main_program.global_block()
         startup_block = self._startup_program.global_block()
-        with open("start_sharding_%d" % self.role_maker._worker_index(),
+        startup_id = str(id(startup_block.program))
+        with open(("start_sharding_%d_%s" % (self.role_maker._worker_index(), startup_id)),
                   'w') as f:
             f.writelines(str(startup_block.program))
-        with open("main_sharding_%d" % self.role_maker._worker_index(),
+        main_id = str(id(main_block.program))
+        with open(("main_sharding_%d_%s" % (self.role_maker._worker_index(), main_id)),
                   'w') as f:
             f.writelines(str(main_block.program))
 
@@ -819,7 +824,7 @@ def collect_segment(self, segment, op_idx, block):
 
     def _split_program(self, block):
         for op_idx, op in reversed(list(enumerate(block.ops))):
-            if int(op.attr('op_role')) != int(OpRole.Optimize):
+            if int(op.attr('op_role')) != int(OpRole.Optimize) and int(op.attr('op_role'))!= int(OpRole.ScaleLr):
                 last_backward_op_idx = op_idx + 1
                 break
 
@@ -829,6 +834,7 @@ def _split_program(self, block):
         for op_idx in reversed(range(last_backward_op_idx)):
             op = block.ops[op_idx]
             assert (int(op.attr('op_role')) != int(OpRole.Optimize))
+            assert (int(op.attr('op_role')) != int(OpRole.ScaleLr))
             if self._sharding_segment_strategy == "segment_broadcast_MB":
                 if segment._param_mem >= self._broadcast_MB:
                     segment = self.collect_segment(segment, op_idx, block)
@@ -874,7 +880,8 @@ def _split_program(self, block):
                 else:
                     broadcast_var_name = unique_name.generate(input_name +
                                                               "@BroadCast")
-                    segment._fill_constant_vars.append(broadcast_var_name)
+                    if not self._thread_mode:
+                        segment._fill_constant_vars.append(broadcast_var_name)
 
                 # (JZ-LIANG) should use Param base name ?
                 broadcast_var_base_name = input_name
@@ -1094,24 +1101,26 @@ def _add_broadcast_allreduce(self, block):
             if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
                 if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
                         shard_allredue_vars) >= 1:
-                    insert_sync_comm_ops(block, self._segments[-1]._end_idx,
-                                         self.dp_ring_id, shard_allredue_vars)
+                    if not self._use_calc_stream:
+                        insert_sync_comm_ops(block, self._segments[-1]._end_idx,
+                                             self.dp_ring_id, shard_allredue_vars)
                     insert_allreduce_ops(
                         block,
                         self._segments[-1]._end_idx,
                         self.dp_ring_id,
                         shard_allredue_vars,
-                        user_defined_strategy=self.user_defined_strategy)
+                        user_defined_strategy=self.user_defined_strategy,
+                        use_calc_stream=self._use_calc_stream)
             # gradient merge
             elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
                 self.create_persistable_gradients_and_insert_merge_ops(
                     block, self._startup_program.global_block(),
                     self._segments[-1]._end_idx, shard_allredue_vars,
                     self._shard)
-
-            insert_sync_comm_ops(block, self._segments[-1]._end_idx,
-                                 self.sharding_ring_id,
-                                 self._segments[-1]._allreduce_vars)
+            if not self._use_calc_stream:
+                insert_sync_comm_ops(block, self._segments[-1]._end_idx,
+                                     self.sharding_ring_id,
+                                     self._segments[-1]._allreduce_vars)
             # allreduce --> reduce
             insert_reduce_ops(block,
                               self._segments[-1]._end_idx,
@@ -1119,7 +1128,8 @@ def _add_broadcast_allreduce(self, block):
                               self._segments[-1]._allreduce_vars,
                               self._shard,
                               op_role=OpRole.Backward,
-                              use_calc_stream=False)
+                              use_calc_stream=self._use_calc_stream,
+                              )
 
         for idx, segment in reversed(list(enumerate(self._segments))):
             allreduce_vars = self._segments[
@@ -1162,11 +1172,12 @@ def _add_broadcast_allreduce(self, block):
             if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
                 if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
                         shard_allredue_vars) >= 1:
-                    insert_sync_comm_ops(block, segment._end_idx,
-                                         self.dp_ring_id, shard_allredue_vars)
+                    if not self._use_calc_stream:
+                        insert_sync_comm_ops(block, segment._end_idx,
+                                             self.dp_ring_id, shard_allredue_vars)
 
                     broad_cast_vars = [x[0] for x in broadcast_vars]
-                    if len(broad_cast_vars) > 0:
+                    if not self._use_calc_stream and len(broad_cast_vars) > 0:
                         insert_sync_comm_ops(block, segment._end_idx,
                                              self.sharding_ring_id,
                                              broad_cast_vars)
@@ -1174,14 +1185,14 @@ def _add_broadcast_allreduce(self, block):
                     comm_dep_vars = allreduce_vars + [
                         x[0] for x in broadcast_vars
                     ]
-                    if len(comm_dep_vars) > 0:
+                    if not self._use_calc_stream and len(comm_dep_vars) > 0:
                         insert_sync_comm_ops(block, segment._end_idx,
                                              self.sharding_ring_id,
                                              comm_dep_vars)
             # gradient merge
             elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
                 broad_cast_vars = [x[0] for x in broadcast_vars]
-                if len(broad_cast_vars) > 0:
+                if not self._use_calc_stream and len(broad_cast_vars) > 0:
                     insert_sync_comm_ops(block, segment._end_idx,
                                          self.sharding_ring_id, broad_cast_vars)
 
@@ -1189,7 +1200,7 @@ def _add_broadcast_allreduce(self, block):
                 k for k, v in cast_ops.items()
             ] + self._segments[idx]._allreduce_vars
 
-            if len(calc_dep_vars) > 0:
+            if not self._use_calc_stream and len(calc_dep_vars) > 0:
                 insert_sync_calc_op(block, segment._end_idx,
                                     [calc_dep_vars[-1]])
 
@@ -1208,7 +1219,7 @@ def _add_broadcast_allreduce(self, block):
                     segment._start_idx, shard_allredue_vars, self._shard)
 
             insert_broadcast_ops(block, segment._start_idx,
-                                 self.sharding_ring_id, broadcast_vars)
+                                 self.sharding_ring_id, broadcast_vars, self._use_calc_stream)
 
             # step6: add all_reduce ops
             # dp
@@ -1220,13 +1231,17 @@ def _add_broadcast_allreduce(self, block):
                         segment._start_idx,
                         self.dp_ring_id,
                         shard_allredue_vars,
-                        user_defined_strategy=self.user_defined_strategy)
-                    insert_sync_comm_ops(block, segment._start_idx,
-                                         self.sharding_ring_id, allreduce_vars)
+                        user_defined_strategy=self.user_defined_strategy,
+                        use_calc_stream=self._use_calc_stream,
+                        )
+                    if not self._use_calc_stream:
+                        insert_sync_comm_ops(block, segment._start_idx,
+                                             self.sharding_ring_id, allreduce_vars)
             # gradient merge
             elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
-                insert_sync_comm_ops(block, segment._start_idx,
-                                     self.sharding_ring_id, allreduce_vars)
+                if not self._use_calc_stream:
+                    insert_sync_comm_ops(block, segment._start_idx,
+                                         self.sharding_ring_id, allreduce_vars)
             # sharding
             # allreduce --> reduce
             # TODO temp change
@@ -1237,17 +1252,19 @@ def _add_broadcast_allreduce(self, block):
                                   allreduce_vars,
                                   self._shard,
                                   op_role=OpRole.Backward,
-                                  use_calc_stream=False)
+                                  use_calc_stream=self._use_calc_stream)
 
             block._sync_with_cpp()
 
         if self._segments[0]._broadcast_vars:
             broadcast_vars = [x[0] for x in self._segments[0]._broadcast_vars]
-            insert_sync_comm_ops(block, self._segments[0]._start_idx,
-                                 self.sharding_ring_id, broadcast_vars)
+            if not self._use_calc_stream:
+                insert_sync_comm_ops(block, self._segments[0]._start_idx,
+                                     self.sharding_ring_id, broadcast_vars)
             insert_broadcast_ops(block, self._segments[0]._start_idx,
                                  self.sharding_ring_id,
-                                 self._segments[0]._broadcast_vars)
+                                 self._segments[0]._broadcast_vars,
+                                 self._use_calc_stream)
 
         fill_constant_vars = []
         for x in self._segments[:2]:
@@ -1260,7 +1277,7 @@ def _add_broadcast_allreduce(self, block):
                 cast_ops[k] = v
 
         calc_deps_vars = fill_constant_vars + [k for k, v in cast_ops.items()]
-        if fill_constant_vars or cast_ops:
+        if not self._use_calc_stream and fill_constant_vars or cast_ops:
             insert_sync_calc_op(block, self._segments[0]._start_idx,
                                 [calc_deps_vars[-1]])
 
@@ -1308,7 +1325,10 @@ def _build_groups(self):
         self.global_word_size = self.role_maker._worker_num()
         self.global_rank = self.role_maker._worker_index()
         self.global_endpoints = self.role_maker._get_trainer_endpoints()
-        self.current_endpoint = self.global_endpoints[self.global_rank]
+        if self._thread_mode:
+            self.current_endpoint = self.global_endpoints[self.role_maker._role_id()]
+        else:
+            self.current_endpoint = self.global_endpoints[self.global_rank]
         self._collective_helper = CollectiveHelper(self.role_maker,
                                                    nrings=self._nrings_sharding)
         assert self.global_word_size % self.mp_degree == 0, \
@@ -1844,3 +1864,190 @@ def _sharding_gradient_merge(self):
                 'sub_block': cond_block,
                 'is_scalar_condition': True,
             })
+class ThreadShardingOptimizer(ShardingOptimizer):
+    """Sharding Optimizer."""
+    def __init__(self, optimizer):
+        super().__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = [
+            "ParameterServerOptimizer",
+            "RecomputeOptimizer",
+            "AMPOptimizer",
+            "LarsOptimizer",
+            "LambOptimizer",
+            "ASPOptimizer",
+            # "ModelParallelOptimizer",
+            # "PipelineOptimizer",
+        ]
+        self._thread_mode = True
+        self._use_calc_stream = False
+        op_maker = core.op_proto_and_checker_maker
+        self.op_role_key = op_maker.kOpRoleAttrName()
+        
+    def _prune_main_program(self, block, shard, rings):
+        """
+        rename BroadCast param
+        """
+        var_names = set([])
+        for idx, op in enumerate(block.ops):
+            for input_name in op.desc.input_arg_names():
+                pos = input_name.find("@BroadCast")
+                if pos <= 0:
+                    continue
+                new_name = input_name[0 : pos]
+                op.desc._rename_input(
+                    input_name, new_name
+                )
+                var_names.add(input_name)
+            for output_name in op.desc.output_arg_names():
+                pos = output_name.find("@BroadCast")
+                if pos <= 0:
+                    continue
+                new_name = output_name[0 : pos]
+                op.desc._rename_output(
+                    output_name, new_name
+                )
+                var_names.add(output_name)
+                
+        for var_name in var_names:
+            block._remove_var(var_name, sync=False)
+        
+        print("remove broadcast param count=", len(var_names))
+        block._sync_with_cpp()
+        
+    def _prune_startup_program(self, block, shard):
+        """
+            not need process
+        """
+        block._sync_with_cpp()
+        
+    def _insert_loss_grad_scale_op(self):
+        """
+           paddlebox grad not need scale
+        """
+        main_block = self._main_program.global_block()
+        # # step6: loss div dp_degree
+        # global_dp_degree = self.sharding_degree * self.dp_degree
+        # assert int(global_dp_degree) == global_dp_degree
+        # if global_dp_degree > 1:
+        #     insert_scale_loss_grad_ops(main_block, scale=global_dp_degree)
+        main_block._sync_with_cpp()
+            
+    def minimize_impl(
+        self, loss, startup_program=None, parameter_list=None, no_grad_set=None
+    ):
+        """
+            reset start program and main program
+        """
+        sharding_configs = self.user_defined_strategy.sharding_configs
+        if "use_calc_stream" in sharding_configs:
+            self._use_calc_stream = sharding_configs["use_calc_stream"]
+        optimize_ops, params_grads = super().minimize_impl(
+            loss, startup_program, parameter_list, no_grad_set)
+        # main_block = self._main_program.global_block()
+        # startup_block = self._startup_program.global_block()
+        loss.block.program = self._main_program
+        from paddle import fluid
+        fluid.framework.switch_startup_program(self._startup_program)
+        return optimize_ops, params_grads
+    
+    def _init_comm(self):
+        # sync var
+        self.role_id = self.role_maker._role_id()
+        self.node_nums = self.role_maker._node_num()
+        startup_block = self._startup_program.global_block()
+        if self.node_nums > 1:
+            node_nums = len(self.global_endpoints)
+            assert (
+                self.node_nums == node_nums
+            ), "end points not equal node nums"
+        self.current_endpoint = self.global_endpoints[self.role_id]
+        
+        # mp ring
+        if self.mp_degree > 1:
+            self._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.mp_group_endpoints,
+                self.role_id,
+                self.mp_ring_id,
+            )
+            
+        # sharding ring
+        if self.sharding_degree > 1:
+            self._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.sharding_group_endpoints,
+                self.role_id,
+                self.sharding_ring_id,
+            )
+       
+        # pure dp ring
+        if self.dp_degree > 1:
+            self._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.dp_group_endpoints,
+                self.role_id,
+                self.dp_ring_id,
+            )
+
+        startup_block._sync_with_cpp()
+
+    def _wait(self):
+        if self.node_nums <= 1:
+            return
+        endpoints = self.global_endpoints[:]
+        current_endpoint = endpoints[self.role_id]
+        if self.global_rank == 0:
+            from paddle.fluid.transpiler.details import wait_server_ready
+            endpoints.remove(current_endpoint)
+            wait_server_ready(endpoints)
+            
+    def _init_communicator(
+        self,
+        program,
+        current_endpoint,
+        endpoints,
+        role_id,
+        ring_id
+    ):
+        block = program.global_block()
+        # init mulit node nccl
+        if self.node_nums > 1:
+            other_endpoints = endpoints[:]
+            other_endpoints.remove(current_endpoint)
+            
+            nccl_id_var = block.create_var(
+                name=unique_name.generate('nccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW,
+            )
+            block.append_op(
+                type='c_gen_nccl_id',
+                inputs={},
+                outputs={'Out': nccl_id_var},
+                attrs={
+                    'rank': role_id,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward,
+                },
+            )
+            block.append_op(
+                type='c_comm_init_multitrainer',
+                inputs={'X': nccl_id_var},
+                outputs={},
+                attrs={
+                    'ntrainers': self.node_nums,
+                    'trainer_id': role_id,
+                    'ring_id': ring_id,
+                    self.op_role_key: OpRole.Forward,
+                },
+            )
+        else:
+            block.append_op(
+                type='c_comm_init_all', 
+                attrs={'ring_id': ring_id}
+            )
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 1ff8a579de4cf..a0b4e6474fd0b 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -71,6 +71,7 @@
     'fused_seqpool_concat',
     'fused_concat',
     'rank_attention2',
+    'fused_seq_tensor',
 ]
 
 
@@ -1601,7 +1602,7 @@ def rank_attention2(input,
     return output
 
 
-def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None, batchcount=0):
+def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None, batchcount=0, transpose_weight=False):
     """
     **Batch FC layer**
     This Op can calculate BatchFC. This is similar to matmul op, 
@@ -1666,7 +1667,10 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None, batc
                          "W": w,
                          "Bias": b
                      },
-                     attrs={'batchcount': batchcount},
+                     attrs={
+                         'batchcount': batchcount,
+                         'transpose_weight': transpose_weight
+                     },
                      outputs={"Out": pre_act})
     return helper.append_activation(pre_act)
 
@@ -1760,6 +1764,7 @@ def fused_seqpool_cvm(input,
                       embed_thres_size=0,
                       embedx_concate_size=1,
                       embedx_concate_filter=False,
+                      fill_zero=True,
                       fix_ctr_to_click=False):
     """
      **Notes: The Op only receives List of LoDTensor as input, only support SUM pooling now.
@@ -1820,6 +1825,7 @@ def fused_seqpool_cvm(input,
             "embed_thres_size": embed_thres_size,
             "embedx_concate_size": embedx_concate_size,
             "embedx_concate_filter": embedx_concate_filter,
+            "fill_zero": fill_zero,
             "fix_ctr_to_click": fix_ctr_to_click
         })
 
@@ -2827,3 +2833,66 @@ def fused_concat(input, start_index=0, length=-1, axis=1):
                "length": length})
     return out
 
+def fused_seq_tensor(input,
+              batch_count,
+              max_length,
+              slot_num,
+              ad_slot_num,
+              fea_emb_dim,
+              ad_slot_offset):
+    """
+    **fused seq tensor**
+    Notice: It currently only supports GPU device.
+
+    Args:
+        input: [input, ad_input], input tensor list with data type float32.
+        batch_count: parrellel num.
+        max_length: max_length.
+        slot_num: slot_num, sum of ad_slot_num and side info slot.
+        ad_slot_num: ad slot num.
+        fea_emb_dim: embding dim.
+        ad_slot_offset: ad slot offset.
+
+    Returns:
+        Variable: 
+            din_out, mask_out, side_info_out, ad_slot_session_out 
+    """
+
+    helper = LayerHelper("fused_seq_tensor", **locals())
+
+    check_type(input, "input", list, 'fused_seq_tensor')
+
+    dtype = helper.input_dtype()
+    check_dtype(dtype, 'input', ['float32', 'float64'], 'fused_seq_tensor')
+    
+    check_type(batch_count, 'batch_count', (int, Variable), 'fused_seq_tensor')
+    check_type(max_length, 'max_length', (int, Variable), 'fused_seq_tensor')
+    check_type(slot_num, 'slot_num', (int, Variable), 'fused_seq_tensor')
+    check_type(fea_emb_dim, 'fea_emb_dim', (int, Variable), 'fused_seq_tensor')
+
+    din_out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    mask_out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    side_info_out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    ad_slot_session_out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+
+    helper.append_op(
+        type="fused_seq_tensor",
+        inputs={"Input": input[0],
+                "ADInput": input[1]
+                },
+        attrs={
+            'batch_count': batch_count,
+            'max_length': max_length,
+            'slot_num': slot_num,
+            'fea_emb_dim': fea_emb_dim,
+            'ad_slot_num': ad_slot_num,
+            'ad_slot_offset': ad_slot_offset
+        },
+        outputs={
+            "DINOut": din_out,
+            "MaskOut": mask_out,
+            "SideInfoOut": side_info_out,
+            "ADSlotSessionOut": ad_slot_session_out
+            })
+
+    return din_out, mask_out, side_info_out, ad_slot_session_out
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 2acf005487bef..63eb3914e4bbd 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -401,7 +401,7 @@ def _is_enable_standalone_executor():
     from ..distributed.fleet import fleet
     # use standalone_executor by default if not distributed
     if fleet._role_maker is None and framework._enable_standalone_executor_ is None:
-        framework._enable_standalone_executor_ = 1
+        framework._enable_standalone_executor_ = 0
 
     if framework._enable_standalone_executor_ in [1, '1', True, 'True', 'true']:
         flag = True
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 122c70f466722..7354d49a975e6 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -290,7 +290,8 @@ def save_vars(executor,
               main_program=None,
               vars=None,
               predicate=None,
-              filename=None):
+              filename=None,
+              filter_func=None):
     """
     :api_attr: Static Graph
 
@@ -374,7 +375,8 @@ def name_has_fc(var):
                          main_program=main_program,
                          dirname=dirname,
                          vars=list(filter(predicate, main_program.list_vars())),
-                         filename=filename)
+                         filename=filename,
+                         filter_func=filter_func)
     else:
         params_var_name = "saved_params"
         # give warning when there is no var in model
@@ -389,6 +391,8 @@ def name_has_fc(var):
 
         save_var_map = {}
         for each_var in vars:
+            if filter_func is not None and filter_func(each_var.name):
+                continue
             # NOTE: don't save the variable which type is RAW
             if each_var.type == core.VarDesc.VarType.RAW:
                 continue
@@ -668,7 +672,11 @@ def is_valid(var):
 
 
 @dygraph_not_support
-def save_persistables(executor, dirname, main_program=None, filename=None):
+def save_persistables(executor,
+                      dirname,
+                      main_program=None,
+                      filename=None,
+                      filter_func=None):
     """
     :api_attr: Static Graph
 
@@ -737,7 +745,8 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
                          main_program=main_program,
                          vars=None,
                          predicate=is_persistable,
-                         filename=filename)
+                         filename=filename,
+                         filter_func=filter_func)
 
 
 def load_vars(executor,
@@ -1245,7 +1254,8 @@ def save_inference_model(dirname,
                          params_filename=None,
                          export_for_deployment=True,
                          program_only=False,
-                         clip_extra=False):
+                         clip_extra=False,
+                         filter_func=None):
     """
     :api_attr: Static Graph
 
@@ -1454,7 +1464,7 @@ def save_inference_model(dirname,
     if params_filename is not None:
         params_filename = os.path.basename(params_filename)
 
-    save_persistables(executor, save_dirname, main_program, params_filename)
+    save_persistables(executor, save_dirname, main_program, params_filename, filter_func)
     return target_var_name_list
 
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 6ef620f5f0784..1b423bcca695f 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -557,10 +557,14 @@ if '${WITH_XPU}' == 'ON':
     shutil.copy('${XPU_RT_LIB}', libs_path)
     shutil.copy('${XPU_API_PLUGIN}', libs_path)
     shutil.copy('${XPU_RT_ALIAS_LIB}', libs_path)
+    shutil.copy('${XPU_ML_LIB}', libs_path)
+    shutil.copy('${XPU_ML_ALIAS_LIB}', libs_path)
     package_data['paddle.libs']+=['${XPU_API_LIB_NAME}',
                                   '${XPU_RT_LIB_NAME}',
                                   '${XPU_RT_ALIAS_LIB_NAME}',
-                                  '${XPU_API_PLUGIN_NAME}']
+                                  '${XPU_API_PLUGIN_NAME}',
+                                  '${XPU_ML_LIB_NAME}',
+                                  '${XPU_ML_ALIAS_LIB_NAME}']
 
 if '${WITH_XPU_BKCL}' == 'ON':
     shutil.copy('${XPU_BKCL_LIB}', libs_path)