diff --git a/.gitignore b/.gitignore
index bc8a817b4f2f01..4faaf162ba52b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,4 +66,3 @@ paddle/infrt/tests/lit.cfg.py
 paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
 paddle/fluid/pybind/eager_final_state_op_function_impl.h
 paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h
-builder
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
index cc7635649281cd..04f102818a9743 100644
--- a/cmake/external/pslib.cmake
+++ b/cmake/external/pslib.cmake
@@ -49,7 +49,8 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     PREFIX                ${PSLIB_PREFIX_DIR}
     DOWNLOAD_DIR          ${PSLIB_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      cp /root/paddlejob/new1_code/ps/baidu/paddlepaddle/pslib/pslib.tar.gz ./ && tar zxvf ${PSLIB_NAME}.tar.gz
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz
+                          && tar zxvf ${PSLIB_NAME}.tar.gz
     DOWNLOAD_NO_PROGRESS  1
     UPDATE_COMMAND        ""
     CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT}
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 4b15a6047c83f6..100fa9b776292e 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -160,8 +160,7 @@ class Dataset {
   virtual void SetFleetSendSleepSeconds(int seconds) = 0;
 
   virtual std::vector<std::string> GetSlots() = 0;
-  virtual void SetPassId(uint32_t pass_id) = 0;
-  virtual uint32_t GetPassID() = 0;
+
  protected:
   virtual int ReceiveFromClient(int msg_type, int client_id,
                                 const std::string& msg) = 0;
@@ -250,13 +249,6 @@ class DatasetImpl : public Dataset {
   virtual void DynamicAdjustReadersNum(int thread_num);
   virtual void SetFleetSendSleepSeconds(int seconds);
   virtual std::vector<std::string> GetSlots(); 
-  virtual void SetPassId(uint32_t pass_id) {
-    pass_id_ = pass_id;
-  }
-  virtual uint32_t GetPassID() {
-    return pass_id_;
-  }
-  
   /* for enable_heterps_
   virtual void EnableHeterps(bool enable_heterps) {
     enable_heterps_ = enable_heterps;
@@ -283,7 +275,6 @@ class DatasetImpl : public Dataset {
     // TODO(yaoxuefeng) for SlotRecordDataset
     return -1;
   }
-  uint32_t pass_id_ = 0;
   std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers_;
   std::vector<std::shared_ptr<paddle::framework::DataFeed>> preload_readers_;
   paddle::framework::Channel<T> input_channel_;
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index d6f316cf145b9a..b390cf17f492c5 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -1388,9 +1388,9 @@ void FleetWrapper::SetDate(const uint64_t table_id, const std::string& date) {
 #endif
 }
 
-void FleetWrapper::PrintTableStat(const uint64_t table_id, uint32_t pass_id, size_t threshold) {
+void FleetWrapper::PrintTableStat(const uint64_t table_id) {
 #ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->print_table_stat(table_id, pass_id, threshold);
+  auto ret = pslib_ptr_->_worker_ptr->print_table_stat(table_id);
   ret.wait();
   int32_t err_code = ret.get();
   if (err_code == -1) {
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 2e17eae9196a4e..deb2b90c933532 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -265,7 +265,7 @@ class FleetWrapper {
                            std::vector<std::string> table_var_list,
                            bool load_combine);
 
-  void PrintTableStat(const uint64_t table_id, uint32_t pass_id, uint64_t threshold);
+  void PrintTableStat(const uint64_t table_id);
   void SetFileNumOneShard(const uint64_t table_id, int file_num);
   // mode = 0, load all feature
   // mode = 1, load delta feature, which means load diff
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index c6f34987e65732..061ce0ff3909e9 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -39,76 +39,227 @@ namespace framework {
 
 class HeterContext {
  public:
-  //保存去重后的待查table的key, 第一层对应table-shard, 第二层对应不同维度，第三层就是key集合
-  std::vector<std::vector<std::vector<FeatureKey>>>feature_keys_;
-  //保存查到的value数据，维度同feature_keys_
+  virtual ~HeterContext() {
+    if (!multi_mf_dim_) {
+      for (size_t i = 0; i < mutex_.size(); ++i) {
+        delete mutex_[i];
+      }
+      mutex_.clear();
+    } else {
+      for (size_t i = 0; i < dim_mutex_.size(); ++i) {
+        for (size_t j = 0; j < dim_mutex_[i].size(); j++) {
+          delete dim_mutex_[i][j];
+        }
+        dim_mutex_[i].clear();
+      }
+    }
+  }
+  Scope* scope_{nullptr};
+  std::vector<std::vector<FeatureKey>> feature_keys_;
+  std::vector<std::vector<std::vector<FeatureKey>>> feature_dim_keys_;
+  std::vector<std::vector<std::vector<FeatureKey>>> device_task_keys_;
+
 #ifdef PADDLE_WITH_PSLIB
+  std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> value_ptr_;
+  std::vector<std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>>>
+      device_task_ptr_;
+  std::vector<std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>>>
+      value_dim_ptr_;
   std::vector<std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>>>
-      value_ptr_;
+      device_dim_ptr_;
 #endif
 #ifdef PADDLE_WITH_PSCORE
+  std::vector<std::vector<paddle::distributed::FixedFeatureValue*>> value_ptr_;
   std::vector<std::vector<std::vector<paddle::distributed::FixedFeatureValue*>>>
-      value_ptr_;
+      value_dim_ptr_;
+  std::vector<std::vector<std::vector<paddle::distributed::FixedFeatureValue*>>>
+      device_task_ptr_;
+  std::vector<std::vector<std::vector<paddle::distributed::FixedFeatureValue*>>>
+      device_dim_ptr_;
 #endif
-  //经过去重后的gpu-table中的key数据, 第一层设备，第二层维度，第三层具体的key
-  std::vector<std::vector<std::vector<FeatureKey>>> device_keys_;
+  std::vector<std::vector<FeatureValue>> device_values_;
+  std::vector<std::vector<FeatureKey>> device_keys_;
+  std::vector<std::vector<std::vector<FeatureKey>>> device_dim_keys_;
+  std::vector<std::vector<std::vector<FeatureValue>>> device_dim_values_;
+  std::vector<std::mutex*> mutex_;
+  std::vector<std::vector<std::mutex*>> dim_mutex_;
+  int multi_mf_dim_ = 0;
+
+  uint32_t shard_num_ = 37;
+  uint64_t size() {
+    uint64_t total_size = 0;
+    for (auto& keys : feature_keys_) {
+      total_size += keys.size();
+    }
+    return total_size;
+  }
+  void SetShardNum(uint32_t shard_num) { shard_num_ = shard_num; }
+  uint32_t ShardNum() { return shard_num_; }
+  void init(int shard_num, int device_num) {
+    shard_num_ = shard_num;
+    feature_keys_.resize(shard_num_);
+    value_ptr_.resize(shard_num_);
+    device_task_ptr_.resize(shard_num_);
+    device_task_keys_.resize(shard_num_);
+    for (size_t i = 0; i < device_task_ptr_.size(); i++) {
+      device_task_ptr_[i].resize(device_num);
+      device_task_keys_[i].resize(device_num);
+    }
+
+    device_values_.resize(device_num);
+    device_keys_.resize(device_num);
+    mutex_.resize(device_num);
+    for (size_t i = 0; i < mutex_.size(); ++i) {
+      mutex_[i] = new std::mutex();
+    }
+  }
 
-  //初始化
   void init(int shard_num, int device_num, int dim_num) {
-    feature_keys_.resize(shard_num);
-    for (auto& iter : feature_keys_) {
-      iter.resize(dim_num);
-      for (auto& iter1: iter) {
-        iter1.clear();
-      }
+    shard_num_ = shard_num;
+    feature_keys_.resize(shard_num_);
+    feature_dim_keys_.resize(shard_num_);
+    value_ptr_.resize(shard_num_);
+    value_dim_ptr_.resize(shard_num_);
+    device_task_ptr_.resize(shard_num_);
+    device_task_keys_.resize(shard_num_);
+    for (size_t i = 0; i < device_task_ptr_.size(); i++) {
+      device_task_ptr_[i].resize(device_num);
+      device_task_keys_[i].resize(device_num);
     }
-    value_ptr_.resize(shard_num);
-    for (auto& iter : value_ptr_) {
-      iter.resize(dim_num);
-      for (auto& iter1: iter) {
-        iter1.clear();
-      }
+    for (size_t i = 0; i < feature_dim_keys_.size(); i++) {
+      feature_dim_keys_[i].resize(dim_num);
+      value_dim_ptr_[i].resize(dim_num);
     }
+    device_values_.resize(device_num);
+    device_dim_values_.resize(device_num);
     device_keys_.resize(device_num);
-    for (auto& iter : device_keys_) {
-      iter.resize(dim_num);
-      for (auto& iter1: iter) {
-        iter1.clear();
+
+    device_dim_keys_.resize(device_num);
+    device_dim_ptr_.resize(device_num);
+    mutex_.resize(device_num);
+    dim_mutex_.resize(device_num);
+    for (size_t i = 0; i < mutex_.size(); ++i) {
+      mutex_[i] = new std::mutex();
+    }
+    for (size_t i = 0; i < dim_mutex_.size(); ++i) {
+      dim_mutex_[i].resize(dim_num);
+      for (int j = 0; j < dim_num; j++) {
+        dim_mutex_[i][j] = new std::mutex();
       }
     }
+    multi_mf_dim_ = dim_num;
+  }
 
+  void Reset() {
+    if (!multi_mf_dim_) {
+      for (size_t i = 0; i < feature_keys_.size(); ++i) {
+        feature_keys_[i].clear();
+      }
+      for (size_t i = 0; i < value_ptr_.size(); ++i) {
+        value_ptr_[i].clear();
+      }
+      for (size_t i = 0; i < device_values_.size(); ++i) {
+        device_values_[i].clear();
+      }
+      for (size_t i = 0; i < device_keys_.size(); ++i) {
+        device_keys_[i].clear();
+      }
+      for (size_t i = 0; i < device_task_ptr_.size(); ++i) {
+        for (size_t j = 0; j < device_task_ptr_[i].size(); ++j) {
+          device_task_ptr_[i][j].clear();
+          device_task_keys_[i][j].clear();
+        }
+      }
+    } else {
+      VLOG(3) << "Reset gpu task with dynamic mf dimention";
+      for (size_t i = 0; i < feature_dim_keys_.size(); i++) {
+        for (size_t j = 0; j < feature_dim_keys_[i].size(); j++) {
+          feature_dim_keys_[i][j].clear();
+        }
+      }
+      for (size_t i = 0; i < value_dim_ptr_.size(); i++) {
+        for (size_t j = 0; j < value_dim_ptr_[i].size(); j++) {
+          value_dim_ptr_[i][j].clear();
+        }
+      }
+
+      for (size_t i = 0; i < device_dim_keys_.size(); i++) {
+        for (size_t j = 0; j < device_dim_keys_[i].size(); j++) {
+          device_dim_keys_[i][j].clear();
+        }
+      }
+      for (size_t i = 0; i < device_dim_ptr_.size(); i++) {
+        for (size_t j = 0; j < device_dim_ptr_[i].size(); j++) {
+          device_dim_ptr_[i][j].clear();
+        }
+      }
+    }
+  }
+  void batch_add_keys(
+      const std::vector<std::unordered_set<uint64_t>>& thread_keys) {
+    assert(thread_keys.size() == feature_keys_.size());
+
+    for (uint32_t i = 0; i < shard_num_; i++) {
+      int idx = 0;
+      idx = feature_keys_[i].size();
+      feature_keys_[i].resize(feature_keys_[i].size() + thread_keys[i].size());
+      std::copy(thread_keys[i].begin(), thread_keys[i].end(),
+                feature_keys_[i].begin() + idx);
+    }
+  }
+
+  void batch_add_keys(int shard_num,
+                      const robin_hood::unordered_set<uint64_t>& shard_keys) {
+    int idx = feature_keys_[shard_num].size();
+    feature_keys_[shard_num].resize(feature_keys_[shard_num].size() +
+                                    shard_keys.size());
+    std::copy(shard_keys.begin(), shard_keys.end(),
+              feature_keys_[shard_num].begin() + idx);
   }
-  //将粗去重的key加入进来,后面再做精细化去重
+
   void batch_add_keys(int shard_num, int dim_id,
                       const robin_hood::unordered_set<uint64_t>& shard_keys) {
-    int idx = feature_keys_[shard_num][dim_id].size();
-    feature_keys_[shard_num][dim_id].resize(
-        feature_keys_[shard_num][dim_id].size() + shard_keys.size());
+    int idx = feature_dim_keys_[shard_num][dim_id].size();
+    feature_dim_keys_[shard_num][dim_id].resize(
+        feature_dim_keys_[shard_num][dim_id].size() + shard_keys.size());
     std::copy(shard_keys.begin(), shard_keys.end(),
-              feature_keys_[shard_num][dim_id].begin() + idx);
+              feature_dim_keys_[shard_num][dim_id].begin() + idx);
   }
-  void unique_keys() {
+
+  void UniqueKeys() {
     std::vector<std::thread> threads;
-    auto unique_func = [this](int i, int j) {
-      auto& cur_keys = feature_keys_[i][j];
+    auto unique_func = [this](int i) {
+      auto& cur_keys = feature_keys_[i];
+      std::sort(cur_keys.begin(), cur_keys.end());
+      std::vector<FeatureKey>::iterator it;
+      it = std::unique(cur_keys.begin(), cur_keys.end());
+      cur_keys.resize(std::distance(cur_keys.begin(), it));
+    };
+    auto unique_dynamic_mf_func = [this](int i, int j) {
+      auto& cur_keys = feature_dim_keys_[i][j];
       std::sort(cur_keys.begin(), cur_keys.end());
       std::vector<FeatureKey>::iterator it;
       it = std::unique(cur_keys.begin(), cur_keys.end());
       cur_keys.resize(std::distance(cur_keys.begin(), it));
     };
-    for (size_t i = 0; i < feature_keys_.size(); i++) {
-      for (size_t j = 0; j < feature_keys_[i].size(); j++) {
-        threads.push_back(std::thread(unique_func, i, j));
+    if (!multi_mf_dim_) {
+      for (uint32_t i = 0; i < shard_num_; i++) {
+        threads.push_back(std::thread(unique_func, i));
+      }
+    } else {
+      for (uint32_t i = 0; i < shard_num_; i++) {
+        for (int j = 0; j < multi_mf_dim_; j++) {
+          threads.push_back(std::thread(unique_dynamic_mf_func, i, j));
+        }
       }
+      VLOG(3) << "heter_context unique keys with dynamic mf dimention";
     }
     for (std::thread& t : threads) {
       t.join();
     }
   }
-  uint16_t pass_id_;
 };
 
-
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index f4fd845de43e6e..983208c0608ae7 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -7,9 +7,9 @@ IF(WITH_GPU)
         get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
         SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
     endif()
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h dy_gpu_value_inl.h feature_value_inl.h gpu_value_inl.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
-    nv_library(heter_ps SRCS heter_ps.cu feature_value.cu DEPS heter_comm)
+    nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
     if(WITH_PSCORE)
       nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
       nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
@@ -20,7 +20,7 @@ IF(WITH_GPU)
     endif()
 ENDIF()
 IF(WITH_ROCM)
-    hip_library(heter_comm SRCS heter_comm.h feature_value.h dy_gpu_value_inl.h feature_value_inl.h gpu_value_inl.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
+    hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
     hip_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
-    hip_library(heter_ps SRCS heter_ps.cu feature_value.cu DEPS heter_comm)
+    hip_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
 ENDIF()
diff --git a/paddle/fluid/framework/fleet/heter_ps/dy_gpu_value_inl.h b/paddle/fluid/framework/fleet/heter_ps/dy_gpu_value_inl.h
deleted file mode 100644
index ac95c5f3b6f4ba..00000000000000
--- a/paddle/fluid/framework/fleet/heter_ps/dy_gpu_value_inl.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_HETERPS
-
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h"
-#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
-
-namespace paddle {
-namespace framework {
-
-struct DyGpuValue {
-  float delta_score;
-  float show;
-  float clk;
-  int slot;
-  float lr;
-  float lr_g2sum;
-  int mf_size;
-  int mf_dim;
-  uint64_t cpu_ptr;
-  float mf[0];
-  __host__ __device__ __forceinline__ DyGpuValue() {
-    delta_score = 0;
-    show = 0;
-    clk = 0;
-    slot = -1;
-    lr = 0;
-    lr_g2sum = 0;
-    mf_size = 0;
-    mf_dim = 0;
-    cpu_ptr = 0;
-  }
-  __device__ __forceinline__ void operator=(const DyGpuValue& in) {
-    delta_score = in.delta_score;
-    show = in.show;
-    clk = in.clk;
-    slot = in.slot;
-    lr = in.lr;
-    lr_g2sum = in.lr_g2sum;
-    mf_size = in.mf_size;
-    mf_dim = in.mf_dim;
-    cpu_ptr = in.cpu_ptr;
-    for (int i = 0; i < mf_dim + 1; i++) {
-      mf[i] = in.mf[i];
-    }
-  }
-  __device__ __forceinline__ void to_cvm(float* des, int dim) {
-    des[0] = show;
-    des[1] = clk;
-    des[2] = lr;
-    if (mf_size == 0) {
-      for (int i = 0; i < dim; i++) {
-        des[3 + i] = 0;
-      }
-    } else {
-      for (int i = 0; i < dim; i++) {
-        des[3 + i] = mf[1 + i];
-      }
-    }
-  }
-};
-
-struct DyGpuPushValue {
-  float show;
-  float clk;
-  int slot;
-  float lr_g;
-  int mf_dim;
-  float mf_g[0];
-  __device__ __forceinline__ void from_grad(const float* grad, int dim, int slot_id, int batch_size) {
-    this->slot = slot_id;
-    this->mf_dim = dim;
-    this->show = grad[0];
-    this->clk = grad[1];
-    this->lr_g = grad[2] * -1. * batch_size;
-    for (int j = 0; j < dim; j++) {
-      this->mf_g[j] = grad[3 + j] * -1. * batch_size;
-    }
-  }
-  __device__ __forceinline__ DyGpuPushValue& operator+=(const DyGpuPushValue& input) {
-    show += input.show;
-    clk += input.clk;
-    lr_g += input.lr_g;
-    for (int i = 0; i < input.mf_dim; i++) {
-      mf_g[i] += input.mf_g[i];
-    }
-    return *this;
-  }
-  __device__ __forceinline__ void operator=(const DyGpuPushValue& input) {
-    show = input.show;
-    clk = input.clk;
-    slot = input.slot;
-    lr_g = input.lr_g;
-    mf_dim = input.mf_dim;
-    for (int i = 0; i < mf_dim; i++) {
-     mf_g[i] = input.mf_g[i];
-    }
-  }
-};
-
-template <>
-class Optimizer<DyGpuValue, DyGpuPushValue> {
- public:
-  Optimizer() {}
-
-  ~Optimizer() {}
-
-  void initialize() {}
-
-  __device__ void update_lr(float& w, float& g2sum, float g, float scale) {
-    double add_g2sum = 0;
-    double ratio = optimizer_config::learning_rate *
-                   sqrt(optimizer_config::initial_g2sum /
-                        (optimizer_config::initial_g2sum + g2sum));
-    double scaled_grad = g / scale;
-    w += scaled_grad * ratio;
-    if (w < optimizer_config::min_bound) w = optimizer_config::min_bound;
-    if (w > optimizer_config::max_bound) w = optimizer_config::max_bound;
-    add_g2sum += scaled_grad * scaled_grad;
-    g2sum += add_g2sum;
-  }
-
-  __device__ void update_mf(int n, float* w, float& g2sum, const float* g,
-                            float scale) {
-    double add_g2sum = 0;
-    double ratio = optimizer_config::mf_learning_rate *
-                   sqrt(optimizer_config::mf_initial_g2sum /
-                        (optimizer_config::mf_initial_g2sum + g2sum));
-    for (int i = 0; i < n; ++i) {
-      double scaled_grad = g[i] / scale;
-      w[i] += scaled_grad * ratio;
-      if (w[i] < optimizer_config::mf_min_bound)
-        w[i] = optimizer_config::mf_min_bound;
-      if (w[i] > optimizer_config::mf_max_bound)
-        w[i] = optimizer_config::mf_max_bound;
-      add_g2sum += scaled_grad * scaled_grad;
-    }
-    g2sum += add_g2sum / n;
-  }
-  
-  __device__ void update_value(DyGpuValue* ptr, const DyGpuPushValue& grad, curandState& state) {
-    ptr->slot = grad.slot;
-    ptr->show += grad.show;
-    ptr->clk += grad.clk;
-    ptr->delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
-                       optimizer_config::clk_coeff * grad.clk;
-
-    update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
-
-    if (ptr->mf_size == 0) {
-      if (optimizer_config::mf_create_thresholds <=
-          optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) +
-              optimizer_config::clk_coeff * ptr->clk) {
-        ptr->mf_size = ptr->mf_dim + 1;
-        ptr->mf[0] = 0;
-        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-        for (int i = 0; i < ptr->mf_dim; ++i) {
-          ptr->mf[i + 1] =
-              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
-        }
-      }
-    } else {
-      update_mf(ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
-                grad.show);  // for local test
-    }
-  }
-};
-
-
-class T_DyGpuValue_DownpourCtrDymfAccessor : public ValueTransforImp {
-public:
-  virtual int get_gpu_value_size(int dim_size) {
-    int ret = sizeof(DyGpuValue) + (dim_size + 1) * sizeof(float);
-    return TYPE_ALIGN(8, ret);
-  }
-  virtual int get_gpu_push_value_size(int dim_size) {
-    int ret = sizeof(DyGpuPushValue) + (dim_size) * sizeof(float);
-    return TYPE_ALIGN(8, ret);
-  }
-  virtual void value_cpu_to_gpu(void* cpu, void* gpu, int dim_size) {
-#ifdef PADDLE_WITH_PSLIB
-    paddle::ps::DownpourFixedFeatureValue* cpu_value = (paddle::ps::DownpourFixedFeatureValue*)cpu;
-    DyGpuValue* gpu_value = (DyGpuValue*)gpu;
-    const float* ptr_cpu_data = cpu_value->data();
-    size_t dim = cpu_value->size();
-    uint64_t tmp_aa = (uint64_t)(cpu);
-    gpu_value->delta_score = ptr_cpu_data[1];
-    gpu_value->show = ptr_cpu_data[2];
-    gpu_value->clk = ptr_cpu_data[3];
-    gpu_value->slot = int(ptr_cpu_data[6]);
-    gpu_value->lr = ptr_cpu_data[4];
-    gpu_value->lr_g2sum = ptr_cpu_data[5];
-    gpu_value->cpu_ptr = (uint64_t)(cpu);
-    gpu_value->mf_dim = dim_size;
-    if (dim > 8) {
-        gpu_value->mf_size = dim_size + 1;
-        for (int x = 0; x < gpu_value->mf_dim + 1; x++) {
-          gpu_value->mf[x] = ptr_cpu_data[x + 8];
-        }
-      } else {
-        gpu_value->mf_size = 0;
-        for (int x = 0; x < gpu_value->mf_dim + 1; x++) {
-          gpu_value->mf[x] = 0 ;
-        }
-      }
-#endif
-  }
-  virtual void value_gpu_to_cpu(void* gpu) {
-#ifdef PADDLE_WITH_PSLIB
-    DyGpuValue* gpu_value = (DyGpuValue*)gpu;
-    paddle::ps::DownpourFixedFeatureValue& cpu_fix = *((paddle::ps::DownpourFixedFeatureValue*)(gpu_value->cpu_ptr));
-    if (gpu_value->mf_size > 0) {
-      cpu_fix.resize(8 + 1 + gpu_value->mf_dim);
-    }
-    float* cpu_value = cpu_fix.data();
-    cpu_value[1] = gpu_value->delta_score;
-    cpu_value[2] = gpu_value->show;
-    cpu_value[3] = gpu_value->clk;
-    cpu_value[4] = gpu_value->lr;
-    cpu_value[5] = gpu_value->lr_g2sum;
-    cpu_value[6] = gpu_value->slot;
-    if (gpu_value->mf_size > 0) {
-       for (int x = 0; x < gpu_value->mf_dim + 1; x++) {
-         cpu_value[x + 8] = gpu_value->mf[x];
-       }
-    }
-#endif
-  }
-  virtual void value_to_cvm(float** gpu_cvm,
-                            const void* gpu_value,
-                            FeatureKey** gpu_keys,
-                            const int slot_num,
-                            const int64_t* key_len,
-                            const int* slot_dim,
-                            int64_t total_length,
-                            int hidden_size,
-                            int value_size,
-                            cudaStream_t stream
-                            ) {
-    value_to_cvm_impl(gpu_cvm, (DyGpuValue*)gpu_value, gpu_keys, slot_num, key_len,
-                        slot_dim, total_length, hidden_size, value_size, stream);
-  }
-  virtual void grad_to_push(void* push_value,
-                            float** grad_value,
-                            const int slot_num,
-                            const int64_t* grad_len,
-                            const int* slot_dim,
-                            int64_t total_length,
-                            int hidden_size,
-                            int value_size,
-                            int batch_size,
-                            const int* slot_vector,
-                            cudaStream_t stream
-                            ) {
-    grad_to_push_impl((DyGpuPushValue*)push_value, grad_value, slot_num, grad_len, slot_dim,
-                       total_length, hidden_size, value_size, batch_size, slot_vector, stream);
-  }
-};
-
-}
-}
-
-#endif
\ No newline at end of file
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
deleted file mode 100644
index 755f9a9cd97b1e..00000000000000
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_HETERPS
-
-#include "paddle/fluid/framework/fleet/heter_ps/gpu_value_inl.h"
-#include "paddle/fluid/framework/fleet/heter_ps/dy_gpu_value_inl.h"
-
-namespace paddle {
-namespace framework {
-
-void GlobalValueTransfor::init(std::string accessor_type, std::string gpu_value_type) {
-  if (transobj_ != nullptr) {
-    return;
-  }
-  if (accessor_type == "DownpourCtrDymfAccessor" && gpu_value_type == "DyFeatureValue") {
-    transobj_ = (ValueTransfor*)(new T_DyGpuValue_DownpourCtrDymfAccessor());
-  } else if (accessor_type == "DownpourCtrAccessor" && gpu_value_type == "FeatureValue") {
-    transobj_ = (ValueTransfor*)(new T_GpuValue_DownpourCtrAccessor());
-  }
-  return;
-}
-
-ValueTransfor* GlobalValueTransfor::get_value_transfor() {
-  return transobj_;
-}
-
-
-}
-}
-
-#endif
\ No newline at end of file
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index 239451cd00f1e6..c454a0b0ba0be9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -17,88 +17,128 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HETERPS
 
 #include <iostream>
-#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/memory/memory.h"
-#ifdef PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
-#endif
-#ifdef PADDLE_WITH_PSLIB
-#include <pslib.h>
-#endif
 
 namespace paddle {
 namespace framework {
-
-#define TYPE_ALIGN(ALIGNVAL, LEN)  (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
+#define MF_DIM 8
 
 typedef uint64_t FeatureKey;
 
-class ValueTransfor {
-public:
-  virtual int get_gpu_value_size(int dim_size) = 0;
-  virtual int get_gpu_push_value_size(int dim_size) = 0;
-  virtual void value_cpu_to_gpu(void* cpu, void* gpu, int dim_size) = 0;
-  virtual void value_gpu_to_cpu(void* gpu) = 0;
-  virtual void value_to_cvm(float** gpu_cvm, //写入的结果，cvm二维数组
-                            const void* gpu_value, //查表出来的sparse数据
-                            FeatureKey** gpu_keys, //对应的key的二维数组(内部需要用来判断是否为0)
-                            const int slot_num, //一共有多少个slot
-                            const int64_t* key_len, //每个slot下面有多少个key
-                            const int* slot_dim, //每个slot的维度数据(可能为空,只有动态维度模式才会有值)
-                            int64_t total_length, //总共有多少个key
-                            int hidden_size, //非动态维度的情况下，cvm维度数
-                            int value_size, //动态维度下，value的字节大小
-                            cudaStream_t stream //流
-                            ) = 0;
-  virtual void grad_to_push(void* push_value, //写入的结果，连续的pushvalue类型值
-                            float** grad_value, //梯度信息
-                            const int slot_num, //一共有多少个slot
-                            const int64_t* grad_len, //每个slot下面有多少个梯度
-                            const int* slot_dim, //每个slot的维度数据(可能为空,只有动态维度模式才会有值)
-                            int64_t total_length, //总共有多少个梯度
-                            int hidden_size, //非动态维度的情况下，梯度维度数
-                            int value_size, //动态维度下，value的字节大小
-                            int batch_size, //mini-batch
-                            const int* slot_vector, //slot的编号信息
-                            cudaStream_t stream //流
-                            ) = 0;
+/*
+struct FeatureValue {
+  float delta_score;
+  float show;
+  float clk;
+  int slot;
+  float lr;
+  float lr_g2sum;
+  int mf_size;
+  float mf[MF_DIM + 1];
+  uint64_t cpu_ptr;
+
+  friend std::ostream& operator<<(std::ostream& out, FeatureValue& val) {
+    out << "show: " << val.show << " clk: " << val.clk << " slot: " << val.slot
+        << " lr: " << val.lr << " mf_size: " << val.mf_size << " mf:";
+    for (int i = 0; i < val.mf_size; ++i) {
+      out << " " << val.mf[i];
+    }
+    return out;
+  }
 };
 
-class GlobalValueTransfor {
-public:
-  static GlobalValueTransfor& get_instance() {
-    static GlobalValueTransfor ins;
-    return ins;
+struct FeaturePushValue {
+  float show;
+  float clk;
+  int slot;
+  float lr_g;
+  float mf_g[MF_DIM];
+
+  __device__ __forceinline__ FeaturePushValue
+  operator+(const FeaturePushValue& a) const {
+    FeaturePushValue out;
+    out.slot = a.slot;
+    out.show = a.show + show;
+    out.clk = a.clk + clk;
+    out.lr_g = a.lr_g + lr_g;
+    for (int i = 0; i < MF_DIM; ++i) {
+      out.mf_g[i] = a.mf_g[i] + mf_g[i];
+    }
+    return out;
   }
-  void init(std::string accessor_type, std::string gpu_value_type);
-  ValueTransfor* get_value_transfor();
-private:
-  ValueTransfor* transobj_ = nullptr;
 };
-#define g_transfor GlobalValueTransfor::get_instance().get_value_transfor()
-
-class PinnedVector {
-public:
-  template <typename Type>
-  PinnedVector(const Type* buf, const size_t len, gpuStream_t& stream, const paddle::platform::Place& place) {
-    mem_cpu_ = memory::Alloc(phi::GPUPinnedPlace(), len);
-    memcpy(reinterpret_cast<char*>(mem_cpu_->ptr()), buf, len);
-    mem_gpu_ = memory::Alloc(place, len);
-    cudaMemcpyAsync(reinterpret_cast<char*>(mem_gpu_->ptr()), reinterpret_cast<char*>(mem_cpu_->ptr()),
-                    len, cudaMemcpyHostToDevice, stream);
+*/
+
+struct FeatureValue {
+  float delta_score;
+  float show;
+  float clk;
+  int slot;
+  float lr;
+  float lr_g2sum;
+  int mf_size;
+  int mf_dim;
+  uint64_t cpu_ptr;
+  float mf[0];
+
+  friend std::ostream& operator<<(std::ostream& out, FeatureValue& val) {
+    out << "show: " << val.show << " clk: " << val.clk << " slot: " << val.slot
+        << " lr: " << val.lr << " mf_dim: " << val.mf_dim << "cpuptr: " << val.cpu_ptr
+        << " mf_size: " << val.mf_size << " mf:";
+    for (int i = 0; i < val.mf_dim + 1; ++i) {
+      out << " " << val.mf[i];
+    }
+    return out;
   }
-  template <typename Type>
-  Type* get_gpu_ptr() {
-    return reinterpret_cast<Type*>(mem_gpu_->ptr());
+  __device__ __forceinline__ void operator=(const FeatureValue& in) {
+    delta_score = in.delta_score;
+    show = in.show;
+    clk = in.clk;
+    slot = in.slot;
+    lr = in.lr;
+    lr_g2sum = in.lr_g2sum;
+    mf_size = in.mf_size;
+    mf_dim = in.mf_dim;
+    cpu_ptr = in.cpu_ptr;
+    for (int i = 0; i < mf_dim + 1; i++) {
+      mf[i] = in.mf[i];
+    }
   }
-private:
-  memory::allocation::AllocationPtr mem_cpu_;
-  memory::allocation::AllocationPtr mem_gpu_;
 };
 
-}  // end namespace framework
-}  // end namespace paddle
+struct FeaturePushValue {
+  float show;
+  float clk;
+  int slot;
+  float lr_g;
+  int mf_dim;
+  float mf_g[0];
 
+  __device__ __forceinline__ FeaturePushValue
+  operator+(const FeaturePushValue& a) const {
+    FeaturePushValue out;
+    out.slot = a.slot;
+    out.mf_dim = a.mf_dim;
+    out.show = a.show + show;
+    out.clk = a.clk + clk;
+    out.lr_g = a.lr_g + lr_g;
+    // out.mf_g = a.mf_g;
+    for (int i = 0; i < out.mf_dim; ++i) {
+      out.mf_g[i] = a.mf_g[i] + mf_g[i];
+    }
+    return out;
+  }
+  __device__ __forceinline__ void operator=(const FeaturePushValue& in) {
+    show = in.show;
+    clk = in.clk;
+    slot = in.slot;
+    lr_g = in.lr_g;
+    mf_dim = in.mf_dim;
+    for (int i = 0; i < mf_dim; i++) {
+     mf_g[i] = in.mf_g[i];
+    }
+  }
+};
 
+}  // end namespace framework
+}  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h b/paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h
deleted file mode 100644
index ee19c52ade95d1..00000000000000
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_HETERPS
-
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename ValueType>
-__global__ void kernel_value_to_cvm(float** dest, ValueType* src, FeatureKey** keys, const int slot_num,
-                              const int64_t* len, const int* slot_dim, int64_t total_len, int hidden_size, int value_size) {
-  CUDA_KERNEL_LOOP(i, total_len) {
-    int low = 0;
-    int high = slot_num - 1;
-    while (low < high) {
-      int mid = (low + high) / 2;
-      if (i < len[mid])
-        high = mid;
-      else
-        low = mid + 1;
-    }
-    int x = low;
-    int y = i - (x ? len[x - 1] : 0);
-    int cur_dim =hidden_size - 3;
-    //动态维度
-    if (slot_dim != nullptr) {
-      cur_dim = slot_dim[x] - 3;
-    }  
-    char* p_src = (char*)(src);
-    ValueType* value_ptr = (ValueType*)(p_src + uint64_t(i) * uint64_t(value_size));
-    if (*(keys[x] + y) == 0) {
-      *(dest[x] + y * (cur_dim + 3)) = 0;
-      *(dest[x] + y * (cur_dim + 3) + 1) = 0;
-      *(dest[x] + y * (cur_dim + 3) + 2) = 0;
-      for (int j = 0; j < cur_dim; j++) {
-        *(dest[x] + y * (cur_dim + 3) + 3 + j) = 0;
-      }
-    } else {
-      value_ptr->to_cvm(dest[x] + y * (cur_dim + 3), cur_dim);
-    }
-  }
-}
-
-template <typename PushValueType>
-__global__ void kernel_grad_to_push(PushValueType* des, float** src, const int slot_num, const int64_t* len,
-                                    const int* slot_dim, int64_t total_len, int hidden_size, int value_size,
-                                    int batch_size, const int* slot_vector) {
-  CUDA_KERNEL_LOOP(i, total_len) {
-    int low = 0;
-    int high = slot_num - 1;
-    while (low < high) {
-      int mid = (low + high) / 2;
-      if (i < len[mid])
-        high = mid;
-      else
-        low = mid + 1;
-    }
-    int x = low;
-    int y = i - (x ? len[low - 1] : 0);
-    char* d_src = (char*)(des);
-    PushValueType* value_ptr = (PushValueType*)(d_src + i * value_size);
-    int mf_dim = hidden_size - 3; 
-    if (slot_dim != nullptr) {
-      mf_dim = slot_dim[x];
-    }
-    int slot_id = slot_vector[x];
-    value_ptr->from_grad(src[x] + y * (mf_dim + 3), mf_dim, slot_id, batch_size);
-  }
-}
-
-class ValueTransforImp : public ValueTransfor {
-protected:
-  template <typename ValueType>
-  void value_to_cvm_impl( float** gpu_cvm,
-                          ValueType* gpu_value,
-                          FeatureKey** gpu_keys,
-                          const int slot_num,
-                          const int64_t* key_len,
-                          const int* slot_dim,
-                          int64_t total_length,
-                          int hidden_size,
-                          int value_size,
-                          cudaStream_t stream) {
-    kernel_value_to_cvm<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
-        gpu_cvm, gpu_value, gpu_keys, slot_num, key_len, slot_dim, total_length, hidden_size, value_size);
-    cudaStreamSynchronize(stream);
-  }
-  template <typename PushValueType>
-  void grad_to_push_impl(PushValueType* push_value,
-                            float** grad_value,
-                            const int slot_num,
-                            const int64_t* grad_len,
-                            const int* slot_dim,
-                            int64_t total_length,
-                            int hidden_size,
-                            int value_size,
-                            int batch_size,
-                            const int* slot_vector,
-                            cudaStream_t stream
-                            ) {
-    kernel_grad_to_push<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
-          (PushValueType*)push_value, grad_value, slot_num, grad_len, slot_dim,
-                       total_length, hidden_size, value_size, batch_size, slot_vector);
-    cudaStreamSynchronize(stream);
-  }
-};
-
-}
-}
-
-#endif
\ No newline at end of file
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_value_inl.h b/paddle/fluid/framework/fleet/heter_ps/gpu_value_inl.h
deleted file mode 100644
index 8c8afc31ddcfa3..00000000000000
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_value_inl.h
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_HETERPS
-
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h"
-#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
-
-namespace paddle {
-namespace framework {
-
-struct GpuValue {
-  float delta_score;
-  float show;
-  float clk;
-  int slot;
-  float lr;
-  float lr_g2sum;
-  int mf_size;
-  float mf[8 + 1];
-  uint64_t cpu_ptr;
-  __host__ __device__ __forceinline__ GpuValue() {
-    delta_score = 0;
-    show = 0;
-    clk = 0;
-    slot = -1;
-    lr = 0;
-    lr_g2sum = 0;
-    mf_size = 0;
-    cpu_ptr = 0;
-  }
-  __device__ __forceinline__ void operator=(const GpuValue& in) {
-    delta_score = in.delta_score;
-    show = in.show;
-    clk = in.clk;
-    slot = in.slot;
-    lr = in.lr;
-    lr_g2sum = in.lr_g2sum;
-    mf_size = in.mf_size;
-    cpu_ptr = in.cpu_ptr;
-    for (int i = 0; i < 8 + 1; i++) {
-      mf[i] = in.mf[i];
-    }
-  }
-  __device__ __forceinline__ void to_cvm(float* des, int dim) {
-    des[0] = show;
-    des[1] = clk;
-    des[2] = lr;
-    if (mf_size == 0) {
-      for (int i = 0; i < dim; i++) {
-        des[3 + i] = 0;
-      }
-    } else {
-      for (int i = 0; i < dim; i++) {
-        des[3 + i] = mf[1 + i];
-      }
-    }
-  }
-};
-
-struct GpuPushValue {
-  float show;
-  float clk;
-  int slot;
-  float lr_g;
-  float mf_g[8];
-  __device__ __forceinline__ void from_grad(const float* grad, int dim, int slot_id, int batch_size) {
-    this->slot = slot_id;
-    this->show = grad[0];
-    this->clk = grad[1];
-    this->lr_g = grad[2] * -1. * batch_size;
-    for (int j = 0; j < dim; j++) {
-      this->mf_g[j] = grad[3 + j] * -1. * batch_size;
-    }
-  }
-  __device__ __forceinline__ GpuPushValue& operator+=(const GpuPushValue& input) {
-    show += input.show;
-    clk += input.clk;
-    lr_g += input.lr_g;
-    for (int i = 0; i < 8; i++) {
-      mf_g[i] += input.mf_g[i];
-    }
-    return *this;
-  }
-  __device__ __forceinline__ void operator=(const GpuPushValue& input) {
-    show = input.show;
-    clk = input.clk;
-    slot = input.slot;
-    lr_g = input.lr_g;
-    for (int i = 0; i < 8; i++) {
-     mf_g[i] = input.mf_g[i];
-    }
-  }
-};
-
-template <>
-class Optimizer<GpuValue, GpuPushValue> {
- public:
-  Optimizer() {}
-  ~Optimizer() {}
-  void initialize() {}
-
-  __device__ void update_lr(float& w, float& g2sum, float g, float scale) {
-    double add_g2sum = 0;
-    double ratio = optimizer_config::learning_rate *
-                   sqrt(optimizer_config::initial_g2sum /
-                        (optimizer_config::initial_g2sum + g2sum));
-    double scaled_grad = g / scale;
-    w += scaled_grad * ratio;
-    if (w < optimizer_config::min_bound) w = optimizer_config::min_bound;
-    if (w > optimizer_config::max_bound) w = optimizer_config::max_bound;
-    add_g2sum += scaled_grad * scaled_grad;
-    g2sum += add_g2sum;
-  }
-
-  __device__ void update_mf(int n, float* w, float& g2sum, const float* g,
-                            float scale) {
-    double add_g2sum = 0;
-    double ratio = optimizer_config::mf_learning_rate *
-                   sqrt(optimizer_config::mf_initial_g2sum /
-                        (optimizer_config::mf_initial_g2sum + g2sum));
-    for (int i = 0; i < n; ++i) {
-      double scaled_grad = g[i] / scale;
-      w[i] += scaled_grad * ratio;
-      if (w[i] < optimizer_config::mf_min_bound)
-        w[i] = optimizer_config::mf_min_bound;
-      if (w[i] > optimizer_config::mf_max_bound)
-        w[i] = optimizer_config::mf_max_bound;
-      add_g2sum += scaled_grad * scaled_grad;
-    }
-    g2sum += add_g2sum / n;
-  }
-  
-  __device__ void update_value(GpuValue* ptr, const GpuPushValue& grad, curandState& state) {
-    ptr->slot = grad.slot;
-    ptr->show += grad.show;
-    ptr->clk += grad.clk;
-    ptr->delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
-                       optimizer_config::clk_coeff * grad.clk;
-
-    update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
-
-    if (ptr->mf_size == 0) {
-      if (optimizer_config::mf_create_thresholds <=
-          optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) +
-              optimizer_config::clk_coeff * ptr->clk) {
-        ptr->mf_size = 9;
-        ptr->mf[0] = 0;
-        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-        for (int i = 0; i < 8; ++i) {
-          ptr->mf[i + 1] =
-              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
-        }
-      }
-    } else {
-      update_mf(8, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
-                grad.show);  // for local test
-    }
-  }
-};
-
-class T_GpuValue_DownpourCtrAccessor : public ValueTransforImp {
-public:
-  virtual int get_gpu_value_size(int dim_size) {
-    int ret = sizeof(GpuValue);
-    return TYPE_ALIGN(8, ret);
-  }
-  virtual int get_gpu_push_value_size(int dim_size) {
-    int ret = sizeof(GpuPushValue);
-    return TYPE_ALIGN(8, ret);
-  }
-  virtual void value_cpu_to_gpu(void* cpu, void* gpu, int dim_size) {
-#ifdef PADDLE_WITH_PSLIB
-    paddle::ps::DownpourFixedFeatureValue* cpu_value = (paddle::ps::DownpourFixedFeatureValue*)cpu;
-    GpuValue* gpu_value = (GpuValue*)gpu;
-    const float* ptr_cpu_data = cpu_value->data();
-    size_t dim = cpu_value->size();
-    gpu_value->delta_score = ptr_cpu_data[1];
-    gpu_value->show = ptr_cpu_data[2];
-    gpu_value->clk = ptr_cpu_data[3];
-    gpu_value->slot = ptr_cpu_data[6];
-    gpu_value->lr = ptr_cpu_data[4];
-    gpu_value->lr_g2sum = ptr_cpu_data[5];
-    gpu_value->cpu_ptr = (uint64_t)(cpu);
-    if (dim > 7) {
-      gpu_value->mf_size = 9;
-      for (int x = 0; x < gpu_value->mf_size; x++) {
-        gpu_value->mf[x] = ptr_cpu_data[x + 7];
-      }
-    } else {
-      gpu_value->mf_size = 0;
-      for (int x = 0; x < 9; x++) {
-        gpu_value->mf[x] = 0;
-      }
-    }
-#endif
-#ifdef PADDLE_WITH_PSCORE
-    const paddle::distributed::FixedFeatureValue* cpu_value = (const paddle::distributed::FixedFeatureValue*)cpu;
-    GpuValue* gpu_value = (GpuValue*)gpu;
-    const float* ptr_cpu_data = cpu_value->data();
-    size_t dim = cpu_value->size();
-    gpu_value->delta_score = ptr_cpu_data[2];
-    gpu_value->show = ptr_cpu_data[3];
-    gpu_value->clk = ptr_cpu_data[4];
-    gpu_value->slot = ptr_cpu_data[0];
-    gpu_value->lr = ptr_cpu_data[5];
-    gpu_value->lr_g2sum = ptr_cpu_data[6];
-    gpu_value->cpu_ptr = (uint64_t)(cpu);
-    if (dim > 7) {
-      gpu_value->mf_size = 9;
-      for (int x = 0; x < gpu_value->mf_size; x++) {
-        gpu_value->mf[x] = ptr_cpu_data[x + 7];
-      }
-    } else {
-      gpu_value->mf_size = 0;
-      for (int x = 0; x < 9; x++) {
-        gpu_value->mf[x] = 0;
-      }
-    }
-#endif
-  }
-  virtual void value_gpu_to_cpu(void* gpu) {
-#ifdef PADDLE_WITH_PSLIB
-    GpuValue* gpu_value = (GpuValue*)gpu;
-    paddle::ps::DownpourFixedFeatureValue& cpu_fix = *((paddle::ps::DownpourFixedFeatureValue*)(gpu_value->cpu_ptr));
-    if (gpu_value->mf_size > 0) {
-      cpu_fix.resize(7 + gpu_value->mf_size);
-    }
-    float* cpu_value = cpu_fix.data();
-    cpu_value[1] = gpu_value->delta_score;
-    cpu_value[2] = gpu_value->show;
-    cpu_value[3] = gpu_value->clk;
-    cpu_value[4] = gpu_value->lr;
-    cpu_value[5] = gpu_value->lr_g2sum;
-    cpu_value[6] = gpu_value->slot;
-    if (gpu_value->mf_size > 0) {
-       for (int x = 0; x < gpu_value->mf_size; x++) {
-         cpu_value[x + 7] = gpu_value->mf[x];
-       }
-    }
-#endif
-#ifdef PADDLE_WITH_PSCORE
-    GpuValue* gpu_value = (GpuValue*)gpu;
-    paddle::distributed::FixedFeatureValue& cpu_value = *((paddle::distributed::FixedFeatureValue*)(gpu_value->cpu_ptr));
-    if (gpu_value->mf_size > 0) {
-      cpu_value.resize(7 + gpu_value->mf_size);
-    }
-    cpu_value[2] = gpu_value->delta_score;
-    cpu_value[3] = gpu_value->show;
-    cpu_value[4] = gpu_value->clk;
-    cpu_value[5] = gpu_value->lr;
-    cpu_value[6] = gpu_value->lr_g2sum;
-    cpu_value[0] = gpu_value->slot;
-    if (gpu_value->mf_size > 0) {
-       for (int x = 0; x < gpu_value->mf_size; x++) {
-         cpu_value[x + 7] = gpu_value->mf[x];
-       }
-    }
-#endif
-}
-  virtual void value_to_cvm(float** gpu_cvm,
-                            const void* gpu_value,
-                            FeatureKey** gpu_keys,
-                            const int slot_num,
-                            const int64_t* key_len,
-                            const int* slot_dim,
-                            int64_t total_length,
-                            int hidden_size,
-                            int value_size,
-                            cudaStream_t stream
-                            ) {
-    value_to_cvm_impl(gpu_cvm, (GpuValue*)gpu_value, gpu_keys, slot_num, key_len,
-                        slot_dim, total_length, hidden_size, value_size, stream);
-  }
-  virtual void grad_to_push(void* push_value,
-                            float** grad_value,
-                            const int slot_num,
-                            const int64_t* grad_len,
-                            const int* slot_dim,
-                            int64_t total_length,
-                            int hidden_size,
-                            int value_size,
-                            int batch_size,
-                            const int* slot_vector,
-                            cudaStream_t stream
-                            ) {
-    grad_to_push_impl((GpuPushValue*)push_value, grad_value, slot_num, grad_len, slot_dim, 
-                      total_length, hidden_size, value_size, batch_size, slot_vector, stream);
-  }
-};
-
-}
-}
-
-#endif
\ No newline at end of file
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index 83d71ac912a31d..035478e7d7cfda 100755
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -54,12 +54,22 @@ class HashTable {
   virtual ~HashTable();
   HashTable(const HashTable&) = delete;
   HashTable& operator=(const HashTable&) = delete;
+  void insert(const KeyType* d_keys, const ValType* d_vals, size_t len,
+              gpuStream_t stream);
   void insert(const KeyType* d_keys, size_t len, char* pool, size_t feature_value_size,
               size_t start_index, gpuStream_t stream);
-  void get(const KeyType* d_keys, ValType d_vals, size_t len, gpuStream_t stream);
+  void get(const KeyType* d_keys, ValType* d_vals, size_t len,
+           gpuStream_t stream);
+  void get(const KeyType* d_keys, char* d_vals, size_t len, gpuStream_t stream);
   void show();
+  void dump_to_cpu(int devid, cudaStream_t stream);
+
   template <typename GradType, typename Sgd>
-  void update(const KeyType* d_keys, const GradType* d_grads, size_t len, Sgd sgd,
+  void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
+              Sgd sgd, gpuStream_t stream);
+
+  template <typename Sgd>
+  void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd,
               gpuStream_t stream);
 
   int size() { return container_->size(); }
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
index 6e2543e4b38b0a..b4be4c1f8c35ca 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -26,6 +26,23 @@ struct ReplaceOp {
   }
 };
 
+template <typename Table>
+__global__ void insert_kernel(Table* table,
+                              const typename Table::key_type* const keys,
+                              const typename Table::mapped_type* const vals,
+                              size_t len) {
+  ReplaceOp<typename Table::mapped_type> op;
+  thrust::pair<typename Table::key_type, typename Table::mapped_type> kv;
+
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    kv.first = keys[i];
+    kv.second = vals[i];
+    auto it = table->insert(kv, op);
+    assert(it != table->end() && "error: insert fails: table is full");
+  }
+}
+
 template <typename Table>
 __global__ void insert_kernel(Table* table,
                               const typename Table::key_type* const keys,
@@ -45,40 +62,46 @@ __global__ void insert_kernel(Table* table,
   }
 }
 
-template <typename Table, typename ValType>
+template <typename Table>
 __global__ void search_kernel(Table* table,
-                                    const typename Table::key_type* const keys,
-                                    ValType* vals, size_t len,
-                                    size_t pull_feature_value_size) {
+                              const typename Table::key_type* const keys,
+                              typename Table::mapped_type* const vals,
+                              size_t len) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     auto it = table->find(keys[i]);
-    char* d_value = (char*)(vals);
     if (it != table->end()) {
-      uint64_t offset = i * pull_feature_value_size;
-      ValType* cur = (ValType*)(d_value + offset);
-      ValType& input = *(ValType*)(it->second);
-      *cur = input;
+      vals[i] = it->second;
     } else {
-      if (keys[i] != 0) printf("pull miss key: %llu", keys[i]);
-      ValType* cur = (ValType*)(d_value + i * pull_feature_value_size);
-      *cur = ValType();
+      printf("pull miss key: %llu", keys[i]);
     }
   }
 }
 
-template <typename Table, typename GradType, typename Sgd>
-__global__ void update_kernel(Table* table,
+template <typename Table>
+__global__ void dy_mf_search_kernel(Table* table,
                                     const typename Table::key_type* const keys,
-                                    const GradType* grads, curandState* p_state, size_t len,
-                                    Sgd sgd, size_t grad_value_size) {
+                                    char* vals, size_t len,
+                                    size_t pull_feature_value_size) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     auto it = table->find(keys[i]);
     if (it != table->end()) {
-      char* grads_tmp = (char*)(grads);
-      GradType* cur = (GradType*)(grads_tmp + i * grad_value_size);
-      sgd.update_value((it.getter())->second, *cur, p_state[i]);
+      uint64_t offset = i * pull_feature_value_size;
+      FeatureValue* cur = (FeatureValue*)(vals + offset);
+      FeatureValue& input = *(FeatureValue*)(it->second);
+      cur->slot = input.slot;
+      cur->show = input.show;
+      cur->clk = input.clk;
+      cur->mf_dim = input.mf_dim;
+      cur->lr = input.lr;
+      cur->mf_size = input.mf_size;
+      cur->cpu_ptr = input.cpu_ptr;
+      cur->delta_score = input.delta_score;
+      cur->lr_g2sum = input.lr_g2sum;
+      for (int j = 0; j < cur->mf_dim + 1; ++j) {
+        cur->mf[j] = input.mf[j];
+      }
     } else {
       if (keys[i] != 0) printf("pull miss key: %llu", keys[i]);
     }
@@ -147,6 +170,72 @@ class CuRandState {
   curandState* states_ = nullptr;
 };
 
+template <typename Table, typename GradType, typename Sgd>
+__global__ void update_kernel(Table* table,
+                              const typename Table::key_type* const keys,
+                              const GradType* const grads, curandState* p_state,
+                              size_t len, Sgd sgd) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto it = table->find(keys[i]);
+    if (it != table->end()) {
+      sgd.update_value((it.getter())->second, grads[i], p_state[i]);
+    } else {
+      printf("push miss key: %llu", keys[i]);
+    }
+  }
+}
+
+template <typename Table, typename GradType, typename Sgd>
+__global__ void update_kernel(Table* table,
+                              const typename Table::key_type* const keys,
+                              const GradType* const grads, size_t len,
+                              Sgd sgd) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto it = table->find(keys[i]);
+    if (it != table->end()) {
+      sgd.update_value((it.getter())->second, grads[i]);
+    } else {
+      printf("push miss key: %llu", keys[i]);
+    }
+  }
+}
+
+template <typename Table, typename Sgd>
+__global__ void dy_mf_update_kernel(Table* table,
+                                    const typename Table::key_type* const keys,
+                                    const char* const grads, size_t len,
+                                    Sgd sgd, size_t grad_value_size) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto it = table->find(keys[i]);
+    if (it != table->end()) {
+      FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
+      sgd.dy_mf_update_value((it.getter())->second, *cur);
+    } else {
+      if (keys[i] != 0) printf("push miss key: %llu", keys[i]);
+    }
+  }
+}
+
+template <typename Table, typename Sgd>
+__global__ void dy_mf_update_kernel(Table* table,
+                                    const typename Table::key_type* const keys,
+                                    const char* const grads, curandState* p_state, size_t len,
+                                    Sgd sgd, size_t grad_value_size) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto it = table->find(keys[i]);
+    if (it != table->end()) {
+      FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
+      sgd.dy_mf_update_value((it.getter())->second, *cur, p_state[i]);
+    } else {
+      if(keys[i] != 0) printf("push miss key: %llu", keys[i]);
+    }
+  }
+}
+
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::HashTable(size_t capacity) {
   container_ = new TableContainer<KeyType, ValType>(capacity);
@@ -164,15 +253,39 @@ void HashTable<KeyType, ValType>::show() {
 }
 
 template <typename KeyType, typename ValType>
-void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType d_vals, size_t len, gpuStream_t stream) {
+void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
+                                      size_t len, gpuStream_t stream) {
+  if (len == 0) {
+    return;
+  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys,
+                                                       d_vals, len);
+}
+
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
+                                      size_t len, gpuStream_t stream) {
   if (len == 0) {
     return;
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
-  search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+  dy_mf_search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
       container_, d_keys, d_vals, len, pull_feature_value_size_);
 }
 
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
+                                         const ValType* d_vals, size_t len,
+                                         gpuStream_t stream) {
+  if (len == 0) {
+    return;
+  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys,
+                                                       d_vals, len);
+}
+
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
                                          char* pool, size_t feature_value_size,
@@ -189,6 +302,87 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
       container_, d_keys, len, pool, feature_value_size, start_index);
 }
 
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
+  container_->prefetch(cudaCpuDeviceId, stream);
+  std::vector<std::thread> threads;
+  size_t num = container_->size();
+  KeyType unuse_key = std::numeric_limits<KeyType>::max();
+  thrust::pair<KeyType, ValType>* kv = container_->data();
+
+  int thread_num = 8;
+  int len_per_thread = num / thread_num;
+  int remain = num % thread_num;
+  int begin = 0;
+
+  auto dump_func = [unuse_key, kv](int left, int right) {
+    for (int i = left; i < right; i++) {
+      if (kv[i].first == unuse_key) {
+        continue;
+      }
+      ValType& gpu_val = kv[i].second;
+#ifdef PADDLE_WITH_PSLIB
+      auto* downpour_value =
+          (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
+        downpour_value->resize(gpu_val.mf_size + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+      // cpu_val[0] = 0;
+      cpu_val[1] = gpu_val.delta_score;
+      cpu_val[2] = gpu_val.show;
+      cpu_val[3] = gpu_val.clk;
+      cpu_val[4] = gpu_val.lr;
+      cpu_val[5] = gpu_val.lr_g2sum;
+      // useless
+      if (cpu_val[6] <= 0) {
+        cpu_val[6] = gpu_val.slot * -1;
+      } else {
+        cpu_val[6] = gpu_val.slot;
+      }
+      if (gpu_val.mf_size > 0) {
+        for (int x = 0; x < gpu_val.mf_size; x++) {
+          cpu_val[x + 7] = gpu_val.mf[x];
+        }
+      }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      auto* downpour_value =
+          (paddle::distributed::FixedFeatureValue*)(gpu_val.cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
+        downpour_value->resize(gpu_val.mf_size + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+      // cpu_val[0] = 0;
+      cpu_val[2] = gpu_val.delta_score;
+      cpu_val[3] = gpu_val.show;
+      cpu_val[4] = gpu_val.clk;
+      cpu_val[5] = gpu_val.lr;
+      cpu_val[6] = gpu_val.lr_g2sum;
+      cpu_val[0] = gpu_val.slot;
+      if (gpu_val.mf_size > 0) {
+        for (int x = 0; x < gpu_val.mf_size; x++) {
+          cpu_val[x + 7] = gpu_val.mf[x];
+        }
+      }
+#endif
+    }
+  };
+
+  for (int i = 0; i < thread_num; i++) {
+    threads.push_back(std::thread(
+        dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0)));
+    begin += len_per_thread + (i < remain ? 1 : 0);
+  }
+  for (std::thread& t : threads) {
+    t.join();
+  }
+
+  // container_->prefetch(devid, stream);
+}
+
 template <typename KeyType, typename ValType>
 template <typename GradType, typename Sgd>
 void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
@@ -197,10 +391,26 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
   if (len == 0) {
     return;
   }
-  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
   auto state = CuRandState::get();
   auto d_state = state->get(len, stream);
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
   update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      container_, d_keys, d_grads, d_state, len, sgd);
+  CuRandState::push(state, stream);
+}
+
+template <typename KeyType, typename ValType>
+template <typename Sgd>
+void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
+                                         const char* d_grads, size_t len,
+                                         Sgd sgd, gpuStream_t stream) {
+  if (len == 0) {
+    return;
+  }
+  auto state = CuRandState::get();
+  auto d_state = state->get(len, stream);
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  dy_mf_update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
       container_, d_keys, d_grads, d_state, len, sgd, push_grad_value_size_);
   CuRandState::push(state, stream);
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 2c5e8cb90d4e02..e0ce6f55ba66e4 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -37,6 +37,43 @@ namespace framework {
 #define TYPEALIGN(ALIGNVAL, LEN) \
   (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
 
+struct CustomGradMerger {
+  template <typename T>
+  CUB_RUNTIME_FUNCTION __forceinline__ __device__ T
+  operator()(const T& a, const T& b) const {
+    T out;
+    out.slot = a.slot;
+    out.mf_dim = a.mf_dim;
+    out.show = a.show + b.show;
+    out.clk = a.clk + b.clk;
+    out.lr_g = a.lr_g + b.lr_g;
+    return out;
+  }
+
+  template <typename T>
+  __device__ __forceinline__
+  void copy_basic_field(T& output, const T& input) {
+      output.slot    = input.slot;
+      output.show    = input.show;
+      output.clk     = input.clk;
+      output.mf_dim = input.mf_dim;
+      output.lr_g = input.lr_g;
+      for(int i = 0; i < output.mf_dim ; ++i) {
+         output.mf_g[i] = input.mf_g[i];
+      }
+  }
+  template <typename T>
+  __device__ __forceinline__
+  void add_basic_field(T& output, const T& input) {
+      output.show    += input.show;
+      output.clk     += input.clk;
+      output.lr_g += input.lr_g;
+      for(int i = 0; i < input.mf_dim; ++i) {
+         output.mf_g[i] += input.mf_g[i];
+      }
+  }
+};
+
 template <typename KeyType, typename ValType, typename GradType>
 class HeterComm {
  public:
@@ -44,40 +81,66 @@ class HeterComm {
   virtual ~HeterComm();
   HeterComm(const HeterComm&) = delete;
   HeterComm& operator=(const HeterComm&) = delete;
-  int get_index_by_devid(int devid);
-  void set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
-                              const std::vector<ncclComm_t>& inter_comms,
-                              int comm_size);
-  void set_multi_mf_dim(int max_mf_dim);
-  void show_one_table(int gpu_num);
+
+  void split_input_to_shard(KeyType* d_keys, int* d_idx_ptr, size_t len,
+                            int* left, int* right, int gpu_num);
+  void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
+                  int& uniq_len);  // NOLINT
+  void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, float* mf,
+                  size_t len, int& uniq_len);
+  void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len);
+  void build_ps(int num, KeyType* h_keys, ValType* h_vals, size_t len,
+                size_t chunk_size, int stream_num);
   void build_ps(int num, KeyType* h_keys, char* pool, size_t len, size_t feature_value_size,
                 size_t chunk_size, int stream_num);
-  void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len);
+  void dump();
+  void show_one_table(int gpu_num);
+  int get_index_by_devid(int devid);
+
   template <typename Sgd>
-  void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len, Sgd& sgd);
+  void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len,
+                   Sgd& sgd);  // NOLINT
+
   template <typename Sgd>
   void push_sparse_multi_node(int num, KeyType* d_keys, GradType* d_grads,
-                              size_t len, Sgd& sgd);
-private:
-  void split_input_to_shard(KeyType* d_keys, int* d_idx_ptr, size_t len,
-            int* left, int* right, int gpu_num);
-  void create_storage(int start_index, int end_index, size_t keylen, size_t vallen);
-  void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right,
-                    KeyType* src_key, char* src_val, size_t val_size);
-  void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
-                   char* src_val, size_t val_size);
-  void destroy_storage(int start_index, int end_index);
-  void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, int& uniq_len);
-  int log2i(int x);
-  int gather_one_node_grad(int num, KeyType* d_keys, GradType* d_grads, int len);
-  int gather_multi_node_grad(int num, KeyType* d_keys, GradType* d_grads, int len);
+                              size_t len, Sgd& sgd);  // NOLINT
+
   template <typename Sgd>
-  void update_one_table(int num, KeyType* d_keys, GradType* d_grads, size_t len, Sgd& sgd);
+  void update_one_table(int num, KeyType* d_keys, GradType* d_grads, size_t len,
+                        Sgd& sgd);  // NOLINT
+
+  int gather_one_node_grad(int num, KeyType* d_keys, GradType* d_grads,
+                           int len);
+
+  int gather_multi_node_grad(int num, KeyType* d_keys, GradType* d_grads,
+                             int len);
+
+  int log2i(int x);
+
+  void set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
+                              const std::vector<ncclComm_t>& inter_comms,
+                              int comm_size) {
+    nccl_inner_comms_ = inner_comms;
+    nccl_inter_comms_ = inter_comms;
+    node_size_ = comm_size;
+  }
+  
+  void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
+    
+    multi_mf_dim_ = multi_mf_dim;
+    max_mf_dim_ = max_mf_dim;
+    VLOG(3) << "heter comm set multi multi_mf_dim_: " << multi_mf_dim_ << " max_mf_dim_: " << max_mf_dim_;
+  }
+
   bool need_transfer(int send_id, int receive_id) {
     return ((send_id / 4 != receive_id / 4) && (send_id + 4) % 8 != receive_id);
   }
+
+  // void dump_to_cpu(int index);
+
+  void end_pass();
+
   int get_transfer_devid(int send_id) { return (send_id + 4) % 8; }
-  void init_path();
 
   struct Node {
     cudaStream_t in_stream;
@@ -89,14 +152,17 @@ class HeterComm {
     size_t val_bytes_len;
     int gpu_num;
   };
+
   struct Path {
     std::vector<Node> nodes_;
   };
+
   struct CopyTask {
     Path* path;
     int step;
     CopyTask(Path* path_, int step_) : path(path_), step(step_) {}
   };
+
   struct LocalStorage {
     LocalStorage() {}
     void init(int size, int dev_id) {
@@ -135,13 +201,23 @@ class HeterComm {
     GradType* local_grads;
   };
 
-  
-  
-  
+  void init_path();
+
+  void create_storage(int start_index, int end_index, size_t keylen, size_t vallen);
+  void destroy_storage(int start_index, int end_index);
+  void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right,
+                    KeyType* src_key, GradType* src_val);
+  void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right,
+                    KeyType* src_key, char* src_val, size_t val_size);
+  void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
+                   ValType* src_val);
+  void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
+                   char* src_val, size_t val_size);
 
  protected:
   using Table = HashTable<KeyType, ValType>;
   using PtrTable = HashTable<KeyType, ValType*>;
+  std::vector<Table*> tables_;
   std::vector<PtrTable*> ptr_tables_;
   std::shared_ptr<HeterPsResource> resource_;
   std::vector<std::vector<Path>> path_;
@@ -150,6 +226,7 @@ class HeterComm {
 
  private:
   std::vector<LocalStorage> storage_;
+  CustomGradMerger merger_;
   int topo_aware_{0};
   int feanum_{1800 * 2048};
   int multi_node_{0};
@@ -157,6 +234,7 @@ class HeterComm {
   std::vector<ncclComm_t> nccl_inter_comms_;
   int node_size_;
   std::vector<std::shared_ptr<cub::CachingDeviceAllocator>> allocators_;
+  int multi_mf_dim_{8};
   int max_mf_dim_ = 8;
 };
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index f54d33fb1c945b..156035da41a1cf 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-//填充索引
 template <typename T>
 __global__ void fill_idx(T* idx, size_t len) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -28,17 +27,18 @@ __global__ void fill_idx(T* idx, size_t len) {
   }
 }
 
-//计算key归属在哪个gpu里面
-template <typename KeyType, typename T>
-__global__ void calc_shard_index(KeyType* d_keys, size_t len, T* shard_index,
-                                 int total_gpu) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
-    shard_index[i] = d_keys[i] % total_gpu;
+template <typename T>
+void show_tensor(T* input, size_t len, gpuStream_t stream, std::string name) {
+  T tmp[len];  // NOLINT
+  cudaMemcpyAsync(&tmp, input, sizeof(T) * len, cudaMemcpyDeviceToHost, stream);
+  cudaStreamSynchronize(stream);
+  std::cout << name;
+  for (int i = 0; i < len; ++i) {
+    std::cout << ":" << tmp[i];
   }
+  std::cout << std::endl;
 }
 
-//计算每个卡归属key的起始地址
 template <typename T>
 __global__ void calc_shard_offset(T* idx, T* left, T* right, size_t len) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -56,7 +56,15 @@ __global__ void calc_shard_offset(T* idx, T* left, T* right, size_t len) {
   }
 }
 
-//按照索引进行key的重排
+template <typename KeyType, typename T>
+__global__ void calc_shard_index(KeyType* d_keys, size_t len, T* shard_index,
+                                 int total_gpu) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    shard_index[i] = d_keys[i] % total_gpu;
+  }
+}
+
 template <typename KeyType, typename T>
 __global__ void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx,
                                size_t len) {
@@ -66,21 +74,19 @@ __global__ void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx,
   }
 }
 
-//填充结果
-template <typename ValType, typename T>
-__global__ void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx,
-                                 size_t len, size_t val_size) {
+template <typename KeyType, typename GradType, typename T>
+__global__ void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
+                                 GradType* d_shard_grads, GradType* d_grads,
+                                 T* idx, size_t len) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
-    uint64_t new_offset = uint64_t(idx[i]) * val_size;
-    *(ValType*)((char*)d_vals + new_offset) =
-        *(ValType*)((char*)d_shard_vals + i * val_size);
+    d_shard_keys[i] = d_keys[idx[i]];
+    d_shard_grads[i] = d_grads[idx[i]];
   }
 }
 
-//填充结果
 template <typename KeyType, typename GradType, typename T>
-__global__ void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
+__global__ void dy_mf_fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
                                        GradType* d_shard_grads,
                                        GradType* d_grads, T* idx, size_t len,
                                        size_t grad_value_size) {
@@ -92,30 +98,51 @@ __global__ void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
   }
 }
 
-/* 相同key的梯度聚合到一起*/
-template <typename GradType>
 __global__ void merge_gradient_kernel(const uint32_t* offset,
                                       const uint32_t* fea_num,
-                                      const uint32_t* index, GradType* input,
-                                      GradType* output, int n,
-                                      size_t grad_value_size) {
+                                      const uint32_t* index, const char* input,
+                                      char* output, int n,
+                                      size_t grad_value_size,
+                                      CustomGradMerger& merger_) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   
   if (i < n) {
     uint32_t start = offset[i];
     uint32_t num = fea_num[i];
     int ori_index = index[start];
-    char* tmp_in = (char*)(input);
-    char* tmp_out = (char*)(output);
-    GradType& out_value = *((GradType*)(tmp_out + size_t(i) * grad_value_size));
-    GradType& in_value  = *((GradType*)(tmp_in + size_t(ori_index) * grad_value_size));
-    out_value = in_value;
+
+    FeaturePushValue& lhs = *(FeaturePushValue*)(output + i * grad_value_size);
+    FeaturePushValue& in =
+        *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
+    merger_.copy_basic_field(lhs, in);
+    
     for (int j = 1; j < num; ++j) {
       ori_index = index[start + j];
-      GradType& in_value_tmp  = *((GradType*)(tmp_in + size_t(ori_index) * grad_value_size));
-      out_value += in_value_tmp;
+      FeaturePushValue& rhs = *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size);
+      merger_.add_basic_field(lhs, rhs);
     }
-  } 
+  }
+  
+}
+
+template <typename ValType, typename T>
+__global__ void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx,
+                           size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_vals[idx[i]] = d_shard_vals[i];
+  }
+}
+
+template <typename ValType, typename T>
+__global__ void dy_mf_fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx,
+                                 size_t len, size_t val_size) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    uint64_t new_offset = uint64_t(idx[i]) * val_size;
+    *(ValType*)((char*)d_vals + new_offset) =
+        *(ValType*)((char*)d_shard_vals + i * val_size);
+  }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
@@ -124,16 +151,22 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
   VLOG(1) << "Construct new HeterComm";
   resource_ = resource;
   storage_.resize(resource_->total_gpu());
+  multi_mf_dim_ = resource->multi_mf();
   for (int i = 0; i < resource_->total_gpu(); ++i) {
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     // allocators_.push_back(std::make_shared<cub::CachingDeviceAllocator>(
     //     2, 1, 20, (size_t)-1, false, false));  // NOLINT
     allocators_.push_back(std::make_shared<cub::CachingDeviceAllocator>(
         8, 1, (unsigned int)-1, (size_t)-1, false, false));
-    {
+    if (!multi_mf_dim_) {
+      auto table = new Table(capacity / load_factor_);
+      tables_.push_back(table);
+    } else {
       max_mf_dim_ = resource->max_mf_dim();
-      size_t val_type_size = g_transfor->get_gpu_value_size(max_mf_dim_);
-      size_t grad_type_size = g_transfor->get_gpu_push_value_size(max_mf_dim_);
+      size_t val_type_size =
+      TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
+      size_t grad_type_size =
+      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
       auto ptr_table = new PtrTable(capacity / load_factor_);
       ptr_table->set_feature_value_size(val_type_size, grad_type_size);
       ptr_tables_.push_back(ptr_table);
@@ -147,445 +180,148 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-HeterComm<KeyType, ValType, GradType>::~HeterComm() {
-  for (auto& table : ptr_tables_) {
-    delete table;
-    table = nullptr;
-  }
-}
-
-/*
-  通过gpu设备id，获得索引id，内部的一个映射关系
-*/
-template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
-  return resource_->get_index_by_devid(devid);
-}
+void HeterComm<KeyType, ValType, GradType>::init_path() {
+  int total_gpu = resource_->total_gpu();
+  path_.resize(total_gpu);
 
-/*
-  参数设置
-*/
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
-                              const std::vector<ncclComm_t>& inter_comms,
-                              int comm_size) {
-  nccl_inner_comms_ = inner_comms;
-  nccl_inter_comms_ = inter_comms;
-  node_size_ = comm_size;
+  if (!topo_aware_) {
+    VLOG(0) << "init path without topo aware";
+    for (int i = 0; i < total_gpu; ++i) {
+      path_[i].resize(total_gpu);
+      for (int j = 0; j < total_gpu; ++j) {
+        auto& nodes = path_[i][j].nodes_;
+        nodes.resize(1);
+        nodes[0].in_stream = resource_->comm_stream(i, j);
+        nodes[0].out_stream = resource_->comm_stream(i, j);
+        nodes[0].key_storage = NULL;
+        nodes[0].val_storage = NULL;
+        nodes[0].sync = 0;
+        nodes[0].gpu_num = j;
+      }
+    }
+  } else {
+    VLOG(0) << "init path with topo aware";
+    for (int i = 0; i < total_gpu; ++i) {
+      path_[i].resize(total_gpu);
+      for (int j = 0; j < total_gpu; ++j) {
+        auto& nodes = path_[i][j].nodes_;
+        int from = resource_->dev_id(i);
+        int to = resource_->dev_id(j);
+        int transfer_id = i;
+        if (need_transfer(from, to)) {
+          transfer_id = resource_->get_index_by_devid(get_transfer_devid(from));
+          nodes.push_back(Node());
+          Node& node = nodes.back();
+          node.in_stream = resource_->comm_stream(i, transfer_id);
+          node.out_stream = resource_->comm_stream(transfer_id, i);
+          node.key_storage = NULL;
+          node.val_storage = NULL;
+          node.sync = 1;
+          node.gpu_num = transfer_id;
+        }
+        nodes.push_back(Node());
+        Node& node = nodes.back();
+        node.in_stream = resource_->comm_stream(i, transfer_id);
+        node.out_stream = resource_->comm_stream(transfer_id, i);
+        node.key_storage = NULL;
+        node.val_storage = NULL;
+        node.sync = 0;
+        node.gpu_num = j;
+      }
+    }
+  }
 }
 
-/*
-  参数设置
-*/
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::set_multi_mf_dim(int max_mf_dim) {
-  max_mf_dim_ = max_mf_dim;
-}
+void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
+                                                           int end_index,
+                                                           size_t keylen,
+                                                           size_t vallen) {
+  auto& allocator = allocators_[start_index];
+  auto& nodes = path_[start_index][end_index].nodes_;
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num));
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
+        resource_->dev_id(nodes[i].gpu_num),
+        (void**)&(nodes[i].key_storage),  // NOLINT
+        keylen, resource_->remote_stream(nodes[i].gpu_num, start_index)));
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
+        resource_->dev_id(nodes[i].gpu_num),
+        (void**)&(nodes[i].val_storage),  // NOLINT
+        vallen, resource_->remote_stream(nodes[i].gpu_num, start_index)));
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::show_one_table(int gpu_num) {
-  //todo impl
+    nodes[i].key_bytes_len = keylen;
+    nodes[i].val_bytes_len = vallen;
+  }
 }
 
-/* 建表
-    num: 是哪一个gpu
-    h_keys: 具体的key(cpu内存)
-    pool: 具体的value(gpu内存)
-    len: key的数量
-    feature_value_size: value的大小(value可能是变长的)
-    chunk_size: 内部需要，多少个key做一次处理(多流加速处理)
-    stream_num: 流数量
-*/
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
-                                                     char* pool,
-                                                     size_t len,
-                                                     size_t feature_value_size,
-                                                     size_t chunk_size,
-                                                     int stream_num) {
-  if (len <= 0) {
-    return;
-  }
-  int dev_id = resource_->dev_id(num);
-  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-  platform::CUDADeviceGuard guard(dev_id);
-  std::vector<memory::allocation::AllocationPtr> d_key_bufs;
-  std::vector<memory::allocation::AllocationPtr> h_pinned_key_bufs;
-  gpuStream_t streams[stream_num];
-  for (int i = 0; i < stream_num; ++i) {
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i])));
-    auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType));
-    d_key_bufs.push_back(std::move(d_k_buf));
-    auto h_k_buf = memory::Alloc(phi::GPUPinnedPlace(), chunk_size * sizeof(KeyType));
-    h_pinned_key_bufs.push_back(std::move(h_k_buf));
-  }
-
-  int cur_len = 0;
-  int cur_stream = 0;
-  while (cur_len < len) {
-    cur_stream = cur_stream % stream_num;
-    cudaStreamSynchronize(streams[cur_stream]);
-    int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size;
-    memcpy(h_pinned_key_bufs[cur_stream]->ptr(),  h_keys + cur_len, sizeof(KeyType) * tmp_len);
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_pinned_key_bufs[cur_stream]->ptr(),
-                        sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice,
-                        streams[cur_stream]));
-    ptr_tables_[num]->insert(
-        reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()), tmp_len,
-        pool, feature_value_size, cur_len, streams[cur_stream]);
-    cur_stream += 1;
-    cur_len += tmp_len;
-  }
+void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index,
+                                                            int end_index) {
+  auto& allocator = allocators_[start_index];
+  auto& nodes = path_[start_index][end_index].nodes_;
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num));
 
-  for (int i = 0; i < stream_num; ++i) {
-    cudaStreamSynchronize(streams[i]);
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i]));
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num),
+                          nodes[i].key_storage));
+    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num),
+                          nodes[i].val_storage));
   }
 }
 
-/* 查表操作
-    num: 是哪个gpu
-    d_keys: 要查的key
-    d_vals: 查出来的结果存入这个地方(table里面的value是指针，这里需要构造出value本体出来放在这里)
-    len: 有多少个key
-*/
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
-                                                        KeyType* d_keys,
-                                                        ValType* d_vals,
-                                                        size_t len) {
-  if (len == 0) {
-    return;
+void HeterComm<KeyType, ValType, GradType>::walk_to_dest(
+    int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key,
+    GradType* src_val) {
+  int need_copy_val = 0;
+  if (src_val) {
+    need_copy_val = 1;
   }
-  int total_gpu = resource_->total_gpu();
-  int dev_id = resource_->dev_id(num);
-  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-  platform::CUDADeviceGuard guard(dev_id);
-  auto stream = resource_->local_stream(num, 0);
-
-  int grid_size = (len - 1) / block_size_ + 1;
-
-  auto h_left_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu);
-  auto h_right_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu);
-  int* h_left = reinterpret_cast<int*>(h_left_alloc->ptr());
-  int* h_right = reinterpret_cast<int*>(h_right_alloc->ptr());
-
-  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
-  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
-  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
-  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
-
-  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
-  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
-  
-  auto d_idx = memory::Alloc(place, len * sizeof(int));
-  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
-
-  size_t val_type_size = g_transfor->get_gpu_value_size(max_mf_dim_);
-
-  auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
-  KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
-  auto d_shard_vals = memory::Alloc(place, len * val_type_size);
-  ValType* d_shard_vals_ptr = reinterpret_cast<ValType*>(d_shard_vals->ptr());
-
-  //将key切分到不同的卡里面去(这里计算出来的只是中间结果)
-  split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num);
-  //将零散的key进行重排，保证归属在相同gpu卡里面的key连续存储，并对应上d_left_ptr&d_right_ptr
-  fill_shard_key<<<grid_size, block_size_, 0, stream>>>(d_shard_keys_ptr,
-                                                        d_keys, d_idx_ptr, len);
-
-  cudaMemcpyAsync(h_left, d_left_ptr, total_gpu * sizeof(int),
-             cudaMemcpyDeviceToHost, stream);
-  cudaMemcpyAsync(h_right, d_right_ptr, total_gpu * sizeof(int),
-             cudaMemcpyDeviceToHost, stream);
-  cudaStreamSynchronize(stream);
-
-  //创建分卡显存资源
-  for (int i = 0; i < total_gpu; ++i) {
-    int shard_len = h_right[i] - h_left[i] + 1;
+  std::queue<CopyTask> que;
+  for (int i = 0; i < gpu_num; i++) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
-    create_storage(num, i, shard_len * sizeof(KeyType),
-                   shard_len * val_type_size);
-  }
-
-  //将数据转移到目的gpu卡中
-  walk_to_dest(num, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL, 0);
-
-  std::vector<platform::Timer> time_lines;
-  time_lines.resize(total_gpu);
-
-  //开始查表操作了
-  for (int i = 0; i < total_gpu; ++i) {
-    time_lines[i].Start();
-    if (h_left[i] == -1) {
-      continue;
+    int size = path_[start_index][i].nodes_.size();
+    auto& node = path_[start_index][i].nodes_[0];
+    CopyTask t(&path_[start_index][i], 0);
+    que.push(t);
+    cudaMemcpyAsync(node.key_storage,
+                    reinterpret_cast<char*>(src_key + h_left[i]),
+                    node.key_bytes_len, cudaMemcpyDefault, node.in_stream);
+    if (need_copy_val) {
+      cudaMemcpyAsync(node.val_storage,
+                      reinterpret_cast<char*>(src_val + h_left[i]),
+                      node.val_bytes_len, cudaMemcpyDefault, node.in_stream);
     }
-    auto& node = path_[num][i].nodes_.back();
-    cudaStreamSynchronize(node.in_stream);
-    platform::CUDADeviceGuard guard(resource_->dev_id(i));
-
-    ptr_tables_[i]->rwlock_->RDLock();
-    ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
-                            reinterpret_cast<ValType*>(node.val_storage), h_right[i] - h_left[i] + 1,
-                            resource_->remote_stream(i, num));
   }
-  for (int i = 0; i < total_gpu; ++i) {
-    cudaStreamSynchronize(resource_->remote_stream(i, num));
-    if (h_left[i] == -1) {
-      continue;
-    }
-    ptr_tables_[i]->rwlock_->UNLock();
-    time_lines[i].Pause();
-  }
-  
-  //将各卡查到的结果转移到当前卡中来
-  walk_to_src(num, total_gpu, h_left, h_right, reinterpret_cast<char*>(d_shard_vals_ptr), val_type_size);
-
-  for (int i = 0; i < total_gpu; ++i) {
-    auto& node = path_[num][i].nodes_.front();
-    cudaStreamSynchronize(node.out_stream);
-  }
-
-  //填充结果
-  fill_dvals<<<grid_size, block_size_, 0, stream>>>(
-      d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size);
-
-  //资源销毁
-  cudaStreamSynchronize(stream);
-  for (int i = 0; i < total_gpu; ++i) {
-    if (h_left[i] == -1 || h_right[i] == -1) {
-      continue;
-    }
-    destroy_storage(num, i);
-  }
-}
-
-/*
-  * 梯度更新操作
-  * gpu_num 是哪个gpu卡
-  * d_keys: key指针
-  * d_grads: 梯度数据
-  * len: key长度
-  * sgd: 优化器
-*/
-template <typename KeyType, typename ValType, typename GradType>
-template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
-                                                        KeyType* d_keys,
-                                                        GradType* d_grads,
-                                                        size_t len,
-                                                        Sgd& sgd) {  // NOLINT
-  if (len == 0) {
-    return;
-  }
-  int total_gpu = resource_->total_gpu();
-  int dev_id = resource_->dev_id(gpu_num);
-  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-  platform::CUDADeviceGuard guard(dev_id);
-  auto stream = resource_->local_stream(gpu_num, 0);
-
-  size_t grad_value_size = g_transfor->get_gpu_push_value_size(max_mf_dim_);
-
-  // int h_left[total_gpu];   // NOLINT
-  // int h_right[total_gpu];  // NOLINT
-  auto h_left_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu);
-  auto h_right_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu);
-  int* h_left = reinterpret_cast<int*>(h_left_alloc->ptr());
-  int* h_right = reinterpret_cast<int*>(h_right_alloc->ptr());
-
-  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
-  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
-  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
-  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
-
-  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
-  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
-  auto d_idx = memory::Alloc(place, len * sizeof(int));
-  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
-
-  auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
-  KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
-
-  auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
-  GradType* d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
-
-  int uniq_len = len;
-  merge_grad(gpu_num, d_keys, d_grads, len, uniq_len);
-  
-  int grid_size = (uniq_len - 1) / block_size_ + 1;
-
-  split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr,
-                       gpu_num);
-
-  fill_shard_grads<<<grid_size, block_size_, 0, stream>>>(
-        d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr,
-        uniq_len, grad_value_size);
-
-
-  cudaMemcpyAsync(h_left, d_left_ptr, total_gpu * sizeof(int),
-             cudaMemcpyDeviceToHost, stream);
-  cudaMemcpyAsync(h_right, d_right_ptr, total_gpu * sizeof(int),
-             cudaMemcpyDeviceToHost, stream);
-  cudaStreamSynchronize(stream);
-
-  for (int i = 0; i < total_gpu; ++i) {
-    int shard_len = h_right[i] - h_left[i] + 1;
-    if (h_left[i] == -1 || h_right[i] == -1) {
-      continue;
-    }
-    {
-      create_storage(gpu_num, i, shard_len * sizeof(KeyType),
-                     shard_len * grad_value_size);
-    }
-  }
-
-  {
-    walk_to_dest(gpu_num, total_gpu, h_left, h_right, d_shard_keys_ptr,
-               reinterpret_cast<char*>(d_shard_grads_ptr), grad_value_size);
-  }
-
-  std::vector<platform::Timer> time_lines;
-  time_lines.resize(total_gpu);
-
-  for (int i = 0; i < total_gpu; ++i) {
-    time_lines[i].Start();
-    if (h_left[i] == -1 || h_right[i] == -1) {
-      continue;
+  while (!que.empty()) {
+    CopyTask& cur_task = que.front();
+    que.pop();
+    if (cur_task.path->nodes_[cur_task.step].sync) {
+      cudaStreamSynchronize(cur_task.path->nodes_[cur_task.step].in_stream);
     }
-    auto& node = path_[gpu_num][i].nodes_.back();
-    cudaStreamSynchronize(node.in_stream);
-
-    platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    ptr_tables_[i]->rwlock_->WRLock();
-    ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
-                             reinterpret_cast<GradType*>(node.val_storage), h_right[i] - h_left[i] + 1, sgd,
-                             resource_->remote_stream(i, gpu_num));
-  }
-  for (int i = 0; i < total_gpu; ++i) {
-    cudaStreamSynchronize(resource_->remote_stream(i, gpu_num));
-    if (h_left[i] != -1) {
-      {
-        ptr_tables_[i]->rwlock_->UNLock();
+    if (cur_task.step != cur_task.path->nodes_.size() - 1) {
+      int cur_step = cur_task.step;
+      CopyTask c(cur_task.path, cur_step + 1);
+      que.push(c);
+      cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage,
+                      cur_task.path->nodes_[cur_step].key_storage,
+                      cur_task.path->nodes_[cur_step + 1].key_bytes_len,
+                      cudaMemcpyDefault,
+                      cur_task.path->nodes_[cur_step + 1].in_stream);
+      if (need_copy_val) {
+        cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage,
+                        cur_task.path->nodes_[cur_step].val_storage,
+                        cur_task.path->nodes_[cur_step + 1].val_bytes_len,
+                        cudaMemcpyDefault,
+                        cur_task.path->nodes_[cur_step + 1].in_stream);
       }
-      time_lines[i].Pause();
-    }
-  }
-  for (int i = 0; i < total_gpu; ++i) {
-    if (h_left[i] == -1 || h_right[i] == -1) {
-      continue;
     }
-    destroy_storage(gpu_num, i);
-  }
-}
-
-/* 暂时没用到，看性能是否需要启用, 并优化
-    相比push_sparse，是用nccl做卡间的通信
-*/
-template <typename KeyType, typename ValType, typename GradType>
-template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node(
-    int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
-    Sgd& sgd) {  // NOLINT
-  if (len == 0) {
-    return;
-  }
-
-  int uniq_len = len;
-  merge_grad(gpu_num, d_keys, d_grads, len, uniq_len);
-
-  uniq_len = gather_one_node_grad(gpu_num, d_keys, d_grads, uniq_len);
-
-  uniq_len = gather_multi_node_grad(gpu_num, storage_[gpu_num].local_keys,
-                                    storage_[gpu_num].local_grads, uniq_len);
-
-  update_one_table(gpu_num, storage_[gpu_num].local_keys,
-                   storage_[gpu_num].local_grads, uniq_len, sgd);
-}
-
-
-
-
-/* 将key拆分，看归属在哪个gpu里面
-    d_keys: key列表
-    d_idx_ptr: 结果, 按照gpu卡分块好的对应原始d_keys的下标
-    left: 每个卡的首地址
-    right: 每个卡的结束地址
-    d_idx_ptr/left/right : 这三个值就知道了归属到每个卡的key列表了
-*/
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
-    KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right,
-    int gpu_num) {
-  int total_gpu = resource_->total_gpu();
-  int dev_id = resource_->dev_id(gpu_num);
-  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-  platform::CUDADeviceGuard guard(dev_id);
-  auto stream = resource_->local_stream(gpu_num, 0);
-
-  auto d_idx_tmp = memory::Alloc(place, len * sizeof(int));
-  int* d_idx_tmp_ptr = reinterpret_cast<int*>(d_idx_tmp->ptr());
-
-  auto d_shard_index = memory::Alloc(place, len * sizeof(int));
-  int* d_shard_index_ptr = reinterpret_cast<int*>(d_shard_index->ptr());
-
-  auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int));
-  int* d_shard_index_tmp_ptr = reinterpret_cast<int*>(d_shard_index_tmp->ptr());
-
-  int grid_size = (len - 1) / block_size_ + 1;
-  fill_idx<<<grid_size, block_size_, 0, stream>>>(d_idx_tmp_ptr, len);
-  calc_shard_index<<<grid_size, block_size_, 0, stream>>>(
-      d_keys, len, d_shard_index_tmp_ptr, total_gpu);
-
-  size_t temp_storage_bytes;
-  const int num_bits = 1 + log2i(total_gpu);
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr,
-      d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream));
-
-  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr,
-      d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream));
-  calc_shard_offset<<<grid_size, block_size_, 0, stream>>>(d_shard_index_ptr,
-                                                           left, right, len);
-  cudaStreamSynchronize(stream);
-}
-
-/* 资源创建，因为要分卡操作，所以需要创建不同卡的资源
-    start_index: 当前op计算的卡编号
-    end_index: 要创建的哪个卡资源的卡编号
-    keylen: key相关的显存大小
-    vallen: value相关的显存大小
-*/
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
-                                                           int end_index,
-                                                           size_t keylen,
-                                                           size_t vallen) {
-  auto& allocator = allocators_[start_index];
-  auto& nodes = path_[start_index][end_index].nodes_;
-  for (size_t i = 0; i < nodes.size(); ++i) {
-    platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num));
-    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
-        resource_->dev_id(nodes[i].gpu_num),
-        (void**)&(nodes[i].key_storage),  // NOLINT
-        keylen, resource_->remote_stream(nodes[i].gpu_num, start_index)));
-    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
-        resource_->dev_id(nodes[i].gpu_num),
-        (void**)&(nodes[i].val_storage),  // NOLINT
-        vallen, resource_->remote_stream(nodes[i].gpu_num, start_index)));
-
-    nodes[i].key_bytes_len = keylen;
-    nodes[i].val_bytes_len = vallen;
   }
 }
 
-/*
-  将数据转入到目的gpu卡中
-*/
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::walk_to_dest(
     int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key,
@@ -638,12 +374,9 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(
   }
 }
 
-/*
-  *将不同卡中的结果转移到当前卡中
-*/
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::walk_to_src(
-    int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, size_t val_size) {
+    int start_index, int gpu_num, int* h_left, int* h_right, ValType* src_val) {
   std::queue<CopyTask> que;
   for (int i = 0; i < gpu_num; i++) {
     if (h_left[i] == -1 || h_right[i] == -1) {
@@ -652,7 +385,7 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_src(
     int cur_step = path_[start_index][i].nodes_.size() - 1;
     auto& node = path_[start_index][i].nodes_[cur_step];
     if (cur_step == 0) {
-      cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size,
+      cudaMemcpyAsync(reinterpret_cast<char*>(src_val + h_left[i]),
                       node.val_storage, node.val_bytes_len, cudaMemcpyDefault,
                       node.out_stream);
     } else {
@@ -682,7 +415,7 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_src(
                       cur_task.path->nodes_[cur_step - 1].out_stream);
     } else if (cur_step == 0) {
       int end_index = cur_task.path->nodes_.back().gpu_num;
-      cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size,
+      cudaMemcpyAsync(reinterpret_cast<char*>(src_val + h_left[end_index]),
                       cur_task.path->nodes_[cur_step].val_storage,
                       cur_task.path->nodes_[cur_step].val_bytes_len,
                       cudaMemcpyDefault,
@@ -691,146 +424,689 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_src(
   }
 }
 
-/*
-  * 资源销毁
-*/
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index,
-                                                            int end_index) {
-  auto& allocator = allocators_[start_index];
-  auto& nodes = path_[start_index][end_index].nodes_;
-  for (size_t i = 0; i < nodes.size(); ++i) {
-    platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num),
-                          nodes[i].key_storage));
-    PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num),
-                          nodes[i].val_storage));
+void HeterComm<KeyType, ValType, GradType>::walk_to_src(
+    int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, size_t val_size) {
+  std::queue<CopyTask> que;
+  for (int i = 0; i < gpu_num; i++) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    int cur_step = path_[start_index][i].nodes_.size() - 1;
+    auto& node = path_[start_index][i].nodes_[cur_step];
+    if (cur_step == 0) {
+      cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size,
+                      node.val_storage, node.val_bytes_len, cudaMemcpyDefault,
+                      node.out_stream);
+    } else {
+      CopyTask t(&path_[start_index][i], cur_step - 1);
+      que.push(t);
+      cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage,
+                      node.val_storage,
+                      path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
+                      cudaMemcpyDefault,
+                      path_[start_index][i].nodes_[cur_step - 1].out_stream);
+    }
+  }
+  while (!que.empty()) {
+    CopyTask& cur_task = que.front();
+    que.pop();
+    int cur_step = cur_task.step;
+    if (cur_task.path->nodes_[cur_step].sync) {
+      cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream);
+    }
+    if (cur_step > 0) {
+      CopyTask c(cur_task.path, cur_step - 1);
+      que.push(c);
+      cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
+                      cur_task.path->nodes_[cur_step].val_storage,
+                      cur_task.path->nodes_[cur_step - 1].val_bytes_len,
+                      cudaMemcpyDefault,
+                      cur_task.path->nodes_[cur_step - 1].out_stream);
+    } else if (cur_step == 0) {
+      int end_index = cur_task.path->nodes_.back().gpu_num;
+      cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size,
+                      cur_task.path->nodes_[cur_step].val_storage,
+                      cur_task.path->nodes_[cur_step].val_bytes_len,
+                      cudaMemcpyDefault,
+                      cur_task.path->nodes_[cur_step].out_stream);
+    }
+  }
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+HeterComm<KeyType, ValType, GradType>::~HeterComm() {
+  if (!multi_mf_dim_) {
+    for (auto& table : tables_) {
+      delete table;
+      table = nullptr;
+    }
+  } else {
+    for (auto& table : ptr_tables_) {
+      delete table;
+      table = nullptr;
+    }
+    for (auto& table : tables_) {
+      delete table;
+      table = nullptr;
+    }
+   }
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::show_one_table(int gpu_num) {
+  if (!multi_mf_dim_) {
+    tables_[gpu_num]->show();
+  } else {
+    // ptr_tables_[gpu_num]->show();
+  }
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+int HeterComm<KeyType, ValType, GradType>::log2i(int x) {
+  unsigned res = 0;
+  while (x >>= 1) {
+    ++res;
+  }
+  return res;
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
+  return resource_->get_index_by_devid(devid);
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
+                                                     ValType* h_vals,
+                                                     size_t len,
+                                                     size_t chunk_size,
+                                                     int stream_num) {
+  if (len <= 0) {
+    return;
+  }
+  int dev_id = resource_->dev_id(num);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+
+  std::vector<memory::allocation::AllocationPtr> d_key_bufs;
+  std::vector<memory::allocation::AllocationPtr> d_val_bufs;
+
+  gpuStream_t streams[stream_num];  // NOLINT
+  for (int i = 0; i < stream_num; ++i) {
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i])));
+    auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType));
+    auto d_v_buf = memory::Alloc(place, chunk_size * sizeof(ValType));
+    d_key_bufs.push_back(std::move(d_k_buf));
+    d_val_bufs.push_back(std::move(d_v_buf));
+  }
+
+  int cur_len = 0;
+  int cur_stream = 0;
+
+  while (cur_len < len) {
+    cur_stream = cur_stream % stream_num;
+    int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_keys + cur_len,
+                        sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice,
+                        streams[cur_stream]));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaMemcpyAsync(d_val_bufs[cur_stream]->ptr(), h_vals + cur_len,
+                        sizeof(ValType) * tmp_len, cudaMemcpyHostToDevice,
+                        streams[cur_stream]));
+    tables_[num]->insert(
+        reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()),
+        reinterpret_cast<ValType*>(d_val_bufs[cur_stream]->ptr()), tmp_len,
+        streams[cur_stream]);
+    cur_stream += 1;
+    cur_len += tmp_len;
+  }
+
+  for (int i = 0; i < stream_num; ++i) {
+    cudaStreamSynchronize(streams[i]);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i]));
+  }
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
+                                                     char* pool,
+                                                     size_t len,
+                                                     size_t feature_value_size,
+                                                     size_t chunk_size,
+                                                     int stream_num) {
+  if (len <= 0) {
+    return;
+  }
+  int dev_id = resource_->dev_id(num);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+
+  // use hbm pool
+  std::vector<memory::allocation::AllocationPtr> d_key_bufs;
+
+  gpuStream_t streams[stream_num];
+  for (int i = 0; i < stream_num; ++i) {
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i])));
+    auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType));
+    d_key_bufs.push_back(std::move(d_k_buf));
+  }
+
+  int cur_len = 0;
+  int cur_stream = 0;
+
+  while (cur_len < len) {
+    cur_stream = cur_stream % stream_num;
+    int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_keys + cur_len,
+                        sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice,
+                        streams[cur_stream]));
+    ptr_tables_[num]->insert(
+        reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()), tmp_len,
+        pool, feature_value_size, cur_len, streams[cur_stream]);
+    cur_stream += 1;
+    cur_len += tmp_len;
+  }
+
+  for (int i = 0; i < stream_num; ++i) {
+    cudaStreamSynchronize(streams[i]);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i]));
+  }
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::merge_grad(
+    int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
+    int& uniq_len) {  // NOLINT
+  int dev_id = resource_->dev_id(gpu_num);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_num, 0);
+
+  size_t temp_storage_bytes;
+
+  auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
+  KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
+
+  auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType));
+  GradType* d_merge_grads_ptr =
+      reinterpret_cast<GradType*>(d_merge_grads->ptr());
+
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads,
+      d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false));
+
+  void* d_buff = NULL;
+  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
+      d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false));
+  temp_storage_bytes = 0;
+
+  auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int));
+  int* d_num_runs_out = reinterpret_cast<int*>(d_num_runs_out_mem->ptr());
+
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey(
+      NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_merge_grads_ptr,
+      d_grads, d_num_runs_out, merger_, len, stream, false));
+
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey(
+      d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
+      d_merge_grads_ptr, d_grads, d_num_runs_out, merger_, len, stream, false));
+
+  cudaMemcpyAsync(&uniq_len, d_num_runs_out, sizeof(int),
+                  cudaMemcpyDeviceToHost, stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
+                                                       KeyType* d_keys,
+                                                       GradType* d_grads,
+                                                       float* mf, size_t len,
+                                                       int& uniq_len) {
+  platform::Timer timeline;
+  timeline.Start();
+  int dev_id = resource_->dev_id(gpu_num);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_num, 0);
+
+  size_t temp_storage_bytes;
+
+  //VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_;
+  size_t grad_value_size =
+      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+
+  auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
+  KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
+
+  auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
+  GradType* d_merge_grads_ptr =
+      reinterpret_cast<GradType*>(d_merge_grads->ptr());
+
+  auto d_fea_num_info =
+      memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1));
+  uint32_t* d_fea_num_info_ptr =
+      reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
+  uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len];
+  uint32_t* d_idx = (uint32_t*)&d_index[len];
+  int* d_merged_size = (int*)&d_idx[len];
+  int grid_size = (len - 1) / block_size_ + 1;
+  fill_idx<<<grid_size, block_size_, 0, stream>>>(d_idx, len);
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len,
+      0, 8 * sizeof(KeyType), stream));
+
+  void* d_buff = NULL;
+  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
+      d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  timeline.Pause();
+  timeline.Start();
+  temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
+      NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_fea_num_info_ptr,
+      d_merged_size, len, stream));
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
+      d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
+      d_fea_num_info_ptr, d_merged_size, len, stream));
+
+  cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int),
+                  cudaMemcpyDeviceToHost, stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  timeline.Pause();
+  timeline.Start();
+
+  assert(d_merged_size > 0);
+  uint32_t* d_offset = (uint32_t*)&d_index[len];
+
+  temp_storage_bytes = 0;
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
+      NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len,
+      stream));
+  if (d_temp_storage->size() < temp_storage_bytes) {
+    d_temp_storage = NULL;
+    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
+      d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset,
+      uniq_len, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  timeline.Pause();
+  timeline.Start();
+  grid_size = (uniq_len - 1) / block_size_ + 1;
+  merge_gradient_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads,
+      (char*)d_merge_grads_ptr, uniq_len, grad_value_size, merger_);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  timeline.Pause();
+  timeline.Start();
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len,
+                      cudaMemcpyDeviceToDevice, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  timeline.Pause();
+  timeline.Start();
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
+    KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right,
+    int gpu_num) {
+  int total_gpu = resource_->total_gpu();
+  int dev_id = resource_->dev_id(gpu_num);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_num, 0);
+
+  auto d_idx_tmp = memory::Alloc(place, len * sizeof(int));
+  int* d_idx_tmp_ptr = reinterpret_cast<int*>(d_idx_tmp->ptr());
+
+  auto d_shard_index = memory::Alloc(place, len * sizeof(int));
+  int* d_shard_index_ptr = reinterpret_cast<int*>(d_shard_index->ptr());
+
+  auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int));
+  int* d_shard_index_tmp_ptr = reinterpret_cast<int*>(d_shard_index_tmp->ptr());
+
+  int grid_size = (len - 1) / block_size_ + 1;
+  fill_idx<<<grid_size, block_size_, 0, stream>>>(d_idx_tmp_ptr, len);
+  calc_shard_index<<<grid_size, block_size_, 0, stream>>>(
+      d_keys, len, d_shard_index_tmp_ptr, total_gpu);
+
+  size_t temp_storage_bytes;
+  const int num_bits = 1 + log2i(total_gpu);
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr,
+      d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream));
+
+  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr,
+      d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream));
+  calc_shard_offset<<<grid_size, block_size_, 0, stream>>>(d_shard_index_ptr,
+                                                           left, right, len);
+  cudaStreamSynchronize(stream);
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
+                                                        KeyType* d_keys,
+                                                        ValType* d_vals,
+                                                        size_t len) {
+  if (len == 0) {
+    return;
+  }
+
+  int total_gpu = resource_->total_gpu();
+  int dev_id = resource_->dev_id(num);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(num, 0);
+
+  int grid_size = (len - 1) / block_size_ + 1;
+
+  auto h_left_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu);
+  auto h_right_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu);
+  int* h_left = reinterpret_cast<int*>(h_left_alloc->ptr());
+  int* h_right = reinterpret_cast<int*>(h_right_alloc->ptr());
+
+  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
+  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+
+  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
+  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
+  //
+  auto d_idx = memory::Alloc(place, len * sizeof(int));
+  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
+
+  size_t val_type_size = 0;
+  if (!multi_mf_dim_) {
+    val_type_size = sizeof(ValType);
+  } else {
+    val_type_size =
+        TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
+  }
+
+  auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
+  KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
+  auto d_shard_vals = memory::Alloc(place, len * val_type_size);
+  ValType* d_shard_vals_ptr = reinterpret_cast<ValType*>(d_shard_vals->ptr());
+
+  split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num);
+
+  fill_shard_key<<<grid_size, block_size_, 0, stream>>>(d_shard_keys_ptr,
+                                                        d_keys, d_idx_ptr, len);
+
+  cudaMemcpyAsync(h_left, d_left_ptr, total_gpu * sizeof(int),
+             cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(h_right, d_right_ptr, total_gpu * sizeof(int),
+             cudaMemcpyDeviceToHost, stream);
+  cudaStreamSynchronize(stream);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_right[i] - h_left[i] + 1;
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    create_storage(num, i, shard_len * sizeof(KeyType),
+                   shard_len * val_type_size);
+  }
+
+  walk_to_dest(num, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL);
+
+  std::vector<platform::Timer> time_lines;
+  time_lines.resize(total_gpu);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    time_lines[i].Start();
+    if (h_left[i] == -1) {
+      continue;
+    }
+    auto& node = path_[num][i].nodes_.back();
+    cudaStreamSynchronize(node.in_stream);
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    if (!multi_mf_dim_) {
+      tables_[i]->rwlock_->RDLock();
+      tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
+                      reinterpret_cast<ValType*>(node.val_storage),
+                      h_right[i] - h_left[i] + 1,
+                      resource_->remote_stream(i, num));
+    } else {
+      ptr_tables_[i]->rwlock_->RDLock();
+      ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
+                          node.val_storage, h_right[i] - h_left[i] + 1,
+                          resource_->remote_stream(i, num));
+    }
+  }
+  for (int i = 0; i < total_gpu; ++i) {
+    cudaStreamSynchronize(resource_->remote_stream(i, num));
+    if (h_left[i] == -1) {
+      continue;
+    }
+    if (!multi_mf_dim_) {
+      tables_[i]->rwlock_->UNLock();
+    } else {
+      ptr_tables_[i]->rwlock_->UNLock();
+    }
+    time_lines[i].Pause();
+  }
+
+  if (!multi_mf_dim_) {
+    walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr);
+  } else {
+    walk_to_src(num, total_gpu, h_left, h_right, reinterpret_cast<char*>(d_shard_vals_ptr), val_type_size);
+  }
+
+  for (int i = 0; i < total_gpu; ++i) {
+    auto& node = path_[num][i].nodes_.front();
+    cudaStreamSynchronize(node.out_stream);
+  }
+
+  if (!multi_mf_dim_) {
+    fill_dvals<<<grid_size, block_size_, 0, stream>>>(d_shard_vals_ptr, d_vals,
+                                                      d_idx_ptr, len);
+  } else {
+    dy_mf_fill_dvals<<<grid_size, block_size_, 0, stream>>>(
+        d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size);
+  }
+  cudaStreamSynchronize(stream);
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    destroy_storage(num, i);
+  }
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+template <typename Sgd>
+void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
+                                                        KeyType* d_keys,
+                                                        GradType* d_grads,
+                                                        size_t len,
+                                                        Sgd& sgd) {  // NOLINT
+  if (len == 0) {
+    return;
   }
-}
 
-/* 相同key的梯度做聚合，聚合到一起
-  * gpu_num 归属哪个卡
-  * d_keys 要处理的key(input/output)
-  * d_grads 要处理的value(input/output)
-  * len key的长度
-  * uniq_len 去重后key的长度
-*/
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::merge_grad(int gpu_num,
-                                                       KeyType* d_keys,
-                                                       GradType* d_grads,
-                                                       size_t len,
-                                                       int& uniq_len) {
-  platform::Timer timeline;
-  timeline.Start();
+  int total_gpu = resource_->total_gpu();
   int dev_id = resource_->dev_id(gpu_num);
   platform::CUDAPlace place = platform::CUDAPlace(dev_id);
   platform::CUDADeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(gpu_num, 0);
 
-  size_t temp_storage_bytes;
+  size_t grad_value_size =
+    TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
 
-  //VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_;
-  size_t grad_value_size = g_transfor->get_gpu_push_value_size(max_mf_dim_);
+  // int h_left[total_gpu];   // NOLINT
+  // int h_right[total_gpu];  // NOLINT
+  auto h_left_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu);
+  auto h_right_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu);
+  int* h_left = reinterpret_cast<int*>(h_left_alloc->ptr());
+  int* h_right = reinterpret_cast<int*>(h_right_alloc->ptr());
 
-  auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
-  //排序后的key
-  KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
+  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
+  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
 
-  auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
-  GradType* d_merge_grads_ptr =
-      reinterpret_cast<GradType*>(d_merge_grads->ptr());
+  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
+  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
+  auto d_idx = memory::Alloc(place, len * sizeof(int));
+  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
 
-  auto d_fea_num_info =
-      memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1));
-  uint32_t* d_fea_num_info_ptr =
-      reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
-  //经过排序后的key的索引下标
-  uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len];
-  //对应原始key的索引下标
-  uint32_t* d_idx = (uint32_t*)&d_index[len];
-  int* d_merged_size = (int*)&d_idx[len];
-  int grid_size = (len - 1) / block_size_ + 1;
-  //填充原始key的索引下标
-  fill_idx<<<grid_size, block_size_, 0, stream>>>(d_idx, len);
-  //对key做排序， d_merge_keys_ptr就是排序后好的key， d_index这个就是key的索引，可以用来反解出来排序后好的key
-  //对应原始的那个位置的key
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len,
-      0, 8 * sizeof(KeyType), stream));
+  auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
+  KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
 
-  void* d_buff = NULL;
-  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
-      d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
-  timeline.Pause();
-  timeline.Start();
-  temp_storage_bytes = 0;
-  //对排序好后的key做一个去重，并统计数量
-  //d_keys,去重后的key，d_fea_num_info_ptr对应的key的数量, d_merged_size去重后的key数量
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
-      NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_fea_num_info_ptr,
-      d_merged_size, len, stream));
-  if (d_temp_storage->size() < temp_storage_bytes) {
-    d_temp_storage = NULL;
-    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
+  GradType* d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+
+  int uniq_len = len;
+  merge_grad(gpu_num, d_keys, d_grads, NULL, len, uniq_len);
+
+  int grid_size = (uniq_len - 1) / block_size_ + 1;
+
+  split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr,
+                       gpu_num);
+
+  if (!multi_mf_dim_) {
+    fill_shard_grads<<<grid_size, block_size_, 0, stream>>>(
+        d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr,
+        uniq_len);
+  } else {
+    dy_mf_fill_shard_grads<<<grid_size, block_size_, 0, stream>>>(
+        d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr,
+        uniq_len, grad_value_size);
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
-      d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
-      d_fea_num_info_ptr, d_merged_size, len, stream));
 
-  cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int),
-                  cudaMemcpyDeviceToHost, stream);
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
-  timeline.Pause();
-  timeline.Start();
 
-  assert(d_merged_size > 0);
-  uint32_t* d_offset = (uint32_t*)&d_index[len];
+  cudaMemcpyAsync(h_left, d_left_ptr, total_gpu * sizeof(int),
+             cudaMemcpyDeviceToHost, stream);
+  cudaMemcpyAsync(h_right, d_right_ptr, total_gpu * sizeof(int),
+             cudaMemcpyDeviceToHost, stream);
+  cudaStreamSynchronize(stream);
 
-  //做一个前缀求和
-  temp_storage_bytes = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
-      NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len,
-      stream));
-  if (d_temp_storage->size() < temp_storage_bytes) {
-    d_temp_storage = NULL;
-    d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_right[i] - h_left[i] + 1;
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    if (!multi_mf_dim_) {
+      create_storage(gpu_num, i, shard_len * sizeof(KeyType),
+                     shard_len * sizeof(GradType));
+    } else {
+      create_storage(gpu_num, i, shard_len * sizeof(KeyType),
+                     shard_len * grad_value_size);
+    }
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
-      d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset,
-      uniq_len, stream));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
-  timeline.Pause();
-  timeline.Start();
-  grid_size = (uniq_len - 1) / block_size_ + 1;
 
-  merge_gradient_kernel<<<grid_size, block_size_, 0, stream>>>(
-      d_offset, d_fea_num_info_ptr, d_index, (GradType*)d_grads,
-      (GradType*)d_merge_grads_ptr, uniq_len, grad_value_size);
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  if (!multi_mf_dim_) {
+    walk_to_dest(gpu_num, total_gpu, h_left, h_right, d_shard_keys_ptr,
+               d_shard_grads_ptr);
+  } else {
+    walk_to_dest(gpu_num, total_gpu, h_left, h_right, d_shard_keys_ptr,
+               reinterpret_cast<char*>(d_shard_grads_ptr), grad_value_size);
+  }
 
-  timeline.Pause();
-  timeline.Start();
+  std::vector<platform::Timer> time_lines;
+  time_lines.resize(total_gpu);
 
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len,
-                      cudaMemcpyDeviceToDevice, stream));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
-  timeline.Pause();
-  timeline.Start();
+  for (int i = 0; i < total_gpu; ++i) {
+    time_lines[i].Start();
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto& node = path_[gpu_num][i].nodes_.back();
+    cudaStreamSynchronize(node.in_stream);
+
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    if (!multi_mf_dim_) {
+      tables_[i]->rwlock_->WRLock();
+      tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
+                         reinterpret_cast<GradType*>(node.val_storage),
+                         h_right[i] - h_left[i] + 1, sgd,
+                         resource_->remote_stream(i, gpu_num));
+    } else {
+      ptr_tables_[i]->rwlock_->WRLock();
+      ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
+                             node.val_storage, h_right[i] - h_left[i] + 1, sgd,
+                             resource_->remote_stream(i, gpu_num));
+    }
+  }
+  for (int i = 0; i < total_gpu; ++i) {
+    cudaStreamSynchronize(resource_->remote_stream(i, gpu_num));
+    if (h_left[i] != -1) {
+      if (!multi_mf_dim_) {
+        tables_[i]->rwlock_->UNLock();
+      } else {
+        ptr_tables_[i]->rwlock_->UNLock();
+      }
+      time_lines[i].Pause();
+    }
+  }
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    destroy_storage(gpu_num, i);
+  }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::log2i(int x) {
-  unsigned res = 0;
-  while (x >>= 1) {
-    ++res;
+template <typename Sgd>
+void HeterComm<KeyType, ValType, GradType>::update_one_table(
+    int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
+    Sgd& sgd) {  // NOLINT
+  if (len == 0) {
+    return;
   }
-  return res;
+
+  int dev_id = resource_->dev_id(gpu_num);
+  platform::CUDADeviceGuard guard(dev_id);
+  tables_[gpu_num]->rwlock_->WRLock();
+  tables_[gpu_num]->update(d_keys, d_grads, len, sgd,
+                           resource_->remote_stream(gpu_num, gpu_num));
+  tables_[gpu_num]->rwlock_->UNLock();
+  cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num));
+}
+
+template <typename KeyType, typename ValType, typename GradType>
+template <typename Sgd>
+void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node(
+    int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
+    Sgd& sgd) {  // NOLINT
+  if (len == 0) {
+    return;
+  }
+
+  int uniq_len = len;
+  merge_grad(gpu_num, d_keys, d_grads, len, uniq_len);
+
+  uniq_len = gather_one_node_grad(gpu_num, d_keys, d_grads, uniq_len);
+
+  uniq_len = gather_multi_node_grad(gpu_num, storage_[gpu_num].local_keys,
+                                    storage_[gpu_num].local_grads, uniq_len);
+
+  update_one_table(gpu_num, storage_[gpu_num].local_keys,
+                   storage_[gpu_num].local_grads, uniq_len, sgd);
 }
 
 template <typename KeyType, typename ValType, typename GradType>
@@ -910,8 +1186,7 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
     fill_shard_grads<<<grid_size, block_size_, 0, stream>>>(
         storage.local_keys + merge_num, storage.all_keys + index,
         storage.local_grads + merge_num, storage.all_grads + index,
-        d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1,
-        g_transfor->get_gpu_push_value_size(max_mf_dim_));
+        d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1);
     merge_num = merge_num + h_right[gpu_num] - h_left[gpu_num] + 1;
   }
 
@@ -981,76 +1256,35 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType>::update_one_table(
-    int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
-    Sgd& sgd) {  // NOLINT
-  if (len == 0) {
-    return;
-  }
-
-  int dev_id = resource_->dev_id(gpu_num);
-  platform::CUDADeviceGuard guard(dev_id);
-  ptr_tables_[gpu_num]->rwlock_->WRLock();
-  ptr_tables_[gpu_num]->update(d_keys, d_grads, len, sgd,
-                           resource_->remote_stream(gpu_num, gpu_num));
-  ptr_tables_[gpu_num]->rwlock_->UNLock();
-  cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num));
-}
-
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::init_path() {
+void HeterComm<KeyType, ValType, GradType>::end_pass() {
   int total_gpu = resource_->total_gpu();
-  path_.resize(total_gpu);
+  std::vector<std::thread> threads;
 
-  if (!topo_aware_) {
-    VLOG(0) << "init path without topo aware";
+  auto dump_to_cpu_func = [this](int index) {
+    auto stream = resource_->local_stream(index, 0);
+    int dev_id = resource_->dev_id(index);
+    platform::CUDADeviceGuard guard(dev_id);
+    tables_[index]->dump_to_cpu(dev_id, stream);
+  };
+
+  if (!multi_mf_dim_) {
     for (int i = 0; i < total_gpu; ++i) {
-      path_[i].resize(total_gpu);
-      for (int j = 0; j < total_gpu; ++j) {
-        auto& nodes = path_[i][j].nodes_;
-        nodes.resize(1);
-        nodes[0].in_stream = resource_->comm_stream(i, j);
-        nodes[0].out_stream = resource_->comm_stream(i, j);
-        nodes[0].key_storage = NULL;
-        nodes[0].val_storage = NULL;
-        nodes[0].sync = 0;
-        nodes[0].gpu_num = j;
-      }
+      threads.push_back(std::thread(dump_to_cpu_func, i));
     }
-  } else {
-    VLOG(0) << "init path with topo aware";
-    for (int i = 0; i < total_gpu; ++i) {
-      path_[i].resize(total_gpu);
-      for (int j = 0; j < total_gpu; ++j) {
-        auto& nodes = path_[i][j].nodes_;
-        int from = resource_->dev_id(i);
-        int to = resource_->dev_id(j);
-        int transfer_id = i;
-        if (need_transfer(from, to)) {
-          transfer_id = resource_->get_index_by_devid(get_transfer_devid(from));
-          nodes.push_back(Node());
-          Node& node = nodes.back();
-          node.in_stream = resource_->comm_stream(i, transfer_id);
-          node.out_stream = resource_->comm_stream(transfer_id, i);
-          node.key_storage = NULL;
-          node.val_storage = NULL;
-          node.sync = 1;
-          node.gpu_num = transfer_id;
-        }
-        nodes.push_back(Node());
-        Node& node = nodes.back();
-        node.in_stream = resource_->comm_stream(i, transfer_id);
-        node.out_stream = resource_->comm_stream(transfer_id, i);
-        node.key_storage = NULL;
-        node.val_storage = NULL;
-        node.sync = 0;
-        node.gpu_num = j;
-      }
+    for (auto& t : threads) {
+      t.join();
     }
   }
 }
 
+// template <typename KeyType, typename ValType, typename GradType>
+// void HeterComm<KeyType, ValType, GradType>::dump_to_cpu(int index) {
+//  auto stream = resource_->local_stream(index, 0);
+//  int dev_id = resource_->dev_id(index);
+//  platform::CUDADeviceGuard guard(dev_id);
+//  tables_[index]->dump_to_cpu(dev_id, stream);
+//}
+
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 2251d28be7ed82..2fa4c08d20d0b2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -14,9 +14,6 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h"
-#include "paddle/fluid/framework/fleet/heter_ps/gpu_value_inl.h"
-#include "paddle/fluid/framework/fleet/heter_ps/dy_gpu_value_inl.h"
 
 #ifdef PADDLE_WITH_HETERPS
 
@@ -24,61 +21,57 @@ namespace paddle {
 namespace framework {
 
 HeterPsBase* HeterPsBase::get_instance(
-    size_t capacity, std::shared_ptr<HeterPsResource> resource,
-    std::string accessor_type, std::string gpu_value_type) {
-  if (accessor_type == "DownpourCtrDymfAccessor" && gpu_value_type == "DyFeatureValue") {
-    return new HeterPs<FeatureKey, DyGpuValue, DyGpuPushValue>(capacity, resource);
-  } else if (accessor_type == "DownpourCtrAccessor" && gpu_value_type == "FeatureValue") {
-    return new HeterPs<FeatureKey, GpuValue, GpuPushValue>(capacity, resource);
-  }
-  return nullptr;
+    size_t capacity, std::shared_ptr<HeterPsResource> resource) {
+  return new HeterPs(capacity, resource);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-HeterPs<KeyType, ValType, GradType>::HeterPs() {
+HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) {
+  comm_ =
+      std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>(
+          capacity, resource);
+  opt_ = Optimizer<FeatureValue, FeaturePushValue>();
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-HeterPs<KeyType, ValType, GradType>::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) {
-  comm_ = std::make_shared<HeterComm<KeyType, ValType, GradType>>(capacity, resource);
-  opt_ = Optimizer<ValType, GradType>();
+HeterPs::~HeterPs() {}
+
+void HeterPs::pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals,
+                          size_t len) {
+  comm_->pull_sparse(num, d_keys, d_vals, len);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterPs<KeyType, ValType, GradType>::pull_sparse(int num, FeatureKey* d_keys, void* d_vals,
-                           size_t len) {
-  comm_->pull_sparse(num, d_keys, (ValType*)d_vals, len);
+void HeterPs::build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
+                       size_t len, size_t chunk_size, int stream_num) {
+  comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterPs<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys, char* pool,
-            size_t len, size_t feature_value_size, size_t chunk_size, int stream_num) {
+void HeterPs::build_ps(int num, FeatureKey* h_keys, char* pool,
+                       size_t len, size_t feature_value_size, size_t chunk_size, int stream_num) {
   comm_->build_ps(num, h_keys, pool, len, feature_value_size, chunk_size, stream_num);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterPs<KeyType, ValType, GradType>::set_nccl_comm_and_size(
-      const std::vector<ncclComm_t>& inner_comms,
-      const std::vector<ncclComm_t>& inter_comms, int comm_size) {
-  comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
+int HeterPs::get_index_by_devid(int devid) {
+  return comm_->get_index_by_devid(devid);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterPs<KeyType, ValType, GradType>::set_multi_mf_dim(int max_mf_dim) {
-  comm_->set_multi_mf_dim(max_mf_dim);
-}
+void HeterPs::end_pass() { comm_->end_pass(); }
 
-template <typename KeyType, typename ValType, typename GradType>
-int HeterPs<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
-  return comm_->get_index_by_devid(devid);
-}
+void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterPs<KeyType, ValType, GradType>::push_sparse(int num, FeatureKey* d_keys, void* d_grads, size_t len) {
-   comm_->push_sparse(num, d_keys, (GradType*)d_grads, len, opt_);
+void HeterPs::push_sparse(int num, FeatureKey* d_keys,
+                          FeaturePushValue* d_grads, size_t len) {
+  comm_->push_sparse(num, d_keys, d_grads, len, opt_);
+  // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
 }
 
+void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
+                                     const std::vector<ncclComm_t>& inter_comms,
+                                     int comm_size) {
+  comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
+}
 
+void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
+  comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim);
+}
 
 }  // end namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index 0a41d2ee2ea51e..217f55e90bcae5 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -23,38 +23,33 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-template <typename KeyType, typename ValType, typename GradType>
 class HeterPs : public HeterPsBase {
-public:
-  HeterPs();
+ public:
+  HeterPs() {}
   HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource);
-  virtual ~HeterPs() {}
+  virtual ~HeterPs();
   HeterPs(const HeterPs&) = delete;
   HeterPs& operator=(const HeterPs&) = delete;
 
-  virtual void pull_sparse(int num, FeatureKey* d_keys, void* d_vals,
+  virtual void pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals,
                            size_t len) override;
-
-  virtual void build_ps(int num, KeyType* h_keys, char* pool,
+  virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
+                        size_t len, size_t chunk_size, int stream_num) override;
+  virtual void build_ps(int num, FeatureKey* h_keys, char* pool,
                         size_t len, size_t feature_value_size, size_t chunk_size, int stream_num) override;
-
   virtual void set_nccl_comm_and_size(
       const std::vector<ncclComm_t>& inner_comms,
       const std::vector<ncclComm_t>& inter_comms, int comm_size) override;
-
-  virtual void set_multi_mf_dim(int max_mf_dim) override;
-
+  virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;
+  virtual void end_pass() override;
   virtual int get_index_by_devid(int devid) override;
-
-  virtual void show_one_table(int gpu_num) {
-    comm_->show_one_table(gpu_num);
-  };
-
+  virtual void show_one_table(int gpu_num) override;
   virtual void push_sparse(int num, FeatureKey* d_keys,
-                           void* d_grads, size_t len) override;
-private:
-  std::shared_ptr<HeterComm<KeyType, ValType, GradType>> comm_;
-  Optimizer<ValType, GradType> opt_;
+                           FeaturePushValue* d_grads, size_t len) override;
+
+ private:
+  std::shared_ptr<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>> comm_;
+  Optimizer<FeatureValue, FeaturePushValue> opt_;
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 0024e2017a2dec..713c1123969624 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -30,22 +30,24 @@ class HeterPsBase {
   virtual ~HeterPsBase(){};
   HeterPsBase(const HeterPsBase&) = delete;
   HeterPsBase& operator=(const HeterPsBase&) = delete;
-  virtual void pull_sparse(int num, FeatureKey* d_keys, void* d_vals,
+
+  virtual void pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals,
                            size_t len) = 0;
+  virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
+                        size_t len, size_t chunk_size, int stream_num) = 0;
   virtual void build_ps(int num, FeatureKey* h_keys, char* pool,
                         size_t len, size_t feature_value_size, size_t chunk_size, int stream_num) = 0;
   virtual int get_index_by_devid(int devid) = 0;
   virtual void set_nccl_comm_and_size(
       const std::vector<ncclComm_t>& inner_comms,
       const std::vector<ncclComm_t>& inter_comms, int comm_size) = 0;
-  virtual void set_multi_mf_dim(int max_mf_dim) = 0;
+  virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0;
+  virtual void end_pass() = 0;
   virtual void show_one_table(int gpu_num) = 0;
   virtual void push_sparse(int num, FeatureKey* d_keys,
-                           void* d_grads, size_t len) = 0;
+                           FeaturePushValue* d_grads, size_t len) = 0;
   static HeterPsBase* get_instance(size_t capacity,
-                                   std::shared_ptr<HeterPsResource> resource,
-                                   std::string accessor_type,
-                                   std::string gpu_value_type);
+                                   std::shared_ptr<HeterPsResource> resource);
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
index 5ca8a6abc42fd7..9189902c28ffb4 100644
--- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
+++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
@@ -29,11 +29,11 @@ class MemoryPool {
       : capacity_(capacity), block_size_(block_size) {
     VLOG(3) << "mem_pool init with block_size: " << block_size
             << " capacity: " << capacity;
-    mem_alloc_ = memory::Alloc(phi::GPUPinnedPlace(), block_size * capacity_);
-    mem_ = reinterpret_cast<char*>(mem_alloc_->ptr());
+    mem_ = (char*)malloc(block_size * capacity_);
   }
   ~MemoryPool() {
     VLOG(3) << "mem pool delete";
+    free(mem_);
   }
   size_t block_size() { return block_size_; }
   char* mem() { return mem_; }
@@ -45,7 +45,6 @@ class MemoryPool {
   }
 
  private:
-  memory::allocation::AllocationPtr mem_alloc_;
   char* mem_ = NULL;
   size_t capacity_;
   size_t block_size_;
@@ -61,13 +60,8 @@ class HBMMemoryPool : public managed {
     VLOG(3) << "hbm memory pool with capacity" << capacity_
             << " bs: " << block_size_;
     cudaMalloc(&mem_, block_size_ * capacity_);
-    gpuStream_t streams;
-    cudaStreamCreate(&streams);
-    cudaMemcpyAsync(mem_, mem_pool->mem(), mem_pool->byte_size(),
-               cudaMemcpyHostToDevice, streams);
-    cudaStreamSynchronize(streams);
-    cudaStreamDestroy(streams);
-    
+    cudaMemcpy(mem_, mem_pool->mem(), mem_pool->byte_size(),
+               cudaMemcpyHostToDevice);
   }
 
   ~HBMMemoryPool() {
@@ -77,6 +71,30 @@ class HBMMemoryPool : public managed {
 
   size_t block_size() { return block_size_; }
 
+  void clear(void) { cudaMemset(mem_, 0, block_size_ * capacity_); }
+
+  void reset(size_t capacity) {
+    cudaFree(mem_);
+    mem_ = NULL;
+    capacity_ = capacity;
+    cudaMalloc(&mem_, (block_size_ * capacity / 8 + 1) * 8);
+    cudaMemset(mem_, 0, block_size_ * capacity);
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, HBMMemoryPool& p) {
+    for (size_t k = 0; k < 5; k++) {
+      auto x = (FeatureValue*)(p.mem() + k * p.capacity());
+      out << "show: " << x->show << " clk: " << x->clk << " slot: " << x->slot
+          << " lr: " << x->lr << " mf_dim: " << x->mf_size
+          << " mf_size: " << x->mf_size << " mf:";
+      for (int i = 0; i < x->mf_size + 1; ++i) {
+        out << " " << x->mf[i];
+      }
+      out << "\n";
+    }
+    return out;
+  }
+
   char* mem() { return mem_; }
 
   size_t capacity() { return capacity_; }
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index 28a49e2d4c3f7b..a0ad727b6386fd 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -27,10 +27,163 @@ template <typename ValType, typename GradType>
 class Optimizer {
  public:
   Optimizer() {}
+
   ~Optimizer() {}
+
   void initialize() {}
-  __device__ void update_value(ValType* ptr, const GradType& grad, curandState& state) {
+
+  __device__ void update_lr(float& w, float& g2sum, float g, float scale) {
+    double add_g2sum = 0;
+    double ratio = optimizer_config::learning_rate *
+                   sqrt(optimizer_config::initial_g2sum /
+                        (optimizer_config::initial_g2sum + g2sum));
+    double scaled_grad = g / scale;
+
+    w += scaled_grad * ratio;
+
+    if (w < optimizer_config::min_bound) w = optimizer_config::min_bound;
+    if (w > optimizer_config::max_bound) w = optimizer_config::max_bound;
+
+    add_g2sum += scaled_grad * scaled_grad;
+
+    g2sum += add_g2sum;
+  }
+
+  __device__ void update_mf(int n, float* w, float& g2sum, const float* g,
+                            float scale) {
+    double add_g2sum = 0;
+    double ratio = optimizer_config::mf_learning_rate *
+                   sqrt(optimizer_config::mf_initial_g2sum /
+                        (optimizer_config::mf_initial_g2sum + g2sum));
+    for (int i = 0; i < n; ++i) {
+      double scaled_grad = g[i] / scale;
+
+      w[i] += scaled_grad * ratio;
+
+      if (w[i] < optimizer_config::mf_min_bound)
+        w[i] = optimizer_config::mf_min_bound;
+      if (w[i] > optimizer_config::mf_max_bound)
+        w[i] = optimizer_config::mf_max_bound;
+      add_g2sum += scaled_grad * scaled_grad;
+    }
+
+    g2sum += add_g2sum / n;
+  }
+  __device__ void update_value(ValType& val, const GradType& grad) {
+    val.slot = grad.slot;
+    val.show += grad.show;
+    val.clk += grad.clk;
+    val.delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
+                       optimizer_config::clk_coeff * grad.clk;
+
+    update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show);
+
+    if (val.mf_size == 0) {
+      if (optimizer_config::mf_create_thresholds <=
+          optimizer_config::nonclk_coeff * (val.show - val.clk) +
+              optimizer_config::clk_coeff * val.clk) {
+        val.mf_size = MF_DIM + 1;
+        val.mf[0] = 0;
+        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+        curandState state;
+        curand_init(clock64(), tid_x, 0, &state);
+        for (int i = 0; i < MF_DIM; ++i) {
+          val.mf[i + 1] =
+              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
+        }
+      }
+    } else {
+      update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show);
+    }
   }
+
+  __device__ void update_value(ValType& val, const GradType& grad, curandState& state) {
+    val.slot = grad.slot;
+    val.show += grad.show;
+    val.clk += grad.clk;
+    val.delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
+                       optimizer_config::clk_coeff * grad.clk;
+
+    update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show);
+
+    if (val.mf_size == 0) {
+      if (optimizer_config::mf_create_thresholds <=
+          optimizer_config::nonclk_coeff * (val.show - val.clk) +
+              optimizer_config::clk_coeff * val.clk) {
+        val.mf_size = MF_DIM + 1;
+        val.mf[0] = 0;
+        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+        for (int i = 0; i < MF_DIM; ++i) {
+          val.mf[i + 1] =
+              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
+        }
+      }
+    } else {
+      update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show);
+    }
+  }
+
+  __device__ void dy_mf_update_value(ValType* ptr, const GradType& grad) {
+    ptr->slot = grad.slot;
+    ptr->show += grad.show;
+    ptr->clk += grad.clk;
+    ptr->delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
+                       optimizer_config::clk_coeff * grad.clk;
+
+    update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
+    // ptr->mf_dim = grad.mf_dim;
+
+    if (ptr->mf_size == 0) {
+      if (optimizer_config::mf_create_thresholds <=
+          optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) +
+              optimizer_config::clk_coeff * ptr->clk) {
+        ptr->mf_size = ptr->mf_dim + 1;
+        ptr->mf[0] = 0;
+        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+        curandState state;
+        curand_init(clock64(), tid_x, 0, &state);
+        for (int i = 0; i < ptr->mf_dim; ++i) {
+          ptr->mf[i + 1] =
+              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
+        }
+      }
+    } else {
+      update_mf(ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
+                grad.show);  // for local test
+    }
+  }
+
+  __device__ void dy_mf_update_value(ValType* ptr, const GradType& grad, curandState& state) {
+    ptr->slot = grad.slot;
+    ptr->show += grad.show;
+    ptr->clk += grad.clk;
+    ptr->delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
+                       optimizer_config::clk_coeff * grad.clk;
+
+    update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
+    // ptr->mf_dim = grad.mf_dim;
+
+    if (ptr->mf_size == 0) {
+      if (optimizer_config::mf_create_thresholds <=
+          optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) +
+              optimizer_config::clk_coeff * ptr->clk) {
+        ptr->mf_size = ptr->mf_dim + 1;
+        ptr->mf[0] = 0;
+        int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+        //curandState state;
+        //curand_init(clock64(), tid_x, 0, &state);
+        for (int i = 0; i < ptr->mf_dim; ++i) {
+          ptr->mf[i + 1] =
+              (curand_uniform(&state)) * optimizer_config::mf_initial_range;
+        }
+      }
+    } else {
+      update_mf(ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
+                grad.show);  // for local test
+    }
+  }
+
+
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_comm.cu b/paddle/fluid/framework/fleet/heter_ps/test_comm.cu
index 212f4d4291c953..3a6ed50ad8e702 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_comm.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_comm.cu
@@ -23,7 +23,6 @@ limitations under the License. */
 using namespace paddle::framework;
 
 TEST(TEST_FLEET, heter_comm) {
-  /*
   int gpu_count = 3;
   std::vector<int> dev_ids;
   dev_ids.push_back(0);
@@ -34,14 +33,14 @@ TEST(TEST_FLEET, heter_comm) {
   resource->enable_p2p();
   std::vector<size_t> count;
   std::vector<std::vector<FeatureKey>> keys;
-  std::vector<std::vector<LxchDyFeatureValue>> vals;
+  std::vector<std::vector<FeatureValue>> vals;
   count.resize(dev_ids.size(), 0);
   keys.resize(dev_ids.size());
   vals.resize(dev_ids.size());
 
   for (int i = 0; i < 10; i++) {
     FeatureKey key;
-    LxchDyFeatureValue val;
+    FeatureValue val;
     int gpu_num = i % gpu_count;
     key = i;
     val.lr = i;
@@ -57,7 +56,7 @@ TEST(TEST_FLEET, heter_comm) {
   }
 
   auto heter_comm =
-      std::make_shared<HeterComm<FeatureKey, LxchDyFeatureValue, LxchDyFeaturePushValue>>(
+      std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>(
           size, resource);
   for (int i = 0; i < gpu_count; ++i) {
     std::cout << "building table: " << i << std::endl;
@@ -68,9 +67,9 @@ TEST(TEST_FLEET, heter_comm) {
   std::cout << "testing pull sparse:" << std::endl;
   paddle::platform::CUDADeviceGuard guard(0);
   FeatureKey* pull_keys;
-  LxchDyFeatureValue* pull_vals;
+  FeatureValue* pull_vals;
   cudaMallocManaged(&pull_keys, 5 * sizeof(FeatureKey));
-  cudaMallocManaged(&pull_vals, 5 * sizeof(LxchDyFeatureValue));
+  cudaMallocManaged(&pull_vals, 5 * sizeof(FeatureValue));
 
   pull_keys[0] = 2;
   pull_keys[1] = 3;
@@ -86,11 +85,11 @@ TEST(TEST_FLEET, heter_comm) {
   cudaFree(pull_vals);
 
   std::cout << "testing push sparse:" << std::endl;
-  Optimizer<LxchDyFeatureValue, LxchDyFeaturePushValue> opt;
+  Optimizer<FeatureValue, FeaturePushValue> opt;
   FeatureKey* push_keys;
-  LxchDyFeaturePushValue* push_vals;
+  FeaturePushValue* push_vals;
   cudaMallocManaged(&push_keys, 5 * sizeof(FeatureKey));
-  cudaMallocManaged(&push_vals, 5 * sizeof(LxchDyFeaturePushValue));
+  cudaMallocManaged(&push_vals, 5 * sizeof(FeaturePushValue));
   push_keys[0] = 2;
   push_keys[1] = 3;
   push_keys[2] = 9;
@@ -110,5 +109,4 @@ TEST(TEST_FLEET, heter_comm) {
 
   cudaFree(push_keys);
   cudaFree(push_vals);
-  */
 }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index aaa61fbbfd0fc0..41b11e8ed90ed2 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -106,135 +106,240 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   platform::Timer timeline;
   timeline.Start();
   int device_num = heter_devices_.size();
-  gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
-  
-  //step1 将读取到的ins，key先做一个粗去重(主要是多线程粗去重)
-  thread_keys_.resize(thread_keys_thread_num_);
-  for (auto& iter : thread_keys_) {
-    iter.resize(thread_keys_thread_num_);
-    for (auto& iter1 : iter) {
-      iter1.resize(multi_mf_dim_);
-      for (auto& iter2 : iter1) {
-        iter2.clear();
-      }
-    }
-  }
-  dataset_mutex_.lock();
-  Dataset* cur_dataset = dataset_pipe_.front();
-  dataset_pipe_.pop();
-  dataset_mutex_.unlock();
-
-  std::function<void(void*, int, int, int)> first_uniq_func;
-  void* record_vec = nullptr;
+  if (!multi_mf_dim_) {
+    gpu_task->init(thread_keys_shard_num_, device_num);
+  } else {
+    gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
+  }
+  auto& local_keys = gpu_task->feature_keys_;
+  auto& local_ptr = gpu_task->value_ptr_;
+
+  std::vector<std::thread> threads;
+
+  // data should be in input channel
+  if (!multi_mf_dim_) {
+    thread_keys_.resize(thread_keys_thread_num_);
+    for (int i = 0; i < thread_keys_thread_num_; i++) {
+      thread_keys_[i].resize(thread_keys_shard_num_);
+    }
+  } else {
+    thread_dim_keys_.resize(thread_keys_thread_num_);
+    for (int i = 0; i < thread_keys_thread_num_; i++) {
+      thread_dim_keys_[i].resize(thread_keys_shard_num_);
+      for (int j = 0; j < thread_keys_shard_num_; j++) {
+        thread_dim_keys_[i][j].resize(multi_mf_dim_);
+      }
+    }
+  }
+
   size_t total_len = 0;
-  std::string data_set_name = std::string(typeid(*cur_dataset).name());
+  size_t len_per_thread = 0;
+  int remain = 0;
+  size_t begin = 0;
+
+  std::string data_set_name = std::string(typeid(*dataset_).name());
+
   if (data_set_name.find("SlotRecordDataset") != std::string::npos) {
-    SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(cur_dataset);
+    SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_);
     auto input_channel = dataset->GetInputChannel();
-    const std::deque<SlotRecord>* vec_data = &(input_channel->GetData());
-    record_vec = (void*)(vec_data);
-    total_len = vec_data->size();
-
-    first_uniq_func = [this](void* ptr, int begin_index, int end_index, int i) -> void {
-      const std::deque<SlotRecord>& total_data = *((const std::deque<SlotRecord>*)ptr);
-      for (auto iter = total_data.begin() + begin_index; iter != total_data.begin() + end_index; iter++) {
+    VLOG(3) << "buildtask::inputslotchannle size: " << input_channel->Size();
+    const std::deque<SlotRecord>& vec_data = input_channel->GetData();
+    total_len = vec_data.size();
+    len_per_thread = total_len / (thread_keys_thread_num_);
+    remain = total_len % (thread_keys_thread_num_);
+    auto gen_func = [this](const std::deque<SlotRecord>& total_data,
+                           int begin_index, int end_index, int i) {
+      for (auto iter = total_data.begin() + begin_index;
+           iter != total_data.begin() + end_index; iter++) {
+        const auto& ins = *iter;
+        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
+        for (const auto feasign : feasign_v) {
+          int shard_id = feasign % thread_keys_shard_num_;
+          this->thread_keys_[i][shard_id].insert(feasign);
+        }
+      }
+    };
+    auto gen_dynamic_mf_func = [this](const std::deque<SlotRecord>& total_data,
+                                      int begin_index, int end_index, int i) {
+      for (auto iter = total_data.begin() + begin_index;
+           iter != total_data.begin() + end_index; iter++) {
         const auto& ins = *iter;
         const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
         const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets;
-        for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size(); slot_idx++) {
-          for (size_t j = slot_offset[slot_offset_vector_[slot_idx]]; j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) {
+        for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size();
+             slot_idx++) {
+          for (size_t j = slot_offset[slot_offset_vector_[slot_idx]];
+               j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) {
             int shard_id = feasign_v[j] % thread_keys_shard_num_;
             int dim_id = slot_index_vec_[slot_idx];
             if (feasign_v[j] != 0) {
-              this->thread_keys_[i][shard_id][dim_id].insert(feasign_v[j]);
+              this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]);
             }
           }
         }
       }
+      /*
+      for (auto iter = total_data.begin() + begin_index;
+           iter != total_data.begin() + end_index; iter++) {
+        const auto& ins = *iter;
+        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
+        for (const auto feasign : feasign_v) {
+          int shard_id = feasign % thread_keys_shard_num_;
+          if (slot_idx >= slot_index_vec_.size()) {
+            VLOG(0) << "WRONG::slot_idx: " << slot_idx << " slot_index_vec_size: " <<
+      slot_index_vec_.size();
+          }
+          int dim_id = slot_index_vec_[slot_idx];
+          if (feasign_v[j] != 0) {
+            this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]);
+          }
+        }
+      }
+      */
     };
+    for (int i = 0; i < thread_keys_thread_num_; i++) {
+      if (!multi_mf_dim_) {
+        threads.push_back(
+            std::thread(gen_func, std::ref(vec_data), begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0), i));
+      } else {
+        VLOG(3) << "psgpu wrapper genfunc with dynamic mf";
+        threads.push_back(
+            std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0), i));
+      }
+      begin += len_per_thread + (i < remain ? 1 : 0);
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    timeline.Pause();
+    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
   } else {
     CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
-    MultiSlotDataset* dataset = dynamic_cast<MultiSlotDataset*>(cur_dataset);
+    VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
+    MultiSlotDataset* dataset = dynamic_cast<MultiSlotDataset*>(dataset_);
     auto input_channel = dataset->GetInputChannel();
-    const std::deque<Record>* vec_data = &(input_channel->GetData());
-    record_vec = (void*)(vec_data);
-    total_len = vec_data->size();
-    first_uniq_func = [this](void* ptr, int begin_index, int end_index, int i) -> void {
-      const std::deque<Record>& total_data = *((const std::deque<Record>*)ptr);
-      for (auto iter = total_data.begin() + begin_index; iter != total_data.begin() + end_index; iter++) {
+
+    const std::deque<Record>& vec_data = input_channel->GetData();
+    total_len = vec_data.size();
+    len_per_thread = total_len / thread_keys_thread_num_;
+    remain = total_len % thread_keys_thread_num_;
+    auto gen_func = [this](const std::deque<Record>& total_data,
+                           int begin_index, int end_index, int i) {
+      for (auto iter = total_data.begin() + begin_index;
+           iter != total_data.begin() + end_index; iter++) {
         const auto& ins = *iter;
         const auto& feasign_v = ins.uint64_feasigns_;
-        //暂不支持多维度，打平老逻辑(有需要的时候加上)
         for (const auto feasign : feasign_v) {
           uint64_t cur_key = feasign.sign().uint64_feasign_;
           int shard_id = cur_key % thread_keys_shard_num_;
-          this->thread_keys_[i][shard_id][0].insert(cur_key);
+          this->thread_keys_[i][shard_id].insert(cur_key);
         }
       }
     };
+    for (int i = 0; i < thread_keys_thread_num_; i++) {
+      threads.push_back(
+          std::thread(gen_func, std::ref(vec_data), begin,
+                      begin + len_per_thread + (i < remain ? 1 : 0), i));
+      begin += len_per_thread + (i < remain ? 1 : 0);
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    timeline.Pause();
+    VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
   }
-  std::vector<std::thread> threads;
-  size_t len_per_thread = total_len / thread_keys_thread_num_;
-  size_t remain = total_len % thread_keys_thread_num_;
-  size_t begin = 0;
-  for (size_t i = 0; i < (size_t)thread_keys_thread_num_; i++) {
-    threads.push_back(
-      std::thread(first_uniq_func, record_vec, begin,
-        begin + len_per_thread + (i < remain ? 1 : 0), i));
-    begin += len_per_thread + (i < remain ? 1 : 0);
-  }
-  for (auto& t : threads) {
-    t.join();
-  }
-  threads.clear();
-  timeline.Pause();
-  auto step_1 = timeline.ElapsedSec();
 
-  //step2 insert into together
   timeline.Start();
-  auto merge_ins_func = [this, gpu_task](int shard_num, int dim_id) -> void {
+
+  threads.clear();
+  // merge thread_keys to shard_keys
+  auto merge_ins_func = [this, gpu_task](int shard_num) {
     for (int i = 0; i < thread_keys_thread_num_; ++i) {
-      gpu_task->batch_add_keys(shard_num, dim_id, thread_keys_[i][shard_num][dim_id]);
-      thread_keys_[i][shard_num][dim_id].clear();
+      gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]);
+      thread_keys_[i][shard_num].clear();
     }
   };
+  auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
+    for (int i = 0; i < thread_keys_thread_num_; ++i) {
+      gpu_task->batch_add_keys(shard_num, dim_id,
+                               thread_dim_keys_[i][shard_num][dim_id]);
+      thread_dim_keys_[i][shard_num][dim_id].clear();
+    }
+  };
+  // for (size_t i = 0; i < thread_keys_.size(); i++) {
+  //  gpu_task->batch_add_keys(thread_keys_[i]);
+  //  for (int j = 0; j < thread_keys_thread_num_; j++) {
+  //    thread_keys_[i][j].clear();
+  //  }
+  //}
   for (int i = 0; i < thread_keys_shard_num_; ++i) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      threads.push_back(std::thread(merge_ins_func, i, j));
+    if (!multi_mf_dim_) {
+      threads.push_back(std::thread(merge_ins_func, i));
+    } else {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
+      }
     }
   }
   for (auto& t : threads) {
     t.join();
   }
-  threads.clear();
   timeline.Pause();
-  auto step_2 = timeline.ElapsedSec();
 
-  //step3 精细化去重
+  VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec()
+          << " seconds.";
   timeline.Start();
-  gpu_task->unique_keys();
+  gpu_task->UniqueKeys();
   timeline.Pause();
-  auto step_3 = timeline.ElapsedSec();
-  VLOG(0) << "pass_id:" << gpu_task->pass_id_ << "  PreBuildTask cost-detail  "
-          << "rough-dedup: " << step_1
-          << "s  add-batch: " << step_2
-          << "s  unique_keys:" << step_3 << "s"; 
+
+  VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
+
+  if (!multi_mf_dim_) {
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      VLOG(0) << "GpuPs shard: " << i << " key len: " << local_keys[i].size();
+      local_ptr[i].resize(local_keys[i].size());
+    }
+  } else {
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
+                << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
+        gpu_task->value_dim_ptr_[i][j].resize(
+            gpu_task->feature_dim_keys_[i][j].size());
+      }
+    }
+  }
 }
 
 void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
-
-#if (defined PADDLE_WITH_PSLIB) || (defined PADDLE_WITH_PSCORE)
   platform::Timer timeline;
-  timeline.Start();
+  auto& local_keys = gpu_task->feature_keys_;
+  auto& local_ptr = gpu_task->value_ptr_;
+
+  auto& local_dim_keys = gpu_task->feature_dim_keys_;
+  auto& local_dim_ptr = gpu_task->value_dim_ptr_;
+
+  auto& device_dim_keys = gpu_task->device_dim_keys_;
+  auto& device_dim_ptr = gpu_task->device_dim_ptr_;
+  if (multi_mf_dim_) {
+    for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
+      device_dim_keys[dev].resize(multi_mf_dim_);
+      device_dim_ptr[dev].resize(multi_mf_dim_);
+    }
+  }
 
+  std::vector<std::thread> threads(thread_keys_shard_num_);
 #ifdef PADDLE_WITH_PSLIB
   auto fleet_ptr = FleetWrapper::GetInstance();
-#else
+#endif
+#ifdef PADDLE_WITH_PSCORE
   auto fleet_ptr = paddle::distributed::FleetWrapper::GetInstance();
 #endif
 
 #if (defined PADDLE_WITH_PSLIB) && (defined PADDLE_WITH_HETERPS)
-  //设置日期，ps内部pull的时候需要根据day_id做decay
+  // get day_id: day nums from 1970
   struct std::tm b;
   b.tm_year = year_ - 1900;
   b.tm_mon = month_ - 1;
@@ -245,58 +350,163 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   fleet_ptr->pslib_ptr_->_worker_ptr->set_day_id(table_id_, day_id);
 #endif
 
-  fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(0);
-  //获取sparse-value的指针
-  auto& pull_keys = gpu_task->feature_keys_;
-  auto& pull_value = gpu_task->value_ptr_;
-  auto pull_value_func = [this, &pull_keys, &pull_value, &fleet_ptr, &gpu_task](int i, int j) -> void {
-    size_t key_size = pull_keys[i][j].size();
-    pull_value[i][j].resize(key_size);
+  timeline.Start();
+  auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) {
+    size_t key_size = local_keys[i].size();
+    int32_t status = -1;
+#ifdef PADDLE_WITH_PSLIB
+    // auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
+    //    reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
+    //    local_keys[i].data(), key_size);
     int32_t cnt = 0;
     while (true) {
-#ifdef PADDLE_WITH_PSLIB
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          i, reinterpret_cast<char**>(pull_value[i][j].data()),
-          this->table_id_, pull_keys[i][j].data(), key_size, gpu_task->pass_id_);
-#else
-      auto tt = fleet_ptr->worker_ptr_->PullSparsePtr(
-          reinterpret_cast<char**>(pull_value[i][j].data()), this->table_id_,
-          pull_keys[i][j].data(), key_size);
+          i, reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
+          local_keys[i].data(), key_size);
+      bool flag = true;
+
+      tt.wait();
+
+      try {
+        status = tt.get();
+      } catch (const std::future_error& e) {
+        VLOG(0) << "Caught a future_error with code" << e.code()
+                << ", Message:" << e.what();
+      }
+      if (status != 0) {
+        VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
+        sleep(sleep_seconds_before_fail_exit_);
+        flag = false;
+        cnt++;
+      }
+      if (cnt > 3) {
+        VLOG(0) << "fleet pull sparse failed, retry 3 times";
+        exit(-1);
+      }
+
+      if (flag) {
+        break;
+      }
+    }
 #endif
+#ifdef PADDLE_WITH_PSCORE
+    int32_t cnt = 0;
+    while (true) {
+      auto tt = fleet_ptr->worker_ptr_->PullSparsePtr(
+          reinterpret_cast<char**>(local_ptr[i].data()), this->table_id_,
+          local_keys[i].data(), key_size);
+      bool flag = true;
+
       tt.wait();
-      int32_t status = -1;
+
       try {
         status = tt.get();
       } catch (const std::future_error& e) {
         VLOG(0) << "Caught a future_error with code" << e.code()
                 << ", Message:" << e.what();
       }
-      if (status == 0) {
+      if (status != 0) {
+        VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
+        sleep(sleep_seconds_before_fail_exit_);
+        flag = false;
+        cnt++;
+      }
+      if (cnt > 3) {
+        VLOG(0) << "fleet pull sparse failed, retry 3 times";
+        exit(-1);
+      }
+
+      if (flag) {
         break;
-      } else {
+      }
+    }
+#endif
+    if (status != 0) {
+      LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
+      sleep(300);
+      exit(-1);
+    } else {
+      VLOG(3) << "FleetWrapper Pull sparse to local done with table size: "
+              << local_keys[i].size();
+    }
+  };
+
+  auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr,
+                              &fleet_ptr](int i, int j) {
+#ifdef PADDLE_WITH_PSLIB
+    size_t key_size = local_dim_keys[i][j].size();
+    int32_t status = -1;
+    int32_t cnt = 0;
+    while (true) {
+      auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
+          i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
+          this->table_id_, local_dim_keys[i][j].data(), key_size);
+      bool flag = true;
+
+      tt.wait();
+
+      try {
+        status = tt.get();
+      } catch (const std::future_error& e) {
+        VLOG(0) << "Caught a future_error with code" << e.code()
+                << ", Message:" << e.what();
+      }
+      if (status != 0) {
         VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
         sleep(sleep_seconds_before_fail_exit_);
+        flag = false;
         cnt++;
       }
       if (cnt > 3) {
         VLOG(0) << "fleet pull sparse failed, retry 3 times";
         exit(-1);
       }
+
+      if (flag) {
+        break;
+      }
     }
+    if (status != 0) {
+      LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
+      sleep(300);
+      exit(-1);
+    } else {
+      VLOG(3) << "FleetWrapper Pull sparse to local done with table size: "
+              << local_dim_keys[i][j].size();
+    }
+#endif
   };
-  std::vector<std::future<void>> task_futures;
-  for (int i = 0; i < thread_keys_shard_num_; i++) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      task_futures.emplace_back(pull_thread_pool_[i]->enqueue(pull_value_func, i, j));
+  fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_);
+  if (!multi_mf_dim_) {
+    for (size_t i = 0; i < threads.size(); i++) {
+      threads[i] = std::thread(ptl_func, i);
+    }
+  } else {
+    threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
+
+    std::vector<std::future<void>> task_futures;
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        // threads[i * multi_mf_dim_ + j] = std::thread(ptl_dynamic_mf_func, i,
+        // j);
+        task_futures.emplace_back(
+            pull_thread_pool_[i]->enqueue(ptl_dynamic_mf_func, i, j));
+      }
+    }
+    for (auto& f : task_futures) {
+      f.wait();
     }
+    task_futures.clear();
   }
-  for (auto& f : task_futures) {
-    f.wait();
+  if (!multi_mf_dim_) {
+    for (std::thread& t : threads) {
+      t.join();
+    }
   }
+  fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_);
   timeline.Pause();
-  fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(0);
-  VLOG(0) << "pass_id:" << gpu_task->pass_id_ << "  build-pull-detail cost: " << timeline.ElapsedSec() << "s";
-#endif
+  VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
+          << " seconds.";
   if (multi_node_) {
     auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
     if (!gloo_wrapper->IsInitialized()) {
@@ -307,223 +517,499 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   }
 }
 
-//里面用到了一些静态变量，同一时刻只允许有一个线程运行这个函数
-void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
-#define FOUR_VECTOR_INIT(vec, first_size, second_size, thrid_size) \
-  vec.resize(first_size); \
-  for (auto& iter1 : vec) { \
-    iter1.resize(second_size); \
-    for (auto& iter2 : iter1) { \
-      iter2.resize(thrid_size); \
-      for (auto& iter3 : iter2) { \
-        iter3.clear(); \
-      } \
-    } \
-  }
-#define THIRD_VECTOR_INIT(vec, first_size, second_size, thrid_size) \
-  vec.resize(first_size); \
-  for (auto& iter1 : vec) { \
-    iter1.resize(second_size); \
-    for (auto& iter2 : iter1) { \
-      iter2.resize(thrid_size); \
-    } \
-  }
-
+void PSGPUWrapper::PrepareGPUTask(std::shared_ptr<HeterContext> gpu_task) {
   platform::Timer timeline;
-  timeline.Start();
-  std::vector<std::future<void>> task_futures;
   int device_num = heter_devices_.size();
-  //step1 input: 分片->维度->key; => output: 分片->设备->维度->key
-  auto& pull_keys = gpu_task->feature_keys_;
-  auto& pull_values = gpu_task->value_ptr_;
-  static std::vector<std::vector<std::vector<std::vector<FeatureKey>>>> s_first_keys;
+  auto& local_keys = gpu_task->feature_keys_;
+  auto& local_ptr = gpu_task->value_ptr_;
+  auto& local_dim_keys = gpu_task->feature_dim_keys_;
+  auto& local_dim_ptr = gpu_task->value_dim_ptr_;
+
+  auto& device_keys = gpu_task->device_keys_;
+  auto& device_vals = gpu_task->device_values_;
+  auto& device_dim_keys = gpu_task->device_dim_keys_;
+  auto& device_dim_ptr = gpu_task->device_dim_ptr_;
+  auto& device_dim_mutex = gpu_task->dim_mutex_;
+  std::vector<std::future<void>> task_futures;
+  std::vector<std::thread> threads(thread_keys_shard_num_);
+
+  if (multi_mf_dim_) {
+    for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
+      device_dim_keys[dev].resize(multi_mf_dim_);
+      device_dim_ptr[dev].resize(multi_mf_dim_);
+    }
+  }
+  timeline.Start();
+  std::vector<std::vector<std::pair<uint64_t, char*>>> pass_values;
+
+  bool record_status = false;
 #ifdef PADDLE_WITH_PSLIB
-  static std::vector<std::vector<std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>>>> s_first_value;
-#elif PADDLE_WITH_PSCORE
-  static std::vector<std::vector<std::vector<std::vector<paddle::distributed::FixedFeatureValue*>>>> s_first_value;
+  auto fleet_ptr = FleetWrapper::GetInstance();
+  uint16_t pass_id = 0;
+  if (multi_node_) {
+    record_status = fleet_ptr->pslib_ptr_->_worker_ptr->take_sparse_record(
+        table_id_, pass_id, pass_values);
+  }
 #endif
-  FOUR_VECTOR_INIT(s_first_keys, thread_keys_shard_num_, device_num, multi_mf_dim_)
-  FOUR_VECTOR_INIT(s_first_value, thread_keys_shard_num_, device_num, multi_mf_dim_)
-  auto& l_first_keys = s_first_keys;
-  auto& l_first_value = s_first_value;
-  auto func_first = [this, &pull_keys, &pull_values, &l_first_keys, &l_first_value, &device_num](int shard_id, int dim_id) -> void {
-    auto& l_keys = pull_keys[shard_id][dim_id];
-    auto& l_values = pull_values[shard_id][dim_id];
-    for (size_t i = 0; i < l_keys.size(); i++) {
-      int dev_id = l_keys[i] % device_num;
-      l_first_keys[shard_id][dev_id][dim_id].push_back(l_keys[i]);
-      l_first_value[shard_id][dev_id][dim_id].push_back(l_values[i]);
+  auto& device_task_keys = gpu_task->device_task_keys_;
+  auto& device_task_ptrs = gpu_task->device_task_ptr_;
+  auto build_pull_dynamic_mf_func = [this, device_num, &local_dim_keys,
+                                     &local_dim_ptr, &device_dim_keys,
+                                     &device_dim_ptr,
+                                     &device_dim_mutex](int i, int j) {
+#ifdef PADDLE_WITH_PSLIB
+    std::vector<std::vector<FeatureKey>> task_keys(device_num);
+    std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
+        device_num);
+    for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) {
+      int shard = local_dim_keys[i][j][k] % device_num;
+      task_keys[shard].push_back(local_dim_keys[i][j][k]);
+      task_ptrs[shard].push_back(local_dim_ptr[i][j][k]);
     }
-  };
-  for (int i = 0; i < thread_keys_shard_num_; i++) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(func_first, i, j));
+    // allocate local keys to devices
+    for (int dev = 0; dev < device_num; dev++) {
+      device_dim_mutex[dev][j]->lock();
+
+      int len = task_keys[dev].size();
+      int cur = device_dim_keys[dev][j].size();
+      device_dim_keys[dev][j].resize(device_dim_keys[dev][j].size() + len);
+      device_dim_ptr[dev][j].resize(device_dim_ptr[dev][j].size() + len);
+      for (int k = 0; k < len; ++k) {
+        device_dim_keys[dev][j][cur + k] = task_keys[dev][k];
+        device_dim_ptr[dev][j][cur + k] = task_ptrs[dev][k];
+      }
+      device_dim_mutex[dev][j]->unlock();
     }
-  }
-  for (auto& f : task_futures) {
-    f.wait();
-  }
-  task_futures.clear();
-  timeline.Pause();
-  auto step_1 = timeline.ElapsedSec();
-  timeline.Start();
+// for (int dev = 0; dev < device_num; dev++) {
+//   for (int dim = 0; dim < multi_mf_dim_; dim++) {
+//     device_dim_mutex[dev][dim]->lock();
+
+//     int len = task_keys[dev].size();
+//     int cur = device_dim_keys[dev][dim].size();
+//     device_dim_keys[dev][dim].resize(device_dim_keys[dev][dim].size() +
+//                                      len);
+//     device_dim_ptr[dev][dim].resize(device_dim_ptr[dev][dim].size() + len);
+//     for (int k = 0; k < len; ++k) {
+//       device_dim_keys[dev][dim][cur + k] = task_keys[dev][k];
+//       device_dim_ptr[dev][dim][cur + k] = task_ptrs[dev][k];
+//     }
+//     device_dim_mutex[dev][dim]->unlock();
+//   }
+// }
+#endif
+  };
+  auto build_func = [device_num, record_status, &pass_values, &local_keys,
+                     &local_ptr, &device_task_keys, &device_task_ptrs](int i) {
+    auto& task_keys = device_task_keys[i];
+#ifdef PADDLE_WITH_PSLIB
+    auto& task_ptrs = device_task_ptrs[i];
+#endif
 
-  //step2 通过前缀计算，获得结果大小 prefix_sum: 设备->维度->分片数量
-  //      l_second_key/s_second_value: 设备->维度->具体的值
-  static std::vector<std::vector<std::vector<int>>> prefix_sum;
+#ifdef PADDLE_WITH_PSCORE
+    auto& task_ptrs = device_task_ptrs[i];
+#endif
+
+    for (size_t j = 0; j < local_keys[i].size(); j++) {
+      int shard = local_keys[i][j] % device_num;
+      task_keys[shard].push_back(local_keys[i][j]);
+      task_ptrs[shard].push_back(local_ptr[i][j]);
+    }
 #ifdef PADDLE_WITH_PSLIB
-  static std::vector<std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>>> s_second_value;
-#elif PADDLE_WITH_PSCORE
-  static std::vector<std::vector<std::vector<paddle::distributed::FixedFeatureValue*>>> s_second_value;
+    if (record_status) {
+      size_t local_keys_size = local_keys.size();
+      size_t pass_values_size = pass_values.size();
+      for (size_t j = 0; j < pass_values_size; j += local_keys_size) {
+        auto& shard_values = pass_values[j];
+        for (size_t pair_idx = 0; pair_idx < pass_values[j].size();
+             pair_idx++) {
+          auto& cur_pair = shard_values[pair_idx];
+          int shard = cur_pair.first % device_num;
+          task_keys[shard].push_back(cur_pair.first);
+          task_ptrs[shard].push_back(
+              (paddle::ps::DownpourFixedFeatureValue*)cur_pair.second);
+        }
+      }
+    }
 #endif
-  THIRD_VECTOR_INIT(prefix_sum, device_num, multi_mf_dim_, thread_keys_shard_num_ + 1)
-  THIRD_VECTOR_INIT(s_second_value, device_num, multi_mf_dim_, 0)
-  auto& l_second_key = gpu_task->device_keys_;
-  auto& l_second_value = s_second_value;
-  auto& l_prefix_sum = prefix_sum;
-  auto func_second = [this, &l_prefix_sum, &l_first_keys, &l_second_key, &l_second_value]
-                      (int device_id, int dim_id) -> void {
-    l_prefix_sum[device_id][dim_id][0] = 0;
-    for (int j = 0; j < this->thread_keys_shard_num_; j++) {
-      l_prefix_sum[device_id][dim_id][j+1] = l_prefix_sum[device_id][dim_id][j] + l_first_keys[j][device_id][dim_id].size();
-    }
-    l_second_key[device_id][dim_id].resize(l_prefix_sum[device_id][dim_id][this->thread_keys_shard_num_]);
-    l_second_value[device_id][dim_id].resize(l_prefix_sum[device_id][dim_id][this->thread_keys_shard_num_]);
   };
-  for (int i = 0; i < device_num; i++) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(func_second, i, j));
+  if (!multi_mf_dim_) {
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(build_func, i));
+    }
+    for (auto& f : task_futures) {
+      f.wait();
     }
+    task_futures.clear();
+    VLOG(0) << "GpuPs build hbmps done";
   }
-  for (auto& f : task_futures) {
-    f.wait();
+  std::vector<std::vector<int>> prefix_sum;
+  prefix_sum.resize(device_num);
+  for (int i = 0; i < device_num; i++) {
+    prefix_sum[i].resize(thread_keys_shard_num_ + 1);
+    prefix_sum[i][0] = 0;
   }
-  task_futures.clear();
-  timeline.Pause();
-  auto step_2 = timeline.ElapsedSec();
-  timeline.Start();
+  auto calc_prefix_func = [this, &prefix_sum, &device_keys, &device_vals,
+                           &device_task_keys](int device_num) {
+    for (int j = 0; j < thread_keys_shard_num_; j++) {
+      prefix_sum[device_num][j + 1] =
+          prefix_sum[device_num][j] + device_task_keys[j][device_num].size();
+    }
+    device_keys[device_num].resize(
+        prefix_sum[device_num][thread_keys_shard_num_]);
+    device_vals[device_num].resize(
+        prefix_sum[device_num][thread_keys_shard_num_]);
+  };
+  if (!multi_mf_dim_) {
+    for (int i = 0; i < device_num; i++) {
+      task_futures.emplace_back(
+          hbm_thread_pool_[i]->enqueue(calc_prefix_func, i));
+    }
+    for (auto& f : task_futures) {
+      f.wait();
+    }
+    task_futures.clear();
+  }
+  VLOG(0) << "prefix done";
+  auto prepare_dev_value_func = [device_num, &prefix_sum, &device_keys,
+                                 &device_vals, &device_task_keys,
+                                 &device_task_ptrs](int dev, int shard_id) {
+    auto& task_keys = device_task_keys[shard_id];
+#ifdef PADDLE_WITH_PSLIB
+    auto& task_ptrs = device_task_ptrs[shard_id];
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+    auto& task_ptrs = device_task_ptrs[dev];
+#endif
 
-  //step3, 具体的key/value => 转入到l_second_key/l_second_value
-  //       这就是要最终转入到gpu中key-value分设备后的cpu数据了
-  auto func_third = [this, &l_second_key, &l_second_value, &l_first_keys, &l_first_value, &l_prefix_sum]
-                    (int shard_id, int device_id, int dim_id) -> void {
-    auto& input_key = l_first_keys[shard_id][device_id][dim_id];
-    auto& input_value = l_first_value[shard_id][device_id][dim_id];
-    auto& output_key = l_second_key[device_id][dim_id];
-    auto& output_value = l_second_value[device_id][dim_id];
-    int start_index = prefix_sum[device_id][dim_id][shard_id];
-    for (size_t i = 0; i < input_key.size(); i++) {
-      output_key[i + start_index] = input_key[i];
-      output_value[i + start_index] = input_value[i];
+    int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id];
+    int cur = prefix_sum[dev][shard_id];
+#ifdef PADDLE_WITH_PSLIB
+    for (int j = 0; j < len; ++j) {
+      device_keys[dev][cur + j] = task_keys[dev][j];
+      float* ptr_val = task_ptrs[dev][j]->data();
+      FeatureValue& val = device_vals[dev][cur + j];
+      size_t dim = task_ptrs[dev][j]->size();
+      // dec ref
+      if (ptr_val[6] <= 0) {
+        ptr_val[6] *= -1;
+      } else if (ptr_val[0] <= 0) {
+        ptr_val[0] *= -1;
+      }
+
+      val.delta_score = ptr_val[1];
+      val.show = ptr_val[2];
+      val.clk = ptr_val[3];
+      val.slot = ptr_val[6];
+      val.lr = ptr_val[4];
+      val.lr_g2sum = ptr_val[5];
+      val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]);
+
+      if (dim > 7) {
+        val.mf_size = MF_DIM + 1;
+        for (int x = 0; x < val.mf_size; x++) {
+          val.mf[x] = ptr_val[x + 7];
+        }
+      } else {
+        val.mf_size = 0;
+        for (int x = 0; x < MF_DIM + 1; x++) {
+          val.mf[x] = 0;
+        }
+      }
+    }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+    for (int j = 0; j < len; ++j) {
+      device_keys[dev][cur + j] = task_keys[dev][j];
+      float* ptr_val = task_ptrs[dev][j]->data();
+      FeatureValue& val = device_vals[dev][cur + j];
+      size_t dim = task_ptrs[dev][j]->size();
+      val.delta_score = ptr_val[2];
+      val.show = ptr_val[3];
+      val.clk = ptr_val[4];
+      val.slot = ptr_val[0];
+      val.lr = ptr_val[5];
+      val.lr_g2sum = ptr_val[6];
+      val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]);
+
+      if (dim > 7) {
+        val.mf_size = MF_DIM + 1;
+        for (int x = 0; x < val.mf_size; x++) {
+          val.mf[x] = ptr_val[x + 7];
+        }
+      } else {
+        val.mf_size = 0;
+        for (int x = 0; x < MF_DIM + 1; x++) {
+          val.mf[x] = 0;
+        }
+      }
     }
+#endif
+    VLOG(3) << "GpuPs build hbmps done";
+
   };
-  for (int i = 0; i < thread_keys_shard_num_; i++) {
-    for (int j = 0; j < device_num; j++) {
-      for (int k = 0; k < multi_mf_dim_; k++) {
-        task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(func_third, i, j, k));
+
+  fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_);
+  if (multi_mf_dim_) {
+    threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i * multi_mf_dim_ + j] =
+            std::thread(build_pull_dynamic_mf_func, i, j);
       }
     }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+  } else {
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < device_num; j++) {
+        task_futures.emplace_back(
+            hbm_thread_pool_[i]->enqueue(prepare_dev_value_func, j, i));
+      }
+    }
+    for (auto& f : task_futures) {
+      f.wait();
+    }
+    task_futures.clear();
   }
-  for (auto& f : task_futures) {
-    f.wait();
-  }
-  task_futures.clear();
+  fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_);
   timeline.Pause();
-  auto step_3 = timeline.ElapsedSec();
+  VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
+          << " seconds.";
+}
+
+void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
+  int device_num = heter_devices_.size();
+  platform::Timer timeline;
   timeline.Start();
 
-  //step4 初始化gpu-table相关数据了
+  std::vector<size_t> feature_keys_count(device_num);
   size_t size_max = 0;
-  for (int i = 0; i < device_num; i++) {
-    size_t tmp_size = 0;
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      tmp_size += l_second_key[i][j].size();
+  if (!multi_mf_dim_) {
+    for (int i = 0; i < device_num; i++) {
+      feature_keys_count[i] = gpu_task->device_keys_[i].size();
+      VLOG(0) << i << " card contains feasign nums: " << feature_keys_count[i];
+      size_max = std::max(size_max, feature_keys_count[i]);
+    }
+  } else {
+    for (int i = 0; i < device_num; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        feature_keys_count[i] += gpu_task->device_dim_ptr_[i][j].size();
+        VLOG(1) << i << " card with dynamic mf dim: " << index_dim_vec_[j]
+                << " dim index: " << j << " contains feasign nums: "
+                << gpu_task->device_dim_ptr_[i][j].size();
+      }
+      VLOG(1) << i << " card with dynamic mf contains feasign nums total: "
+              << feature_keys_count[i];
+      size_max = std::max(size_max, feature_keys_count[i]);
     }
-    size_max = std::max(size_max, tmp_size);
-  }
-  if (size_max <= 0) {
-    VLOG(0) << "Skip build gpu ps cause feasign nums = " << size_max;
-    return;
   }
   if (HeterPs_) {
     delete HeterPs_;
     HeterPs_ = nullptr;
   }
-  HeterPs_ = HeterPsBase::get_instance(size_max, resource_, ps_accessor_type_, gpu_value_type_);
-  CHECK(HeterPs_ != nullptr);
+  if (size_max <= 0) {
+    VLOG(0) << "Skip build gpu ps cause feasign nums = " << size_max;
+    return;
+  }
+  std::vector<std::thread> threads(device_num);
+  HeterPs_ = HeterPsBase::get_instance(size_max, resource_);
   HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_);
-  HeterPs_->set_multi_mf_dim(max_mf_dim_);
-
-  //step5, cpu数据转化到gpu数据，并构造gpu-table了
-  auto transfor_value_obj = g_transfor;
-  auto build_table_func = [this, &l_second_key, &l_second_value, &transfor_value_obj] 
-                          (int device_id, int dim_id) -> void {
-    int cur_dim_size = this->index_dim_vec_[dim_id];
-    size_t gpu_value_size = transfor_value_obj->get_gpu_value_size(cur_dim_size);
-    auto& cpu_keys = l_second_key[device_id][dim_id];
-    auto& cpu_values = l_second_value[device_id][dim_id];
-    size_t keys_len = cpu_keys.size();
-    this->mem_pools_[device_id * this->multi_mf_dim_ + dim_id] = new MemoryPool(keys_len, gpu_value_size);
-    auto& mem_pool = this->mem_pools_[device_id * this->multi_mf_dim_ + dim_id];
-    for (size_t k = 0; k < keys_len; k++) {
-      void* to_value_ptr = mem_pool->mem_address(k);
-      transfor_value_obj->value_cpu_to_gpu(cpu_values[k], to_value_ptr, cur_dim_size);
-    }
-    auto device_index = resource_->dev_id(device_id);
-    platform::CUDADeviceGuard guard(device_index);
-
-    this->hbm_pools_[device_id * this->multi_mf_dim_ + dim_id] = new HBMMemoryPool(mem_pool);
-    auto& cur_pool = this->hbm_pools_[device_id * this->multi_mf_dim_ + dim_id];
-    this->HeterPs_->build_ps(device_id, cpu_keys.data(), cur_pool->mem(), keys_len,
-                             gpu_value_size, 500000, 2);
+  auto build_func = [this, &gpu_task, &feature_keys_count](int i) {
+    VLOG(3) << "building table: " << i;
+    this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(),
+                             gpu_task->device_values_[i].data(),
+                             feature_keys_count[i], 500000, 2);
+    if (feature_keys_count[i] > 0) {
+      HeterPs_->show_one_table(i);
+    }
+  };
+
+  // multi-thread process
+
+  auto build_dymf_mem_pool = [this, &gpu_task](int i, int j) {
+    this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
+    int mf_dim = this->index_dim_vec_[j];
+    size_t feature_value_size =
+        TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+    auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
+    size_t len = device_dim_keys.size();
+    CHECK(len == device_dim_ptrs.size());
+    this->mem_pools_[i * this->multi_mf_dim_ + j] = new MemoryPool(len, feature_value_size);
+  };
+
+  auto build_dymf_hbm_pool = [this, &gpu_task](int i, int j) {
+
+    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+    size_t len = device_dim_keys.size();
+    int mf_dim = this->index_dim_vec_[j];
+    size_t feature_value_size =
+        TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+
+    auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool);
+    auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
+
+    this->HeterPs_->build_ps(i, device_dim_keys.data(),
+                             cur_pool->mem(), len, feature_value_size,
+                             500000, 2);
+    if (device_dim_keys.size() > 0) {
+      VLOG(3) << "show table: " << i << " table kv size: " << device_dim_keys.size() << "dim: " << mf_dim << " len: " << len;
+      HeterPs_->show_one_table(i);
+    }
     delete mem_pool;
   };
-  
-  std::vector<std::thread> threads;
-  threads.resize(device_num * multi_mf_dim_);
-  for (int i = 0; i < device_num; i++) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      threads[i + j * device_num] = std::thread(build_table_func, i, j);
+
+  int thread_num = 16;
+  auto build_dynamic_mf_func = [this, &gpu_task, thread_num](int i, int j, int z) {
+    int mf_dim = this->index_dim_vec_[j];
+    VLOG(3) << "building table: " << i << "with mf dim: " << mf_dim;
+
+    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+    auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
+
+    size_t len = device_dim_keys.size();
+    CHECK(len == device_dim_ptrs.size());
+
+    auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
+
+    // ============ add for multi-thread ================ 
+    int len_per_thread = len / thread_num;
+    int remain = len % thread_num;
+    int left = -1, right = -1;
+    
+    int real_len = len_per_thread;
+    if (z < remain) real_len++;
+    
+    if (z < remain) {
+      left = z * (len_per_thread + 1);
+      right = left + real_len;
+    } else {
+      left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread;
+      right = left + real_len;
     }
-  }
-  
-  for (std::thread& t : threads) {
-    t.join();
+    // ============ add for multi-thread ================ 
+
+    for (int k = left; k < right; k++) {
+
+      FeatureValue* val = (FeatureValue*)(mem_pool->mem_address(k));
+      float* ptr_val = device_dim_ptrs[k]->data();
+      size_t dim = device_dim_ptrs[k]->size();
+      // dec ref
+      if (ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                      DownpourCtrDymfFeatureValue::slot_index()] <= 0) {
+        ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                    DownpourCtrDymfFeatureValue::slot_index()] *= -1;
+      } else if (ptr_val
+                     [paddle::ps::DownpourCtrDymfAccessor::
+                          DownpourCtrDymfFeatureValue::unseen_days_index()] <=
+                 0) {
+        ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                    DownpourCtrDymfFeatureValue::unseen_days_index()] *= -1;
+      }
+      val->delta_score =
+          ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                      DownpourCtrDymfFeatureValue::delta_score_index()];
+      val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                              DownpourCtrDymfFeatureValue::show_index()];
+      val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                             DownpourCtrDymfFeatureValue::click_index()];
+      val->slot = int(ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                                  DownpourCtrDymfFeatureValue::slot_index()]);
+      val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                            DownpourCtrDymfFeatureValue::embed_w_index()];
+      val->lr_g2sum =
+          ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                      DownpourCtrDymfFeatureValue::embed_g2sum_index()];
+      val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]);
+
+      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
+      ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  mf_dim_index()] = float(mf_dim);
+      val->mf_dim = mf_dim;
+      if (dim > 8) {  // CpuPS alreay expand as mf_dim
+        val->mf_size = mf_dim + 1;
+        for (int x = 0; x < val->mf_dim + 1; x++) {
+          val->mf[x] = ptr_val[x + 8];
+        }
+      } else {
+        val->mf_size = 0;
+        for (int x = 0; x < val->mf_dim + 1; x++) {
+          val->mf[x] = 0;
+        }
+      }
+    }
+  };
+  if (!multi_mf_dim_) {
+    for (size_t i = 0; i < threads.size(); i++) {
+      threads[i] = std::thread(build_func, i);
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    threads.clear();
+  } else {
+    threads.resize(device_num * multi_mf_dim_);
+    for (int i = 0; i < device_num; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i + j * device_num] = std::thread(build_dymf_mem_pool, i, j);
+      }
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    threads.clear();
+    // multi-thread process
+    threads.resize(device_num * multi_mf_dim_ * thread_num);
+    for (int i = 0; i < device_num; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        for (int k = 0; k < thread_num; k++) {
+          threads[(i + j * device_num) * thread_num + k] = std::thread(build_dynamic_mf_func, i, j, k);
+        }
+      }
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    threads.clear();
+    threads.resize(device_num * multi_mf_dim_);
+    for (int i = 0; i < device_num; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i + j * device_num] = std::thread(build_dymf_hbm_pool, i, j);
+      }
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    threads.clear();
   }
   timeline.Pause();
-  auto step_4 = timeline.ElapsedSec();
-  VLOG(0) << "pass_id:" << gpu_task->pass_id_ << "  build_gpu_table_detail  "
-          << "  device_picec_a:" << step_1
-          << "s  device_picec_b:" << step_2
-          << "s  device_picec_c:" << step_3
-          << "s  trans_to_gpu:" << step_4 << "s";
-
-#undef FOUR_VECTOR_INIT
-#undef THIRD_VECTOR_INIT
+  VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec()
+          << " s.";
 }
 
 void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
   platform::Timer timer;
+  VLOG(3) << "Begin LoadIntoMemory(), dataset[" << dataset_ << "]";
   timer.Start();
   dataset_->LoadIntoMemory();
   timer.Pause();
-  auto load_s = timer.ElapsedSec();
-  timer.Start();
+  VLOG(0) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s";
+
+  // local shuffle
   if (is_shuffle) {
     dataset_->LocalShuffle();
   }
-  timer.Pause();
-  auto shuffle_s = timer.ElapsedSec();
-  
   InitSlotInfo();
   std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
-  gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
-  VLOG(0) << "pass_id:" << gpu_task->pass_id_ << "  LoadIntoMemory cost: " << load_s << "s  Shuffle cost:" << shuffle_s << "s";
+  gpu_task->Reset();
 
-  dataset_mutex_.lock();
-  dataset_pipe_.push(dataset_);
-  dataset_mutex_.unlock();
   data_ready_channel_->Put(gpu_task);
+
+  VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]";
 }
 
 void PSGPUWrapper::start_build_thread() {
@@ -540,11 +1026,14 @@ void PSGPUWrapper::pre_build_thread() {
     if (!data_ready_channel_->Get(gpu_task)) {
       continue;
     }
+    VLOG(3) << "thread PreBuildTask start.";
     platform::Timer timer;
     timer.Start();
+    // build cpu ps data process
     PreBuildTask(gpu_task);
     timer.Pause();
-    VLOG(0) << "pass_id:" << gpu_task->pass_id_ << "  PreBuildTask cost: " << timer.ElapsedSec() << "s";
+    VLOG(0) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec()
+            << "s";
     buildcpu_ready_channel_->Put(gpu_task);
   }
   VLOG(3) << "build cpu thread end";
@@ -556,17 +1045,20 @@ void PSGPUWrapper::build_pull_thread() {
     if (!buildcpu_ready_channel_->Get(gpu_task)) {
       continue;
     }
+    VLOG(3) << "thread build pull start.";
     platform::Timer timer;
     timer.Start();
+    // build cpu ps data process
     BuildPull(gpu_task);
     timer.Pause();
-    VLOG(0) << "pass_id:" << gpu_task->pass_id_ << "  BuildPull cost: " << timer.ElapsedSec() << "s";
+    VLOG(1) << "thread BuildPull end, cost time: " << timer.ElapsedSec() << "s";
     buildpull_ready_channel_->Put(gpu_task);
   }
   VLOG(3) << "build cpu thread end";
 }
 
 void PSGPUWrapper::build_task() {
+  // build_task: build_pull + build_gputask
   std::shared_ptr<HeterContext> gpu_task = nullptr;
   // train end, gpu free
   if (!gpu_free_channel_->Get(gpu_task)) {
@@ -576,11 +1068,16 @@ void PSGPUWrapper::build_task() {
   if (!buildpull_ready_channel_->Get(gpu_task)) {
     return;
   }
+
+  VLOG(0) << "PrepareGPUTask start.";
   platform::Timer timer;
   timer.Start();
+  PrepareGPUTask(gpu_task);
   BuildGPUTask(gpu_task);
   timer.Pause();
-  VLOG(0) << "pass_id:" << gpu_task->pass_id_ << "  build_gpu_table cost: " << timer.ElapsedSec() << "s";
+  VLOG(0) << "PrepareGPUTask + BuildGPUTask end, cost time: "
+          << timer.ElapsedSec() << "s";
+
   current_task_ = gpu_task;
 }
 
@@ -591,16 +1088,17 @@ void PSGPUWrapper::BeginPass() {
     PADDLE_THROW(
         platform::errors::Fatal("[BeginPass] current task is not ended."));
   }
+
   build_task();
   timer.Pause();
+
   if (current_task_ == nullptr) {
     PADDLE_THROW(platform::errors::Fatal(
         "[BeginPass] after build_task, current task is not null."));
   }
-  VLOG(0) << "pass_id:" << current_task_->pass_id_ << "  begin_pass cost: " << timer.ElapsedSec() << "s";
-}
-
 
+  VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
+}
 
 void PSGPUWrapper::EndPass() {
   if (!current_task_) {
@@ -608,9 +1106,25 @@ void PSGPUWrapper::EndPass() {
         platform::errors::Fatal("[EndPass] current task has been ended."));
   }
   auto fleet_ptr = FleetWrapper::GetInstance();
+  fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_);
   platform::Timer timer;
   timer.Start();
-  
+  size_t keysize_max = 0;
+
+  // in case of feasign_num = 0, skip dump_to_cpu
+  if (!multi_mf_dim_) {
+    for (size_t i = 0; i < heter_devices_.size(); i++) {
+      keysize_max =
+          std::max(keysize_max, current_task_->device_keys_[i].size());
+    }
+  } else {
+    for (size_t i = 0; i < heter_devices_.size(); i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        keysize_max =
+            std::max(keysize_max, current_task_->device_dim_keys_[i][j].size());
+      }
+    }
+  }
 
   int thread_num = 8;
   auto dump_pool_to_cpu_func = [this, thread_num](int i, int j, int z) {
@@ -618,7 +1132,7 @@ void PSGPUWrapper::EndPass() {
     PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
 
     auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
-    auto& device_keys = this->current_task_->device_keys_[i][j];
+    auto& device_keys = this->current_task_->device_dim_keys_[i][j];
     size_t len = device_keys.size();
     // ============ add for multi-thread ================
     int len_per_thread = len / thread_num;
@@ -637,16 +1151,13 @@ void PSGPUWrapper::EndPass() {
     }
     // ============ add for multi-thread ================
     int mf_dim = this->index_dim_vec_[j];
-    size_t feature_value_size = g_transfor->get_gpu_value_size(mf_dim);
-    auto cpu_value =  memory::Alloc(phi::GPUPinnedPlace(), feature_value_size * real_len);
-    char* cpu_value_ptr = reinterpret_cast<char*>(cpu_value->ptr());
+    VLOG(3) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim;
+    size_t feature_value_size =
+        TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+    char* test_build_values = (char*)malloc(feature_value_size * real_len);
     uint64_t offset = left * feature_value_size;
-    gpuStream_t streams;
-    cudaStreamCreate(&streams);
-    cudaMemcpyAsync(cpu_value_ptr, hbm_pool->mem() + offset,
-               feature_value_size * real_len, cudaMemcpyDeviceToHost, streams);
-    cudaStreamSynchronize(streams);
-    cudaStreamDestroy(streams);
+    cudaMemcpy(test_build_values, hbm_pool->mem() + offset,
+               feature_value_size * real_len, cudaMemcpyDeviceToHost);
     CHECK(len == hbm_pool->capacity());
     uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
     for (int i = left; i < right; ++i) {
@@ -654,103 +1165,231 @@ void PSGPUWrapper::EndPass() {
         continue;
       }
       size_t local_offset = (i - left) * feature_value_size;
-      void* gpu_val = (void*)(cpu_value_ptr + local_offset);
-      g_transfor->value_gpu_to_cpu(gpu_val);
+      FeatureValue* gpu_val = (FeatureValue*)(test_build_values + local_offset);
+      auto* downpour_value =
+          (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+
+      if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
+        downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+      // cpu_val[0] = 0;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  delta_score_index()] = gpu_val->delta_score;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  show_index()] = gpu_val->show;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  click_index()] = gpu_val->clk;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  embed_w_index()] = gpu_val->lr;
+      cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  embed_g2sum_index()] = gpu_val->lr_g2sum;
+      // useless
+      if (cpu_val[paddle::ps::DownpourCtrDymfAccessor::
+                      DownpourCtrDymfFeatureValue::slot_index()] < 0) {
+        cpu_val[paddle::ps::DownpourCtrDymfAccessor::
+                    DownpourCtrDymfFeatureValue::slot_index()] =
+            -1 * gpu_val->slot;
+      } else {
+        cpu_val[paddle::ps::DownpourCtrDymfAccessor::
+                    DownpourCtrDymfFeatureValue::slot_index()] = gpu_val->slot;
+      }
+      if (gpu_val->mf_size > 0) {
+        for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
+          if (x + 8 >= int(downpour_value->size())) {
+            VLOG(0) << "x: " << x << " size: "<< downpour_value_size;
+          }
+          cpu_val[x + 8] = gpu_val->mf[x];
+        }
+      }
     }
+    free(test_build_values);
   };
 
-  size_t device_num = heter_devices_.size();
-  std::vector<std::thread> threads(device_num * multi_mf_dim_ * thread_num);
-  for (size_t i = 0; i < device_num; i++) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      for (int k = 0; k < thread_num; k++) {
-        threads[(i + j * device_num) * thread_num + k] =
-            std::thread(dump_pool_to_cpu_func, i, j, k);
+  if (multi_mf_dim_) {
+    VLOG(0) << "dynamic mf dump pool: multi_mf_dim_: " << multi_mf_dim_;
+    size_t device_num = heter_devices_.size();
+    std::vector<std::thread> threads(device_num * multi_mf_dim_ * thread_num);
+    for (size_t i = 0; i < device_num; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        for (int k = 0; k < thread_num; k++) {
+          threads[(i + j * device_num) * thread_num + k] =
+              std::thread(dump_pool_to_cpu_func, i, j, k);
+        }
       }
     }
+    for (std::thread& t : threads) {
+      t.join();
+    }
   }
-  for (std::thread& t : threads) {
-    t.join();
+  if (keysize_max != 0) {
+    HeterPs_->end_pass();
   }
 
   for (size_t i = 0; i < hbm_pools_.size(); i++) {
     delete hbm_pools_[i];
   }
+
   gpu_task_pool_.Push(current_task_);
   current_task_ = nullptr;
   gpu_free_channel_->Put(current_task_);
+  fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_);
   timer.Pause();
   VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
-//拉取gpu-table数据，并做cvm填充(对应pull_box_sparse_op算子, 非动态维度)
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
                               const int table_id,
                               const std::vector<const uint64_t*>& keys,
                               const std::vector<float*>& values,
                               const std::vector<int64_t>& slot_lengths,
                               const int hidden_size) {
-  
-  const std::vector<int> slot_dim;
-  PullSparse(place, table_id, keys, values, slot_lengths, slot_dim, hidden_size);
-  return;
+  VLOG(3) << "Begine Gpu Ps PullSparse";
+  platform::Timer all_timer;
+  platform::Timer pull_gpups_timer;
+  all_timer.Start();
+  int64_t total_length =
+      std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+  auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue));
+  FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
+  if (platform::is_cpu_place(place)) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Warning:: CPUPlace is not supported in GpuPs now."));
+  } else if (platform::is_gpu_place(place)) {
+    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
+    int device_id = place.GetDeviceId();
+    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
+    LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
+    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
+        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
+
+    // construct slot_level lod info
+    auto slot_lengths_lod = slot_lengths;
+    for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
+      slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+    }
+    auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
+    auto buf_length =
+        memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+    uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
+    int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+    cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
+               cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_len, slot_lengths_lod.data(),
+               slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
+
+    this->CopyKeys(place, gpu_keys, total_keys, gpu_len,
+                   static_cast<int>(slot_lengths.size()),
+                   static_cast<int>(total_length));
+    VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
+            << " len: " << total_length;
+    pull_gpups_timer.Start();
+    HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
+                          static_cast<int>(total_length));
+    pull_gpups_timer.Pause();
+    VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
+            << "]";
+    this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len,
+                      static_cast<int>(slot_lengths.size()), hidden_size,
+                      total_length);
+  } else {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "GpuPs: PullSparse Only Support CUDAPlace Now."));
+  }
+  all_timer.Pause();
+  VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec()
+          << " s, of which GPUPS costs: " << pull_gpups_timer.ElapsedSec()
+          << " s";
+  VLOG(3) << "End PullSparse";
 }
 
-//拉取gpu-table数据，并做cvm填充(对应pull_gpups_sparse_op算子, 非动态维度)
 void PSGPUWrapper::PullSparse(
     const paddle::platform::Place& place, const int table_id,
     const std::vector<const uint64_t*>& keys, const std::vector<float*>& values,
     const std::vector<int64_t>& slot_lengths,
     const std::vector<int>& slot_dim,  // dimension for each slot
     const int hidden_size) {
-    
-  if (!platform::is_gpu_place(place)) {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-      "GpuPs: PullSparse Only Support CUDAPlace Now."));
+  VLOG(3) << "Begine Gpu Ps PullSparse";
+  platform::Timer all_timer;
+  platform::Timer pull_gpups_timer;
+  all_timer.Start();
+
+  size_t total_length =
+      std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+
+  size_t feature_value_size = 0;
+  if (!multi_mf_dim_) {
+    feature_value_size = sizeof(FeatureValue);
+  } else {
+    feature_value_size = TYPEALIGN(
+        8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1));
   }
-  //step1 将散乱的key重整到一块连续的空间上面来
-  size_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-  size_t feature_value_size = g_transfor->get_gpu_value_size(max_mf_dim_);
   auto buf = memory::Alloc(place, total_length * feature_value_size);
-  void* total_values_gpu = reinterpret_cast<void*>(buf->ptr());
-  int device_id = place.GetDeviceId();
-  int devid_2_index = HeterPs_->get_index_by_devid(device_id);
-  LoDTensor& total_keys_tensor = keys_tensor_[devid_2_index];
-  uint64_t* total_keys = reinterpret_cast<uint64_t*>(total_keys_tensor.mutable_data<int64_t>(
-                                {int64_t(total_length), 1}, place));
-  auto slot_lengths_lod = slot_lengths;
-  for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
-    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
-  }
-
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-
-  PinnedVector pinned_buf_key(keys.data(), keys.size() * sizeof(uint64_t*), stream, place);
-  uint64_t** gpu_keys = pinned_buf_key.get_gpu_ptr<uint64_t*>();
-  PinnedVector pinned_buf_length(slot_lengths_lod.data(), slot_lengths.size() * sizeof(int64_t), stream, place);
-  int64_t* gpu_len = pinned_buf_length.get_gpu_ptr<int64_t>();
-
-  this->CopyKeys(place, gpu_keys, total_keys, gpu_len,
-                  static_cast<int>(slot_lengths.size()),
-                  static_cast<int>(total_length));
-  
-  //step2 查表获得gpu-value数据
-  HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, total_length);
-
-  //step3 做cvm转换处理
-  PinnedVector pinned_buf_value(values.data(), values.size() * sizeof(float*), stream, place);
-  float** gpu_values = pinned_buf_value.get_gpu_ptr<float*>();
-  if (slot_dim.size() != 0) { //动态mf模式
-    PinnedVector pinned_dim(slot_dim.data(), slot_dim.size() * sizeof(int), stream, place);
-    int* gpu_dim = pinned_dim.get_gpu_ptr<int>();
-    g_transfor->value_to_cvm(gpu_values, total_values_gpu, gpu_keys, slot_lengths.size(), gpu_len,
-      gpu_dim, total_length, 0, feature_value_size, stream);
+  FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
+  if (platform::is_cpu_place(place)) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Warning:: CPUPlace is not supported in GpuPs now."));
+  } else if (platform::is_gpu_place(place)) {
+    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
+    int device_id = place.GetDeviceId();
+    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
+    LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
+    uint64_t* total_keys =
+        reinterpret_cast<uint64_t*>(total_keys_tensor.mutable_data<int64_t>(
+            {int64_t(total_length), 1}, place));
+    // construct slot_level lod info
+    auto slot_lengths_lod = slot_lengths;
+    for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
+      slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+    }
+    auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
+    auto buf_length =
+        memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+    uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
+    int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+    cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
+               cudaMemcpyHostToDevice);
+    cudaMemcpy(gpu_len, slot_lengths_lod.data(),
+               slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
+    auto buf_dim = memory::Alloc(place, slot_dim.size() * sizeof(int));
+    int* gpu_dim = reinterpret_cast<int*>(buf_dim->ptr());
+    cudaMemcpy(gpu_dim, slot_dim.data(), slot_dim.size() * sizeof(int),
+               cudaMemcpyHostToDevice);
+    this->CopyKeys(place, gpu_keys, total_keys, gpu_len,
+                   static_cast<int>(slot_lengths.size()),
+                   static_cast<int>(total_length));
+
+    VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
+            << " len: " << total_length;
+
+    pull_gpups_timer.Start();
+    HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
+                          total_length);
+
+    VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
+            << "]";
+    if (!multi_mf_dim_) {
+      this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len,
+                        static_cast<int>(slot_lengths.size()), hidden_size,
+                        total_length);
+    } else {
+      this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len,
+                        static_cast<int>(slot_lengths.size()), hidden_size,
+                        total_length, gpu_dim);
+    }
+    pull_gpups_timer.Pause();
   } else {
-    g_transfor->value_to_cvm(gpu_values, total_values_gpu, gpu_keys, slot_lengths.size(), gpu_len,
-        nullptr, total_length, hidden_size, feature_value_size, stream);
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "GpuPs: PullSparse Only Support CUDAPlace Now."));
   }
+  all_timer.Pause();
+  time_1 += all_timer.ElapsedSec();
+  time_2 += pull_gpups_timer.ElapsedSec();
+  VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec()
+          << " s, of which pullsparse costs: " << pull_gpups_timer.ElapsedSec()
+          << " s";
+  VLOG(3) << "End PullSparse";
 }
 
 void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
@@ -759,52 +1398,53 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
                                   const std::vector<const float*>& grad_values,
                                   const std::vector<int64_t>& slot_lengths,
                                   const int hidden_size, const int batch_size) {
-  if (!platform::is_gpu_place(place)) {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "GPUPS: PushSparseGrad Only Support CUDAPlace Now."));
-  }
-  //step1 将零散的梯度重整到一块连续的空间上面来
-  int64_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-  size_t grad_value_size = g_transfor->get_gpu_push_value_size(max_mf_dim_);
-  int device_id = place.GetDeviceId();
-  int devid_2_index = HeterPs_->get_index_by_devid(device_id);
-  LoDTensor& cached_total_keys_tensor = keys_tensor_[devid_2_index];
-  uint64_t* total_keys = reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  
-  PinnedVector pinned_gpu_values(grad_values.data(), grad_values.size() * sizeof(float*), stream, place);
-  float** gpu_values = pinned_gpu_values.get_gpu_ptr<float*>();
-
-  PinnedVector pinned_slot_vector(slot_vector_.data(), slot_vector_.size() * sizeof(int), stream, place);
-  int* d_slot_vector = pinned_slot_vector.get_gpu_ptr<int>();
-
-  auto slot_lengths_lod = slot_lengths;
-  for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
-    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
-  }
-  PinnedVector pinned_buf_length(slot_lengths_lod.data(), slot_lengths_lod.size() * sizeof(int64_t), stream, place);
-  int64_t* gpu_len = pinned_buf_length.get_gpu_ptr<int64_t>();
-
+  VLOG(3) << "Begin GPUPS PushSparseGrad";
+  platform::Timer all_timer;
+  platform::Timer push_gpups_timer;
+  all_timer.Start();
+  int64_t total_length =
+      std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+  size_t grad_value_size =
+      TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
   auto buf = memory::Alloc(place, total_length * grad_value_size);
-  void* total_grad_values_gpu = reinterpret_cast<void*>(buf->ptr());
-      
-  if (hidden_size == 0) { //动态mf
-    PinnedVector pinned_mf_dim(slot_mf_dim_vector_.data(), slot_mf_dim_vector_.size() * sizeof(int), stream, place);
-    int* d_mf_dim_vector = pinned_mf_dim.get_gpu_ptr<int>();
-    g_transfor->grad_to_push(
-          total_grad_values_gpu, gpu_values, slot_lengths.size(), gpu_len, d_mf_dim_vector,
-          total_length, 0, grad_value_size, batch_size, d_slot_vector, stream);
-  } else {
-    g_transfor->grad_to_push(
-          total_grad_values_gpu, gpu_values, slot_lengths.size(), gpu_len, 0,
-          total_length, hidden_size, grad_value_size, batch_size, d_slot_vector, stream);
-  }
+  VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_;
+  FeaturePushValue* total_grad_values_gpu =
+      reinterpret_cast<FeaturePushValue*>(buf->ptr());
+  if (platform::is_cpu_place(place)) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Warning:: CPUPlace is not supported in GPUPS now."));
+  } else if (platform::is_gpu_place(place)) {
+    int device_id = place.GetDeviceId();
+    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
+    LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
+    uint64_t* total_keys =
+        reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
+    VLOG(3) << "Begin copy grad tensor to gpups struct";
+    if (!multi_mf_dim_) {
+      this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
+                        hidden_size, total_length, batch_size);
+    } else {
+      this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
+                        total_length, batch_size, grad_value_size);
+    }
 
-  //step2，梯度更新了
-  HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
+    VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index
+            << " len: " << total_length;
+    push_gpups_timer.Start();
+    HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
                           static_cast<int>(total_length));
+    push_gpups_timer.Pause();
+  } else {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "GPUPS: PushSparseGrad Only Support CUDAPlace Now."));
+  }
+  all_timer.Pause();
+  time_3 += all_timer.ElapsedSec();
+  time_4 += push_gpups_timer.ElapsedSec();
+  VLOG(3) << "PushSparseGrad total cost: " << all_timer.ElapsedSec()
+          << " s, of which GPUPS cost: " << push_gpups_timer.ElapsedSec()
+          << " s";
+  VLOG(3) << "End PushSparseGrad";
 }
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 9ac41e970e7c4f..4a25d5e3783a0b 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -25,6 +25,91 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+__global__ void PullCopy(float** dest, const FeatureValue* src,
+                         const int64_t* len, int hidden, int slot_num,
+                         int total_len, uint64_t** keys) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[x - 1] : 0);
+    if (*(keys[x] + y) == 0) {
+      *(dest[x] + y * hidden) = 0;
+      *(dest[x] + y * hidden + 1) = 0;
+      *(dest[x] + y * hidden + 2) = 0;
+    } else {
+      *(dest[x] + y * hidden) = (src + i)->show;
+      *(dest[x] + y * hidden + 1) = (src + i)->clk;
+      *(dest[x] + y * hidden + 2) = (src + i)->lr;
+    }
+    if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) {
+      for (int j = 0; j < hidden - 3; j++) {
+        *(dest[x] + y * hidden + 3 + j) = 0;
+      }
+    } else {
+      for (int j = 0; j < hidden - 3; j++) {
+        *(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j];
+      }
+    }
+  }
+}
+
+__global__ void PullCopy(float** dest, const FeatureValue* src,
+                         const int64_t* len, int slot_num, int total_len,
+                         uint64_t** keys, uint64_t max_val_size, int* gpu_dim) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[x - 1] : 0);
+    int cur_dim = gpu_dim[x] - 3;
+    FeatureValue* feature_value_ptr =
+        (FeatureValue*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
+    
+    int mf_dim = feature_value_ptr->mf_dim;
+    mf_dim = gpu_dim[x] - 3;
+    if (*(keys[x] + y) == 0) {
+      *(dest[x] + y * (cur_dim + 3)) = 0;
+      *(dest[x] + y * (cur_dim + 3) + 1) = 0;
+      *(dest[x] + y * (cur_dim + 3) + 2) = 0;
+    } else {
+      *(dest[x] + y * (mf_dim + 3)) = feature_value_ptr->show;
+      *(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr->clk;
+      *(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr->lr;
+    }
+    if ((feature_value_ptr)->mf_size == 0 || *(keys[x] + y) == 0 ){
+      if (*(keys[x] + y) == 0) {
+        for (int j = 0; j < cur_dim; j++) {
+          *(dest[x] + y * (cur_dim + 3) + 3 + j) = 0;
+        }
+      } else {
+        for (int j = 0; j < mf_dim; j++) {
+          *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0;
+        }
+      }
+      
+    } else {
+      for (int j = 0; j < mf_dim; j++) {
+        *(dest[x] + y * (mf_dim + 3) + 3 + j) = feature_value_ptr->mf[1 + j];
+      }
+    }
+  }
+}
+
 __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys,
                                const int64_t* len, int slot_num,
                                int total_len) {
@@ -44,6 +129,139 @@ __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys,
   }
 }
 
+//__global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys,
+//                               const int64_t* len, int slot_num,
+//                               int total_len, int* gpu_dim) {
+//  CUDA_KERNEL_LOOP(i, total_len) {
+//    int low = 0;
+//    int high = slot_num - 1;
+//    while (low < high) {
+//      int mid = (low + high) / 2;
+//      if (i < len[mid])
+//        high = mid;
+//      else
+//        low = mid + 1;
+//    }
+//    int x = low;
+//    int y = i - (x ? len[x - 1] : 0);
+//    dest_total_keys[i] = src_keys[x][y];
+//    //if (src_keys[x][y] == 0 && gpu_dim[x] > 30) {
+//    //  dest_total_keys[i] = 1;
+//    //};
+//  }
+//}
+
+__global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len,
+                         int hidden, int slot_num, int total_len, int bs,
+                         int* slot_vector) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[low - 1] : 0);
+    FeaturePushValue val;
+    float* src_ptr = src[x];
+    int hidden_off = y * hidden;
+    val.slot = slot_vector[x];
+    val.show = *(src_ptr + hidden_off);
+    val.clk = *(src_ptr + hidden_off + 1);
+    val.lr_g = *(src_ptr + hidden_off + 2) * -1. * bs;
+    for (int j = 0; j < hidden - 3; j++) {
+      val.mf_g[j] = *(src_ptr + hidden_off + 3 + j) * -1. * bs;
+    }
+    *(dest + i) = val;
+  }
+}
+
+__global__ void PushCopyWithPool(FeaturePushValue* dest, float** src,
+                                 int64_t* len, int slot_num, uint64_t total_len,
+                                 int bs, int* slot_vector, int* mf_dim_vector,
+                                 size_t grad_value_size) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[low - 1] : 0);
+    FeaturePushValue* cur =
+        (FeaturePushValue*)((char*)dest + i * grad_value_size);
+
+    cur->slot = slot_vector[x];
+    int mf_dim = mf_dim_vector[x];  // slot_vector holds both slot and
+                                    // slot:mf_dim information
+    cur->mf_dim = mf_dim;
+
+    cur->show = *(src[x] + y * (mf_dim + 3));
+    cur->clk = *(src[x] + y * (mf_dim + 3) + 1);
+    cur->lr_g = *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
+    for (int j = 0; j < cur->mf_dim; j++) {
+      cur->mf_g[j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
+    }
+  }
+}
+
+void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
+                               uint64_t** gpu_keys,
+                               const std::vector<float*>& values,
+                               const FeatureValue* total_values_gpu,
+                               const int64_t* gpu_len, const int slot_num,
+                               const int hidden_size,
+                               const int64_t total_length) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
+  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
+  cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
+             cudaMemcpyHostToDevice);
+
+  PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      gpu_values, total_values_gpu, gpu_len, hidden_size, slot_num,
+      total_length, gpu_keys);
+  cudaStreamSynchronize(stream);
+}
+
+void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
+                               uint64_t** gpu_keys,
+                               const std::vector<float*>& values,
+                               const FeatureValue* total_values_gpu,
+                               const int64_t* gpu_len, const int slot_num,
+                               const int hidden_size,
+                               const int64_t total_length, int* gpu_dim) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
+  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
+  cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
+             cudaMemcpyHostToDevice);
+
+  if (!multi_mf_dim_) {
+    PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+        gpu_values, total_values_gpu, gpu_len, hidden_size, slot_num,
+        total_length, gpu_keys);
+  } else {
+    PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+        gpu_values, total_values_gpu, gpu_len, slot_num, total_length, gpu_keys,
+        val_type_size_, gpu_dim);
+  }
+
+  cudaStreamSynchronize(stream);
+}
+
 void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                             uint64_t** origin_keys, uint64_t* total_keys,
                             const int64_t* gpu_len, int slot_num,
@@ -56,6 +274,99 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
   cudaStreamSynchronize(stream);
 }
 
+//void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
+//                            uint64_t** origin_keys, uint64_t* total_keys,
+//                            const int64_t* gpu_len, int slot_num,
+//                            int total_len, int* gpu_dim) {
+//  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+//                    platform::DeviceContextPool::Instance().Get(place))
+//                    ->stream();
+//  CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>(
+//      origin_keys, total_keys, gpu_len, slot_num, total_len, gpu_dim);
+//  cudaStreamSynchronize(stream);
+//}
+
+void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
+                               const std::vector<const float*>& grad_values,
+                               FeaturePushValue* total_grad_values_gpu,
+                               const std::vector<int64_t>& slot_lengths,
+                               const int hidden_size,
+                               const int64_t total_length,
+                               const int batch_size) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto slot_lengths_lod = slot_lengths;
+  for (int i = 1; i < slot_lengths_lod.size(); i++) {
+    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+  }
+  auto buf_grad_value =
+      memory::Alloc(place, grad_values.size() * sizeof(float*));
+  auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+  auto buf_slot_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+
+  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
+  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
+
+  cudaMemcpy(gpu_values, grad_values.data(),
+             grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_len, slot_lengths_lod.data(),
+             slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_slot_vector, slot_vector_.data(),
+             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
+
+  PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      total_grad_values_gpu, gpu_values, gpu_len, hidden_size,
+      slot_lengths.size(), total_length, batch_size, d_slot_vector);
+  cudaStreamSynchronize(stream);
+}
+
+void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
+                               const std::vector<const float*>& grad_values,
+                               FeaturePushValue* total_grad_values_gpu,
+                               const std::vector<int64_t>& slot_lengths,
+                               const uint64_t total_length, const int batch_size,
+                               size_t grad_value_size) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto slot_lengths_lod = slot_lengths;
+  for (int i = 1; i < slot_lengths_lod.size(); i++) {
+    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+  }
+  auto buf_grad_value =
+      memory::Alloc(place, grad_values.size() * sizeof(float*));
+  auto buf_length =
+      memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+  auto buf_slot_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+  auto buf_mf_dim_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+
+  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
+  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
+  int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
+
+  cudaMemcpy(gpu_values, grad_values.data(),
+             grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_len, slot_lengths_lod.data(),
+             slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_slot_vector, slot_vector_.data(),
+             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector_.data(),
+             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
+
+  PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(),
+      total_length, batch_size, d_slot_vector, d_mf_dim_vector,
+      grad_value_size);
+
+  cudaStreamSynchronize(stream);
+}
+
 void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
                                 float min_bound, float max_bound,
                                 float learning_rate, float initial_g2sum,
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index ad0321b5466783..e54ff6a3bd58b8 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -90,8 +90,14 @@ class PSGPUWrapper {
   PSGPUWrapper() {
     HeterPs_ = NULL;
     sleep_seconds_before_fail_exit_ = 300;
-    ps_accessor_type_ = "DownpourCtrAccessor";
-    gpu_value_type_ = "FeatureValue";
+    hbm_thread_pool_.resize(thread_keys_shard_num_);
+    for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
+      hbm_thread_pool_[i].reset(new ::ThreadPool(1));
+    }
+    pull_thread_pool_.resize(thread_keys_shard_num_);
+    for (size_t i = 0; i < pull_thread_pool_.size(); i++) {
+      pull_thread_pool_[i].reset(new ::ThreadPool(1));
+    }
   }
 
   void PullSparse(const paddle::platform::Place& place, const int table_id,
@@ -115,10 +121,33 @@ class PSGPUWrapper {
   void CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys,
                 uint64_t* total_keys, const int64_t* gpu_len, int slot_num,
                 int total_len, int* gpu_dim);
+  void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys,
+                   const std::vector<float*>& values,
+                   const FeatureValue* total_values_gpu, const int64_t* gpu_len,
+                   const int slot_num, const int hidden_size,
+                   const int64_t total_length);
+  void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys,
+                   const std::vector<float*>& values,
+                   const FeatureValue* total_values_gpu, const int64_t* gpu_len,
+                   const int slot_num, const int hidden_size,
+                   const int64_t total_length, int* gpu_dim);
+  void CopyForPush(const paddle::platform::Place& place,
+                   const std::vector<const float*>& grad_values,
+                   FeaturePushValue* total_grad_values_gpu,
+                   const std::vector<int64_t>& slot_lengths,
+                   const int hidden_size, const int64_t total_length,
+                   const int batch_size);
+  void CopyForPush(const paddle::platform::Place& place,
+                   const std::vector<const float*>& grad_values,
+                   FeaturePushValue* total_grad_values_gpu,
+                   const std::vector<int64_t>& slot_lengths,
+                   const uint64_t total_length, const int batch_size,
+                   size_t grad_value_size);
 
   void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
   void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
   void BuildPull(std::shared_ptr<HeterContext> gpu_task);
+  void PrepareGPUTask(std::shared_ptr<HeterContext> gpu_task);
   void LoadIntoMemory(bool is_shuffle);
   void BeginPass();
   void EndPass();
@@ -151,7 +180,7 @@ class PSGPUWrapper {
       is_initialized_ = true;
       resource_ = std::make_shared<HeterPsResource>(dev_ids);
       resource_->enable_p2p();
-      keys_tensor_.resize(resource_->total_gpu());
+      keys_tensor.resize(resource_->total_gpu());
 #ifdef PADDLE_WITH_GLOO
       auto gloo = paddle::framework::GlooWrapper::GetInstance();
       if (gloo->Size() > 1) {
@@ -279,9 +308,7 @@ class PSGPUWrapper {
     day_ = day;
   }
 
-  void SetDataset(Dataset* dataset) { 
-    dataset_ = dataset; 
-  }
+  void SetDataset(Dataset* dataset) { dataset_ = dataset; }
 
   // PSGPUWrapper singleton
   static std::shared_ptr<PSGPUWrapper> GetInstance() {
@@ -290,6 +317,10 @@ class PSGPUWrapper {
     }
     return s_instance_;
   }
+  std::vector<std::unordered_map<uint64_t, std::vector<float>>>& GetLocalTable(
+      int table_id) {
+    return local_tables_[table_id];
+  }
   void SetSlotVector(const std::vector<int>& slot_vector) {
     slot_vector_ = slot_vector;
   }
@@ -303,41 +334,48 @@ class PSGPUWrapper {
     assert(slot_mf_dim_vector_.size() == slot_vector_.size());
   }
 
-  void SetSlotDimFixed(const int dim) {
-    assert(slot_vector_.size() != 0);
-    for (size_t ii = 0; ii < slot_vector_.size(); ii++) {
-      slot_mf_dim_vector_.push_back(dim);
-    }
-  }
-
-  void SetTableShardNum(const int shard_num) {
-    thread_keys_shard_num_ = shard_num;
-  }
-
-  void SetAccessorAndValueType(std::string accessor_type, std::string value_type) {
-    ps_accessor_type_ = accessor_type;
-    gpu_value_type_ = value_type;
-  }
-
   void InitSlotInfo() {
     if (slot_info_initialized_) {
       return;
     }
     SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_);
     auto slots_vec = dataset->GetSlots();
+    auto multi_slot_desc = dataset_->GetDataFeedDesc().multi_slot_desc();
+    std::vector<std::string> slots_vec_test;
+    for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
+      const auto& slot = multi_slot_desc.slots(i);
+      // VLOG(0) << "yxfslotname: " << slot.name();
+      if (slot.type() == "uint64" || slot.type() == "uint32") {
+        slots_vec_test.push_back(slot.name());
+      }
+    }
+    std::cout << "wrapper use slots: ";
+    for (auto s : slots_vec_test) {
+      std::cout << s << " | ";
+    }
+    std::cout << " end wrapper " << std::endl;
+    VLOG(0) << "get slot desc";
     slot_offset_vector_.clear();
     for (auto& slot : slot_vector_) {
       for (size_t i = 0; i < slots_vec.size(); ++i) {
         if (std::to_string(slot) == slots_vec[i]) {
+          // VLOG(0) << "yxf slot: " << slot;
           slot_offset_vector_.push_back(i);
           break;
         }
       }
     }
+    for (auto s : slot_offset_vector_) {
+      std::cout << s << " | ";
+    }
+    std::cout << " end " << std::endl;
+    for (size_t i = 0; i < slot_mf_dim_vector_.size(); i++) {
+      slot_dim_map_[slot_vector_[i]] = slot_mf_dim_vector_[i];
+    }
 
     std::unordered_set<int> dims_set;
-    for (auto& it : slot_mf_dim_vector_) {
-      dims_set.insert(it);
+    for (auto& it : slot_dim_map_) {
+      dims_set.insert(it.second);
     }
     size_t num_of_dim = dims_set.size();
     index_dim_vec_.resize(num_of_dim);
@@ -356,17 +394,10 @@ class PSGPUWrapper {
     for (size_t i = 0; i < slot_index_vec_.size(); i++) {
       slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]];
     }
-    //初始化一些线程池
-    CHECK(thread_keys_shard_num_ != 0);
-    hbm_thread_pool_.resize(thread_keys_shard_num_);
-    for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
-      hbm_thread_pool_[i].reset(new ::ThreadPool(1));
-    }
-    pull_thread_pool_.resize(thread_keys_shard_num_);
-    for (size_t i = 0; i < pull_thread_pool_.size(); i++) {
-      pull_thread_pool_[i].reset(new ::ThreadPool(1));
-    }
-    GlobalValueTransfor::get_instance().init(ps_accessor_type_, gpu_value_type_);
+    val_type_size_ =
+        TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1));
+    grad_type_size_ =
+        TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
     slot_info_initialized_ = true;
   }
 
@@ -386,26 +417,33 @@ class PSGPUWrapper {
 
  private:
   static std::shared_ptr<PSGPUWrapper> s_instance_;
-  //主要用于load数据的时候用的
   Dataset* dataset_;
-  //当load数据完成后，会将其筛入到如下队列，后续异步pull会用到这个队列的数据
-  //因为load 和 异步build是两个线程，所以才需要下面的队列来解耦这个dataset对象
-  std::queue<Dataset*> dataset_pipe_;
-  std::mutex dataset_mutex_;
 #ifdef PADDLE_WITH_PSLIB
   paddle::ps::AfsApiWrapper afs_handler_;
 #endif
+  std::unordered_map<
+      uint64_t, std::vector<std::unordered_map<uint64_t, std::vector<float>>>>
+      local_tables_;
   HeterPsBase* HeterPs_;
-  std::vector<LoDTensor> keys_tensor_;  // Cache for pull_sparse
+  std::vector<LoDTensor> keys_tensor;  // Cache for pull_sparse
   std::shared_ptr<HeterPsResource> resource_;
   int32_t sleep_seconds_before_fail_exit_;
   std::vector<int> slot_vector_;
   std::vector<int> slot_offset_vector_;
   std::vector<int> slot_mf_dim_vector_;
+  std::unordered_map<int, int> slot_dim_map_;
   std::vector<int> slot_index_vec_;
   std::vector<int> index_dim_vec_;
   int multi_mf_dim_{0};
   int max_mf_dim_{0};
+  size_t val_type_size_{0};
+  size_t grad_type_size_{0};
+
+  double time_1 = 0.0;
+  double time_2 = 0.0;
+  double time_3 = 0.0;
+  double time_4 = 0.0;
+
   int multi_node_{0};
   int node_size_;
   uint64_t table_id_;
@@ -413,18 +451,20 @@ class PSGPUWrapper {
   std::vector<ncclComm_t> inter_comms_;
   std::vector<ncclUniqueId> inter_ncclids_;
   std::vector<int> heter_devices_;
+  std::unordered_set<std::string> gpu_ps_config_keys_;
   HeterObjectPool<HeterContext> gpu_task_pool_;
-  std::vector<std::vector<std::vector<robin_hood::unordered_set<uint64_t>>>> thread_keys_;
+  std::vector<std::vector<robin_hood::unordered_set<uint64_t>>> thread_keys_;
+  std::vector<std::vector<std::vector<robin_hood::unordered_set<uint64_t>>>>
+      thread_dim_keys_;
   int thread_keys_thread_num_ = 37 * 4;
-  int thread_keys_shard_num_ = 0;
+  int thread_keys_shard_num_ = 64;
   uint64_t max_fea_num_per_pass_ = 5000000000;
   int year_;
   int month_;
   int day_;
   bool slot_info_initialized_ = false;
   int use_afs_api_ = 0;
-  std::string ps_accessor_type_;
-  std::string gpu_value_type_;
+
   std::vector<MemoryPool*> mem_pools_;
   std::vector<HBMMemoryPool*> hbm_pools_;  // in multi mfdim, one table need hbm
                                            // pools of totol dims number
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index 47fc901372e03f..9a373be92b636c 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -242,6 +242,7 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
     }
     config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
   }
+
   auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
   ps_gpu_wrapper->InitializeGPUServer(config);
 }
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index dee112179be328..5e2274cb651385 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -313,8 +313,6 @@ void BindDataset(py::module *m) {
            &framework::Dataset::SetFleetSendSleepSeconds,
            py::call_guard<py::gil_scoped_release>())
       .def("enable_pv_merge", &framework::Dataset::EnablePvMerge,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_pass_id", &framework::Dataset::SetPassId,
            py::call_guard<py::gil_scoped_release>());
 
   py::class_<IterableDatasetWrapper>(*m, "IterableDatasetWrapper")
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index 48dccf85f6bb1c..42703fc17bde52 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -41,12 +41,6 @@ void BindPSGPUWrapper(py::module* m) {
            py::call_guard<py::gil_scoped_release>())
       .def("set_slot_dim_vector", &framework::PSGPUWrapper::SetSlotDimVector,
            py::call_guard<py::gil_scoped_release>())
-      .def("set_slot_dim_fixed", &framework::PSGPUWrapper::SetSlotDimFixed,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_table_shard_num", &framework::PSGPUWrapper::SetTableShardNum,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_accssor_and_gpuvalue_type", &framework::PSGPUWrapper::SetAccessorAndValueType,
-           py::call_guard<py::gil_scoped_release>())
       .def("set_slot_offset_vector",
            &framework::PSGPUWrapper::SetSlotOffsetVector,
            py::call_guard<py::gil_scoped_release>())
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index c8699acf5b7b57..84064669c0dc67 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -73,24 +73,10 @@ def __init__(self):
         self.proto_desc.pipe_command = "cat"
         self.dataset = core.Dataset("MultiSlotDataset")
         self.thread_num = 1
-        self.pass_id = 0
         self.filelist = []
         self.use_ps_gpu = False
         self.psgpu = None
 
-    def set_pass_id(self, pass_id):
-        """
-        set_pass_id
-        """
-        self.pass_id = pass_id
-        self.dataset.set_pass_id(pass_id)
-    
-    def get_pass_id(self):
-        """
-        get_pass_id
-        """
-        return self.pass_id
-
     def set_pipe_command(self, pipe_command):
         """
         Set pipe command of current dataset
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 9bcf78b6c9f023..8d803c0d5bd7d9 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -313,7 +313,7 @@ def save_inference_model(self,
         """
         self._fleet_ptr.save_model(dirname, 0)
 
-    def print_table_stat(self, table_id, pass_id, threshold):
+    def print_table_stat(self, table_id):
         """
         print stat info of table_id,
         format: tableid, feasign size, mf size
@@ -325,7 +325,7 @@ def print_table_stat(self, table_id, pass_id, threshold):
         """
         self._role_maker._barrier_worker()
         if self._role_maker.is_first_worker():
-            self._fleet_ptr.print_table_stat(table_id, pass_id, threshold)
+            self._fleet_ptr.print_table_stat(table_id)
         self._role_maker._barrier_worker()
 
     def set_file_num_one_shard(self, table_id, file_num):