diff --git a/cmake/external/jemalloc.cmake b/cmake/external/jemalloc.cmake
new file mode 100644
index 00000000000000..efce686b20929a
--- /dev/null
+++ b/cmake/external/jemalloc.cmake
@@ -0,0 +1,35 @@
+include(ExternalProject)
+
+set(JEMALLOC_PROJECT "extern_jemalloc")
+set(JEMALLOC_URL
+    /~https://github.com/jemalloc/jemalloc/releases/download/5.1.0/jemalloc-5.1.0.tar.bz2
+)
+set(JEMALLOC_BUILD ${THIRD_PARTY_PATH}/jemalloc/src/extern_jemalloc)
+set(JEMALLOC_SOURCE_DIR "${THIRD_PARTY_PATH}/jemalloc")
+set(JEMALLOC_INSTALL ${THIRD_PARTY_PATH}/install/jemalloc)
+set(JEMALLOC_INCLUDE_DIR ${JEMALLOC_INSTALL}/include)
+set(JEMALLOC_DOWNLOAD_DIR "${JEMALLOC_SOURCE_DIR}/src/${JEMALLOC_PROJECT}")
+
+set(JEMALLOC_STATIC_LIBRARIES
+    ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a)
+set(JEMALLOC_LIBRARIES
+    ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a)
+
+ExternalProject_Add(
+  extern_jemalloc
+  PREFIX ${JEMALLOC_SOURCE_DIR}
+  URL ${JEMALLOC_URL}
+  INSTALL_DIR ${JEMALLOC_INSTALL}
+  DOWNLOAD_DIR "${JEMALLOC_DOWNLOAD_DIR}"
+  BUILD_COMMAND $(MAKE)
+  BUILD_IN_SOURCE 1
+  INSTALL_COMMAND $(MAKE) install
+  CONFIGURE_COMMAND "${JEMALLOC_DOWNLOAD_DIR}/configure"
+                    --prefix=${JEMALLOC_INSTALL} --disable-initial-exec-tls)
+
+add_library(jemalloc STATIC IMPORTED GLOBAL)
+set_property(TARGET jemalloc PROPERTY IMPORTED_LOCATION
+                                      ${JEMALLOC_STATIC_LIBRARIES})
+
+include_directories(${JEMALLOC_INCLUDE_DIR})
+add_dependencies(jemalloc extern_jemalloc)
diff --git a/cmake/external/rocksdb.cmake b/cmake/external/rocksdb.cmake
index 41a1916dc33083..0084247461b74b 100644
--- a/cmake/external/rocksdb.cmake
+++ b/cmake/external/rocksdb.cmake
@@ -14,6 +14,13 @@
 
 include(ExternalProject)
 
+# find_package(jemalloc REQUIRED)
+
+set(JEMALLOC_INCLUDE_DIR ${THIRD_PARTY_PATH}/install/jemalloc/include)
+set(JEMALLOC_LIBRARIES
+    ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a)
+message(STATUS "rocksdb jemalloc:" ${JEMALLOC_LIBRARIES})
+
 set(ROCKSDB_PREFIX_DIR ${THIRD_PARTY_PATH}/rocksdb)
 set(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb)
 set(ROCKSDB_INCLUDE_DIR
@@ -22,22 +29,41 @@ set(ROCKSDB_INCLUDE_DIR
 set(ROCKSDB_LIBRARIES
     "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a"
     CACHE FILEPATH "rocksdb library." FORCE)
-set(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+set(ROCKSDB_COMMON_FLAGS
+    "-g -pipe -O2 -W -Wall -Wno-unused-parameter -fPIC -fno-builtin-memcmp -fno-omit-frame-pointer"
+)
+set(ROCKSDB_FLAGS
+    "-DNDEBUG -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DOS_LINUX -DROCKSDB_FALLOCATE_PRESENT -DHAVE_SSE42 -DHAVE_PCLMUL -DZLIB -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX -DROCKSDB_BACKTRACE -DROCKSDB_SUPPORT_THREAD_LOCAL -DROCKSDB_USE_RTTI -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_AUXV_GETAUXVAL_PRESENT"
+)
+set(ROCKSDB_CMAKE_CXX_FLAGS
+    "${ROCKSDB_COMMON_FLAGS} -DROCKSDB_LIBAIO_PRESENT -msse -msse4.2 -mpclmul ${ROCKSDB_FLAGS} -fPIC  -I${JEMALLOC_INCLUDE_DIR}"
+)
+set(ROCKSDB_CMAKE_C_FLAGS
+    "${ROCKSDB_COMMON_FLAGS} ${ROCKSDB_FLAGS} -DROCKSDB_LIBAIO_PRESENT -fPIC  -I${JEMALLOC_INCLUDE_DIR}"
+)
 include_directories(${ROCKSDB_INCLUDE_DIR})
 
+set(CMAKE_CXX_LINK_EXECUTABLE
+    "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz")
 ExternalProject_Add(
   extern_rocksdb
   ${EXTERNAL_PROJECT_LOG_ARGS}
   PREFIX ${ROCKSDB_PREFIX_DIR}
-  GIT_REPOSITORY "/~https://github.com/facebook/rocksdb"
-  GIT_TAG v6.10.1
+  GIT_REPOSITORY "/~https://github.com/Thunderbrook/rocksdb"
+  GIT_TAG 6.19.fb
   UPDATE_COMMAND ""
   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
              -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
              -DWITH_BZ2=OFF
              -DWITH_GFLAGS=OFF
+             -DWITH_TESTS=OFF
+             -DWITH_JEMALLOC=ON
+             -DWITH_BENCHMARK_TOOLS=OFF
+             -DJeMalloc_LIBRARIES=${JEMALLOC_LIBRARIES}
+             -DJeMalloc_INCLUDE_DIRS=${JEMALLOC_INCLUDE_DIR}
              -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS}
-             -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+             -DCMAKE_C_FLAGS=${ROCKSDB_CMAKE_C_FLAGS}
+             -DCMAKE_CXX_LINK_EXECUTABLE=${CMAKE_CXX_LINK_EXECUTABLE}
   #    BUILD_BYPRODUCTS ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a
   INSTALL_COMMAND
     mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ && cp
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 94fb1b4d838f9a..5455ddadfdea44 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -422,6 +422,9 @@ if(WITH_PSCORE)
 
   include(external/rocksdb) # download, build, install rocksdb
   list(APPEND third_party_deps extern_rocksdb)
+
+  include(external/jemalloc) # download, build, install jemalloc
+  list(APPEND third_party_deps extern_jemalloc)
 endif()
 
 if(WITH_XBYAK)
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 5654669d76fdba..74f946b2253aac 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -148,10 +148,12 @@ class PSClient {
     return fut;
   }
 
-  virtual ::std::future<int32_t> PullSparsePtr(char **select_values,
+  virtual ::std::future<int32_t> PullSparsePtr(int shard_id,
+                                               char **select_values,
                                                size_t table_id,
                                                const uint64_t *keys,
-                                               size_t num) {
+                                               size_t num,
+                                               uint16_t pass_id) {
     VLOG(0) << "Did not implement";
     std::promise<int32_t> promise;
     std::future<int> fut = promise.get_future();
@@ -160,6 +162,15 @@ class PSClient {
   }
 
   virtual std::future<int32_t> PrintTableStat(uint32_t table_id) = 0;
+  virtual std::future<int32_t> SaveCacheTable(uint32_t table_id,
+                                              uint16_t pass_id,
+                                              size_t threshold) {
+    VLOG(0) << "Did not implement";
+    std::promise<int32_t> promise;
+    std::future<int> fut = promise.get_future();
+    promise.set_value(-1);
+    return fut;
+  }
 
   // 确保所有积攒中的请求都发起发送
   virtual std::future<int32_t> Flush() = 0;
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc
index e8bf426710bc3b..5466e9cd95bd09 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -260,10 +260,12 @@ ::std::future<int32_t> PsLocalClient::PushDense(const Region* regions,
 //  return done();
 //}
 
-::std::future<int32_t> PsLocalClient::PullSparsePtr(char** select_values,
+::std::future<int32_t> PsLocalClient::PullSparsePtr(int shard_id,
+                                                    char** select_values,
                                                     size_t table_id,
                                                     const uint64_t* keys,
-                                                    size_t num) {
+                                                    size_t num,
+                                                    uint16_t pass_id) {
   // FIXME
   // auto timer =
   // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse");
@@ -278,6 +280,8 @@ ::std::future<int32_t> PsLocalClient::PullSparsePtr(char** select_values,
   table_context.pull_context.ptr_values = select_values;
   table_context.use_ptr = true;
   table_context.num = num;
+  table_context.shard_id = shard_id;
+  table_context.pass_id = pass_id;
 
   //  table_ptr->PullSparsePtr(select_values, keys, num);
   table_ptr->Pull(table_context);
@@ -285,6 +289,28 @@ ::std::future<int32_t> PsLocalClient::PullSparsePtr(char** select_values,
   return done();
 }
 
+::std::future<int32_t> PsLocalClient::PrintTableStat(uint32_t table_id) {
+  auto* table_ptr = GetTable(table_id);
+  std::pair<int64_t, int64_t> ret = table_ptr->PrintTableStat();
+  VLOG(0) << "table id: " << table_id << ", feasign size: " << ret.first
+          << ", mf size: " << ret.second;
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::SaveCacheTable(uint32_t table_id,
+                                                     uint16_t pass_id,
+                                                     size_t threshold) {
+  auto* table_ptr = GetTable(table_id);
+  std::pair<int64_t, int64_t> ret = table_ptr->PrintTableStat();
+  VLOG(0) << "table id: " << table_id << ", feasign size: " << ret.first
+          << ", mf size: " << ret.second;
+  if (ret.first > threshold) {
+    VLOG(0) << "run cache table";
+    table_ptr->CacheTable(pass_id);
+  }
+  return done();
+}
+
 ::std::future<int32_t> PsLocalClient::PushSparseRawGradient(
     size_t table_id,
     const uint64_t* keys,
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h
index 593805547af849..583ea8052eb01d 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.h
@@ -76,18 +76,19 @@ class PsLocalClient : public PSClient {
     return fut;
   }
 
-  virtual ::std::future<int32_t> PullSparsePtr(char** select_values,
+  virtual ::std::future<int32_t> PullSparsePtr(int shard_id,
+                                               char** select_values,
                                                size_t table_id,
                                                const uint64_t* keys,
-                                               size_t num);
+                                               size_t num,
+                                               uint16_t pass_id);
 
-  virtual ::std::future<int32_t> PrintTableStat(uint32_t table_id) {
-    std::promise<int32_t> prom;
-    std::future<int32_t> fut = prom.get_future();
-    prom.set_value(0);
+  virtual ::std::future<int32_t> PrintTableStat(uint32_t table_id);
+
+  virtual ::std::future<int32_t> SaveCacheTable(uint32_t table_id,
+                                                uint16_t pass_id,
+                                                size_t threshold);
 
-    return fut;
-  }
   virtual ::std::future<int32_t> PushSparse(size_t table_id,
                                             const uint64_t* keys,
                                             const float** update_values,
diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h
index b55c77bf52d848..9f6baf3189fb8b 100644
--- a/paddle/fluid/distributed/ps/table/accessor.h
+++ b/paddle/fluid/distributed/ps/table/accessor.h
@@ -162,6 +162,15 @@ class ValueAccessor {
     return 0;
   }
 
+  virtual bool SaveMemCache(float* value,
+                            int param,
+                            double global_cache_threshold,
+                            uint16_t pass_id) {
+    return true;
+  }
+
+  virtual void UpdatePassId(float* value, uint16_t pass_id) {}
+
   virtual float GetField(float* value, const std::string& name) { return 0.0; }
 #define DEFINE_GET_INDEX(class, field) \
   virtual int get_##field##_index() override { return class ::field##_index(); }
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 06e1f1c9c7734c..88c2895ecb04c5 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -24,6 +24,7 @@
 #include "gflags/gflags.h"
 #include "paddle/fluid/distributed/common/utils.h"
 #include "paddle/fluid/distributed/ps/table/graph/graph_node.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/platform/timer.h"
@@ -32,6 +33,8 @@
 
 DECLARE_bool(graph_load_in_parallel);
 DECLARE_bool(graph_get_neighbor_id);
+DECLARE_int32(gpugraph_storage_mode);
+DECLARE_uint64(gpugraph_slot_feasign_max_num);
 
 namespace paddle {
 namespace distributed {
@@ -54,32 +57,38 @@ int32_t GraphTable::Load_to_ssd(const std::string &path,
 }
 
 paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
-    std::vector<uint64_t> &node_ids, int slot_num) {
-  std::vector<std::vector<uint64_t>> bags(task_pool_size_);
-  for (int i = 0; i < task_pool_size_; i++) {
-    auto predsize = node_ids.size() / task_pool_size_;
+    int gpu_id, std::vector<uint64_t> &node_ids, int slot_num) {
+  size_t shard_num = 64;
+  std::vector<std::vector<uint64_t>> bags(shard_num);
+  std::vector<uint64_t> feature_array[shard_num];
+  std::vector<uint8_t> slot_id_array[shard_num];
+  std::vector<uint64_t> node_id_array[shard_num];
+  std::vector<paddle::framework::GpuPsFeaInfo> node_fea_info_array[shard_num];
+  for (size_t i = 0; i < shard_num; i++) {
+    auto predsize = node_ids.size() / shard_num;
     bags[i].reserve(predsize * 1.2);
+    feature_array[i].reserve(predsize * 1.2 * slot_num);
+    slot_id_array[i].reserve(predsize * 1.2 * slot_num);
+    node_id_array[i].reserve(predsize * 1.2);
+    node_fea_info_array[i].reserve(predsize * 1.2);
   }
 
   for (auto x : node_ids) {
-    int location = x % shard_num % task_pool_size_;
+    int location = x % shard_num;
     bags[location].push_back(x);
   }
 
   std::vector<std::future<int>> tasks;
-  std::vector<uint64_t> feature_array[task_pool_size_];
-  std::vector<uint8_t> slot_id_array[task_pool_size_];
-  std::vector<uint64_t> node_id_array[task_pool_size_];
-  std::vector<paddle::framework::GpuPsFeaInfo>
-      node_fea_info_array[task_pool_size_];
-  slot_feature_num_map_.resize(slot_num);
-  for (int k = 0; k < slot_num; ++k) {
-    slot_feature_num_map_[k] = 0;
+  if (slot_feature_num_map_.size() == 0) {
+    slot_feature_num_map_.resize(slot_num);
+    for (int k = 0; k < slot_num; ++k) {
+      slot_feature_num_map_[k] = 0;
+    }
   }
 
   for (size_t i = 0; i < bags.size(); i++) {
     if (bags[i].size() > 0) {
-      tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
+      tasks.push_back(_cpu_worker_pool[gpu_id]->enqueue([&, i, this]() -> int {
         uint64_t node_id;
         paddle::framework::GpuPsFeaInfo x;
         std::vector<uint64_t> feature_ids;
@@ -96,19 +105,11 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
             x.feature_offset = feature_array[i].size();
             int total_feature_size = 0;
             for (int k = 0; k < slot_num; ++k) {
-              v->get_feature_ids(k, &feature_ids);
-              int feature_ids_size = feature_ids.size();
+              auto feature_ids_size = v->get_feature_ids(k, feature_array[i], slot_id_array[i]);
               if (slot_feature_num_map_[k] < feature_ids_size) {
                 slot_feature_num_map_[k] = feature_ids_size;
               }
               total_feature_size += feature_ids_size;
-              if (!feature_ids.empty()) {
-                feature_array[i].insert(feature_array[i].end(),
-                                        feature_ids.begin(),
-                                        feature_ids.end());
-                slot_id_array[i].insert(
-                    slot_id_array[i].end(), feature_ids_size, k);
-              }
             }
             x.feature_size = total_feature_size;
             node_fea_info_array[i].push_back(x);
@@ -127,32 +128,40 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea(
   }
   VLOG(0) << "slot_feature_num_map: " << ss.str();
 
+  tasks.clear();
+
   paddle::framework::GpuPsCommGraphFea res;
   uint64_t tot_len = 0;
-  for (int i = 0; i < task_pool_size_; i++) {
+  for (size_t i = 0; i < shard_num; i++) {
     tot_len += feature_array[i].size();
   }
   VLOG(0) << "Loaded feature table on cpu, feature_list_size[" << tot_len
           << "] node_ids_size[" << node_ids.size() << "]";
   res.init_on_cpu(tot_len, (unsigned int)node_ids.size(), slot_num);
   unsigned int offset = 0, ind = 0;
-  for (int i = 0; i < task_pool_size_; i++) {
-    for (int j = 0; j < (int)node_id_array[i].size(); j++) {
-      res.node_list[ind] = node_id_array[i][j];
-      res.fea_info_list[ind] = node_fea_info_array[i][j];
-      res.fea_info_list[ind++].feature_offset += offset;
-    }
-    for (size_t j = 0; j < feature_array[i].size(); j++) {
-      res.feature_list[offset + j] = feature_array[i][j];
-      res.slot_id_list[offset + j] = slot_id_array[i][j];
-    }
+  for (size_t i = 0; i < shard_num; i++) {
+    tasks.push_back(_cpu_worker_pool[gpu_id]->enqueue([&, i, ind, offset, this]() -> int {
+      auto start = ind;
+      for (int j = 0; j < (int)node_id_array[i].size(); j++) {
+        res.node_list[start] = node_id_array[i][j];
+        res.fea_info_list[start] = node_fea_info_array[i][j];
+        res.fea_info_list[start++].feature_offset += offset;
+      }
+      for (size_t j = 0; j < feature_array[i].size(); j++) {
+        res.feature_list[offset + j] = feature_array[i][j];
+        res.slot_id_list[offset + j] = slot_id_array[i][j];
+      }
+      return 0;
+    }));
     offset += feature_array[i].size();
+    ind += node_id_array[i].size();
   }
+  for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
   return res;
 }
 
 paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
-    int idx, std::vector<uint64_t> ids) {
+    int idx, const std::vector<uint64_t> & ids) {
   std::vector<std::vector<uint64_t>> bags(task_pool_size_);
   for (int i = 0; i < task_pool_size_; i++) {
     auto predsize = ids.size() / task_pool_size_;
@@ -327,7 +336,7 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
           std::string str;
           if (_db->get(i, ch, sizeof(int) * 2 + sizeof(uint64_t), str) == 0) {
             count[i] += (int64_t)str.size();
-            for (size_t j = 0; j < (int)str.size(); j += sizeof(uint64_t)) {
+            for (size_t j = 0; j < str.size(); j += sizeof(uint64_t)) {
               uint64_t id = *(uint64_t *)(str.c_str() + j);
               add_comm_edge(idx, v, id);
             }
@@ -397,7 +406,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
         score[i] = 0;
       }
     }
-    for (size_t j = 0; j < (int)value.size(); j += sizeof(uint64_t)) {
+    for (size_t j = 0; j < value.size(); j += sizeof(uint64_t)) {
       uint64_t v = *((uint64_t *)(value.c_str() + j));
       int index = -1;
       if (id_map.find(v) != id_map.end()) {
@@ -488,6 +497,116 @@ void GraphTable::clear_graph(int idx) {
     edge_shards[idx].push_back(new GraphShard());
   }
 }
+
+void GraphTable::release_graph() {
+  // Before releasing graph, prepare for sampling ids and embedding keys.
+  build_graph_type_keys();
+
+  if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
+    build_graph_total_keys();
+  }
+  // clear graph
+  if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH
+          || FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) {
+    clear_edge_shard();
+  }
+  else {
+    clear_graph();
+  }
+}
+
+void GraphTable::release_graph_edge() {
+  if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
+    build_graph_total_keys();
+  }
+  clear_edge_shard();
+}
+
+void GraphTable::release_graph_node() {
+  build_graph_type_keys();
+  if (FLAGS_gpugraph_storage_mode != paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH
+           && FLAGS_gpugraph_storage_mode != paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) {
+    clear_feature_shard();
+  }
+  else {
+    merge_feature_shard();
+    feature_shrink_to_fit();
+  }
+}
+
+void GraphTable::clear_edge_shard() {
+  VLOG(0) << "begin clear edge shard";
+  std::vector<std::future<int>> tasks;
+  for (auto &type_shards : edge_shards) {
+    for (auto &shard : type_shards) {
+      tasks.push_back(
+        load_node_edge_task_pool->enqueue([&shard, this]() -> int {
+          delete shard;
+          return 0;
+      }));
+    }
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); 
+  for (auto &shards : edge_shards) shards.clear();
+  edge_shards.clear();
+  VLOG(0) << "finish clear edge shard"; 
+}
+
+void GraphTable::clear_feature_shard() {
+  VLOG(0) << "begin clear feature shard";
+  std::vector<std::future<int>> tasks;
+  for (auto &type_shards : feature_shards) {
+    for (auto &shard : type_shards) {
+      tasks.push_back(
+        load_node_edge_task_pool->enqueue([&shard, this]() -> int {
+          delete shard;
+          return 0;
+      }));
+    }
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  for (auto &shards : feature_shards) shards.clear();
+  feature_shards.clear();
+  VLOG(0) << "finish clear feature shard";
+}
+
+void GraphTable::feature_shrink_to_fit() {
+  std::vector<std::future<int>> tasks;
+  for (auto &type_shards : feature_shards) {
+      for (auto &shard : type_shards) {
+        tasks.push_back(
+          load_node_edge_task_pool->enqueue([&shard, this]() -> int {
+             shard->shrink_to_fit();
+             return 0;
+        }));
+      }
+  }
+  for(size_t i = 0; i < tasks.size(); i++) tasks[i].get(); 
+}
+
+void GraphTable::merge_feature_shard() {
+  VLOG(0) << "begin merge_feature_shard";
+  std::vector<std::future<int>> tasks;
+  for (size_t i = 0; i < feature_shards[0].size(); i++) {
+    tasks.push_back(
+      load_node_edge_task_pool->enqueue([i, this]() -> int {
+        for (size_t j = 1; j < feature_shards.size(); j++) {
+          feature_shards[0][i]->merge_shard(feature_shards[j][i]);
+        }
+        return 0;
+    }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  feature_shards.resize(1);
+}
+
+void GraphTable::clear_graph() {
+  VLOG(0) << "begin clear_graph";
+  clear_edge_shard();
+  clear_feature_shard();
+  VLOG(0) << "finish clear_graph";
+}
+
 int32_t GraphTable::load_next_partition(int idx) {
   if (next_partition >= (int)partitions[idx].size()) {
     VLOG(0) << "partition iteration is done";
@@ -554,7 +673,7 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) {
           std::vector<Node *> &v = shards[i]->get_bucket();
           for (size_t j = 0; j < v.size(); j++) {
             std::vector<uint64_t> s;
-            for (size_t k = 0; k < (int)v[j]->get_neighbor_size(); k++) {
+            for (size_t k = 0; k < v[j]->get_neighbor_size(); k++) {
               s.push_back(v[j]->get_neighbor_id(k));
             }
             cost += v[j]->get_neighbor_size() * sizeof(uint64_t);
@@ -1053,21 +1172,7 @@ Node *GraphShard::find_node(uint64_t id) {
   return iter == node_location.end() ? nullptr : bucket[iter->second];
 }
 
-GraphTable::~GraphTable() {
-  for (int i = 0; i < (int)edge_shards.size(); i++) {
-    for (auto p : edge_shards[i]) {
-      delete p;
-    }
-    edge_shards[i].clear();
-  }
-
-  for (int i = 0; i < (int)feature_shards.size(); i++) {
-    for (auto p : feature_shards[i]) {
-      delete p;
-    }
-    feature_shards[i].clear();
-  }
-}
+GraphTable::~GraphTable() { clear_graph(); }
 
 int32_t GraphTable::Load(const std::string &path, const std::string &param) {
   bool load_edge = (param[0] == 'e');
@@ -1095,16 +1200,19 @@ std::string GraphTable::get_inverse_etype(std::string &etype) {
   return res;
 }
 
-int32_t GraphTable::parse_type_to_typepath(std::string &type2files,
-                                           std::string graph_data_local_path,
-                                           std::vector<std::string> &res_type,
-                                           std::unordered_map<std::string, std::string> &res_type2path) {
-  auto type2files_split = paddle::string::split_string<std::string>(type2files, ",");
+int32_t GraphTable::parse_type_to_typepath(
+    std::string &type2files,
+    std::string graph_data_local_path,
+    std::vector<std::string> &res_type,
+    std::unordered_map<std::string, std::string> &res_type2path) {
+  auto type2files_split =
+      paddle::string::split_string<std::string>(type2files, ",");
   if (type2files_split.size() == 0) {
     return -1;
   }
   for (auto one_type2file : type2files_split) {
-    auto one_type2file_split = paddle::string::split_string<std::string>(one_type2file, ":");
+    auto one_type2file_split =
+        paddle::string::split_string<std::string>(one_type2file, ":");
     auto type = one_type2file_split[0];
     auto type_dir = one_type2file_split[1];
     res_type.push_back(type);
@@ -1113,6 +1221,94 @@ int32_t GraphTable::parse_type_to_typepath(std::string &type2files,
   return 0;
 }
 
+int32_t GraphTable::parse_edge_and_load(std::string etype2files,
+                                       std::string graph_data_local_path,
+                                       int part_num,
+                                       bool reverse) {
+   std::vector<std::string> etypes;
+   std::unordered_map<std::string, std::string> edge_to_edgedir;
+   int res = parse_type_to_typepath(
+       etype2files, graph_data_local_path, etypes, edge_to_edgedir);
+   if (res != 0) {
+     VLOG(0) << "parse edge type and edgedir failed!";
+     return -1;
+   }
+   VLOG(0) << "etypes size: " << etypes.size();
+   VLOG(0) << "whether reverse: " << reverse;
+   is_load_reverse_edge = reverse;
+   std::string delim = ";";
+   size_t total_len = etypes.size();
+   
+   std::vector<std::future<int>> tasks;
+   for (size_t i = 0; i < total_len; i++) { 
+     tasks.push_back(
+         _shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int {
+           std::string etype_path = edge_to_edgedir[etypes[i]];
+           auto etype_path_list = paddle::framework::localfs_list(etype_path);
+           std::string etype_path_str;
+           if (part_num > 0 && part_num < (int)etype_path_list.size()) {
+             std::vector<std::string> sub_etype_path_list(
+                etype_path_list.begin(), etype_path_list.begin() + part_num);
+             etype_path_str =
+                paddle::string::join_strings(sub_etype_path_list, delim);
+           } else {
+             etype_path_str =
+                paddle::string::join_strings(etype_path_list, delim);
+           }
+           this->load_edges(etype_path_str, false, etypes[i]);
+           if (reverse) {
+             std::string r_etype = get_inverse_etype(etypes[i]);
+             this->load_edges(etype_path_str, true, r_etype);
+           }
+           return 0;
+     }));
+   }
+   for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
+   return 0; 
+}
+
+int32_t GraphTable::parse_node_and_load(std::string ntype2files,
+                                       std::string graph_data_local_path,
+                                       int part_num) {
+  std::vector<std::string> ntypes;
+  std::unordered_map<std::string, std::string> node_to_nodedir;
+  int res = parse_type_to_typepath(
+       ntype2files, graph_data_local_path, ntypes, node_to_nodedir);
+  if (res != 0) {
+    VLOG(0) << "parse node type and nodedir failed!";
+    return -1;
+  }
+  if (ntypes.size() == 0) {
+    VLOG(0) << "node_type not specified, nothing will be loaded ";
+    return 0;
+  }
+
+  std::string delim = ";";
+  std::vector<std::string> type_npath_strs;
+  for (size_t i = 0; i <ntypes.size(); i++) {
+    std::string npath = node_to_nodedir[ntypes[i]];
+    auto npath_list = paddle::framework::localfs_list(npath);
+    std::string type_npath_str;
+    if (part_num > 0 && part_num < (int)npath_list.size()) {
+      std::vector<std::string> sub_npath_list(
+        npath_list.begin(), npath_list.begin() + part_num);
+      type_npath_str = paddle::string::join_strings(sub_npath_list, delim);
+    } else {
+      type_npath_str = paddle::string::join_strings(npath_list, delim);
+    }
+    type_npath_strs.push_back(type_npath_str);
+  }
+  std::string npath_str = paddle::string::join_strings(type_npath_strs, delim);
+  if (FLAGS_graph_load_in_parallel) {
+    this->load_nodes(npath_str, "");
+  } else {
+    for (size_t j = 0; j < ntypes.size(); j++) {
+      this->load_nodes(npath_str, ntypes[j]);
+    }
+  } 
+  return 0;
+}
+
 int32_t GraphTable::load_node_and_edge_file(std::string etype2files,
                                             std::string ntype2files,
                                             std::string graph_data_local_path,
@@ -1120,14 +1316,16 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype2files,
                                             bool reverse) {
   std::vector<std::string> etypes;
   std::unordered_map<std::string, std::string> edge_to_edgedir;
-  int res = parse_type_to_typepath(etype2files, graph_data_local_path, etypes, edge_to_edgedir);
+  int res = parse_type_to_typepath(
+      etype2files, graph_data_local_path, etypes, edge_to_edgedir);
   if (res != 0) {
     VLOG(0) << "parse edge type and edgedir failed!";
     return -1;
   }
   std::vector<std::string> ntypes;
   std::unordered_map<std::string, std::string> node_to_nodedir;
-  res = parse_type_to_typepath(ntype2files, graph_data_local_path, ntypes, node_to_nodedir);
+  res = parse_type_to_typepath(
+      ntype2files, graph_data_local_path, ntypes, node_to_nodedir);
   if (res != 0) {
     VLOG(0) << "parse node type and nodedir failed!";
     return -1;
@@ -1177,7 +1375,6 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype2files,
               VLOG(0) << "node_type not specified, nothing will be loaded ";
               return 0;
             }
-
             if (FLAGS_graph_load_in_parallel) {
               this->load_nodes(npath_str, "");
             } else {
@@ -1461,7 +1658,6 @@ int32_t GraphTable::load_edges(const std::string &path,
                                const std::string &edge_type) {
 #ifdef PADDLE_WITH_HETERPS
   if (search_level == 2) total_memory_cost = 0;
-  const uint64_t fixed_load_edges = 1000000;
 #endif
   int idx = 0;
   if (edge_type == "") {
@@ -1483,7 +1679,7 @@ int32_t GraphTable::load_edges(const std::string &path,
   VLOG(0) << "Begin GraphTable::load_edges() edge_type[" << edge_type << "]";
   if (FLAGS_graph_load_in_parallel) {
     std::vector<std::future<std::pair<uint64_t, uint64_t>>> tasks;
-    for (int i = 0; i < paths.size(); i++) {
+    for (size_t i = 0; i < paths.size(); i++) {
       tasks.push_back(load_node_edge_task_pool->enqueue(
           [&, i, idx, this]() -> std::pair<uint64_t, uint64_t> {
             return parse_edge_file(paths[i], idx, reverse_edge);
@@ -1866,8 +2062,7 @@ int GraphTable::parse_feature(int idx,
   thread_local std::vector<paddle::string::str_ptr> fea_fields;
   fea_fields.clear();
   c = feature_separator_.at(0);
-  paddle::string::split_string_ptr(fields[1].ptr, fields[1].len, c, &fea_fields);
-
+  paddle::string::split_string_ptr(fields[1].ptr, fields[1].len, c, &fea_fields, FLAGS_gpugraph_slot_feasign_max_num);
   std::string name = fields[0].to_string();
   auto it = feat_id_map[idx].find(name);
   if (it != feat_id_map[idx].end()) {
@@ -1947,8 +2142,8 @@ int GraphTable::get_all_id(int type_id,
   MergeShardVector shard_merge(output, slice_num);
   auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
   std::vector<std::future<size_t>> tasks;
-  for (int idx = 0; idx < search_shards.size(); idx++) {
-    for (int j = 0; j < search_shards[idx].size(); j++) {
+  for (size_t idx = 0; idx < search_shards.size(); idx++) {
+    for (size_t j = 0; j < search_shards[idx].size(); j++) {
       tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue(
           [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t {
             std::vector<std::vector<uint64_t>> shard_keys;
@@ -1971,8 +2166,8 @@ int GraphTable::get_all_neighbor_id(
   MergeShardVector shard_merge(output, slice_num);
   auto &search_shards = type_id == 0 ? edge_shards : feature_shards;
   std::vector<std::future<size_t>> tasks;
-  for (int idx = 0; idx < search_shards.size(); idx++) {
-    for (int j = 0; j < search_shards[idx].size(); j++) {
+  for (size_t idx = 0; idx < search_shards.size(); idx++) {
+    for (size_t j = 0; j < search_shards[idx].size(); j++) {
       tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue(
           [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t {
             std::vector<std::vector<uint64_t>> shard_keys;
@@ -2024,7 +2219,7 @@ int GraphTable::get_all_neighbor_id(
   auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
   std::vector<std::future<size_t>> tasks;
   VLOG(3) << "begin task, task_pool_size_[" << task_pool_size_ << "]";
-  for (int i = 0; i < search_shards.size(); i++) {
+  for (size_t i = 0; i < search_shards.size(); i++) {
     tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
         [&search_shards, i, slice_num, &shard_merge]() -> size_t {
           std::vector<std::vector<uint64_t>> shard_keys;
@@ -2050,7 +2245,7 @@ int GraphTable::get_all_feature_ids(
   MergeShardVector shard_merge(output, slice_num);
   auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
   std::vector<std::future<size_t>> tasks;
-  for (int i = 0; i < search_shards.size(); i++) {
+  for (size_t i = 0; i < search_shards.size(); i++) {
     tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
         [&search_shards, i, slice_num, &shard_merge]() -> size_t {
           std::vector<std::vector<uint64_t>> shard_keys;
@@ -2229,7 +2424,7 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
   feat_name.resize(node_types.size());
   feat_shape.resize(node_types.size());
   feat_dtype.resize(node_types.size());
-  VLOG(0) << "got " << node_types.size() << "node types in total";
+  VLOG(0) << "got " << node_types.size() << " node types in total";
   for (int k = 0; k < node_types.size(); k++) {
     feature_to_id[node_types[k]] = k;
     auto node_type = node_types[k];
@@ -2289,5 +2484,50 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
   return 0;
 }
 
+void GraphTable::init_worker_poll(int gpu_num) {
+  _cpu_worker_pool.resize(gpu_num);
+  for (int i = 0; i < gpu_num; i++) {
+    _cpu_worker_pool[i].reset(new ::ThreadPool(16));
+  }
+}
+
+void GraphTable::build_graph_total_keys() {
+  VLOG(0) << "begin insert edge to graph_total_keys";
+  // build node embedding id
+  std::vector<std::vector<uint64_t>> keys;
+  this->get_node_embedding_ids(1, &keys);
+  graph_total_keys_.insert(
+      graph_total_keys_.end(), keys[0].begin(), keys[0].end());
+
+  VLOG(0) << "finish insert edge to graph_total_keys";
+}
+
+void GraphTable::build_graph_type_keys() {
+  VLOG(0) << "begin build_graph_type_keys";
+  graph_type_keys_.clear();
+  graph_type_keys_.resize(this->feature_to_id.size());
+
+  int cnt = 0;
+  for (auto &it : this->feature_to_id) {
+    auto node_idx = it.second;
+    std::vector<std::vector<uint64_t>> keys;
+    this->get_all_id(1, node_idx, 1, &keys);
+    type_to_index_[node_idx] = cnt;
+    graph_type_keys_[cnt++] = std::move(keys[0]);
+  }
+  VLOG(0) << "finish build_graph_type_keys";
+
+  VLOG(0) << "begin insert feature into graph_total_keys";
+  // build feature embedding id
+  for (auto &it : this->feature_to_id) {
+    auto node_idx = it.second;
+    std::vector<std::vector<uint64_t>> keys;
+    this->get_all_feature_ids(1, node_idx, 1, &keys);
+    graph_total_keys_.insert(
+        graph_total_keys_.end(), keys[0].begin(), keys[0].end());
+  }
+  VLOG(0) << "finish insert feature into graph_total_keys";
+}
+
 }  // namespace distributed
 };  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index 9ca8b7f1655fe5..1940fd25f88407 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -130,7 +130,29 @@ class GraphShard {
     return node_location;
   }
 
- private:
+  void shrink_to_fit() {
+    bucket.shrink_to_fit() ;
+    for (size_t i = 0; i < bucket.size(); i ++) {
+       bucket[i]->shrink_to_fit();
+    }
+  }
+
+  void merge_shard(GraphShard * &shard) {
+    bucket.reserve(bucket.size() + shard->bucket.size());
+    for (size_t i = 0; i < shard->bucket.size(); i++) {
+      auto node_id = shard->bucket[i]->get_id();
+      if (node_location.find(node_id) == node_location.end()) {
+        node_location[node_id] = bucket.size();
+        bucket.push_back(shard->bucket[i]);
+      }
+    }
+    shard->node_location.clear();
+    shard->bucket.clear();
+    delete shard;
+    shard = NULL;
+  }
+
+ public:
   std::unordered_map<uint64_t, int> node_location;
   std::vector<Node *> bucket;
 };
@@ -271,7 +293,6 @@ class RandomSampleLRU {
       remove(node_head);
       remove_count--;
     }
-    // std::cerr<<"after remove_count = "<<remove_count<<std::endl;
   }
 
   void move_to_tail(LRUNode<K, V> *node) {
@@ -535,25 +556,28 @@ class GraphTable : public Table {
   virtual int32_t Initialize(const TableParameter &config,
                              const FsClientParameter &fs_config);
   virtual int32_t Initialize(const GraphParameter &config);
+  void init_worker_poll(int gpu_num);
   int32_t Load(const std::string &path, const std::string &param);
-
   int32_t load_node_and_edge_file(std::string etype2files,
                                   std::string ntype2files,
                                   std::string graph_data_local_path,
                                   int part_num,
                                   bool reverse);
-
+  int32_t parse_edge_and_load(std::string etype2files,
+                                        std::string graph_data_local_path,
+                                        int part_num,
+                                        bool reverse);
+  int32_t parse_node_and_load(std::string ntype2files,
+                                        std::string graph_data_local_path,
+                                        int part_num);
   std::string get_inverse_etype(std::string &etype);
-  
   int32_t parse_type_to_typepath(std::string &type2files,
                                  std::string graph_data_local_path,
                                  std::vector<std::string> &res_type,
                                  std::unordered_map<std::string, std::string> &res_type2path);
-
   int32_t load_edges(const std::string &path,
                      bool reverse,
                      const std::string &edge_type);
-
   int get_all_id(int type,
                  int slice_num,
                  std::vector<std::vector<uint64_t>> *output);
@@ -635,7 +659,15 @@ class GraphTable : public Table {
       const std::vector<std::vector<std::string>> &res);
 
   size_t get_server_num() { return server_num; }
+  void clear_graph();
   void clear_graph(int idx);
+  void clear_edge_shard();
+  void clear_feature_shard();
+  void feature_shrink_to_fit();
+  void merge_feature_shard();
+  void release_graph();
+  void release_graph_edge();
+  void release_graph_node();
   virtual int32_t make_neighbor_sample_cache(size_t size_limit, size_t ttl) {
     {
       std::unique_lock<std::mutex> lock(mutex_);
@@ -672,9 +704,9 @@ class GraphTable : public Table {
   virtual int32_t add_node_to_ssd(
       int type_id, int idx, uint64_t src_id, char *data, int len);
   virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph(
-      int idx, std::vector<uint64_t> ids);
+      int idx, const std::vector<uint64_t> & ids);
   virtual paddle::framework::GpuPsCommGraphFea make_gpu_ps_graph_fea(
-      std::vector<uint64_t> &node_ids, int slot_num);
+      int gpu_id, std::vector<uint64_t> &node_ids, int slot_num);
   int32_t Load_to_ssd(const std::string &path, const std::string &param);
   int64_t load_graph_to_memory_from_ssd(int idx, std::vector<uint64_t> &ids);
   int32_t make_complementary_graph(int idx, int64_t byte_size);
@@ -700,9 +732,17 @@ class GraphTable : public Table {
   virtual int32_t build_sampler(int idx, std::string sample_type = "random");
   void set_slot_feature_separator(const std::string &ch);
   void set_feature_separator(const std::string &ch);
+
+  void build_graph_total_keys();
+  void build_graph_type_keys();
+
+  std::vector<uint64_t> graph_total_keys_;
+  std::vector<std::vector<uint64_t>> graph_type_keys_;
+  std::unordered_map<int, int> type_to_index_;
+
   std::vector<std::vector<GraphShard *>> edge_shards, feature_shards;
   size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num;
-  int task_pool_size_ = 24;
+  int task_pool_size_ = 64;
   int load_thread_num = 160;
 
   const int random_sample_nodes_ranges = 3;
@@ -718,6 +758,7 @@ class GraphTable : public Table {
   std::string table_type;
 
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
+  std::vector<std::shared_ptr<::ThreadPool>> _cpu_worker_pool;
   std::vector<std::shared_ptr<std::mt19937_64>> _shards_task_rng_pool;
   std::shared_ptr<::ThreadPool> load_node_edge_task_pool;
   std::shared_ptr<ScaledLRU<SampleKey, SampleResult>> scaled_lru;
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
index 4feee70fed751a..3e4f4d68f49cac 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
@@ -292,7 +292,8 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) {
   thread_local std::ostringstream os;
   os.clear();
   os.str("");
-  os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4];
+  os << common_feature_value.UnseenDays(const_cast<float*>(v)) << " " << v[1]
+     << " " << v[2] << " " << v[3] << " " << v[4];
   //    << v[5] << " " << v[6];
   for (int i = common_feature_value.EmbedG2SumIndex();
        i < common_feature_value.EmbedxG2SumIndex();
@@ -320,5 +321,18 @@ int CtrDymfAccessor::ParseFromString(const std::string& str, float* value) {
   return ret;
 }
 
+bool CtrDymfAccessor::SaveMemCache(float* value,
+                                   int param,
+                                   double global_cache_threshold,
+                                   uint16_t pass_id) {
+  auto base_threshold = _config.ctr_accessor_param().base_threshold();
+  return common_feature_value.Show(value) > global_cache_threshold ||
+         common_feature_value.PassId(value) >= pass_id;
+}
+
+void CtrDymfAccessor::UpdatePassId(float* value, uint16_t pass_id) {
+  common_feature_value.PassId(value) = pass_id;
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
index b820d617d06ae6..047bafedd9d7b6 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
@@ -30,7 +30,7 @@ namespace distributed {
 class CtrDymfAccessor : public ValueAccessor {
  public:
   struct CtrDymfFeatureValue {
-    /*
+    /*v1: old version
       float unseen_days;
       float delta_score;
       float show;
@@ -44,6 +44,20 @@ class CtrDymfAccessor : public ValueAccessor {
       // float embedx_g2sum;
       std::vector<float> embedx_w;
        */
+    /* V2: support pass_id
+      uint16_t pass_id;
+      uint16_t unseen_days;
+      float show;
+      float click;
+      float embed_w;
+      // float embed_g2sum;
+      std::vector<float> embed_g2sum;
+      float slot;
+      float mf_dim
+      std::<vector>float embedx_g2sum;
+      // float embedx_g2sum;
+      std::vector<float> embedx_w;
+    */
 
     int Dim() { return 7 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; }
     int DimSize(size_t dim, int embedx_dim) { return sizeof(float); }
@@ -73,7 +87,17 @@ class CtrDymfAccessor : public ValueAccessor {
     // 根据mf_dim计算的总byte数
     int Size(int& mf_dim) { return (Dim(mf_dim)) * sizeof(float); }
 
-    float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; }
+    uint16_t& PassId(float* val) {
+      uint16_t* int16_val =
+          reinterpret_cast<uint16_t*>(val + UnseenDaysIndex());
+      return int16_val[0];
+    }
+
+    uint16_t& UnseenDays(float* val) {
+      uint16_t* int16_val =
+          reinterpret_cast<uint16_t*>(val + UnseenDaysIndex());
+      return int16_val[1];
+    }
     float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; }
     float& Show(float* val) { return val[ShowIndex()]; }
     float& Click(float* val) { return val[ClickIndex()]; }
@@ -217,6 +241,14 @@ class CtrDymfAccessor : public ValueAccessor {
     return 0.0;
   }
 
+  //根据pass_id和show_threashold阈值来判断cache到ssd
+  bool SaveMemCache(float* value,
+                    int param,
+                    double global_cache_threshold,
+                    uint16_t pass_id);
+  //更新pass_id
+  void UpdatePassId(float* value, uint16_t pass_id);
+
  private:
   // float ShowClickScore(float show, float click);
 
diff --git a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
index eb3ff2e254f567..5b5a0057f37078 100644
--- a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
+++ b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
@@ -16,8 +16,12 @@
 #include <glog/logging.h>
 #include <rocksdb/db.h>
 #include <rocksdb/filter_policy.h>
+#include <rocksdb/iostats_context.h>
 #include <rocksdb/options.h>
+#include <rocksdb/perf_context.h>
+#include <rocksdb/perf_level.h>
 #include <rocksdb/slice.h>
+#include <rocksdb/slice_transform.h>
 #include <rocksdb/table.h>
 #include <rocksdb/write_batch.h>
 
@@ -27,6 +31,55 @@
 namespace paddle {
 namespace distributed {
 
+class Uint64Comparator : public rocksdb::Comparator {
+  int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const {
+    uint64_t A = *((uint64_t*)const_cast<char*>(a.data()));
+    uint64_t B = *((uint64_t*)const_cast<char*>(b.data()));
+    if (A < B) {
+      return -1;
+    }
+    if (A > B) {
+      return 1;
+    }
+    return 0;
+  }
+  const char* Name() const { return "Uint64Comparator"; }
+  void FindShortestSeparator(std::string*, const rocksdb::Slice&) const {}
+  void FindShortSuccessor(std::string*) const {}
+};
+
+class RocksDBItem {
+ public:
+  RocksDBItem() {}
+  ~RocksDBItem() {}
+  void reset() {
+    batch_keys.clear();
+    batch_index.clear();
+    batch_values.clear();
+    status.clear();
+  }
+  std::vector<rocksdb::Slice> batch_keys;
+  std::vector<int> batch_index;
+  std::vector<rocksdb::PinnableSlice> batch_values;
+  std::vector<rocksdb::Status> status;
+};
+
+class RocksDBCtx {
+ public:
+  RocksDBCtx() {
+    items[0].reset();
+    items[1].reset();
+    cur_index = 0;
+  }
+  ~RocksDBCtx() {}
+  RocksDBItem* switch_item() {
+    cur_index = (cur_index + 1) % 2;
+    return &items[cur_index];
+  }
+  RocksDBItem items[2];
+  int cur_index;
+};
+
 class RocksDBHandler {
  public:
   RocksDBHandler() {}
@@ -38,55 +91,69 @@ class RocksDBHandler {
   }
 
   int initialize(const std::string& db_path, const int colnum) {
-    VLOG(3) << "db path: " << db_path << " colnum: " << colnum;
-    rocksdb::Options options;
-    rocksdb::BlockBasedTableOptions bbto;
-    bbto.block_size = 4 * 1024;
-    bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024);
-    bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024);
-    bbto.cache_index_and_filter_blocks = false;
-    bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(20, false));
-    bbto.whole_key_filtering = true;
-    options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto));
-
-    options.keep_log_file_num = 100;
-    options.max_log_file_size = 50 * 1024 * 1024;  // 50MB
-    options.create_if_missing = true;
-    options.use_direct_reads = true;
-    options.max_background_flushes = 5;
-    options.max_background_compactions = 5;
-    options.base_background_compactions = 10;
-    options.write_buffer_size = 256 * 1024 * 1024;  // 256MB
-    options.max_write_buffer_number = 8;
-    options.max_bytes_for_level_base =
-        options.max_write_buffer_number * options.write_buffer_size;
-    options.min_write_buffer_number_to_merge = 1;
-    options.target_file_size_base = 1024 * 1024 * 1024;  // 1024MB
-    options.memtable_prefix_bloom_size_ratio = 0.02;
-    options.num_levels = 4;
-    options.max_open_files = -1;
-
-    options.compression = rocksdb::kNoCompression;
-    options.level0_file_num_compaction_trigger = 8;
-    options.level0_slowdown_writes_trigger =
-        1.8 * options.level0_file_num_compaction_trigger;
-    options.level0_stop_writes_trigger =
-        3.6 * options.level0_file_num_compaction_trigger;
-
-    if (!db_path.empty()) {
-      std::string rm_cmd = "rm -rf " + db_path;
-      system(rm_cmd.c_str());
-    }
-
-    rocksdb::Status s = rocksdb::DB::Open(options, db_path, &_db);
-    assert(s.ok());
-    _handles.resize(colnum);
+    VLOG(0) << "db path: " << db_path << " colnum: " << colnum;
+    _dbs.resize(colnum);
     for (int i = 0; i < colnum; i++) {
-      s = _db->CreateColumnFamily(
-          options, "shard_" + std::to_string(i), &_handles[i]);
+      rocksdb::Options options;
+      options.comparator = &_comparator;
+      rocksdb::BlockBasedTableOptions bbto;
+      // options.memtable_factory.reset(rocksdb::NewHashSkipListRepFactory(65536));
+      // options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(2));
+      bbto.format_version = 5;
+      bbto.use_delta_encoding = false;
+      bbto.block_size = 4 * 1024;
+      bbto.block_restart_interval = 6;
+      bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024);
+      // bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024);
+      bbto.cache_index_and_filter_blocks = false;
+      bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(15, false));
+      bbto.whole_key_filtering = true;
+      options.statistics = rocksdb::CreateDBStatistics();
+      options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto));
+
+      // options.IncreaseParallelism();
+      options.OptimizeLevelStyleCompaction();
+      options.keep_log_file_num = 100;
+      // options.db_log_dir = "./log/rocksdb";
+      options.max_log_file_size = 50 * 1024 * 1024;  // 50MB
+      // options.threads = 8;
+      options.create_if_missing = true;
+      options.use_direct_reads = true;
+      options.max_background_flushes = 37;
+      options.max_background_compactions = 64;
+      options.base_background_compactions = 10;
+      options.write_buffer_size = 256 * 1024 * 1024;  // 256MB
+      options.max_write_buffer_number = 8;
+      options.max_bytes_for_level_base =
+          options.max_write_buffer_number * options.write_buffer_size;
+      options.min_write_buffer_number_to_merge = 1;
+      options.target_file_size_base = 1024 * 1024 * 1024;  // 1024MB
+      // options.verify_checksums_in_compaction = false;
+      // options.disable_auto_compactions = true;
+      options.memtable_prefix_bloom_size_ratio = 0.02;
+      options.num_levels = 4;
+      options.max_open_files = -1;
+
+      options.compression = rocksdb::kNoCompression;
+      // options.compaction_options_fifo = rocksdb::CompactionOptionsFIFO();
+      // options.compaction_style =
+      // rocksdb::CompactionStyle::kCompactionStyleFIFO;
+      options.level0_file_num_compaction_trigger = 5;
+      options.level0_slowdown_writes_trigger =
+          1.8 * options.level0_file_num_compaction_trigger;
+      options.level0_stop_writes_trigger =
+          3.6 * options.level0_file_num_compaction_trigger;
+
+      std::string shard_path = db_path + "_" + std::to_string(i);
+      if (!shard_path.empty()) {
+        std::string rm_cmd = "rm -rf " + shard_path;
+        system(rm_cmd.c_str());
+      }
+
+      rocksdb::Status s = rocksdb::DB::Open(options, shard_path, &_dbs[i]);
       assert(s.ok());
     }
-    LOG(INFO) << "DB initialize success, colnum:" << colnum;
+    VLOG(0) << "DB initialize success, colnum:" << colnum;
     return 0;
   }
 
@@ -94,10 +161,9 @@ class RocksDBHandler {
       int id, const char* key, int key_len, const char* value, int value_len) {
     rocksdb::WriteOptions options;
     options.disableWAL = true;
-    rocksdb::Status s = _db->Put(options,
-                                 _handles[id],
-                                 rocksdb::Slice(key, key_len),
-                                 rocksdb::Slice(value, value_len));
+    rocksdb::Status s = _dbs[id]->Put(options,
+                                      rocksdb::Slice(key, key_len),
+                                      rocksdb::Slice(value, value_len));
     assert(s.ok());
     return 0;
   }
@@ -110,20 +176,17 @@ class RocksDBHandler {
     options.disableWAL = true;
     rocksdb::WriteBatch batch(n * 128);
     for (int i = 0; i < n; i++) {
-      batch.Put(_handles[id],
-                rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second),
+      batch.Put(rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second),
                 rocksdb::Slice(ssd_values[i].first, ssd_values[i].second));
     }
-    rocksdb::Status s = _db->Write(options, &batch);
+    rocksdb::Status s = _dbs[id]->Write(options, &batch);
     assert(s.ok());
     return 0;
   }
 
   int get(int id, const char* key, int key_len, std::string& value) {
-    rocksdb::Status s = _db->Get(rocksdb::ReadOptions(),
-                                 _handles[id],
-                                 rocksdb::Slice(key, key_len),
-                                 &value);
+    rocksdb::Status s = _dbs[id]->Get(
+        rocksdb::ReadOptions(), rocksdb::Slice(key, key_len), &value);
     if (s.IsNotFound()) {
       return 1;
     }
@@ -131,33 +194,58 @@ class RocksDBHandler {
     return 0;
   }
 
+  void multi_get(int id,
+                 const size_t num_keys,
+                 const rocksdb::Slice* keys,
+                 rocksdb::PinnableSlice* values,
+                 rocksdb::Status* status,
+                 const bool sorted_input = true) {
+    rocksdb::ColumnFamilyHandle* handle = _dbs[id]->DefaultColumnFamily();
+    auto read_opt = rocksdb::ReadOptions();
+    read_opt.fill_cache = false;
+    _dbs[id]->MultiGet(
+        read_opt, handle, num_keys, keys, values, status, sorted_input);
+  }
+
   int del_data(int id, const char* key, int key_len) {
     rocksdb::WriteOptions options;
     options.disableWAL = true;
-    rocksdb::Status s =
-        _db->Delete(options, _handles[id], rocksdb::Slice(key, key_len));
+    rocksdb::Status s = _dbs[id]->Delete(options, rocksdb::Slice(key, key_len));
     assert(s.ok());
     return 0;
   }
 
   int flush(int id) {
-    rocksdb::Status s = _db->Flush(rocksdb::FlushOptions(), _handles[id]);
+    rocksdb::Status s = _dbs[id]->Flush(rocksdb::FlushOptions());
     assert(s.ok());
     return 0;
   }
 
   rocksdb::Iterator* get_iterator(int id) {
-    return _db->NewIterator(rocksdb::ReadOptions(), _handles[id]);
+    return _dbs[id]->NewIterator(rocksdb::ReadOptions());
   }
 
   int get_estimate_key_num(uint64_t& num_keys) {
-    _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys);
+    // _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys);
+    return 0;
+  }
+
+  Uint64Comparator* get_comparator() { return &_comparator; }
+
+  int ingest_externel_file(int id,
+                           const std::vector<std::string>& sst_filelist) {
+    rocksdb::IngestExternalFileOptions ifo;
+    ifo.move_files = true;
+    rocksdb::Status s = _dbs[id]->IngestExternalFile(sst_filelist, ifo);
+    assert(s.ok());
     return 0;
   }
 
  private:
   std::vector<rocksdb::ColumnFamilyHandle*> _handles;
-  rocksdb::DB* _db;
+  // rocksdb::DB* _db;
+  std::vector<rocksdb::DB*> _dbs;
+  Uint64Comparator _comparator;
 };
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h
index 9c384a9744b8a3..59e6a5f634c8ad 100644
--- a/paddle/fluid/distributed/ps/table/graph/graph_node.h
+++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h
@@ -56,8 +56,12 @@ class Node {
   virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> *res) const {
     return 0;
   }
+  virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> & feature_id, std::vector<uint8_t> & slot_id) const {
+     return 0;
+  }
   virtual void set_feature(int idx, const std::string &str) {}
   virtual void set_feature_size(int size) {}
+  virtual void shrink_to_fit() {}
   virtual int get_feature_size() { return 0; }
   virtual size_t get_neighbor_size() { return 0; }
 
@@ -155,6 +159,28 @@ class FeatureNode : public Node {
     return 0;
   }
 
+  virtual int get_feature_ids(int slot_idx, std::vector<uint64_t> & feature_id, std::vector<uint8_t> & slot_id) const {
+    errno = 0;
+    size_t num = 0;
+    if (slot_idx < (int)this->feature.size()) {
+      const std::string &s = this->feature[slot_idx];
+      const uint64_t *feas = (const uint64_t *)(s.c_str());
+      num = s.length() / sizeof(uint64_t);
+      CHECK((s.length() % sizeof(uint64_t)) == 0)
+        << "bad feature_item: [" << s << "]";
+      for (size_t i = 0; i < num; ++i) {
+        feature_id.push_back(feas[i]);
+        slot_id.push_back(slot_idx);
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+      errno,
+      0,
+      paddle::platform::errors::InvalidArgument(
+        "get_feature_ids get errno should be 0, but got %d.", errno));
+     return num;
+  }
+
   virtual std::string *mutable_feature(int idx) {
     if (idx >= (int)this->feature.size()) {
       this->feature.resize(idx + 1);
@@ -170,6 +196,12 @@ class FeatureNode : public Node {
   }
   virtual void set_feature_size(int size) { this->feature.resize(size); }
   virtual int get_feature_size() { return this->feature.size(); }
+  virtual void shrink_to_fit() {
+     feature.shrink_to_fit();
+     for (auto & slot : feature) {
+       slot.shrink_to_fit();
+     }
+  }
 
   template <typename T>
   static std::string parse_value_to_bytes(std::vector<std::string> feat_str) {
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index a46244265ef206..9a69433c6104a1 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -172,7 +172,6 @@ int32_t MemorySparseTable::Load(const std::string& path,
           value.resize(feature_value_size);
           int parse_size = _value_accesor->ParseFromString(++end, value.data());
           value.resize(parse_size);
-
         }
         read_channel->close();
         if (err_no == -1) {
@@ -725,7 +724,8 @@ int32_t MemorySparseTable::Pull(TableContext& context) {
   if (context.use_ptr) {
     char** pull_values = context.pull_context.ptr_values;
     const uint64_t* keys = context.pull_context.keys;
-    return PullSparsePtr(pull_values, keys, context.num);
+    return PullSparsePtr(
+        context.shard_id, pull_values, keys, context.num, context.pass_id);
   } else {
     float* pull_values = context.pull_context.values;
     const PullSparseValue& pull_value = context.pull_context.pull_value;
@@ -822,9 +822,11 @@ int32_t MemorySparseTable::PullSparse(float* pull_values,
   return 0;
 }
 
-int32_t MemorySparseTable::PullSparsePtr(char** pull_values,
+int32_t MemorySparseTable::PullSparsePtr(int shard_id,  // fake num
+                                         char** pull_values,
                                          const uint64_t* keys,
-                                         size_t num) {
+                                         size_t num,
+                                         uint16_t pass_id) {
   CostTimer timer("pscore_sparse_select_all");
   size_t value_size = _value_accesor->GetAccessorInfo().size / sizeof(float);
   size_t mf_value_size =
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
index 17018d5e5dfc3d..658446d770c713 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -90,7 +90,11 @@ class MemorySparseTable : public Table {
   std::pair<int64_t, int64_t> PrintTableStat() override;
   int32_t PullSparse(float* values, const PullSparseValue& pull_value);
 
-  int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, size_t num);
+  int32_t PullSparsePtr(int shard_id,
+                        char** pull_values,
+                        const uint64_t* keys,
+                        size_t num,
+                        uint16_t pass_id);
 
   int32_t PushSparse(const uint64_t* keys, const float* values, size_t num);
 
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 05f7c5c5780ea6..b7af4172e3f9e2 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -24,8 +24,10 @@ DECLARE_bool(pserver_print_missed_key_num_every_push);
 DECLARE_bool(pserver_create_value_when_push);
 DECLARE_bool(pserver_enable_create_feasign_randomly);
 DEFINE_bool(pserver_open_strict_check, false, "pserver_open_strict_check");
-DEFINE_string(rocksdb_path, "database", "path of sparse table rocksdb file");
 DEFINE_int32(pserver_load_batch_size, 5000, "load batch size for ssd");
+PADDLE_DEFINE_EXPORTED_string(rocksdb_path,
+                              "database",
+                              "path of sparse table rocksdb file");
 
 namespace paddle {
 namespace distributed {
@@ -35,6 +37,8 @@ int32_t SSDSparseTable::Initialize() {
   _db = paddle::distributed::RocksDBHandler::GetInstance();
   _db->initialize(FLAGS_rocksdb_path, _real_local_shard_num);
   VLOG(0) << "initalize SSDSparseTable succ";
+  VLOG(0) << "SSD FLAGS_pserver_print_missed_key_num_every_push:"
+          << FLAGS_pserver_print_missed_key_num_every_push;
   return 0;
 }
 
@@ -45,7 +49,8 @@ int32_t SSDSparseTable::Pull(TableContext& context) {
   if (context.use_ptr) {
     char** pull_values = context.pull_context.ptr_values;
     const uint64_t* keys = context.pull_context.keys;
-    return PullSparsePtr(pull_values, keys, context.num);
+    return PullSparsePtr(
+        context.shard_id, pull_values, keys, context.num, context.pass_id);
   } else {
     float* pull_values = context.pull_context.values;
     const PullSparseValue& pull_value = context.pull_context.pull_value;
@@ -172,90 +177,139 @@ int32_t SSDSparseTable::PullSparse(float* pull_values,
   return 0;
 }
 
-int32_t SSDSparseTable::PullSparsePtr(char** pull_values,
-                                      const uint64_t* keys,
-                                      size_t num) {
+int32_t SSDSparseTable::PullSparsePtr(int shard_id,
+                                      char** pull_values,
+                                      const uint64_t* pull_keys,
+                                      size_t num,
+                                      uint16_t pass_id) {
   CostTimer timer("pserver_ssd_sparse_select_all");
   size_t value_size = _value_accesor->GetAccessorInfo().size / sizeof(float);
   size_t mf_value_size =
       _value_accesor->GetAccessorInfo().mf_size / sizeof(float);
 
   {  // 从table取值 or create
-    std::vector<std::future<int>> tasks(_real_local_shard_num);
-    std::vector<std::vector<std::pair<uint64_t, int>>> task_keys(
-        _real_local_shard_num);
-    for (size_t i = 0; i < num; ++i) {
-      int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num;
-      task_keys[shard_id].push_back({keys[i], i});
-    }
+    RocksDBCtx context;
+    std::vector<std::future<int>> tasks;
+    RocksDBItem* cur_ctx = context.switch_item();
+    cur_ctx->reset();
+    FixedFeatureValue* ret = NULL;
+    auto& local_shard = _local_shards[shard_id];
+    float data_buffer[value_size];
+    float* data_buffer_ptr = data_buffer;
 
-    std::atomic<uint32_t> missed_keys{0};
-    for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
-      tasks[shard_id] =
+    for (int i = 0; i < num; ++i) {
+      uint64_t key = pull_keys[i];
+      auto itr = local_shard.find(key);
+      if (itr == local_shard.end()) {
+        cur_ctx->batch_index.push_back(i);
+        cur_ctx->batch_keys.push_back(
+            rocksdb::Slice((char*)&(pull_keys[i]), sizeof(uint64_t)));
+        if (cur_ctx->batch_keys.size() == 1024) {
+          cur_ctx->batch_values.resize(cur_ctx->batch_keys.size());
+          cur_ctx->status.resize(cur_ctx->batch_keys.size());
+          auto fut =
+              _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
+                  [this, shard_id, cur_ctx]() -> int {
+                    _db->multi_get(shard_id,
+                                   cur_ctx->batch_keys.size(),
+                                   cur_ctx->batch_keys.data(),
+                                   cur_ctx->batch_values.data(),
+                                   cur_ctx->status.data());
+                    return 0;
+                  });
+          cur_ctx = context.switch_item();
+          for (size_t x = 0; x < tasks.size(); ++x) {
+            tasks[x].wait();
+            for (size_t idx = 0; idx < cur_ctx->status.size(); idx++) {
+              uint64_t cur_key = *((uint64_t*)const_cast<char*>(
+                  cur_ctx->batch_keys[idx].data()));
+              if (cur_ctx->status[idx].IsNotFound()) {
+                auto& feature_value = local_shard[cur_key];
+                int init_size = value_size - mf_value_size;
+                feature_value.resize(init_size);
+                _value_accesor->Create(&data_buffer_ptr, 1);
+                memcpy(const_cast<float*>(feature_value.data()),
+                       data_buffer_ptr,
+                       init_size * sizeof(float));
+                ret = &feature_value;
+              } else {
+                int data_size =
+                    cur_ctx->batch_values[idx].size() / sizeof(float);
+                // from rocksdb to mem
+                auto& feature_value = local_shard[cur_key];
+                feature_value.resize(data_size);
+                memcpy(const_cast<float*>(feature_value.data()),
+                       paddle::string::str_to_float(
+                           cur_ctx->batch_values[idx].data()),
+                       data_size * sizeof(float));
+                _db->del_data(shard_id, (char*)&cur_key, sizeof(uint64_t));
+                ret = &feature_value;
+              }
+              _value_accesor->UpdatePassId(ret->data(), pass_id);
+              int pull_data_idx = cur_ctx->batch_index[idx];
+              pull_values[pull_data_idx] = (char*)ret;
+            }
+          }
+          cur_ctx->reset();
+          tasks.clear();
+          tasks.push_back(std::move(fut));
+        }
+      } else {
+        ret = itr.value_ptr();
+        // int pull_data_idx = keys[i].second;
+        _value_accesor->UpdatePassId(ret->data(), pass_id);
+        pull_values[i] = (char*)ret;
+      }
+    }
+    if (cur_ctx->batch_keys.size() != 0) {
+      cur_ctx->batch_values.resize(cur_ctx->batch_keys.size());
+      cur_ctx->status.resize(cur_ctx->batch_keys.size());
+      auto fut =
           _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
-              [this,
-               shard_id,
-               &task_keys,
-               value_size,
-               mf_value_size,
-               pull_values,
-               &missed_keys]() -> int {
-                auto& keys = task_keys[shard_id];
-                auto& local_shard = _local_shards[shard_id];
-                float data_buffer[value_size];  // NOLINT
-                float* data_buffer_ptr = data_buffer;
-                for (size_t i = 0; i < keys.size(); ++i) {
-                  uint64_t key = keys[i].first;
-                  auto itr = local_shard.find(key);
-                  size_t data_size = value_size - mf_value_size;
-                  FixedFeatureValue* ret = NULL;
-                  if (itr == local_shard.end()) {
-                    // pull rocksdb
-                    std::string tmp_string("");
-                    if (_db->get(shard_id,
-                                 reinterpret_cast<char*>(&key),
-                                 sizeof(uint64_t),
-                                 tmp_string) > 0) {
-                      ++missed_keys;
-                      auto& feature_value = local_shard[key];
-                      feature_value.resize(data_size);
-                      float* data_ptr =
-                          const_cast<float*>(feature_value.data());
-                      _value_accesor->Create(&data_buffer_ptr, 1);
-                      memcpy(
-                          data_ptr, data_buffer_ptr, data_size * sizeof(float));
-                      ret = &feature_value;
-                    } else {
-                      data_size = tmp_string.size() / sizeof(float);
-                      memcpy(data_buffer_ptr,
-                             paddle::string::str_to_float(tmp_string),
-                             data_size * sizeof(float));
-                      // from rocksdb to mem
-                      auto& feature_value = local_shard[key];
-                      feature_value.resize(data_size);
-                      memcpy(const_cast<float*>(feature_value.data()),
-                             data_buffer_ptr,
-                             data_size * sizeof(float));
-                      _db->del_data(shard_id,
-                                    reinterpret_cast<char*>(&key),
-                                    sizeof(uint64_t));
-                      ret = &feature_value;
-                    }
-                  } else {
-                    ret = itr.value_ptr();
-                  }
-                  int pull_data_idx = keys[i].second;
-                  pull_values[pull_data_idx] = reinterpret_cast<char*>(ret);
-                }
+              [this, shard_id, cur_ctx]() -> int {
+                _db->multi_get(shard_id,
+                               cur_ctx->batch_keys.size(),
+                               cur_ctx->batch_keys.data(),
+                               cur_ctx->batch_values.data(),
+                               cur_ctx->status.data());
                 return 0;
               });
+      tasks.push_back(std::move(fut));
     }
-    for (int i = 0; i < _real_local_shard_num; ++i) {
-      tasks[i].wait();
+    for (size_t x = 0; x < tasks.size(); ++x) {
+      tasks[x].wait();
     }
-    if (FLAGS_pserver_print_missed_key_num_every_push) {
-      LOG(WARNING) << "total pull keys:" << num
-                   << " missed_keys:" << missed_keys.load();
+    for (size_t x = 0; x < 2; x++) {
+      cur_ctx = context.switch_item();
+      for (size_t idx = 0; idx < cur_ctx->status.size(); idx++) {
+        uint64_t cur_key =
+            *((uint64_t*)const_cast<char*>(cur_ctx->batch_keys[idx].data()));
+        if (cur_ctx->status[idx].IsNotFound()) {
+          auto& feature_value = local_shard[cur_key];
+          int init_size = value_size - mf_value_size;
+          feature_value.resize(init_size);
+          _value_accesor->Create(&data_buffer_ptr, 1);
+          memcpy(const_cast<float*>(feature_value.data()),
+                 data_buffer_ptr,
+                 init_size * sizeof(float));
+          ret = &feature_value;
+        } else {
+          int data_size = cur_ctx->batch_values[idx].size() / sizeof(float);
+          // from rocksdb to mem
+          auto& feature_value = local_shard[cur_key];
+          feature_value.resize(data_size);
+          memcpy(
+              const_cast<float*>(feature_value.data()),
+              paddle::string::str_to_float(cur_ctx->batch_values[idx].data()),
+              data_size * sizeof(float));
+          _db->del_data(shard_id, (char*)&cur_key, sizeof(uint64_t));
+          ret = &feature_value;
+        }
+        _value_accesor->UpdatePassId(ret->data(), pass_id);
+        int pull_data_idx = cur_ctx->batch_index[idx];
+        pull_values[pull_data_idx] = (char*)ret;
+      }
+      cur_ctx->reset();
     }
   }
   return 0;
@@ -527,6 +581,7 @@ int64_t SSDSparseTable::LocalSize() {
 
 int32_t SSDSparseTable::Save(const std::string& path,
                              const std::string& param) {
+  std::lock_guard<std::mutex> guard(_table_mutex);
   if (_real_local_shard_num == 0) {
     _local_show_threshold = -1;
     return 0;
@@ -537,15 +592,16 @@ int32_t SSDSparseTable::Save(const std::string& path,
   //    }
 
   // LOG(INFO) << "table cache rate is: " << _config.sparse_table_cache_rate();
-  LOG(INFO) << "table cache rate is: " << _config.sparse_table_cache_rate();
-  LOG(INFO) << "enable_sparse_table_cache: "
-            << _config.enable_sparse_table_cache();
-  LOG(INFO) << "LocalSize: " << LocalSize();
+  VLOG(0) << "table cache rate is: " << _config.sparse_table_cache_rate();
+  VLOG(0) << "enable_sparse_table_cache: "
+          << _config.enable_sparse_table_cache();
+  VLOG(0) << "LocalSize: " << LocalSize();
   if (_config.enable_sparse_table_cache()) {
-    LOG(INFO) << "Enable sparse table cache, top n:" << _cache_tk_size;
+    VLOG(0) << "Enable sparse table cache, top n:" << _cache_tk_size;
   }
   _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate();
   TopkCalculator tk(_real_local_shard_num, _cache_tk_size);
+  VLOG(0) << "TopkCalculator top n:" << _cache_tk_size;
   size_t file_start_idx = _avg_local_shard_num * _shard_idx;
   std::string table_path = TableDir(path);
   _afs_client.remove(paddle::string::format_string(
@@ -560,138 +616,141 @@ int32_t SSDSparseTable::Save(const std::string& path,
   std::atomic<uint32_t> feasign_size_all{0};
   // feasign_size = 0;
 
-  omp_set_num_threads(thread_num);
-#pragma omp parallel for schedule(dynamic)
-  for (int i = 0; i < _real_local_shard_num; ++i) {
+  std::vector<
+      paddle::framework::Channel<std::pair<uint64_t, std::vector<float>>>>
+      fs_channel;
+  for (int i = 0; i < _real_local_shard_num; i++) {
+    fs_channel.push_back(
+        paddle::framework::MakeChannel<std::pair<uint64_t, std::vector<float>>>(
+            10240));
+  }
+  std::vector<std::thread> threads;
+  threads.resize(_real_local_shard_num);
+
+  auto save_func = [this,
+                    &save_param,
+                    &table_path,
+                    &file_start_idx,
+                    &fs_channel](int file_num) {
+    int err_no = 0;
     FsChannelConfig channel_config;
     if (_config.compress_in_save() && (save_param == 0 || save_param == 3)) {
       channel_config.path =
           paddle::string::format_string("%s/part-%03d-%05d.gz",
                                         table_path.c_str(),
                                         _shard_idx,
-                                        file_start_idx + i);
+                                        file_start_idx + file_num);
     } else {
-      channel_config.path = paddle::string::format_string("%s/part-%03d-%05d",
-                                                          table_path.c_str(),
-                                                          _shard_idx,
-                                                          file_start_idx + i);
+      channel_config.path =
+          paddle::string::format_string("%s/part-%03d-%05d",
+                                        table_path.c_str(),
+                                        _shard_idx,
+                                        file_start_idx + file_num);
     }
     channel_config.converter = _value_accesor->Converter(save_param).converter;
     channel_config.deconverter =
         _value_accesor->Converter(save_param).deconverter;
-    int err_no = 0;
-    int retry_num = 0;
-    bool is_write_failed = false;
+    auto write_channel =
+        _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no);
+    paddle::framework::ChannelReader<std::pair<uint64_t, std::vector<float>>>
+        reader(fs_channel[file_num].get());
+    std::pair<uint64_t, std::vector<float>> out_str;
+    while (reader >> out_str) {
+      std::string format_value = _value_accesor->ParseToString(
+          out_str.second.data(), out_str.second.size());
+      if (0 != write_channel->write_line(paddle::string::format_string(
+                   "%lu %s", out_str.first, format_value.c_str()))) {
+        LOG(FATAL) << "SSDSparseTable save failed, retry it! path:"
+                   << channel_config.path;
+      }
+    }
+    write_channel->close();
+  };
+  for (size_t i = 0; i < threads.size(); i++) {
+    threads[i] = std::thread(save_func, i);
+  }
+
+  std::vector<
+      paddle::framework::ChannelWriter<std::pair<uint64_t, std::vector<float>>>>
+      writers(_real_local_shard_num);
+  omp_set_num_threads(thread_num);
+#pragma omp parallel for schedule(dynamic)
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     int feasign_size = 0;
     auto& shard = _local_shards[i];
-    do {
-      err_no = 0;
-      feasign_size = 0;
-      is_write_failed = false;
-      auto write_channel =
-          _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no);
+    auto& writer = writers[i];
+    writer.Reset(fs_channel[i].get());
+    {
       for (auto it = shard.begin(); it != shard.end(); ++it) {
         if (_config.enable_sparse_table_cache() &&
-            (save_param == 1 || save_param == 2) &&
-            _value_accesor->Save(it.value().data(), 4)) {
-          // tk.push(i, it.value().data()[2]);
+            (save_param == 1 || save_param == 2)) {
+          // get_field get right decayed show
           tk.push(i, _value_accesor->GetField(it.value().data(), "show"));
         }
         if (_value_accesor->Save(it.value().data(), save_param)) {
-          std::string format_value = _value_accesor->ParseToString(
-              it.value().data(), it.value().size());
-          if (0 != write_channel->write_line(paddle::string::format_string(
-                       "%lu %s", it.key(), format_value.c_str()))) {
-            ++retry_num;
-            is_write_failed = true;
-            LOG(ERROR) << "SSDSparseTable save failed, retry it! path:"
-                       << channel_config.path << ", retry_num=" << retry_num;
-            break;
-          }
+          std::vector<float> feature_value;
+          feature_value.resize(it.value().size());
+          memcpy(const_cast<float*>(feature_value.data()),
+                 it.value().data(),
+                 it.value().size() * sizeof(float));
+          writer << std::make_pair(it.key(), std::move(feature_value));
           ++feasign_size;
         }
       }
+    }
 
-      if (err_no == -1 && !is_write_failed) {
-        ++retry_num;
-        is_write_failed = true;
-        LOG(ERROR) << "SSDSparseTable save failed after write, retry it! "
-                   << "path:" << channel_config.path
-                   << " , retry_num=" << retry_num;
-      }
-      if (is_write_failed) {
-        _afs_client.remove(channel_config.path);
-        continue;
-      }
-
-      // delta and cache and revert is all in mem, base in rocksdb
-      if (save_param != 1) {
-        auto* it = _db->get_iterator(i);
-        for (it->SeekToFirst(); it->Valid(); it->Next()) {
-          bool need_save = _value_accesor->Save(
-              paddle::string::str_to_float(it->value().data()), save_param);
-          _value_accesor->UpdateStatAfterSave(
-              paddle::string::str_to_float(it->value().data()), save_param);
-          if (need_save) {
-            std::string format_value = _value_accesor->ParseToString(
-                paddle::string::str_to_float(it->value().data()),
-                it->value().size() / sizeof(float));
-            if (0 != write_channel->write_line(paddle::string::format_string(
-                         "%lu %s",
-                         *((uint64_t*)const_cast<char*>(it->key().data())),
-                         format_value.c_str()))) {
-              ++retry_num;
-              is_write_failed = true;
-              LOG(ERROR) << "SSDSparseTable save failed, retry it! path:"
-                         << channel_config.path << ", retry_num=" << retry_num;
-              break;
-            }
-            if (save_param == 3) {
-              _db->put(i,
-                       it->key().data(),
-                       it->key().size(),
-                       it->value().data(),
-                       it->value().size());
-            }
-            ++feasign_size;
-          }
+    if (save_param != 1) {
+      auto* it = _db->get_iterator(i);
+      for (it->SeekToFirst(); it->Valid(); it->Next()) {
+        bool need_save = _value_accesor->Save(
+            paddle::string::str_to_float(it->value().data()), save_param);
+        _value_accesor->UpdateStatAfterSave(
+            paddle::string::str_to_float(it->value().data()), save_param);
+        if (need_save) {
+          std::vector<float> feature_value;
+          feature_value.resize(it->value().size() / sizeof(float));
+          memcpy(const_cast<float*>(feature_value.data()),
+                 paddle::string::str_to_float(it->value().data()),
+                 it->value().size());
+          writer << std::make_pair(
+              *((uint64_t*)const_cast<char*>(it->key().data())),
+              std::move(feature_value));
+          ++feasign_size;
         }
-        delete it;
       }
+      delete it;
+    }
 
-      write_channel->close();
-      if (err_no == -1) {
-        ++retry_num;
-        is_write_failed = true;
-        LOG(ERROR) << "SSDSparseTable save failed after write, retry it! "
-                   << "path:" << channel_config.path
-                   << " , retry_num=" << retry_num;
-      }
-      if (is_write_failed) {
-        _afs_client.remove(channel_config.path);
-      }
-    } while (is_write_failed);
+    writer.Flush();
+    fs_channel[i]->Close();
     feasign_size_all += feasign_size;
     for (auto it = shard.begin(); it != shard.end(); ++it) {
       _value_accesor->UpdateStatAfterSave(it.value().data(), save_param);
     }
   }
+  for (int i = 0; i < threads.size(); i++) {
+    threads[i].join();
+  }
+  for (int i = 0; i < fs_channel.size(); i++) {
+    fs_channel[i].reset();
+  }
+  fs_channel.clear();
+
   if (save_param == 3) {
-    UpdateTable();
+    // UpdateTable();
     _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate();
-    LOG(INFO) << "SSDSparseTable update success.";
-  }
-  LOG(INFO) << "SSDSparseTable save success, path:"
-            << paddle::string::format_string("%s/%03d/part-%03d-",
-                                             path.c_str(),
-                                             _config.table_id(),
-                                             _shard_idx)
-            << " from " << file_start_idx << " to "
-            << file_start_idx + _real_local_shard_num - 1;
-  // return feasign_size_all;
+    VLOG(0) << "SSDSparseTable update success.";
+  }
+  VLOG(0) << "SSDSparseTable save success, feasign size:" << feasign_size_all
+          << ", path:"
+          << paddle::string::format_string("%s/%03d/part-%03d-",
+                                           path.c_str(),
+                                           _config.table_id(),
+                                           _shard_idx)
+          << " from " << file_start_idx << " to "
+          << file_start_idx + _real_local_shard_num - 1;
   _local_show_threshold = tk.top();
-  LOG(INFO) << "local cache threshold: " << _local_show_threshold;
-  // int32 may overflow need to change return value
+  VLOG(0) << "local cache threshold: " << _local_show_threshold;
   return 0;
 }
 
@@ -862,7 +921,167 @@ int32_t SSDSparseTable::SaveCache(
 
 int32_t SSDSparseTable::Load(const std::string& path,
                              const std::string& param) {
-  return MemorySparseTable::Load(path, param);
+  VLOG(0) << "LOAD FLAGS_rocksdb_path:" << FLAGS_rocksdb_path;
+  std::string table_path = TableDir(path);
+  auto file_list = _afs_client.list(table_path);
+
+  // std::sort(file_list.begin(), file_list.end());
+  for (auto file : file_list) {
+    VLOG(1) << "SSDSparseTable::Load() file list: " << file;
+  }
+
+  int load_param = atoi(param.c_str());
+  size_t expect_shard_num = _sparse_table_shard_num;
+  if (file_list.size() != expect_shard_num) {
+    LOG(WARNING) << "SSDSparseTable file_size:" << file_list.size()
+                 << " not equal to expect_shard_num:" << expect_shard_num;
+    return -1;
+  }
+  if (file_list.size() == 0) {
+    LOG(WARNING) << "SSDSparseTable load file is empty, path:" << path;
+    return -1;
+  }
+
+  size_t file_start_idx = _shard_idx * _avg_local_shard_num;
+
+  if (file_start_idx >= file_list.size()) {
+    return 0;
+  }
+
+  size_t feature_value_size =
+      _value_accesor->GetAccessorInfo().size / sizeof(float);
+  size_t mf_value_size =
+      _value_accesor->GetAccessorInfo().mf_size / sizeof(float);
+
+#ifdef PADDLE_WITH_HETERPS
+  int thread_num = _real_local_shard_num;
+#else
+  int thread_num = _real_local_shard_num < 15 ? _real_local_shard_num : 15;
+#endif
+
+  for (int i = 0; i < _real_local_shard_num; i++) {
+    _fs_channel.push_back(paddle::framework::MakeChannel<std::string>(30000));
+  }
+
+  std::vector<std::thread> threads;
+  threads.resize(thread_num);
+  auto load_func = [this, &file_start_idx, &file_list, &load_param](
+                       int file_num) {
+    int err_no = 0;
+    FsChannelConfig channel_config;
+    channel_config.path = file_list[file_num + file_start_idx];
+    VLOG(1) << "SSDSparseTable::load begin load " << channel_config.path
+            << " into local shard " << file_num;
+    channel_config.converter = _value_accesor->Converter(load_param).converter;
+    channel_config.deconverter =
+        _value_accesor->Converter(load_param).deconverter;
+
+    std::string line_data;
+    auto read_channel = _afs_client.open_r(channel_config, 0, &err_no);
+    paddle::framework::ChannelWriter<std::string> writer(
+        _fs_channel[file_num].get());
+    while (read_channel->read_line(line_data) == 0 && line_data.size() > 1) {
+      writer << line_data;
+    }
+    writer.Flush();
+    read_channel->close();
+    _fs_channel[file_num]->Close();
+  };
+  for (size_t i = 0; i < threads.size(); i++) {
+    threads[i] = std::thread(load_func, i);
+  }
+
+  omp_set_num_threads(thread_num);
+#pragma omp parallel for schedule(dynamic)
+  for (int i = 0; i < _real_local_shard_num; ++i) {
+    std::vector<std::pair<char*, int>> ssd_keys;
+    std::vector<std::pair<char*, int>> ssd_values;
+    std::vector<uint64_t> tmp_key;
+    ssd_keys.reserve(FLAGS_pserver_load_batch_size);
+    ssd_values.reserve(FLAGS_pserver_load_batch_size);
+    tmp_key.reserve(FLAGS_pserver_load_batch_size);
+    ssd_keys.clear();
+    ssd_values.clear();
+    tmp_key.clear();
+    std::string line_data;
+    char* end = NULL;
+    int local_shard_id = i % _avg_local_shard_num;
+    auto& shard = _local_shards[local_shard_id];
+    float data_buffer[FLAGS_pserver_load_batch_size * feature_value_size];
+    float* data_buffer_ptr = data_buffer;
+    uint64_t mem_count = 0;
+    uint64_t ssd_count = 0;
+    uint64_t mem_mf_count = 0;
+    uint64_t ssd_mf_count = 0;
+
+    paddle::framework::ChannelReader<std::string> reader(_fs_channel[i].get());
+
+    while (reader >> line_data) {
+      uint64_t key = std::strtoul(line_data.data(), &end, 10);
+      if (FLAGS_pserver_open_strict_check) {
+        if (key % _sparse_table_shard_num != (i + file_start_idx)) {
+          LOG(WARNING) << "SSDSparseTable key:" << key << " not match shard,"
+                       << " file_idx:" << i
+                       << " shard num:" << _sparse_table_shard_num;
+          continue;
+        }
+      }
+      size_t value_size =
+          _value_accesor->ParseFromString(++end, data_buffer_ptr);
+      // ssd or mem
+      if (_value_accesor->SaveSSD(data_buffer_ptr)) {
+        tmp_key.emplace_back(key);
+        ssd_keys.emplace_back(
+            std::make_pair((char*)&tmp_key.back(), sizeof(uint64_t)));
+        ssd_values.emplace_back(
+            std::make_pair((char*)data_buffer_ptr, value_size * sizeof(float)));
+        data_buffer_ptr += feature_value_size;
+        if (static_cast<int>(ssd_keys.size()) ==
+            FLAGS_pserver_load_batch_size) {
+          _db->put_batch(local_shard_id, ssd_keys, ssd_values, ssd_keys.size());
+          ssd_keys.clear();
+          ssd_values.clear();
+          tmp_key.clear();
+          data_buffer_ptr = data_buffer;
+        }
+        ssd_count++;
+        if (value_size > feature_value_size - mf_value_size) {
+          ssd_mf_count++;
+        }
+      } else {
+        auto& value = shard[key];
+        value.resize(value_size);
+        _value_accesor->ParseFromString(end, value.data());
+        mem_count++;
+        if (value_size > feature_value_size - mf_value_size) {
+          mem_mf_count++;
+        }
+      }
+    }
+    // last batch
+    if (ssd_keys.size() > 0) {
+      _db->put_batch(local_shard_id, ssd_keys, ssd_values, ssd_keys.size());
+    }
+
+    _db->flush(local_shard_id);
+    VLOG(0) << "Table>> load done. ALL[" << mem_count + ssd_count << "] MEM["
+            << mem_count << "] MEM_MF[" << mem_mf_count << "] SSD[" << ssd_count
+            << "] SSD_MF[" << ssd_mf_count << "].";
+  }
+  for (int i = 0; i < threads.size(); i++) {
+    threads[i].join();
+  }
+  for (int i = 0; i < _fs_channel.size(); i++) {
+    _fs_channel[i].reset();
+  }
+  _fs_channel.clear();
+  LOG(INFO) << "load num:" << LocalSize();
+  LOG(INFO) << "SSDSparseTable load success, path from "
+            << file_list[file_start_idx] << " to "
+            << file_list[file_start_idx + _real_local_shard_num - 1];
+
+  _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate();
+  return 0;
 }
 
 //加载path目录下数据[start_idx, end_idx)
@@ -882,7 +1101,11 @@ int32_t SSDSparseTable::Load(size_t start_idx,
   end_idx = static_cast<int>(end_idx) < _sparse_table_shard_num
                 ? end_idx
                 : _sparse_table_shard_num;
+#ifdef PADDLE_WITH_HETERPS
+  int thread_num = end_idx - start_idx;
+#else
   int thread_num = (end_idx - start_idx) < 20 ? (end_idx - start_idx) : 20;
+#endif
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
   for (size_t i = start_idx; i < end_idx; ++i) {
@@ -999,5 +1222,163 @@ int32_t SSDSparseTable::Load(size_t start_idx,
   return 0;
 }
 
+std::pair<int64_t, int64_t> SSDSparseTable::PrintTableStat() {
+  int64_t feasign_size = LocalSize();
+  return {feasign_size, -1};
+}
+
+int32_t SSDSparseTable::CacheTable(uint16_t pass_id) {
+  std::lock_guard<std::mutex> guard(_table_mutex);
+  VLOG(0) << "cache_table";
+  std::atomic<uint32_t> count{0};
+  auto thread_num = _real_local_shard_num;
+  std::vector<std::future<int>> tasks;
+
+  double show_threshold = 10000000;
+
+  //保证cache数据不被淘汰掉
+  if (_config.enable_sparse_table_cache()) {
+    if (_local_show_threshold < show_threshold) {
+      show_threshold = _local_show_threshold;
+    }
+  }
+
+  if (show_threshold < 500) {
+    show_threshold = 500;
+  }
+  VLOG(0) << " show_threshold:" << show_threshold
+          << " ; local_show_threshold:" << _local_show_threshold;
+  VLOG(0) << "Table>> origin mem feasign size:" << LocalSize();
+  static int cache_table_count = 0;
+  ++cache_table_count;
+  for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+    // from mem to ssd
+    auto fut = _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
+        [shard_id, this, &count, show_threshold, pass_id]() -> int {
+          rocksdb::Options options;
+          options.comparator = _db->get_comparator();
+          rocksdb::BlockBasedTableOptions bbto;
+          bbto.format_version = 5;
+          bbto.use_delta_encoding = false;
+          bbto.block_size = 4 * 1024;
+          bbto.block_restart_interval = 6;
+          bbto.cache_index_and_filter_blocks = false;
+          bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(15, false));
+          bbto.whole_key_filtering = true;
+          options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto));
+          options.OptimizeLevelStyleCompaction();
+          options.keep_log_file_num = 100;
+          options.max_log_file_size = 50 * 1024 * 1024;  // 50MB
+          options.create_if_missing = true;
+          options.use_direct_reads = true;
+          options.write_buffer_size = 64 * 1024 * 1024;  // 256MB
+          options.max_write_buffer_number = 4;
+          options.max_bytes_for_level_base =
+              options.max_write_buffer_number * options.write_buffer_size;
+          options.min_write_buffer_number_to_merge = 1;
+          options.target_file_size_base = 1024 * 1024 * 1024;  // 1024MB
+          options.memtable_prefix_bloom_size_ratio = 0.02;
+          options.num_levels = 4;
+          options.max_open_files = -1;
+
+          options.compression = rocksdb::kNoCompression;
+
+          auto& shard = _local_shards[shard_id];
+          if (1) {
+            using DataType = shard_type::map_type::iterator;
+            std::vector<DataType> datas;
+            datas.reserve(shard.size() * 0.8);
+            size_t idx = 0;
+            for (auto it = shard.begin(); it != shard.end(); ++it) {
+              if (!_value_accesor->SaveMemCache(
+                      it.value().data(), 0, show_threshold, pass_id)) {
+                datas.emplace_back(it.it);
+              }
+            }
+            count.fetch_add(datas.size(), std::memory_order_relaxed);
+            VLOG(0) << "datas size:  " << datas.size();
+            {
+              // sst文件写入必须有序
+              uint64_t show_begin = butil::gettimeofday_ms();
+              std::sort(datas.begin(),
+                        datas.end(),
+                        [](const DataType& a, const DataType& b) {
+                          return a->first < b->first;
+                        });
+              VLOG(0) << "sort shard " << shard_id << ": "
+                      << butil::gettimeofday_ms() - show_begin
+                      << " ms, num: " << datas.size();
+            }
+
+            //必须做空判断，否则sst_writer.Finish会core掉
+            if (datas.size() != 0) {
+              rocksdb::SstFileWriter sst_writer(rocksdb::EnvOptions(), options);
+              std::string filename =
+                  paddle::string::format_string("%s_%d/cache-%05d.sst",
+                                                FLAGS_rocksdb_path.c_str(),
+                                                shard_id,
+                                                cache_table_count);
+              rocksdb::Status status = sst_writer.Open(filename);
+              if (!status.ok()) {
+                VLOG(0) << "sst writer open " << filename << "failed"
+                        << ", " << status.getState();
+                abort();
+              }
+              VLOG(0) << "sst writer open " << filename;
+
+              uint64_t show_begin = butil::gettimeofday_ms();
+              for (auto& data : datas) {
+                uint64_t tmp_key = data->first;
+                FixedFeatureValue& tmp_value =
+                    *((FixedFeatureValue*)(void*)(data->second));
+                status = sst_writer.Put(
+                    rocksdb::Slice((char*)(&(tmp_key)), sizeof(uint64_t)),
+                    rocksdb::Slice((char*)(tmp_value.data()),
+                                   tmp_value.size() * sizeof(float)));
+                if (!status.ok()) {
+                  VLOG(0) << "fatal in Put file: " << filename << ", "
+                          << status.getState();
+                  abort();
+                }
+              }
+              status = sst_writer.Finish();
+              if (!status.ok()) {
+                VLOG(0) << "fatal in finish file: " << filename << ", "
+                        << status.getState();
+                abort();
+              }
+              VLOG(0) << "write sst_file shard " << shard_id << ": "
+                      << butil::gettimeofday_ms() - show_begin << " ms";
+              int ret = _db->ingest_externel_file(shard_id, {filename});
+              if (ret) {
+                VLOG(0) << "ingest file failed"
+                        << ", " << status.getState();
+                abort();
+              }
+            }
+
+            for (auto it = shard.begin(); it != shard.end();) {
+              if (!_value_accesor->SaveMemCache(
+                      it.value().data(), 0, show_threshold, pass_id)) {
+                it = shard.erase(it);
+              } else {
+                ++it;
+              }
+            }
+          }
+          return 0;
+        });
+    tasks.push_back(std::move(fut));
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  tasks.clear();
+
+  VLOG(0) << "Table>> cache ssd count: " << count.load();
+  VLOG(0) << "Table>> after update, mem feasign size:" << LocalSize();
+  return 0;
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
index 55a05bbab5ec24..8f281b5a4bffb0 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
@@ -38,7 +38,11 @@ class SSDSparseTable : public MemorySparseTable {
   int32_t Push(TableContext& context) override;
 
   int32_t PullSparse(float* pull_values, const uint64_t* keys, size_t num);
-  int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, size_t num);
+  int32_t PullSparsePtr(int shard_id,
+                        char** pull_values,
+                        const uint64_t* keys,
+                        size_t num,
+                        uint16_t pass_id);
   int32_t PushSparse(const uint64_t* keys, const float* values, size_t num);
   int32_t PushSparse(const uint64_t* keys, const float** values, size_t num);
 
@@ -77,10 +81,16 @@ class SSDSparseTable : public MemorySparseTable {
                        const std::string& param);
   int64_t LocalSize();
 
+  std::pair<int64_t, int64_t> PrintTableStat() override;
+
+  int32_t CacheTable(uint16_t pass_id) override;
+
  private:
   RocksDBHandler* _db;
   int64_t _cache_tk_size;
   double _local_show_threshold{0.0};
+  std::vector<paddle::framework::Channel<std::string>> _fs_channel;
+  std::mutex _table_mutex;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h
index aee707712f6629..f07a3f2132217e 100644
--- a/paddle/fluid/distributed/ps/table/table.h
+++ b/paddle/fluid/distributed/ps/table/table.h
@@ -62,6 +62,8 @@ struct TableContext {
   size_t num;
   bool use_ptr = false;
   uint32_t trainer_id;  // for GEO and global step
+  int shard_id;         // for gpups
+  uint16_t pass_id;     // for gpups ssd
 };
 
 class Table {
@@ -147,6 +149,7 @@ class Table {
 
   virtual void *GetShard(size_t shard_idx) = 0;
   virtual std::pair<int64_t, int64_t> PrintTableStat() { return {0, 0}; }
+  virtual int32_t CacheTable(uint16_t pass_id) { return 0; }
 
   // for patch model
   virtual void Revert() {}
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index 5df74883f9247f..db0dcf0605dc70 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -748,6 +748,17 @@ void FleetWrapper::PrintTableStat(const uint64_t table_id) {
   }
 }
 
+void FleetWrapper::SaveCacheTable(const uint64_t table_id,
+                                  uint16_t pass_id,
+                                  size_t threshold) {
+  auto ret = worker_ptr_->SaveCacheTable(table_id, pass_id, threshold);
+  ret.wait();
+  int32_t err_code = ret.get();
+  if (err_code == -1) {
+    LOG(ERROR) << "save cache table stat failed";
+  }
+}
+
 void FleetWrapper::ShrinkSparseTable(int table_id, int threshold) {
   auto ret = worker_ptr_->Shrink(table_id, std::to_string(threshold));
   ret.wait();
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h
old mode 100755
new mode 100644
index 28347b3502707c..5065fb380a3464
--- a/paddle/fluid/distributed/ps/wrapper/fleet.h
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.h
@@ -242,6 +242,9 @@ class FleetWrapper {
   void BarrierWithTable(uint32_t barrier_type);
 
   void PrintTableStat(const uint64_t table_id);
+  void SaveCacheTable(const uint64_t table_id,
+                      uint16_t pass_id,
+                      size_t threshold);
   // mode = 0, load all feature
   // mode = 1, load delta feature, which means load diff
   void LoadModel(const std::string& path, const int mode);
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 47dc8abf402b9c..fe6c51b87228aa 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -353,7 +353,7 @@ class ChannelReader {
     }
     if (cursor_ >= buffer_.size()) {
       cursor_ = 0;
-      if (channel_->read(buffer_) == 0) {
+      if (channel_->Read(buffer_) == 0) {
         failed_ = true;
         return *this;
       }
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index c88c91f166112d..0181ea9eb062b4 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -2118,6 +2118,14 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
   }
 }
 
+void SlotRecordInMemoryDataFeed::InitGraphResource() {
+  gpu_graph_data_generator_.AllocResource(thread_id_, feed_vec_);
+}
+
+void SlotRecordInMemoryDataFeed::InitGraphTrainResource() {
+  gpu_graph_data_generator_.AllocTrainResource(thread_id_);
+}
+
 void SlotRecordInMemoryDataFeed::LoadIntoMemory() {
   VLOG(3) << "SlotRecord LoadIntoMemory() begin, thread_id=" << thread_id_;
   if (!so_parser_name_.empty()) {
@@ -2654,7 +2662,7 @@ bool SlotRecordInMemoryDataFeed::Start() {
   pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_);
 #endif
 #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
-  gpu_graph_data_generator_.AllocResource(this->place_, feed_vec_);
+  gpu_graph_data_generator_.SetFeedVec(feed_vec_);
 #endif
   return true;
 }
@@ -2696,6 +2704,12 @@ int SlotRecordInMemoryDataFeed::Next() {
 #endif
 }
 
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+void SlotRecordInMemoryDataFeed::DoWalk() {
+  gpu_graph_data_generator_.DoWalk();
+}
+#endif
+
 #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
 void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
   int offset_cols_size = (ins_num + 1);
diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
index 94e478f8c58af9..a90e853e65deaf 100644
--- a/paddle/fluid/framework/data_feed.cu
+++ b/paddle/fluid/framework/data_feed.cu
@@ -19,20 +19,19 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_feed.h"
 #include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
 #include <thrust/random.h>
 #include <thrust/shuffle.h>
 #include <sstream>
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
-
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/unique_kernel.h"
-#include "paddle/phi/kernels/graph_reindex_kernel.h"
-#include "paddle/phi/kernels/gpu/graph_reindex_funcs.h"
+#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 
 DECLARE_bool(enable_opt_get_features);
+DECLARE_int32(gpugraph_storage_mode);
+DECLARE_double(gpugraph_hbm_table_load_factor);
 
 namespace paddle {
 namespace framework {
@@ -41,6 +40,11 @@ namespace framework {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
        i += blockDim.x * gridDim.x)
 
+#define DEBUG_STATE(state)                                             \
+  VLOG(2) << "left: " << state->left << " right: " << state->right     \
+          << " central_word: " << state->central_word                  \
+          << " step: " << state->step << " cursor: " << state->cursor  \
+          << " len: " << state->len << " row_num: " << state->row_num; \
 // CUDA: use 512 threads per block
 const int CUDA_NUM_THREADS = 512;
 // CUDA: number of blocks for threads.
@@ -85,26 +89,6 @@ __global__ void FillSlotValueOffsetKernel(const int ins_num,
   }
 }
 
-__global__ void fill_actual_neighbors(int64_t* vals,
-                                      int64_t* actual_vals,
-                                      int64_t* actual_vals_dst,
-                                      int* actual_sample_size,
-                                      int* cumsum_actual_sample_size,
-                                      int sample_size,
-                                      int len,
-                                      int mod) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
-    int offset1 = cumsum_actual_sample_size[i];
-    int offset2 = sample_size * i;
-    int dst_id = i % mod;
-    for (int j = 0; j < actual_sample_size[i]; j++) {
-      actual_vals[offset1 + j] = vals[offset2 + j];
-      actual_vals_dst[offset1 + j] = dst_id;
-    }
-  }
-}
-
 void SlotRecordInMemoryDataFeed::FillSlotValueOffset(
     const int ins_num,
     const int used_slot_num,
@@ -233,13 +217,13 @@ __global__ void CopyDuplicateKeys(int64_t *dist_tensor,
 int GraphDataGenerator::AcquireInstance(BufState *state) {
   //
   if (state->GetNextStep()) {
-    state->Debug();
+    DEBUG_STATE(state);
     return state->len;
   } else if (state->GetNextCentrolWord()) {
-    state->Debug();
+    DEBUG_STATE(state);
     return state->len;
   } else if (state->GetNextBatch()) {
-    state->Debug();
+    DEBUG_STATE(state);
     return state->len;
   }
   return 0;
@@ -371,64 +355,157 @@ __global__ void GraphFillSlotLodKernel(int64_t *id_tensor, int len) {
   CUDA_KERNEL_LOOP(idx, len) { id_tensor[idx] = idx; }
 }
 
-int GraphDataGenerator::FillInsBuf() {
-  if (ins_buf_pair_len_ >= batch_size_) {
-    return batch_size_;
+int GraphDataGenerator::FillIdShowClkTensor(int total_instance,
+                                            bool gpu_graph_training,
+                                            size_t cursor) {
+  id_tensor_ptr_ =
+      feed_vec_[0]->mutable_data<int64_t>({total_instance, 1}, this->place_);
+  show_tensor_ptr_ =
+      feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
+  clk_tensor_ptr_ =
+      feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  if (gpu_graph_training) {
+    uint64_t *ins_cursor, *ins_buf;
+    ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
+    ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
+    cudaMemcpyAsync(id_tensor_ptr_,
+                    ins_cursor,
+                    sizeof(uint64_t) * total_instance,
+                    cudaMemcpyDeviceToDevice,
+                    train_stream_);
+  } else {
+    uint64_t *d_type_keys =
+        reinterpret_cast<uint64_t *>(d_device_keys_[cursor]->ptr());
+    d_type_keys += infer_node_start_;
+    infer_node_start_ += total_instance / 2;
+    CopyDuplicateKeys<<<GET_BLOCKS(total_instance / 2),
+                        CUDA_NUM_THREADS,
+                        0,
+                        train_stream_>>>(
+        id_tensor_ptr_, d_type_keys, total_instance / 2);
   }
-  int total_instance = AcquireInstance(&buf_state_);
 
-  VLOG(2) << "total_ins: " << total_instance;
-  buf_state_.Debug();
+  GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
+                       CUDA_NUM_THREADS,
+                       0,
+                       train_stream_>>>(show_tensor_ptr_, total_instance);
+  GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
+                       CUDA_NUM_THREADS,
+                       0,
+                       train_stream_>>>(clk_tensor_ptr_, total_instance);
+  return 0;
+}
 
-  if (total_instance == 0) {
-    int res = FillWalkBuf(d_walk_);
-    if (!res) {
-      // graph iterate complete
-      return -1;
-    } else {
-      total_instance = buf_state_.len;
-      VLOG(2) << "total_ins: " << total_instance;
-      buf_state_.Debug();
-      // if (total_instance == 0) {
-      //  return -1;
-      //}
+int GraphDataGenerator::FillGraphSlotFeature(int total_instance,
+                                             bool gpu_graph_training) {
+  int64_t *slot_tensor_ptr_[slot_num_];
+  int64_t *slot_lod_tensor_ptr_[slot_num_];
+  for (int i = 0; i < slot_num_; ++i) {
+    slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data<int64_t>(
+        {total_instance * h_slot_feature_num_map_[i], 1}, this->place_);
+    slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data<int64_t>(
+        {total_instance + 1}, this->place_);
+  }
+  uint64_t *ins_cursor, *ins_buf;
+  if (gpu_graph_training) {
+    ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
+    ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
+  } else {
+    id_tensor_ptr_ =
+        feed_vec_[0]->mutable_data<int64_t>({total_instance, 1}, this->place_);
+    ins_cursor = (uint64_t *)id_tensor_ptr_;
+  }
+
+  cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(),
+                  slot_tensor_ptr_,
+                  sizeof(uint64_t *) * slot_num_,
+                  cudaMemcpyHostToDevice,
+                  train_stream_);
+  cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(),
+                  slot_lod_tensor_ptr_,
+                  sizeof(uint64_t *) * slot_num_,
+                  cudaMemcpyHostToDevice,
+                  train_stream_);
+  uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
+  FillFeatureBuf(ins_cursor, feature_buf, total_instance);
+  GraphFillSlotKernel<<<GET_BLOCKS(total_instance * fea_num_per_node_),
+                        CUDA_NUM_THREADS,
+                        0,
+                        train_stream_>>>((uint64_t *)d_slot_tensor_ptr_->ptr(),
+                                         feature_buf,
+                                         total_instance * fea_num_per_node_,
+                                         total_instance,
+                                         slot_num_,
+                                         (int*)d_slot_feature_num_map_->ptr(),
+                                         fea_num_per_node_,
+                                         (int*)d_actual_slot_id_map_->ptr(),
+                                         (int*)d_fea_offset_map_->ptr());
+  GraphFillSlotLodKernelOpt<<<GET_BLOCKS((total_instance + 1) * slot_num_),
+                              CUDA_NUM_THREADS,
+                              0,
+                              train_stream_>>>(
+      (uint64_t *)d_slot_lod_tensor_ptr_->ptr(),
+      (total_instance + 1) * slot_num_,
+      total_instance + 1,
+      (int*)d_slot_feature_num_map_->ptr());
+  if (debug_mode_) {
+    uint64_t h_walk[total_instance];
+    cudaMemcpy(h_walk,
+               ins_cursor,
+               total_instance * sizeof(uint64_t),
+               cudaMemcpyDeviceToHost);
+    uint64_t h_feature[total_instance * slot_num_ * fea_num_per_node_];
+    cudaMemcpy(h_feature,
+               feature_buf,
+               total_instance * fea_num_per_node_ * slot_num_ * sizeof(uint64_t),
+               cudaMemcpyDeviceToHost);
+    for (int i = 0; i < total_instance; ++i) {
+      std::stringstream ss;
+      for (int j = 0; j < fea_num_per_node_; ++j) {
+        ss << h_feature[i * fea_num_per_node_ + j] << " ";
+      }
+      VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i
+              << "] = " << (uint64_t)h_walk[i] << " feature[" << i * fea_num_per_node_
+              << ".." << (i + 1) * fea_num_per_node_ << "] = " << ss.str();
     }
 
-    if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
-      FillFeatureBuf(d_walk_, d_feature_);
-      if (debug_mode_) {
-        int len = buf_size_ > 5000 ? 5000 : buf_size_;
-        uint64_t h_walk[len];
-        cudaMemcpy(h_walk,
-                   d_walk_->ptr(),
-                   len * sizeof(uint64_t),
-                   cudaMemcpyDeviceToHost);
-        uint64_t h_feature[len * slot_num_];
-        cudaMemcpy(h_feature,
-                   d_feature_->ptr(),
-                   len * slot_num_ * sizeof(uint64_t),
-                   cudaMemcpyDeviceToHost);
-        for (int i = 0; i < len; ++i) {
-          std::stringstream ss;
-          for (int j = 0; j < slot_num_; ++j) {
-            ss << h_feature[i * slot_num_ + j] << " ";
-          }
-          VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i
-                  << "] = " << (uint64_t)h_walk[i] << " feature["
-                  << i * slot_num_ << ".." << (i + 1) * slot_num_
-                  << "] = " << ss.str();
-        }
+    uint64_t h_slot_tensor[fea_num_per_node_][total_instance];
+    uint64_t h_slot_lod_tensor[slot_num_][total_instance + 1];
+    for (int i = 0; i < slot_num_; ++i) {
+      cudaMemcpy(h_slot_tensor[i],
+                 slot_tensor_ptr_[i],
+                 total_instance * sizeof(uint64_t),
+                 cudaMemcpyDeviceToHost);
+      int len = total_instance > 5000 ? 5000 : total_instance;
+      for (int j = 0; j < len; ++j) {
+        VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i << "][" << j
+                << "] = " << h_slot_tensor[i][j];
+      }
+
+      cudaMemcpy(h_slot_lod_tensor[i],
+                 slot_lod_tensor_ptr_[i],
+                 (total_instance + 1) * sizeof(uint64_t),
+                 cudaMemcpyDeviceToHost);
+      len = total_instance + 1 > 5000 ? 5000 : total_instance + 1;
+      for (int j = 0; j < len; ++j) {
+        VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i << "][" << j
+                << "] = " << h_slot_lod_tensor[i][j];
       }
     }
   }
+  return 0;
+}
 
+int GraphDataGenerator::MakeInsPair() {
   uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk_->ptr());
   uint64_t *ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
   int *random_row = reinterpret_cast<int *>(d_random_row_->ptr());
   int *d_pair_num = reinterpret_cast<int *>(d_pair_num_->ptr());
-  cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_);
+  cudaMemsetAsync(d_pair_num, 0, sizeof(int), train_stream_);
   int len = buf_state_.len;
-  GraphFillIdKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+  // make pair
+  GraphFillIdKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, train_stream_>>>(
       ins_buf + ins_buf_pair_len_ * 2,
       d_pair_num,
       walk,
@@ -438,29 +515,12 @@ int GraphDataGenerator::FillInsBuf() {
       len,
       walk_len_);
   int h_pair_num;
-  cudaMemcpyAsync(
-      &h_pair_num, d_pair_num, sizeof(int), cudaMemcpyDeviceToHost, stream_);
-  if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
-    uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
-    uint64_t *feature = reinterpret_cast<uint64_t *>(d_feature_->ptr());
-    cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_);
-    int len = buf_state_.len;
-    VLOG(2) << "feature_buf start[" << ins_buf_pair_len_ * 2 * slot_num_
-            << "] len[" << len << "]";
-    GraphFillFeatureKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
-        feature_buf + ins_buf_pair_len_ * 2 * slot_num_,
-        d_pair_num,
-        walk,
-        feature,
-        random_row + buf_state_.cursor,
-        buf_state_.central_word,
-        window_step_[buf_state_.step],
-        len,
-        walk_len_,
-        slot_num_);
-  }
-
-  cudaStreamSynchronize(stream_);
+  cudaMemcpyAsync(&h_pair_num,
+                  d_pair_num,
+                  sizeof(int),
+                  cudaMemcpyDeviceToHost,
+                  train_stream_);
+  cudaStreamSynchronize(train_stream_);
   ins_buf_pair_len_ += h_pair_num;
 
   if (debug_mode_) {
@@ -474,357 +534,41 @@ int GraphDataGenerator::FillInsBuf() {
     for (int xx = 0; xx < 2 * ins_buf_pair_len_; xx++) {
       VLOG(2) << "h_ins_buf[" << xx << "]: " << h_ins_buf[xx];
     }
-    delete[] h_ins_buf;
-
-    if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
-      uint64_t *feature_buf =
-          reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
-      uint64_t h_feature_buf[(batch_size_ * 2 * 2) * slot_num_];
-      cudaMemcpy(h_feature_buf,
-                 feature_buf,
-                 (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t),
-                 cudaMemcpyDeviceToHost);
-      for (int xx = 0; xx < (batch_size_ * 2 * 2) * slot_num_; xx++) {
-        VLOG(2) << "h_feature_buf[" << xx << "]: " << h_feature_buf[xx];
-      }
-    }
   }
   return ins_buf_pair_len_;
 }
 
-std::vector<std::shared_ptr<phi::Allocation>> GraphDataGenerator::SampleNeighbors(
-    int64_t* uniq_nodes, int len, int sample_size,
-    std::vector<int>& edges_split_num, int64_t* neighbor_len) {
-
-  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-  auto edge_to_id = gpu_graph_ptr->edge_to_id;
-  
-  auto sample_res = gpu_graph_ptr->graph_neighbor_sample_all_edge_type(
-      gpuid_, edge_to_id_len_, (uint64_t*)(uniq_nodes), sample_size, len,
-      edge_type_graph_);
-
-  int* all_sample_count_ptr =
-      reinterpret_cast<int* >(sample_res.actual_sample_size_mem->ptr());
-
-  auto cumsum_actual_sample_size =
-      memory::Alloc(place_, (len * edge_to_id_len_ + 1) * sizeof(int));
-  int* cumsum_actual_sample_size_ptr =
-      reinterpret_cast<int*>(cumsum_actual_sample_size->ptr());
-  cudaMemsetAsync(cumsum_actual_sample_size_ptr, 
-                  0, 
-                  (len * edge_to_id_len_ + 1) * sizeof(int),
-                  stream_);
-
-  size_t temp_storage_bytes = 0;
-  CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL,
-                                           temp_storage_bytes,
-                                           all_sample_count_ptr,
-                                           cumsum_actual_sample_size_ptr + 1,
-                                           len * edge_to_id_len_,
-                                           stream_));
-  auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes);
-  CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(),
-                                           temp_storage_bytes,
-                                           all_sample_count_ptr,
-                                           cumsum_actual_sample_size_ptr + 1,
-                                           len * edge_to_id_len_,
-                                           stream_));
-  cudaStreamSynchronize(stream_);
-
-  edges_split_num.resize(edge_to_id_len_);
-  for (int i = 0; i < edge_to_id_len_; i++) {
-    cudaMemcpyAsync(
-        edges_split_num.data() + i,
-        cumsum_actual_sample_size_ptr + (i + 1) * len,
-        sizeof(int),
-        cudaMemcpyDeviceToHost,
-        stream_);
+int GraphDataGenerator::FillInsBuf() {
+  if (ins_buf_pair_len_ >= batch_size_) {
+    return batch_size_;
   }
-  CUDA_CHECK(cudaStreamSynchronize(stream_));
-  int all_sample_size = edges_split_num[edge_to_id_len_ - 1];
-  auto final_sample_val =
-      memory::AllocShared(place_, all_sample_size * sizeof(int64_t));
-  auto final_sample_val_dst =
-      memory::AllocShared(place_, all_sample_size * sizeof(int64_t));
-  int64_t* final_sample_val_ptr =
-      reinterpret_cast<int64_t* >(final_sample_val->ptr());
-  int64_t* final_sample_val_dst_ptr =
-      reinterpret_cast<int64_t* >(final_sample_val_dst->ptr());
-  int64_t* all_sample_val_ptr =
-      reinterpret_cast<int64_t* >(sample_res.val_mem->ptr());
-  fill_actual_neighbors<<<GET_BLOCKS(len * edge_to_id_len_),
-                          CUDA_NUM_THREADS,
-                          0,
-                          stream_>>>(all_sample_val_ptr,
-                                     final_sample_val_ptr,
-                                     final_sample_val_dst_ptr,
-                                     all_sample_count_ptr,
-                                     cumsum_actual_sample_size_ptr,
-                                     sample_size,
-                                     len * edge_to_id_len_,
-                                     len);
-  *neighbor_len = all_sample_size;
-  cudaStreamSynchronize(stream_);
-
-  std::vector<std::shared_ptr<phi::Allocation>> sample_results;
-  sample_results.emplace_back(final_sample_val);
-  sample_results.emplace_back(final_sample_val_dst);
-  return sample_results; 
-}
-
-std::shared_ptr<phi::Allocation> GraphDataGenerator::GetReindexResult(
-    int64_t* reindex_src_data, const int64_t* center_nodes, int* final_nodes_len,
-    int node_len, int64_t neighbor_len) {
+  int total_instance = AcquireInstance(&buf_state_);
 
-  VLOG(2) << gpuid_ << ": Enter GetReindexResult Function";
-  const phi::GPUContext& dev_ctx_ = 
-    *(static_cast<phi::GPUContext *>(
-        platform::DeviceContextPool::Instance().Get(place_)));
-  
-  // Reset reindex table
-  int64_t* d_reindex_table_key_ptr =
-      reinterpret_cast<int64_t* >(d_reindex_table_key_->ptr());
-  int* d_reindex_table_value_ptr =
-      reinterpret_cast<int* >(d_reindex_table_value_->ptr());
-  int* d_reindex_table_index_ptr =
-      reinterpret_cast<int* >(d_reindex_table_index_->ptr());
-
-  VLOG(2) << gpuid_ << ": ResetReindexTable With -1";
-  // Fill table with -1.
-  cudaMemsetAsync(d_reindex_table_key_ptr, -1, 
-                  reindex_table_size_ * sizeof(int64_t), stream_);
-  cudaMemsetAsync(d_reindex_table_value_ptr, -1,
-                  reindex_table_size_ * sizeof(int), stream_);
-  cudaMemsetAsync(d_reindex_table_index_ptr, -1,
-                  reindex_table_size_ * sizeof(int), stream_);
-
-  VLOG(2) << gpuid_ << ": Alloc all_nodes";
-  auto all_nodes =
-      memory::AllocShared(place_, (node_len + neighbor_len) * sizeof(int64_t));
-  int64_t* all_nodes_data = reinterpret_cast<int64_t* >(all_nodes->ptr());
-
-  VLOG(2) << gpuid_ << ": cudaMemcpy all_nodes_data";
-  cudaMemcpy(all_nodes_data, center_nodes, sizeof(int64_t) * node_len,
-             cudaMemcpyDeviceToDevice);
-  cudaMemcpy(all_nodes_data + node_len, reindex_src_data, sizeof(int64_t) * neighbor_len,
-             cudaMemcpyDeviceToDevice);
-
-  cudaStreamSynchronize(stream_);
-  VLOG(2) << gpuid_ << ": Run phi::FillHashTable";
-  auto final_nodes =
-      phi::FillHashTable<int64_t, phi::GPUContext>(dev_ctx_, all_nodes_data,
-                                              node_len + neighbor_len,
-                                              reindex_table_size_,
-                                              d_reindex_table_key_ptr,
-                                              d_reindex_table_value_ptr,
-                                              d_reindex_table_index_ptr,
-                                              final_nodes_len);
-
-  VLOG(2) << gpuid_ << ": Run phi::ReindexSrcOutput";
-  phi::ReindexSrcOutput<int64_t><<<GET_BLOCKS(neighbor_len), CUDA_NUM_THREADS, 0,
-                              stream_>>>(reindex_src_data, neighbor_len,
-                                         reindex_table_size_,
-                                         d_reindex_table_key_ptr,
-                                         d_reindex_table_value_ptr);
-  return final_nodes;
-}
+  VLOG(2) << "total_ins: " << total_instance;
+  buf_state_.Debug();
 
-std::shared_ptr<phi::Allocation> GraphDataGenerator::GenerateSampleGraph(
-    uint64_t* node_ids, int len, int* final_len, phi::DenseTensor* inverse) {
-
-  const phi::GPUContext& dev_ctx_ =
-    *(static_cast<phi::GPUContext *>(
-        platform::DeviceContextPool::Instance().Get(place_)));
-
-  VLOG(2) << "Get Unique Nodes";
-  phi::DenseTensor in_x = phi::Empty<int64_t>(dev_ctx_, {len});
-  cudaMemcpy(in_x.data<int64_t>(), node_ids, len * sizeof(uint64_t),
-             cudaMemcpyDeviceToDevice);
-
-  phi::DenseTensor uniq_nodes, index;
-  std::vector<int> axis;
-  phi::UniqueKernel<int64_t, phi::GPUContext>(dev_ctx_, in_x, false, true,
-      false, axis, phi::DataType::INT32, &uniq_nodes, &index, inverse, &index);
-
-  int64_t* uniq_nodes_data = uniq_nodes.data<int64_t>();
-  int uniq_len = uniq_nodes.numel();
-  int len_samples = samples_.size();
-
-  int *num_nodes_tensor_ptr_[len_samples];
-  int *next_num_nodes_tensor_ptr_[len_samples];
-  int64_t *edges_src_tensor_ptr_[len_samples];
-  int64_t *edges_dst_tensor_ptr_[len_samples];
-  int *edges_split_tensor_ptr_[len_samples];
-
-  VLOG(2) << "Sample Neighbors and Reindex";
-  std::vector<int> edges_split_num;
-  std::vector<std::shared_ptr<phi::Allocation>> final_nodes_vec;
-  std::vector<int> final_nodes_len_vec;
-
-  for (int i = 0; i < len_samples; i++) {
-
-    edges_split_num.clear();
-    std::shared_ptr<phi::Allocation> neighbors, reindex_dst;
-    int64_t neighbors_len = 0;
-    if (i == 0) {
-      auto sample_results =
-          SampleNeighbors(uniq_nodes_data, uniq_len, samples_[i], edges_split_num,
-                          &neighbors_len);
-      neighbors = sample_results[0];
-      reindex_dst = sample_results[1];
-      edges_split_num.push_back(uniq_len);
-    } else {
-      int64_t* final_nodes_data =
-          reinterpret_cast<int64_t* >(final_nodes_vec[i - 1]->ptr());
-      auto sample_results =
-          SampleNeighbors(final_nodes_data, final_nodes_len_vec[i - 1],
-                          samples_[i], edges_split_num, &neighbors_len);
-      neighbors = sample_results[0];
-      reindex_dst = sample_results[1];
-      edges_split_num.push_back(final_nodes_len_vec[i - 1]);
-    }
-    
-    int64_t* reindex_src_data = reinterpret_cast<int64_t* >(neighbors->ptr());
-    int64_t* reindex_dst_data = reinterpret_cast<int64_t* >(reindex_dst->ptr());
-    int final_nodes_len = 0;
-    if (i == 0) {
-      auto tmp_final_nodes =
-          GetReindexResult(reindex_src_data, uniq_nodes_data, &final_nodes_len,
-                           uniq_len, neighbors_len);
-      final_nodes_vec.emplace_back(tmp_final_nodes);
-      final_nodes_len_vec.emplace_back(final_nodes_len);
-    } else {
-      int64_t* final_nodes_data =
-          reinterpret_cast<int64_t* >(final_nodes_vec[i - 1]->ptr());
-      auto tmp_final_nodes =
-          GetReindexResult(reindex_src_data, final_nodes_data, &final_nodes_len,
-                           final_nodes_len_vec[i - 1], neighbors_len);
-      final_nodes_vec.emplace_back(tmp_final_nodes);
-      final_nodes_len_vec.emplace_back(final_nodes_len);
-    }
-    
-    int offset = 3 + 2 * slot_num_ + 5 * i;
-    num_nodes_tensor_ptr_[i] =
-        feed_vec_[offset]->mutable_data<int>({1}, this->place_);
-    next_num_nodes_tensor_ptr_[i] =
-        feed_vec_[offset + 1]->mutable_data<int>({1}, this->place_);
-    edges_src_tensor_ptr_[i] =
-        feed_vec_[offset + 2]->mutable_data<int64_t>({neighbors_len, 1}, this->place_);
-    edges_dst_tensor_ptr_[i] =
-        feed_vec_[offset + 3]->mutable_data<int64_t>({neighbors_len, 1}, this->place_);
-    edges_split_tensor_ptr_[i] =
-        feed_vec_[offset + 4]->mutable_data<int>({edge_to_id_len_}, this->place_);
-
-    cudaMemcpyAsync(num_nodes_tensor_ptr_[i], final_nodes_len_vec.data() + i,
-                    sizeof(int), cudaMemcpyHostToDevice, stream_);
-    cudaMemcpyAsync(next_num_nodes_tensor_ptr_[i], edges_split_num.data() + edge_to_id_len_,
-                    sizeof(int), cudaMemcpyHostToDevice, stream_);
-    cudaMemcpyAsync(edges_split_tensor_ptr_[i], edges_split_num.data(),
-                    sizeof(int) * edge_to_id_len_, cudaMemcpyHostToDevice, stream_);
-    cudaMemcpyAsync(edges_src_tensor_ptr_[i], reindex_src_data,
-                    sizeof(int64_t) * neighbors_len, cudaMemcpyDeviceToDevice, stream_);
-    cudaMemcpyAsync(edges_dst_tensor_ptr_[i], reindex_dst_data,
-                    sizeof(int64_t) * neighbors_len, cudaMemcpyDeviceToDevice, stream_);
-
-    cudaStreamSynchronize(stream_);
+  if (total_instance == 0) {
+    return -1;
   }
-
-  *final_len = final_nodes_len_vec[len_samples - 1];
-  return final_nodes_vec[len_samples - 1];
+  return MakeInsPair();
 }
 
 int GraphDataGenerator::GenerateBatch() {
   int total_instance = 0;
   platform::CUDADeviceGuard guard(gpuid_);
   int res = 0;
-
-  std::shared_ptr<phi::Allocation> final_sage_nodes;
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
   if (!gpu_graph_training_) {
-    while (cursor_ < h_device_keys_.size()) {
-      size_t device_key_size = h_device_keys_[cursor_]->size();
-      if (infer_node_type_start_[cursor_] >= device_key_size) {
-        cursor_++;
-        continue;
-      }
-      total_instance =
-          (infer_node_type_start_[cursor_] + batch_size_ <= device_key_size)
-              ? batch_size_
-              : device_key_size - infer_node_type_start_[cursor_];
-      uint64_t *d_type_keys =
-          reinterpret_cast<uint64_t *>(d_device_keys_[cursor_]->ptr());
-      d_type_keys += infer_node_type_start_[cursor_];
-      infer_node_type_start_[cursor_] += total_instance;
-      VLOG(1) << "in graph_data generator:batch_size = " << batch_size_
-              << " instance = " << total_instance;
-      total_instance *= 2;
-      if (!sage_mode_) {
-        id_tensor_ptr_ =
-            feed_vec_[0]->mutable_data<int64_t>({total_instance, 1}, this->place_);
-        show_tensor_ptr_ =
-            feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
-        clk_tensor_ptr_ =
-            feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
-        CopyDuplicateKeys<<<GET_BLOCKS(total_instance / 2),
-                            CUDA_NUM_THREADS,
-                            0,
-                            stream_>>>(
-            id_tensor_ptr_, d_type_keys, total_instance / 2);
-        GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
-                             CUDA_NUM_THREADS,
-                             0,
-                             stream_>>>(show_tensor_ptr_, total_instance);
-        GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
-                             CUDA_NUM_THREADS,
-                             0,
-                             stream_>>>(clk_tensor_ptr_, total_instance);
-      } else {
-        auto node_buf = memory::AllocShared(
-            place_, total_instance * sizeof(uint64_t));
-        int64_t* node_buf_ptr = reinterpret_cast<int64_t* >(node_buf->ptr());
-        VLOG(1) << "copy center keys";
-        CopyDuplicateKeys<<<GET_BLOCKS(total_instance / 2),
-                            CUDA_NUM_THREADS,
-                            0,
-                            stream_>>>(
-            node_buf_ptr, d_type_keys, total_instance / 2);
-        phi::DenseTensor inverse_;
-        VLOG(1) << "generate sample graph";
-        uint64_t* node_buf_ptr_ = reinterpret_cast<uint64_t* >(node_buf->ptr());
-        final_sage_nodes =
-            GenerateSampleGraph(node_buf_ptr_, total_instance, &uniq_instance_,
-                                &inverse_);
-        id_tensor_ptr_ =
-            feed_vec_[0]->mutable_data<int64_t>({uniq_instance_, 1}, this->place_);
-        show_tensor_ptr_ =
-            feed_vec_[1]->mutable_data<int64_t>({uniq_instance_}, this->place_);
-        clk_tensor_ptr_ =
-            feed_vec_[2]->mutable_data<int64_t>({uniq_instance_}, this->place_);
-        int index_offset = 3 + slot_num_ * 2 + 5 * samples_.size();
-        index_tensor_ptr_ =
-            feed_vec_[index_offset]->mutable_data<int>({total_instance}, this->place_);
-
-        VLOG(1) << "copy id and index";
-        cudaMemcpy(id_tensor_ptr_, final_sage_nodes->ptr(),
-                   sizeof(int64_t) * uniq_instance_,
-                   cudaMemcpyDeviceToDevice);
-        cudaMemcpy(index_tensor_ptr_, inverse_.data<int>(), sizeof(int) * total_instance,
-                   cudaMemcpyDeviceToDevice);
-        GraphFillCVMKernel<<<GET_BLOCKS(uniq_instance_),
-                             CUDA_NUM_THREADS,
-                             0,
-                             stream_>>>(
-            show_tensor_ptr_, uniq_instance_);
-        GraphFillCVMKernel<<<GET_BLOCKS(uniq_instance_),
-                             CUDA_NUM_THREADS,
-                             0,
-                             stream_>>>(
-            clk_tensor_ptr_, uniq_instance_);
-      }
-      break;
-    }
+    total_instance = (infer_node_start_ + batch_size_ <= infer_node_end_)
+                         ? batch_size_
+                         : infer_node_end_ - infer_node_start_;
+    VLOG(1) << "in graph_data generator:batch_size = " << batch_size_
+            << " instance = " << total_instance;
+    total_instance *= 2;
     if (total_instance == 0) {
       return 0;
     }
+    FillIdShowClkTensor(total_instance, gpu_graph_training_, cursor_);
   } else {
     while (ins_buf_pair_len_ < batch_size_) {
       res = FillInsBuf();
@@ -839,192 +583,17 @@ int GraphDataGenerator::GenerateBatch() {
     total_instance =
         ins_buf_pair_len_ < batch_size_ ? ins_buf_pair_len_ : batch_size_;
     total_instance *= 2;
-  }
-
-  uint64_t *ins_cursor, *ins_buf;
-  phi::DenseTensor inverse;
-  if (gpu_graph_training_) {
     VLOG(2) << "total_instance: " << total_instance
             << ", ins_buf_pair_len = " << ins_buf_pair_len_;
-    ins_buf = reinterpret_cast<uint64_t *>(d_ins_buf_->ptr());
-    ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance;
-    if (!sage_mode_) {
-      id_tensor_ptr_ =
-          feed_vec_[0]->mutable_data<int64_t>({total_instance, 1}, this->place_);
-      show_tensor_ptr_ =
-          feed_vec_[1]->mutable_data<int64_t>({total_instance}, this->place_);
-      clk_tensor_ptr_ =
-          feed_vec_[2]->mutable_data<int64_t>({total_instance}, this->place_);
-      cudaMemcpyAsync(id_tensor_ptr_,
-                      ins_cursor,
-                      sizeof(uint64_t) * total_instance,
-                      cudaMemcpyDeviceToDevice,
-                      stream_);
-      GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
-                           CUDA_NUM_THREADS,
-                           0,
-                           stream_>>>(show_tensor_ptr_, total_instance);
-      GraphFillCVMKernel<<<GET_BLOCKS(total_instance),
-                           CUDA_NUM_THREADS,
-                           0,
-                           stream_>>>(clk_tensor_ptr_, total_instance);
-    } else {
-      VLOG(2) << gpuid_ << " " << "Ready to enter GenerateSampleGraph";
-      final_sage_nodes = GenerateSampleGraph(ins_cursor, total_instance, &uniq_instance_,
-                                        &inverse);
-      VLOG(2) << "Copy Final Results";
-      id_tensor_ptr_ =
-          feed_vec_[0]->mutable_data<int64_t>({uniq_instance_, 1}, this->place_);
-      show_tensor_ptr_ =
-          feed_vec_[1]->mutable_data<int64_t>({uniq_instance_}, this->place_);
-      clk_tensor_ptr_ =
-          feed_vec_[2]->mutable_data<int64_t>({uniq_instance_}, this->place_);
-      int index_offset = 3 + slot_num_ * 2 + 5 * samples_.size();
-      index_tensor_ptr_ =
-          feed_vec_[index_offset]->mutable_data<int>({total_instance}, this->place_);
-
-      cudaMemcpyAsync(id_tensor_ptr_,
-                      final_sage_nodes->ptr(),
-                      sizeof(int64_t) * uniq_instance_,
-                      cudaMemcpyDeviceToDevice,
-                      stream_);
-      cudaMemcpyAsync(index_tensor_ptr_,
-                      inverse.data<int>(),
-                      sizeof(int) * total_instance,
-                      cudaMemcpyDeviceToDevice,
-                      stream_);
-      GraphFillCVMKernel<<<GET_BLOCKS(uniq_instance_),
-                           CUDA_NUM_THREADS,
-                           0,
-                           stream_>>>(show_tensor_ptr_, uniq_instance_);
-      GraphFillCVMKernel<<<GET_BLOCKS(uniq_instance_),
-                           CUDA_NUM_THREADS,
-                           0,
-                           stream_>>>(clk_tensor_ptr_, uniq_instance_);
-
-    }
-  } else {
-    ins_cursor = (uint64_t *)id_tensor_ptr_;
+    FillIdShowClkTensor(total_instance, gpu_graph_training_);
   }
 
-  int64_t *slot_tensor_ptr_[slot_num_];
-  int64_t *slot_lod_tensor_ptr_[slot_num_];
   if (slot_num_ > 0) {
-    int slot_instance = sage_mode_ == true ? uniq_instance_ : total_instance;
-    for (int i = 0; i < slot_num_; ++i) {
-      slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data<int64_t>(
-          {slot_instance * h_slot_feature_num_map_[i], 1}, this->place_);
-      slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data<int64_t>(
-          {slot_instance + 1}, this->place_);
-    }
-    if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
-      cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(),
-                      slot_tensor_ptr_,
-                      sizeof(uint64_t *) * slot_num_,
-                      cudaMemcpyHostToDevice,
-                      stream_);
-      cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(),
-                      slot_lod_tensor_ptr_,
-                      sizeof(uint64_t *) * slot_num_,
-                      cudaMemcpyHostToDevice,
-                      stream_);
-    }
-    if (sage_mode_) {
-      size_t temp_storage_bytes = slot_instance * fea_num_per_node_ * sizeof(uint64_t);
-      // No need to allocate a new d_feature_buf_ if the old one is enough.
-      if (d_feature_buf_ == NULL || d_feature_buf_->size() < temp_storage_bytes) {
-        d_feature_buf_ = memory::AllocShared(place_, temp_storage_bytes);
-      }
-    }
-    uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
-    if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
-      if (!sage_mode_) {
-        FillFeatureBuf(ins_cursor, feature_buf, slot_instance);
-      } else {
-        uint64_t* sage_nodes_ptr = reinterpret_cast<uint64_t *>(final_sage_nodes->ptr());
-        FillFeatureBuf(sage_nodes_ptr, feature_buf, slot_instance); 
-      }
-      if (debug_mode_) {
-        uint64_t h_walk[slot_instance];
-        if (!sage_mode_) {
-          cudaMemcpy(h_walk,
-                     ins_cursor,
-                     slot_instance * sizeof(uint64_t),
-                     cudaMemcpyDeviceToHost);
-        } else {
-          uint64_t* sage_nodes_ptr = reinterpret_cast<uint64_t *>(final_sage_nodes->ptr());
-          cudaMemcpy(h_walk,
-                     sage_nodes_ptr,
-                     slot_instance * sizeof(uint64_t),
-                     cudaMemcpyDeviceToHost);
-        }
-        uint64_t h_feature[slot_instance * fea_num_per_node_];
-        cudaMemcpy(h_feature,
-                   feature_buf,
-                   slot_instance * fea_num_per_node_ * sizeof(uint64_t),
-                   cudaMemcpyDeviceToHost);
-        for (int i = 0; i < slot_instance; ++i) {
-          std::stringstream ss;
-          for (int j = 0; j < fea_num_per_node_; ++j) {
-            ss << h_feature[i * fea_num_per_node_ + j] << " ";
-          }
-          VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i
-                  << "] = " << (uint64_t)h_walk[i] << " feature["
-                  << i * fea_num_per_node_ << ".." << (i + 1) * fea_num_per_node_
-                  << "] = " << ss.str();
-        }
-      }
-      GraphFillSlotKernel<<<GET_BLOCKS(slot_instance * fea_num_per_node_),
-                            CUDA_NUM_THREADS,
-                            0,
-                            stream_>>>((uint64_t *)d_slot_tensor_ptr_->ptr(),
-                                       feature_buf,
-                                       slot_instance * fea_num_per_node_,
-                                       slot_instance,
-                                       slot_num_,
-                                       (int*)d_slot_feature_num_map_->ptr(),
-                                       fea_num_per_node_,
-                                       (int*)d_actual_slot_id_map_->ptr(),
-                                       (int*)d_fea_offset_map_->ptr());
-      GraphFillSlotLodKernelOpt<<<GET_BLOCKS((slot_instance + 1) * slot_num_),
-                                  CUDA_NUM_THREADS,
-                                  0,
-                                  stream_>>>(
-          (uint64_t *)d_slot_lod_tensor_ptr_->ptr(),
-          (slot_instance + 1) * slot_num_,
-          slot_instance + 1,
-          (int*)d_slot_feature_num_map_->ptr());
-    } else {
-      for (int i = 0; i < slot_num_; ++i) {
-        int feature_buf_offset =
-            (ins_buf_pair_len_ * 2 - total_instance) * slot_num_ + i * 2;
-        for (int j = 0; j < total_instance; j += 2) {
-          VLOG(2) << "slot_tensor[" << i << "][" << j << "] <- feature_buf["
-                  << feature_buf_offset + j * slot_num_ << "]";
-          VLOG(2) << "slot_tensor[" << i << "][" << j + 1 << "] <- feature_buf["
-                  << feature_buf_offset + j * slot_num_ + 1 << "]";
-          cudaMemcpyAsync(slot_tensor_ptr_[i] + j,
-                          &feature_buf[feature_buf_offset + j * slot_num_],
-                          sizeof(uint64_t) * 2,
-                          cudaMemcpyDeviceToDevice,
-                          stream_);
-        }
-        GraphFillSlotLodKernel<<<GET_BLOCKS(total_instance),
-                                 CUDA_NUM_THREADS,
-                                 0,
-                                 stream_>>>(slot_lod_tensor_ptr_[i],
-                                            total_instance + 1);
-      }
-    }
+    FillGraphSlotFeature(total_instance, gpu_graph_training_);
   }
-
   offset_.clear();
   offset_.push_back(0);
-  if (!sage_mode_) {
-    offset_.push_back(total_instance);
-  } else {
-    offset_.push_back(uniq_instance_);
-  }
+  offset_.push_back(total_instance);
   LoD lod{offset_};
   feed_vec_[0]->set_lod(lod);
   if (slot_num_ > 0) {
@@ -1033,35 +602,9 @@ int GraphDataGenerator::GenerateBatch() {
     }
   }
 
-  cudaStreamSynchronize(stream_);
+  cudaStreamSynchronize(train_stream_);
   if (!gpu_graph_training_) return 1;
   ins_buf_pair_len_ -= total_instance / 2;
-  if (debug_mode_) {
-    uint64_t h_slot_tensor[fea_num_per_node_][total_instance];
-    uint64_t h_slot_lod_tensor[slot_num_][total_instance + 1];
-    for (int i = 0; i < slot_num_; ++i) {
-      cudaMemcpy(h_slot_tensor[i],
-                 slot_tensor_ptr_[i],
-                 total_instance * sizeof(uint64_t),
-                 cudaMemcpyDeviceToHost);
-      int len = total_instance > 5000 ? 5000 : total_instance;
-      for (int j = 0; j < len; ++j) {
-        VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i << "][" << j
-                << "] = " << h_slot_tensor[i][j];
-      }
-
-      cudaMemcpy(h_slot_lod_tensor[i],
-                 slot_lod_tensor_ptr_[i],
-                 (total_instance + 1) * sizeof(uint64_t),
-                 cudaMemcpyDeviceToHost);
-      len = total_instance + 1 > 5000 ? 5000 : total_instance + 1;
-      for (int j = 0; j < len; ++j) {
-        VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i << "][" << j
-                << "] = " << h_slot_lod_tensor[i][j];
-      }
-    }
-  }
-
   return 1;
 }
 
@@ -1128,6 +671,66 @@ __global__ void GraphFillFirstStepKernel(int *prefix_sum,
   }
 }
 
+__global__ void GetUniqueFeaNum(uint64_t *d_in,
+                                uint64_t *unique_num,
+                                size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  __shared__ uint64_t local_num;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+
+  if (i < len - 1) {
+    if (d_in[i] != d_in[i + 1]) {
+      atomicAdd(&local_num, 1);
+    }
+  }
+  if (i == len - 1) {
+    atomicAdd(&local_num, 1);
+  }
+
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    atomicAdd(unique_num, local_num);
+  }
+}
+
+__global__ void UniqueFeature(uint64_t *d_in,
+                              uint64_t *d_out,
+                              uint64_t *unique_num,
+                              size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  __shared__ uint64_t local_key[CUDA_NUM_THREADS];
+  __shared__ uint64_t local_num;
+  __shared__ uint64_t global_num;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+
+  if (i < len - 1) {
+    if (d_in[i] != d_in[i + 1]) {
+      size_t dst = atomicAdd(&local_num, 1);
+      local_key[dst] = d_in[i];
+    }
+  }
+  if (i == len - 1) {
+    size_t dst = atomicAdd(&local_num, 1);
+    local_key[dst] = d_in[i];
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    global_num = atomicAdd(unique_num, local_num);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < local_num) {
+    d_out[global_num + threadIdx.x] = local_key[threadIdx.x];
+  }
+}
 // Fill sample_res to the stepth column of walk
 void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids,
                                      uint64_t *walk,
@@ -1151,45 +754,50 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids,
                                            d_actual_sample_size,
                                            d_prefix_sum + 1,
                                            len,
-                                           stream_));
-  auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes);
+                                           sample_stream_));
+  auto d_temp_storage = memory::Alloc(
+      place_,
+      temp_storage_bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
 
   CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(),
                                            temp_storage_bytes,
                                            d_actual_sample_size,
                                            d_prefix_sum + 1,
                                            len,
-                                           stream_));
+                                           sample_stream_));
 
-  cudaStreamSynchronize(stream_);
+  cudaStreamSynchronize(sample_stream_);
 
   if (step == 1) {
-    GraphFillFirstStepKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
-        d_prefix_sum,
-        d_tmp_sampleidx2row,
-        walk,
-        d_start_ids,
-        len,
-        walk_degree_,
-        walk_len_,
-        d_actual_sample_size,
-        d_neighbors,
-        d_sample_keys);
+    GraphFillFirstStepKernel<<<GET_BLOCKS(len),
+                               CUDA_NUM_THREADS,
+                               0,
+                               sample_stream_>>>(d_prefix_sum,
+                                                 d_tmp_sampleidx2row,
+                                                 walk,
+                                                 d_start_ids,
+                                                 len,
+                                                 walk_degree_,
+                                                 walk_len_,
+                                                 d_actual_sample_size,
+                                                 d_neighbors,
+                                                 d_sample_keys);
 
   } else {
     GraphFillSampleKeysKernel<<<GET_BLOCKS(len),
                                 CUDA_NUM_THREADS,
                                 0,
-                                stream_>>>(d_neighbors,
-                                           d_sample_keys,
-                                           d_prefix_sum,
-                                           d_sampleidx2row,
-                                           d_tmp_sampleidx2row,
-                                           d_actual_sample_size,
-                                           cur_degree,
-                                           len);
-
-    GraphDoWalkKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, stream_>>>(
+                                sample_stream_>>>(d_neighbors,
+                                                  d_sample_keys,
+                                                  d_prefix_sum,
+                                                  d_sampleidx2row,
+                                                  d_tmp_sampleidx2row,
+                                                  d_actual_sample_size,
+                                                  cur_degree,
+                                                  len);
+
+    GraphDoWalkKernel<<<GET_BLOCKS(len), CUDA_NUM_THREADS, 0, sample_stream_>>>(
         d_neighbors,
         walk,
         d_prefix_sum,
@@ -1206,7 +814,6 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids,
     int *h_prefix_sum = new int[len + 1];
     int *h_actual_size = new int[len];
     int *h_offset2idx = new int[once_max_sample_keynum];
-    uint64_t h_sample_keys[once_max_sample_keynum];
     cudaMemcpy(h_offset2idx,
                d_tmp_sampleidx2row,
                once_max_sample_keynum * sizeof(int),
@@ -1225,9 +832,8 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids,
     delete[] h_prefix_sum;
     delete[] h_actual_size;
     delete[] h_offset2idx;
-    delete[] h_sample_keys;
   }
-  cudaStreamSynchronize(stream_);
+  cudaStreamSynchronize(sample_stream_);
   cur_sampleidx2row_ = 1 - cur_sampleidx2row_;
 }
 
@@ -1259,7 +865,105 @@ int GraphDataGenerator::FillFeatureBuf(
   return ret;
 }
 
-int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
+// 尝试插入table, 0表示插入成功
+int GraphDataGenerator::InsertTable(
+    const unsigned long *d_keys,
+    unsigned long len,
+    std::shared_ptr<phi::Allocation> d_uniq_node_num) {
+  uint64_t h_uniq_node_num = 0;
+  uint64_t *d_uniq_node_num_ptr =
+      reinterpret_cast<uint64_t *>(d_uniq_node_num->ptr());
+  cudaMemcpyAsync(&h_uniq_node_num,
+                  d_uniq_node_num_ptr,
+                  sizeof(uint64_t),
+                  cudaMemcpyDeviceToHost,
+                  sample_stream_);
+  cudaStreamSynchronize(sample_stream_);
+  // 产生了足够多的node，采样结束
+  VLOG(2) << "table capcity: " << train_table_cap_ << ", " << h_uniq_node_num
+          << " used";
+  if (h_uniq_node_num + len >= train_table_cap_) {
+    return 1;
+  }
+  table_->insert(d_keys, len, d_uniq_node_num_ptr, sample_stream_);
+  CUDA_CHECK(cudaStreamSynchronize(sample_stream_));
+  return 0;
+}
+
+void GraphDataGenerator::DoWalk() {
+  int device_id = place_.GetDeviceId();
+  debug_gpu_memory_info(device_id, "DoWalk start");
+  if (gpu_graph_training_) {
+    FillWalkBuf();
+  } else {
+    FillInferBuf();
+  }
+  debug_gpu_memory_info(device_id, "DoWalk end");
+}
+
+void GraphDataGenerator::clear_gpu_mem() {
+  d_len_per_row_.reset();
+  d_sample_keys_.reset();
+  d_prefix_sum_.reset();
+  for (size_t i = 0; i < d_sampleidx2rows_.size(); i++) {
+    d_sampleidx2rows_[i].reset();
+  }
+  delete table_;
+}
+
+int GraphDataGenerator::FillInferBuf() {
+  platform::CUDADeviceGuard guard(gpuid_);
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  auto &global_infer_node_type_start =
+      gpu_graph_ptr->global_infer_node_type_start_[gpuid_];
+  auto &infer_cursor = gpu_graph_ptr->infer_cursor_[thread_id_];
+  total_row_ = 0;
+  if (infer_cursor < h_device_keys_len_.size()) {
+    if (global_infer_node_type_start[infer_cursor] >=
+        h_device_keys_len_[infer_cursor]) {
+      infer_cursor++;
+      if (infer_cursor >= h_device_keys_len_.size()) {
+        return 0;
+      }
+    }
+    size_t device_key_size = h_device_keys_len_[infer_cursor];
+    total_row_ =
+        (global_infer_node_type_start[infer_cursor] + infer_table_cap_ <=
+         device_key_size)
+            ? infer_table_cap_
+            : device_key_size - global_infer_node_type_start[infer_cursor];
+
+    host_vec_.resize(total_row_);
+    uint64_t *d_type_keys =
+        reinterpret_cast<uint64_t *>(d_device_keys_[infer_cursor]->ptr());
+    cudaMemcpyAsync(host_vec_.data(),
+                    d_type_keys + global_infer_node_type_start[infer_cursor],
+                    sizeof(uint64_t) * total_row_,
+                    cudaMemcpyDeviceToHost,
+                    sample_stream_);
+    cudaStreamSynchronize(sample_stream_);
+    VLOG(1) << "cursor: " << infer_cursor
+            << " start: " << global_infer_node_type_start[infer_cursor]
+            << " num: " << total_row_;
+    infer_node_start_ = global_infer_node_type_start[infer_cursor];
+    global_infer_node_type_start[infer_cursor] += total_row_;
+    infer_node_end_ = global_infer_node_type_start[infer_cursor];
+    cursor_ = infer_cursor;
+  }
+  return 0;
+}
+
+void GraphDataGenerator::ClearSampleState() {
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  auto &finish_node_type = gpu_graph_ptr->finish_node_type_[gpuid_];
+  auto &node_type_start = gpu_graph_ptr->node_type_start_[gpuid_];
+  finish_node_type.clear();
+  for (auto iter = node_type_start.begin(); iter != node_type_start.end(); iter++) {
+    iter->second = 0;
+  }
+}
+
+int GraphDataGenerator::FillWalkBuf() {
   platform::CUDADeviceGuard guard(gpuid_);
   size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
   ////////
@@ -1277,30 +981,42 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
   }
   ///////
   auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-  uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk->ptr());
+  uint64_t *walk = reinterpret_cast<uint64_t *>(d_walk_->ptr());
   int *len_per_row = reinterpret_cast<int *>(d_len_per_row_->ptr());
   uint64_t *d_sample_keys = reinterpret_cast<uint64_t *>(d_sample_keys_->ptr());
-  cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), stream_);
-  cudaMemsetAsync(
-      len_per_row, 0, once_max_sample_keynum * sizeof(int), stream_);
+  cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), sample_stream_);
+  // cudaMemsetAsync(
+  //     len_per_row, 0, once_max_sample_keynum * sizeof(int), sample_stream_);
+  int sample_times = 0;
   int i = 0;
-  int total_row = 0;
-  size_t node_type_len = first_node_type_.size();
+  total_row_ = 0;
+
+  // 获取全局采样状态
+  auto &first_node_type = gpu_graph_ptr->first_node_type_;
+  auto &meta_path = gpu_graph_ptr->meta_path_;
+  auto &node_type_start = gpu_graph_ptr->node_type_start_[gpuid_];
+  auto &finish_node_type = gpu_graph_ptr->finish_node_type_[gpuid_];
+  auto &type_to_index = gpu_graph_ptr->get_graph_type_to_index();
+  auto &cursor = gpu_graph_ptr->cursor_[thread_id_];
+  size_t node_type_len = first_node_type.size();
   int remain_size =
       buf_size_ - walk_degree_ * once_sample_startid_len_ * walk_len_;
+  int total_samples = 0;
 
   while (i <= remain_size) {
-    int cur_node_idx = cursor_ % node_type_len;
-    int node_type = first_node_type_[cur_node_idx];
-    auto &path = meta_path_[cur_node_idx];
-    size_t start = node_type_start_[node_type];
+    int cur_node_idx = cursor % node_type_len;
+    int node_type = first_node_type[cur_node_idx];
+    auto &path = meta_path[cur_node_idx];
+    size_t start = node_type_start[node_type];
+    VLOG(2) << "cur_node_idx = " << cur_node_idx
+            << " meta_path.size = " << meta_path.size();
     // auto node_query_result = gpu_graph_ptr->query_node_list(
-    //    gpuid_, node_type, start, once_sample_startid_len_);
+    //     gpuid_, node_type, start, once_sample_startid_len_);
 
     // int tmp_len = node_query_result.actual_sample_size;
     VLOG(2) << "choose start type: " << node_type;
-    int type_index = type_to_index_[node_type];
-    size_t device_key_size = h_device_keys_[type_index]->size();
+    int type_index = type_to_index[node_type];
+    size_t device_key_size = h_device_keys_len_[type_index];
     VLOG(2) << "type: " << node_type << " size: " << device_key_size
             << " start: " << start;
     uint64_t *d_type_keys =
@@ -1308,21 +1024,19 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
     int tmp_len = start + once_sample_startid_len_ > device_key_size
                       ? device_key_size - start
                       : once_sample_startid_len_;
-    node_type_start_[node_type] = tmp_len + start;
+    bool update = true;
     if (tmp_len == 0) {
-      finish_node_type_.insert(node_type);
-      if (finish_node_type_.size() == node_type_start_.size()) {
+      finish_node_type.insert(node_type);
+      if (finish_node_type.size() == node_type_start.size()) {
+        cursor = 0;
+        epoch_finish_ = true;
         break;
       }
-      cursor_ += 1;
+      cursor += 1;
       continue;
     }
-    // if (tmp_len == 0) {
-    //  break;
-    //}
-    VLOG(2) << "i = " << i << " buf_size_ = " << buf_size_
-            << " tmp_len = " << tmp_len << " cursor = " << cursor_
-            << " once_max_sample_keynum = " << once_max_sample_keynum;
+
+    VLOG(2) << "gpuid = " << gpuid_ << " path[0] = " << path[0];
     uint64_t *cur_walk = walk + i;
 
     NeighborSampleQuery q;
@@ -1336,6 +1050,30 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
     int step = 1;
     VLOG(2) << "sample edge type: " << path[0] << " step: " << 1;
     jump_rows_ = sample_res.total_sample_size;
+    total_samples += sample_res.total_sample_size;
+    VLOG(2) << "i = " << i << " start = " << start << " tmp_len = " << tmp_len
+            << " cursor = " << node_type << " cur_node_idx = " << cur_node_idx
+            << " jump row: " << jump_rows_;
+    VLOG(2) << "jump_row: " << jump_rows_;
+    if (jump_rows_ == 0) {
+      node_type_start[node_type] = tmp_len + start;
+      cursor += 1;
+      continue;
+    }
+
+    if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+      if (InsertTable(d_type_keys + start, tmp_len, d_uniq_node_num_) != 0) {
+        VLOG(2) << "in step 0, insert key stage, table is full";
+        update = false;
+        break;
+      }
+      if (InsertTable(sample_res.actual_val, sample_res.total_sample_size, d_uniq_node_num_) !=
+          0) {
+        VLOG(2) << "in step 0, insert sample res stage, table is full";
+        update = false;
+        break;
+      }
+    }
     FillOneStep(d_type_keys + start,
                 cur_walk,
                 tmp_len,
@@ -1343,7 +1081,6 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
                 walk_degree_,
                 step,
                 len_per_row);
-    VLOG(2) << "jump_row: " << jump_rows_;
     /////////
     if (debug_mode_) {
       cudaMemcpy(
@@ -1352,11 +1089,16 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
         VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
       }
     }
+
+    VLOG(2) << "sample, step=" << step << " sample_keys=" << tmp_len
+        << " sample_res_len=" << sample_res.total_sample_size;
+
     /////////
     step++;
     size_t path_len = path.size();
     for (; step < walk_len_; step++) {
       if (sample_res.total_sample_size == 0) {
+        VLOG(2) << "sample finish, step=" << step;
         break;
       }
       auto sample_key_mem = sample_res.actual_val_mem;
@@ -1369,9 +1111,17 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
                    (uint64_t)sample_keys_ptr,
                    1,
                    sample_res.total_sample_size);
-      int sample_key_len =  sample_res.total_sample_size;
+      int sample_key_len = sample_res.total_sample_size;
       sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false, true);
-
+      total_samples += sample_res.total_sample_size;
+      if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+        if (InsertTable(sample_res.actual_val, sample_res.total_sample_size, d_uniq_node_num_) !=
+            0) {
+          VLOG(2) << "in step: " << step << ", table is full";
+          update = false;
+          break;
+        }
+      }
       FillOneStep(d_type_keys + start,
                   cur_walk,
                   sample_key_len,
@@ -1386,34 +1136,44 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
           VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx];
         }
       }
+
+      VLOG(2) << "sample, step=" << step << " sample_keys=" << sample_key_len
+          << " sample_res_len=" << sample_res.total_sample_size;
+    }
+    // 此时更新全局采样状态
+    if (update == true) {
+      node_type_start[node_type] = tmp_len + start;
+      i += jump_rows_ * walk_len_;
+      total_row_ += jump_rows_;
+      cursor += 1;
+      sample_times++;
+    } else {
+      VLOG(2) << "table is full, not update stat!";
+      break;
     }
-    // cursor_ += tmp_len;
-    i += jump_rows_ * walk_len_;
-    total_row += jump_rows_;
-    cursor_ += 1;
   }
-  buf_state_.Reset(total_row);
+  buf_state_.Reset(total_row_);
   int *d_random_row = reinterpret_cast<int *>(d_random_row_->ptr());
 
   thrust::random::default_random_engine engine(shuffle_seed_);
-  const auto &exec_policy = thrust::cuda::par.on(stream_);
+  const auto &exec_policy = thrust::cuda::par.on(sample_stream_);
   thrust::counting_iterator<int> cnt_iter(0);
   thrust::shuffle_copy(exec_policy,
                        cnt_iter,
-                       cnt_iter + total_row,
+                       cnt_iter + total_row_,
                        thrust::device_pointer_cast(d_random_row),
                        engine);
 
-  cudaStreamSynchronize(stream_);
+  cudaStreamSynchronize(sample_stream_);
   shuffle_seed_ = engine();
 
   if (debug_mode_) {
-    int *h_random_row = new int[total_row + 10];
+    int *h_random_row = new int[total_row_ + 10];
     cudaMemcpy(h_random_row,
                d_random_row,
-               total_row * sizeof(int),
+               total_row_ * sizeof(int),
                cudaMemcpyDeviceToHost);
-    for (int xx = 0; xx < total_row; xx++) {
+    for (int xx = 0; xx < total_row_; xx++) {
       VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx];
     }
     delete[] h_random_row;
@@ -1423,101 +1183,162 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk) {
     delete[] h_len_per_row;
     delete[] h_prefix_sum;
   }
-  return total_row != 0;
+  if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+    // table_->prefetch(cudaCpuDeviceId, sample_stream_);
+    // thrust::pair<uint64_t, uint64_t> *kv = table_->data();
+    // size_t size = table_->size();
+    // uint64_t unused_key = std::numeric_limits<uint64_t>::max();
+    // for (size_t i = 0; i < size; i++) {
+    //   if (kv[i].first == unused_key) {
+    //     continue;
+    //   }
+    //   host_vec_.push_back(kv[i].first);
+    // }
+
+    uint64_t h_uniq_node_num = 0;
+    uint64_t *d_uniq_node_num =
+        reinterpret_cast<uint64_t *>(d_uniq_node_num_->ptr());
+    cudaMemcpyAsync(&h_uniq_node_num,
+                    d_uniq_node_num,
+                    sizeof(uint64_t),
+                    cudaMemcpyDeviceToHost,
+                    sample_stream_);
+    cudaStreamSynchronize(sample_stream_);
+    VLOG(2) << "h_uniq_node_num: " << h_uniq_node_num;
+    // 临时显存, 存储去重后的nodeid
+    auto d_uniq_node = memory::AllocShared(
+        place_,
+        h_uniq_node_num * sizeof(uint64_t),
+        phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+    uint64_t *d_uniq_node_ptr =
+        reinterpret_cast<uint64_t *>(d_uniq_node->ptr());
+
+    auto d_node_cursor = memory::AllocShared(
+        place_,
+        sizeof(uint64_t),
+        phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+
+    uint64_t *d_node_cursor_ptr =
+        reinterpret_cast<uint64_t *>(d_node_cursor->ptr());
+    cudaMemsetAsync(d_node_cursor_ptr, 0, sizeof(uint64_t), sample_stream_);
+    // uint64_t unused_key = std::numeric_limits<uint64_t>::max();
+    table_->get_keys(d_uniq_node_ptr, d_node_cursor_ptr, sample_stream_);
+
+    cudaStreamSynchronize(sample_stream_);
+
+    host_vec_.resize(h_uniq_node_num);
+    cudaMemcpyAsync(host_vec_.data(),
+                    d_uniq_node_ptr,
+                    sizeof(uint64_t) * h_uniq_node_num,
+                    cudaMemcpyDeviceToHost,
+                    sample_stream_);
+    cudaStreamSynchronize(sample_stream_);
+    
+    VLOG(0) << "sample_times:" << sample_times
+        << ", d_walk_size:" << buf_size_
+        << ", d_walk_offset:" << i
+        << ", total_rows:" << total_row_
+        << ", total_samples:" << total_samples
+        << ", h_uniq_node_num:" << h_uniq_node_num;
+  }
+  return total_row_ != 0;
 }
 
-void GraphDataGenerator::AllocResource(const paddle::platform::Place &place,
-                                       std::vector<LoDTensor *> feed_vec) {
-  place_ = place;
-  gpuid_ = place_.GetDeviceId();
-  VLOG(3) << "gpuid " << gpuid_;
-  stream_ = dynamic_cast<phi::GPUContext *>(
-                platform::DeviceContextPool::Instance().Get(place))
-                ->stream();
+void GraphDataGenerator::SetFeedVec(std::vector<LoDTensor *> feed_vec) {
   feed_vec_ = feed_vec;
-  if (!sage_mode_) {
-    slot_num_ = (feed_vec_.size() - 3) / 2;
-  } else {
-    slot_num_ = (feed_vec_.size() - 4 - samples_.size() * 5) / 2;
-  }
-
+}
+void GraphDataGenerator::AllocResource(int thread_id,
+                                       std::vector<LoDTensor *> feed_vec) {
   auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-  h_slot_feature_num_map_ = gpu_graph_ptr->slot_feature_num_map();
-  fea_num_per_node_ = 0;
-  for (int i = 0; i < slot_num_; ++i) {
-    fea_num_per_node_ += h_slot_feature_num_map_[i];
+  gpuid_ = gpu_graph_ptr->device_id_mapping[thread_id];
+  thread_id_ = thread_id;
+  place_ = platform::CUDAPlace(gpuid_);
+  debug_gpu_memory_info(gpuid_, "AllocResource start");
+
+  platform::CUDADeviceGuard guard(gpuid_);
+  if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+    table_ = new HashTable<uint64_t, uint64_t>(
+        train_table_cap_ / FLAGS_gpugraph_hbm_table_load_factor);
   }
-  std::vector<int> h_actual_slot_id_map, h_fea_offset_map;
-  h_actual_slot_id_map.resize(fea_num_per_node_);
-  h_fea_offset_map.resize(fea_num_per_node_);
-  for (int slot_id = 0, fea_idx = 0; slot_id < slot_num_; ++slot_id) {
-    for (int j = 0; j < h_slot_feature_num_map_[slot_id]; ++j, ++fea_idx) {
-      h_actual_slot_id_map[fea_idx] = slot_id;
-      h_fea_offset_map[fea_idx] = j;
-    }
+  VLOG(1) << "AllocResource gpuid " << gpuid_
+          << " feed_vec.size: " << feed_vec.size()
+          << " table cap: " << train_table_cap_;
+  sample_stream_ = gpu_graph_ptr->get_local_stream(gpuid_);
+  train_stream_ = dynamic_cast<phi::GPUContext *>(
+                      platform::DeviceContextPool::Instance().Get(place_))
+                      ->stream();
+  // feed_vec_ = feed_vec;
+  slot_num_ = (feed_vec.size() - 3) / 2;
+
+  // infer_node_type_start_ = std::vector<int>(h_device_keys_.size(), 0);
+  // for (size_t i = 0; i < h_device_keys_.size(); i++) {
+  //   for (size_t j = 0; j < h_device_keys_[i]->size(); j++) {
+  //     VLOG(3) << "h_device_keys_[" << i << "][" << j
+  //             << "] = " << (*(h_device_keys_[i]))[j];
+  //   }
+  //   auto buf = memory::AllocShared(
+  //       place_, h_device_keys_[i]->size() * sizeof(uint64_t));
+  //   d_device_keys_.push_back(buf);
+  //   CUDA_CHECK(cudaMemcpyAsync(buf->ptr(),
+  //                              h_device_keys_[i]->data(),
+  //                              h_device_keys_[i]->size() * sizeof(uint64_t),
+  //                              cudaMemcpyHostToDevice,
+  //                              stream_));
+  // }
+  auto &d_graph_all_type_keys = gpu_graph_ptr->d_graph_all_type_total_keys_;
+  auto &h_graph_all_type_keys_len = gpu_graph_ptr->h_graph_all_type_keys_len_;
+
+  for (size_t i = 0; i < d_graph_all_type_keys.size(); i++) {
+    d_device_keys_.push_back(d_graph_all_type_keys[i][thread_id]);
+    h_device_keys_len_.push_back(h_graph_all_type_keys_len[i][thread_id]);
   }
+  VLOG(2) << "h_device_keys size: " << h_device_keys_len_.size();
+  
 
-  d_slot_feature_num_map_ = memory::Alloc(place, slot_num_ * sizeof(int));
-  cudaMemcpy(d_slot_feature_num_map_->ptr(), h_slot_feature_num_map_.data(),
-          sizeof(int) * slot_num_, cudaMemcpyHostToDevice);
-  d_actual_slot_id_map_ = memory::Alloc(place, fea_num_per_node_ * sizeof(int));
-  cudaMemcpy(d_actual_slot_id_map_->ptr(), h_actual_slot_id_map.data(),
-          sizeof(int) * fea_num_per_node_, cudaMemcpyHostToDevice);
-  d_fea_offset_map_ = memory::Alloc(place, fea_num_per_node_ * sizeof(int));
-  cudaMemcpy(d_fea_offset_map_->ptr(), h_fea_offset_map.data(),
-          sizeof(int) * fea_num_per_node_, cudaMemcpyHostToDevice);
 
-  // d_device_keys_.resize(h_device_keys_.size());
-  VLOG(2) << "h_device_keys size: " << h_device_keys_.size();
-  infer_node_type_start_ = std::vector<int>(h_device_keys_.size(), 0);
-  for (size_t i = 0; i < h_device_keys_.size(); i++) {
-    for (size_t j = 0; j < h_device_keys_[i]->size(); j++) {
-      VLOG(3) << "h_device_keys_[" << i << "][" << j
-              << "] = " << (*(h_device_keys_[i]))[j];
-    }
-    auto buf = memory::AllocShared(
-        place_, h_device_keys_[i]->size() * sizeof(uint64_t));
-    d_device_keys_.push_back(buf);
-    CUDA_CHECK(cudaMemcpyAsync(buf->ptr(),
-                               h_device_keys_[i]->data(),
-                               h_device_keys_[i]->size() * sizeof(uint64_t),
-                               cudaMemcpyHostToDevice,
-                               stream_));
-  }
-  // h_device_keys_ = h_device_keys;
-  // device_key_size_ = h_device_keys_->size();
-  // d_device_keys_ =
-  //    memory::AllocShared(place_, device_key_size_ * sizeof(int64_t));
-  // CUDA_CHECK(cudaMemcpyAsync(d_device_keys_->ptr(), h_device_keys_->data(),
-  //                           device_key_size_ * sizeof(int64_t),
-  //                           cudaMemcpyHostToDevice, stream_));
   size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_;
-  d_prefix_sum_ =
-      memory::AllocShared(place_, (once_max_sample_keynum + 1) * sizeof(int));
+  d_prefix_sum_ = memory::AllocShared(
+      place_,
+      (once_max_sample_keynum + 1) * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
   int *d_prefix_sum_ptr = reinterpret_cast<int *>(d_prefix_sum_->ptr());
-  cudaMemsetAsync(
-      d_prefix_sum_ptr, 0, (once_max_sample_keynum + 1) * sizeof(int), stream_);
+  cudaMemsetAsync(d_prefix_sum_ptr,
+                  0,
+                  (once_max_sample_keynum + 1) * sizeof(int),
+                  sample_stream_);
   cursor_ = 0;
   jump_rows_ = 0;
-  d_walk_ = memory::AllocShared(place_, buf_size_ * sizeof(uint64_t));
-  cudaMemsetAsync(d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_);
-  if (!FLAGS_enable_opt_get_features && slot_num_ > 0) {
-    d_feature_ =
-        memory::AllocShared(place_, buf_size_ * slot_num_ * sizeof(uint64_t));
-    cudaMemsetAsync(
-        d_feature_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_);
-  }
-  d_sample_keys_ =
-      memory::AllocShared(place_, once_max_sample_keynum * sizeof(uint64_t));
+  d_uniq_node_num_ = memory::AllocShared(
+      place_,
+      sizeof(uint64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+  cudaMemsetAsync(d_uniq_node_num_->ptr(), 0, sizeof(uint64_t), sample_stream_);
+
+  d_walk_ = memory::AllocShared(
+      place_,
+      buf_size_ * sizeof(uint64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
+  cudaMemsetAsync(
+      d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), sample_stream_);
+  d_sample_keys_ = memory::AllocShared(
+      place_,
+      once_max_sample_keynum * sizeof(uint64_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
 
-  d_sampleidx2rows_.push_back(
-      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)));
-  d_sampleidx2rows_.push_back(
-      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)));
+  d_sampleidx2rows_.push_back(memory::AllocShared(
+      place_,
+      once_max_sample_keynum * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_))));
+  d_sampleidx2rows_.push_back(memory::AllocShared(
+      place_,
+      once_max_sample_keynum * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_))));
   cur_sampleidx2row_ = 0;
 
-  d_len_per_row_ =
-      memory::AllocShared(place_, once_max_sample_keynum * sizeof(int));
+  d_len_per_row_ = memory::AllocShared(
+      place_,
+      once_max_sample_keynum * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
   for (int i = -window_; i < 0; i++) {
     window_step_.push_back(i);
   }
@@ -1527,52 +1348,58 @@ void GraphDataGenerator::AllocResource(const paddle::platform::Place &place,
   buf_state_.Init(batch_size_, walk_len_, &window_step_);
   d_random_row_ = memory::AllocShared(
       place_,
-      (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int));
+      (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(sample_stream_)));
   shuffle_seed_ = 0;
 
   ins_buf_pair_len_ = 0;
   d_ins_buf_ =
       memory::AllocShared(place_, (batch_size_ * 2 * 2) * sizeof(uint64_t));
-  if (slot_num_ > 0) {
-    if (!sage_mode_) {
-      d_feature_buf_ = memory::AllocShared(
-          place_, (batch_size_ * 2 * 2) * fea_num_per_node_ * sizeof(uint64_t));
-    } else {
-      d_feature_buf_ = NULL;
-    }
-  }
   d_pair_num_ = memory::AllocShared(place_, sizeof(int));
-  if (FLAGS_enable_opt_get_features && slot_num_ > 0) {
-    d_slot_tensor_ptr_ =
-        memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
-    d_slot_lod_tensor_ptr_ =
-        memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
-  }
 
-  if (sage_mode_) {
-    reindex_table_size_ = batch_size_ * 2;
-    // get hashtable size
-    for (int i = 0; i < samples_.size(); i++) {
-      reindex_table_size_ *= (samples_[i] * edge_to_id_len_ + 1);
-    }
-    int64_t next_pow2 = 
-        1 << static_cast<size_t>(1 + std::log2(reindex_table_size_ >> 1));
-    reindex_table_size_ = next_pow2 << 1;
-
-    d_reindex_table_key_ = 
-        memory::AllocShared(place_, reindex_table_size_ * sizeof(int64_t));
-    d_reindex_table_value_ =
-        memory::AllocShared(place_, reindex_table_size_ * sizeof(int));
-    d_reindex_table_index_ =
-        memory::AllocShared(place_, reindex_table_size_ * sizeof(int));
-    auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-    edge_type_graph_ =
-        gpu_graph_ptr->get_edge_type_graph(gpuid_, edge_to_id_len_);
-  }
+  d_slot_tensor_ptr_ =
+      memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
+  d_slot_lod_tensor_ptr_ =
+      memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *));
+
+  cudaStreamSynchronize(sample_stream_);
 
-  cudaStreamSynchronize(stream_);
+  debug_gpu_memory_info(gpuid_, "AllocResource end");
 }
 
+void GraphDataGenerator::AllocTrainResource(int thread_id) {
+  if (slot_num_ > 0) {
+    platform::CUDADeviceGuard guard(gpuid_);
+    auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+    h_slot_feature_num_map_ = gpu_graph_ptr->slot_feature_num_map();
+    fea_num_per_node_ = 0;
+    for (int i = 0; i < slot_num_; ++i) {
+      fea_num_per_node_ += h_slot_feature_num_map_[i];
+    }
+    std::vector<int> h_actual_slot_id_map, h_fea_offset_map;
+    h_actual_slot_id_map.resize(fea_num_per_node_);
+    h_fea_offset_map.resize(fea_num_per_node_);
+    for (int slot_id = 0, fea_idx = 0; slot_id < slot_num_; ++slot_id) {
+      for (int j = 0; j < h_slot_feature_num_map_[slot_id]; ++j, ++fea_idx) {
+        h_actual_slot_id_map[fea_idx] = slot_id;
+        h_fea_offset_map[fea_idx] = j;
+      }
+    }
+      
+    d_slot_feature_num_map_ = memory::Alloc(place_, slot_num_ * sizeof(int));
+    cudaMemcpy(d_slot_feature_num_map_->ptr(), h_slot_feature_num_map_.data(),
+          sizeof(int) * slot_num_, cudaMemcpyHostToDevice);
+    d_actual_slot_id_map_ = memory::Alloc(place_, fea_num_per_node_ * sizeof(int));
+    cudaMemcpy(d_actual_slot_id_map_->ptr(), h_actual_slot_id_map.data(),
+          sizeof(int) * fea_num_per_node_, cudaMemcpyHostToDevice);
+    d_fea_offset_map_ = memory::Alloc(place_, fea_num_per_node_ * sizeof(int));
+    cudaMemcpy(d_fea_offset_map_->ptr(), h_fea_offset_map.data(),
+          sizeof(int) * fea_num_per_node_, cudaMemcpyHostToDevice);
+    d_feature_buf_ = memory::AllocShared(
+        place_, (batch_size_ * 2 * 2) * fea_num_per_node_ * sizeof(uint64_t));
+  }
+}                                       
+
 void GraphDataGenerator::SetConfig(
     const paddle::framework::DataFeedDesc &data_feed_desc) {
   auto graph_config = data_feed_desc.graph_config();
@@ -1590,58 +1417,22 @@ void GraphDataGenerator::SetConfig(
   repeat_time_ = graph_config.sample_times_one_chunk();
   buf_size_ =
       once_sample_startid_len_ * walk_len_ * walk_degree_ * repeat_time_;
-  VLOG(2) << "Confirm GraphConfig, walk_degree : " << walk_degree_
+  train_table_cap_ = graph_config.train_table_cap();
+  infer_table_cap_ = graph_config.infer_table_cap();
+  epoch_finish_ = false;
+  VLOG(0) << "Confirm GraphConfig, walk_degree : " << walk_degree_
           << ", walk_len : " << walk_len_ << ", window : " << window_
           << ", once_sample_startid_len : " << once_sample_startid_len_
           << ", sample_times_one_chunk : " << repeat_time_
-          << ", batch_size: " << batch_size_;
+          << ", batch_size: " << batch_size_
+          << ", train_table_cap: " << train_table_cap_
+          << ", infer_table_cap: " << infer_table_cap_;
   std::string first_node_type = graph_config.first_node_type();
   std::string meta_path = graph_config.meta_path();
-  sage_mode_ = graph_config.sage_mode();
-  std::string str_samples = graph_config.samples();
-
   auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-  auto edge_to_id = gpu_graph_ptr->edge_to_id;
-  edge_to_id_len_ = edge_to_id.size();
-  auto node_to_id = gpu_graph_ptr->feature_to_id;
-  // parse first_node_type
-  auto node_types =
-      paddle::string::split_string<std::string>(first_node_type, ";");
-  VLOG(2) << "node_types: " << first_node_type;
-  finish_node_type_.clear();
-  node_type_start_.clear();
-  for (auto &type : node_types) {
-    auto iter = node_to_id.find(type);
-    PADDLE_ENFORCE_NE(
-        iter,
-        node_to_id.end(),
-        platform::errors::NotFound("(%s) is not found in node_to_id.", type));
-    VLOG(2) << "node_to_id[" << type << "] = " << iter->second;
-    first_node_type_.push_back(iter->second);
-    node_type_start_[iter->second] = 0;
-  }
-  meta_path_.resize(first_node_type_.size());
-  auto meta_paths = paddle::string::split_string<std::string>(meta_path, ";");
-
-  for (size_t i = 0; i < meta_paths.size(); i++) {
-    auto path = meta_paths[i];
-    auto nodes = paddle::string::split_string<std::string>(path, "-");
-    for (auto &node : nodes) {
-      auto iter = edge_to_id.find(node);
-      PADDLE_ENFORCE_NE(
-          iter,
-          edge_to_id.end(),
-          platform::errors::NotFound("(%s) is not found in edge_to_id.", node));
-      VLOG(2) << "edge_to_id[" << node << "] = " << iter->second;
-      meta_path_[i].push_back(iter->second);
-    }
-  }
-
-  auto samples = paddle::string::split_string<std::string>(str_samples, ";");
-  for (size_t i = 0; i < samples.size(); i++) {
-    int sample_size = std::stoi(samples[i]);
-    samples_.emplace_back(sample_size);
-  }
+  debug_gpu_memory_info("init_conf start");
+  gpu_graph_ptr->init_conf(first_node_type, meta_path);
+  debug_gpu_memory_info("init_conf end");
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 4598788c0a7d85..dff08972a3f684 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+    /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -60,6 +60,8 @@ class Scope;
 class Variable;
 class NeighborSampleResult;
 class NodeQueryResult;
+template <typename KeyType, typename ValType>
+class HashTable;
 }  // namespace framework
 }  // namespace paddle
 
@@ -878,6 +880,9 @@ struct BufState {
 
   int GetNextBatch() {
     cursor += len;
+    if (row_num - cursor < 0) {
+      return 0;
+    }
     int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size;
     if (tmp_len == 0) {
       return 0;
@@ -895,11 +900,14 @@ class GraphDataGenerator {
   GraphDataGenerator(){};
   virtual ~GraphDataGenerator(){};
   void SetConfig(const paddle::framework::DataFeedDesc& data_feed_desc);
-  void AllocResource(const paddle::platform::Place& place,
-                     std::vector<LoDTensor*> feed_vec);
+  void AllocResource(int thread_id, std::vector<LoDTensor*> feed_vec);
+  void AllocTrainResource(int thread_id);
+  void SetFeedVec(std::vector<LoDTensor*> feed_vec);
   int AcquireInstance(BufState* state);
   int GenerateBatch();
-  int FillWalkBuf(std::shared_ptr<phi::Allocation> d_walk);
+  int FillWalkBuf();
+  int FillInferBuf();
+  void DoWalk();
   int FillFeatureBuf(uint64_t* d_walk, uint64_t* d_feature, size_t key_num);
   int FillFeatureBuf(std::shared_ptr<phi::Allocation> d_walk,
                      std::shared_ptr<phi::Allocation> d_feature);
@@ -911,34 +919,44 @@ class GraphDataGenerator {
                    int step,
                    int* len_per_row);
   int FillInsBuf();
+  int FillIdShowClkTensor(int total_instance,
+                          bool gpu_graph_training,
+                          size_t cursor = 0);
+  int FillGraphSlotFeature(int total_instance, bool gpu_graph_training);
+  int MakeInsPair();
+  int GetPathNum() { return total_row_; }
+  void ResetPathNum() {total_row_ = 0; }
+  void ResetEpochFinish() {epoch_finish_ = false; }
+  void ClearSampleState();
   void SetDeviceKeys(std::vector<uint64_t>* device_keys, int type) {
-    type_to_index_[type] = h_device_keys_.size();
-    h_device_keys_.push_back(device_keys);
+    // type_to_index_[type] = h_device_keys_.size();
+    // h_device_keys_.push_back(device_keys);
   }
-  std::vector<std::shared_ptr<phi::Allocation>> SampleNeighbors(
-      int64_t* uniq_nodes, int len, int sample_size,
-      std::vector<int>& edges_split_num, int64_t* neighbor_len);
 
+  std::vector<std::shared_ptr<phi::Allocation>> SampleNeighbors(
+          int64_t* uniq_nodes, int len, int sample_size,
+          std::vector<int>& edges_split_num, int64_t* neighbor_len);
   std::shared_ptr<phi::Allocation> GetReindexResult(
-      int64_t* reindex_src_data, const int64_t* center_nodes,
-      int* final_nodes_len, int node_len, int64_t neighbor_len);
-
+          int64_t* reindex_src_data, const int64_t* center_nodes,
+          int* final_nodes_len, int node_len, int64_t neighbor_len);
   std::shared_ptr<phi::Allocation> GenerateSampleGraph(
-      uint64_t* node_ids, int len, int* uniq_len, phi::DenseTensor* inverse);
+          uint64_t* node_ids, int len, int* uniq_len, phi::DenseTensor* inverse);
+  int InsertTable(const unsigned long* d_keys,
+          unsigned long len,
+          std::shared_ptr<phi::Allocation> d_uniq_node_num);
+  std::vector<uint64_t>& GetHostVec() { return host_vec_; }
+  bool get_epoch_finish() {return epoch_finish_; }
+  void clear_gpu_mem();
 
  protected:
+  HashTable<uint64_t, uint64_t>* table_;
   int walk_degree_;
   int walk_len_;
   int window_;
   int once_sample_startid_len_;
   int gpuid_;
-  // start ids
-  // int64_t* device_keys_;
-  // size_t device_key_size_;
-  std::vector<std::vector<uint64_t>*> h_device_keys_;
-  std::unordered_map<int, int> type_to_index_;
-  // point to device_keys_
   size_t cursor_;
+  int thread_id_;
   size_t jump_rows_;
   int edge_to_id_len_;
   int uniq_instance_;
@@ -947,7 +965,8 @@ class GraphDataGenerator {
   int64_t* show_tensor_ptr_;
   int64_t* clk_tensor_ptr_;
 
-  cudaStream_t stream_;
+  cudaStream_t train_stream_;
+  cudaStream_t sample_stream_;
   paddle::platform::Place place_;
   std::vector<LoDTensor*> feed_vec_;
   std::vector<size_t> offset_;
@@ -955,23 +974,21 @@ class GraphDataGenerator {
   std::vector<std::shared_ptr<phi::Allocation>> d_device_keys_;
 
   std::shared_ptr<phi::Allocation> d_walk_;
+  std::shared_ptr<phi::Allocation> d_feature_list_;
   std::shared_ptr<phi::Allocation> d_feature_;
   std::shared_ptr<phi::Allocation> d_len_per_row_;
   std::shared_ptr<phi::Allocation> d_random_row_;
+  std::shared_ptr<phi::Allocation> d_uniq_node_num_;
   std::shared_ptr<phi::Allocation> d_slot_feature_num_map_;
   std::shared_ptr<phi::Allocation> d_actual_slot_id_map_;
   std::shared_ptr<phi::Allocation> d_fea_offset_map_;
-  //
+
   std::vector<std::shared_ptr<phi::Allocation>> d_sampleidx2rows_;
   int cur_sampleidx2row_;
   // record the keys to call graph_neighbor_sample
   std::shared_ptr<phi::Allocation> d_sample_keys_;
   int sample_keys_len_;
 
-  std::set<int> finish_node_type_;
-  std::unordered_map<int, size_t> node_type_start_;
-  std::vector<int> infer_node_type_start_;
-
   std::shared_ptr<phi::Allocation> d_ins_buf_;
   std::shared_ptr<phi::Allocation> d_feature_buf_;
   std::shared_ptr<phi::Allocation> d_pair_num_;
@@ -994,11 +1011,17 @@ class GraphDataGenerator {
   int fea_num_per_node_;
   int shuffle_seed_;
   int debug_mode_;
-  std::vector<int> first_node_type_;
-  std::vector<std::vector<int>> meta_path_;
   bool gpu_graph_training_;
   bool sage_mode_;
   std::vector<int> samples_;
+  bool epoch_finish_;
+  std::vector<uint64_t> host_vec_;
+  std::vector<uint64_t> h_device_keys_len_;
+  uint64_t train_table_cap_;
+  uint64_t infer_table_cap_;
+  int total_row_;
+  size_t infer_node_start_;
+  size_t infer_node_end_;
 };
 
 class DataFeed {
@@ -1063,11 +1086,30 @@ class DataFeed {
   virtual void SetParseLogKey(bool parse_logkey) {}
   virtual void SetEnablePvMerge(bool enable_pv_merge) {}
   virtual void SetCurrentPhase(int current_phase) {}
+  virtual void InitGraphResource() {}
+  virtual void InitGraphTrainResource() {}
   virtual void SetDeviceKeys(std::vector<uint64_t>* device_keys, int type) {
 #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
     gpu_graph_data_generator_.SetDeviceKeys(device_keys, type);
 #endif
   }
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+  virtual const std::vector<uint64_t>& GetHostVec() {
+    return gpu_graph_data_generator_.GetHostVec();
+  }
+#endif
+
+  virtual void clear_gpu_mem() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    gpu_graph_data_generator_.clear_gpu_mem();
+#endif
+  }
+  virtual bool get_epoch_finish() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    return gpu_graph_data_generator_.get_epoch_finish();
+#endif
+  }
+
   virtual void SetGpuGraphMode(int gpu_graph_mode) {
     gpu_graph_mode_ = gpu_graph_mode;
   }
@@ -1084,11 +1126,40 @@ class DataFeed {
     return ins_content_vec_;
   }
   virtual int GetCurBatchSize() { return batch_size_; }
+  virtual int GetGraphPathNum() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    return gpu_graph_data_generator_.GetPathNum();
+#else
+    return 0;
+#endif
+  }
+  virtual void ResetPathNum() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    gpu_graph_data_generator_.ResetPathNum();
+#endif
+  }
+  
+  virtual void ClearSampleState() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    gpu_graph_data_generator_.ClearSampleState();
+#endif
+  }
+
+  virtual void ResetEpochFinish() {
+#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS)
+    gpu_graph_data_generator_.ResetEpochFinish();
+#endif
+}
+
   virtual bool IsTrainMode() { return train_mode_; }
   virtual void LoadIntoMemory() {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This function(LoadIntoMemory) is not implemented."));
   }
+  virtual void DoWalk() {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This function(DoWalk) is not implemented."));
+  }
   virtual void SetPlace(const paddle::platform::Place& place) {
     place_ = place;
   }
@@ -1663,6 +1734,8 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed<SlotRecord> {
   //                                    CustomParser* parser) {}
   virtual void PutToFeedVec(const std::vector<SlotRecord>& ins_vec) {}
 
+  virtual void InitGraphResource(void);
+  virtual void InitGraphTrainResource(void);
   virtual void LoadIntoMemoryByCommand(void);
   virtual void LoadIntoMemoryByLib(void);
   virtual void LoadIntoMemoryByLine(void);
@@ -1697,6 +1770,8 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed<SlotRecord> {
                      const int float_slot_size,
                      const UsedSlotGpuType* used_slots);
 #endif
+  virtual void DoWalk();
+
   float sample_rate_ = 1.0f;
   int use_slot_size_ = 0;
   int float_use_slot_size_ = 0;
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
index 9726880d64ab5c..25610eea237813 100644
--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -40,6 +40,8 @@ message GraphConfig {
   optional bool gpu_graph_training = 10 [ default = true ];
   optional bool sage_mode = 11 [ default = false ];
   optional string samples = 12;
+  optional int64 train_table_cap = 13 [ default = 80000 ];
+  optional int64 infer_table_cap = 14 [ default = 80000 ];
 }
 
 message DataFeedDesc {
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 1d70ef6a1c78b0..e22136fdaf7079 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -36,7 +36,9 @@
 #endif
 
 USE_INT_STAT(STAT_total_feasign_num_in_mem);
+USE_INT_STAT(STAT_epoch_finish);
 DECLARE_bool(graph_get_neighbor_id);
+DECLARE_int32(gpugraph_storage_mode);
 
 namespace paddle {
 namespace framework {
@@ -446,18 +448,6 @@ void MultiSlotDataset::PrepareTrain() {
   return;
 }
 
-template <typename T>
-void DatasetImpl<T>::SetGraphDeviceKeys(
-    const std::vector<uint64_t>& h_device_keys) {
-  //  for (size_t i = 0; i < gpu_graph_device_keys_.size(); i++) {
-  //    gpu_graph_device_keys_[i].clear();
-  //  }
-  //  size_t device_num = gpu_graph_device_keys_.size();
-  //  for (size_t i = 0; i < h_device_keys.size(); i++) {
-  //    int shard = h_device_keys[i] % device_num;
-  //    gpu_graph_device_keys_[shard].push_back(h_device_keys[i]);
-  //  }
-}
 // load data into memory, Dataset hold this memory,
 // which will later be fed into readers' channel
 template <typename T>
@@ -469,63 +459,54 @@ void DatasetImpl<T>::LoadIntoMemory() {
   if (gpu_graph_mode_) {
     VLOG(0) << "in gpu_graph_mode";
 #ifdef PADDLE_WITH_HETERPS
-    graph_all_type_total_keys_.clear();
-    auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
-    auto node_to_id = gpu_graph_ptr->feature_to_id;
-    auto edge_to_id = gpu_graph_ptr->edge_to_id;
-    graph_all_type_total_keys_.resize(node_to_id.size());
-    int cnt = 0;
-    // set sample start node
-    for (auto& iter : node_to_id) {
-      int node_idx = iter.second;
-      std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
-      gpu_graph_ptr->get_all_id(
-          1, node_idx, thread_num_, &gpu_graph_device_keys);
-      auto& type_total_key = graph_all_type_total_keys_[cnt];
-      type_total_key.resize(thread_num_);
-      for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
-        VLOG(2) << "node type: " << node_idx << ", gpu_graph_device_keys[" << i
-                << "] = " << gpu_graph_device_keys[i].size();
-        for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
-          type_total_key[i].push_back(gpu_graph_device_keys[i][j]);
-        }
-      }
-
+    for (size_t i = 0; i < readers_.size(); i++) {
+      readers_[i]->SetGpuGraphMode(gpu_graph_mode_);
+    }
+    
+    if (STAT_GET(STAT_epoch_finish) == 1) {
+      VLOG(0) << "get epoch finish true";
+      STAT_RESET(STAT_epoch_finish, 0);
       for (size_t i = 0; i < readers_.size(); i++) {
-        readers_[i]->SetDeviceKeys(&type_total_key[i], node_idx);
-        readers_[i]->SetGpuGraphMode(gpu_graph_mode_);
+        readers_[i]->ResetPathNum();
+        readers_[i]->ResetEpochFinish();
       }
-      cnt++;
+      return;
     }
 
-    // add node embedding id
-    std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
-    gpu_graph_ptr->get_node_embedding_ids(thread_num_, &gpu_graph_device_keys);
-    for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
-      for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
-        gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
+    for (int64_t i = 0; i < thread_num_; ++i) {
+      load_threads.push_back(
+          std::thread(&paddle::framework::DataFeed::DoWalk, readers_[i].get()));
+    }
+    for (std::thread& t : load_threads) {
+      t.join();
+    }
+    uint64_t node_num = 0;
+    for (int i = 0; i < thread_num_; i++) {
+      auto& host_vec = readers_[i]->GetHostVec();
+      node_num += host_vec.size();
+    }
+    gpu_graph_total_keys_.reserve(node_num);
+    for (int i = 0; i < thread_num_; i++) {
+      auto& host_vec = readers_[i]->GetHostVec();
+      for (size_t j = 0; j < host_vec.size(); j++) {
+        gpu_graph_total_keys_.push_back(host_vec[j]);
       }
     }
 
-    // add feature embedding id
-    VLOG(2) << "begin add feature_id into gpu_graph_total_keys_ size["
-            << gpu_graph_total_keys_.size() << "]";
-    for (auto& iter : node_to_id) {
-      std::vector<std::vector<uint64_t>> gpu_graph_device_keys;
-      int node_idx = iter.second;
-      gpu_graph_ptr->get_all_feature_ids(
-          1, node_idx, thread_num_, &gpu_graph_device_keys);
-      for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) {
-        VLOG(2) << "begin node type: " << node_idx << ", gpu_graph_device_keys["
-                << i << "] = " << gpu_graph_device_keys[i].size();
-        for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) {
-          gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]);
-        }
-        VLOG(2) << "end node type: " << node_idx << ", gpu_graph_device_keys["
-                << i << "] = " << gpu_graph_device_keys[i].size();
+    if (GetEpochFinish() == true) {
+      VLOG(0) << "epoch finish, set stat and clear sample stat!";
+      STAT_RESET(STAT_epoch_finish, 1);
+      for (size_t i = 0; i < readers_.size(); i++) {
+        readers_[i]->ClearSampleState();
       }
     }
-    VLOG(2) << "end add feature_id into gpu_graph_total_keys_ size["
+    if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+      for (size_t i = 0; i < readers_.size(); i++) {
+        readers_[i]->clear_gpu_mem();
+      }
+    }
+
+    VLOG(2) << "end add edge into gpu_graph_total_keys_ size["
             << gpu_graph_total_keys_.size() << "]";
 #endif
   } else {
@@ -1123,7 +1104,26 @@ void DatasetImpl<T>::DestroyPreLoadReaders() {
 
 template <typename T>
 int64_t DatasetImpl<T>::GetMemoryDataSize() {
-  return input_channel_->Size();
+  if (gpu_graph_mode_) {
+    int64_t total_path_num = 0;
+    for (int i = 0; i < thread_num_; i++) {
+      total_path_num += readers_[i]->GetGraphPathNum();
+    }
+    return total_path_num;
+  } else {
+    return input_channel_->Size();
+  }
+}
+
+template <typename T>
+bool DatasetImpl<T>::GetEpochFinish() {
+  bool is_epoch_finish = true;
+  if (gpu_graph_mode_) {
+    for (int i = 0; i < thread_num_; i++) {
+      is_epoch_finish = is_epoch_finish && readers_[i]->get_epoch_finish();
+    }
+  }
+  return is_epoch_finish;
 }
 
 template <typename T>
@@ -1780,6 +1780,7 @@ void SlotRecordDataset::CreateReaders() {
     readers_[i]->SetParseLogKey(parse_logkey_);
     readers_[i]->SetEnablePvMerge(enable_pv_merge_);
     readers_[i]->SetCurrentPhase(current_phase_);
+    readers_[i]->InitGraphResource();
     if (input_channel_ != nullptr) {
       readers_[i]->SetInputChannel(input_channel_.get());
     }
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 0489c2ece64e8f..9e1998a35fd649 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -169,6 +169,10 @@ class Dataset {
 
   virtual void SetGpuGraphMode(int is_graph_mode) = 0;
   virtual int GetGpuGraphMode() = 0;
+  virtual bool GetEpochFinish() = 0;
+
+  virtual void SetPassId(uint32_t pass_id) = 0;
+  virtual uint32_t GetPassID() = 0;
 
  protected:
   virtual int ReceiveFromClient(int msg_type,
@@ -253,7 +257,7 @@ class DatasetImpl : public Dataset {
                                          int read_thread_num,
                                          int consume_thread_num,
                                          int shard_num) {}
-  virtual void SetGraphDeviceKeys(const std::vector<uint64_t>& h_device_keys);
+  virtual void SetGraphDeviceKeys(const std::vector<uint64_t>& h_device_keys) {}
   virtual void ClearLocalTables() {}
   virtual void CreatePreLoadReaders();
   virtual void DestroyPreLoadReaders();
@@ -263,11 +267,7 @@ class DatasetImpl : public Dataset {
   virtual void DynamicAdjustReadersNum(int thread_num);
   virtual void SetFleetSendSleepSeconds(int seconds);
   virtual std::vector<std::string> GetSlots();
-  /* for enable_heterps_
-  virtual void EnableHeterps(bool enable_heterps) {
-    enable_heterps_ = enable_heterps;
-  }
-  */
+  virtual bool GetEpochFinish();
 
   std::vector<paddle::framework::Channel<T>>& GetMultiOutputChannel() {
     return multi_output_channel_;
@@ -280,10 +280,13 @@ class DatasetImpl : public Dataset {
       return multi_consume_channel_;
     }
   }
+  Channel<T>& GetInputChannelRef() { return input_channel_; }
   std::vector<uint64_t>& GetGpuGraphTotalKeys() {
     return gpu_graph_total_keys_;
   }
-  Channel<T>& GetInputChannelRef() { return input_channel_; }
+
+  virtual void SetPassId(uint32_t pass_id) { pass_id_ = pass_id; }
+  virtual uint32_t GetPassID() { return pass_id_; }
 
  protected:
   virtual int ReceiveFromClient(int msg_type,
@@ -344,9 +347,9 @@ class DatasetImpl : public Dataset {
   std::vector<std::string> use_slots_;
   bool enable_heterps_ = false;
   int gpu_graph_mode_ = 0;
-  // std::vector<std::vector<int64_t>> gpu_graph_device_keys_;
-  std::vector<std::vector<std::vector<uint64_t>>> graph_all_type_total_keys_;
+  std::vector<std::vector<std::vector<uint64_t>>> gpu_graph_type_keys_;
   std::vector<uint64_t> gpu_graph_total_keys_;
+  uint32_t pass_id_ = 0;
 };
 
 // use std::vector<MultiSlotType> or Record as data type
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 4cf3ab8dc1a67d..bacb096f751b1d 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -29,7 +29,7 @@ if(WITH_HETERPS)
       nv_library(
         ps_gpu_wrapper
         SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
-        DEPS heter_ps gloo_wrapper ps_framework_proto ${BRPC_DEPS})
+        DEPS heter_ps gloo_wrapper ps_framework_proto graph_gpu_wrapper ${BRPC_DEPS})
     else()
       nv_library(
         ps_gpu_wrapper
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index ef2e73d6dd5b56..948021582275d4 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -85,7 +85,9 @@ class HeterContext {
   std::vector<std::vector<std::mutex*>> dim_mutex_;
   int multi_mf_dim_ = 0;
 
+  void * sub_graph_feas = NULL;
   uint32_t shard_num_ = 37;
+  uint16_t pass_id_ = 0;
   uint64_t size() {
     uint64_t total_size = 0;
     for (auto& keys : feature_keys_) {
diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
index 85bf6bb553b220..160de2646d7d96 100644
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
@@ -524,6 +524,7 @@ class concurrent_unordered_map : public managed {
   __forceinline__ __device__ iterator
   insert(const value_type& x,
          aggregation_type op,
+         uint64_t* local_count = NULL,
          comparison_type keys_equal = key_equal(),
          bool precomputed_hash = false,
          hash_value_type precomputed_hash_value = 0) {
@@ -580,6 +581,10 @@ class concurrent_unordered_map : public managed {
         if (m_enable_collision_stat) {
           atomicAdd(&m_insert_times, 1);
         }
+
+        if (local_count != NULL && keys_equal(unused_key, old_key)) {
+          atomicAdd(local_count, 1);
+        }
         break;
       }
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
index f05fe6c95de0a5..52a02cbfb2d8b9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -60,11 +60,11 @@ __global__ void PullDedupCopy(const size_t N,
                               const int64_t* slot_lens,
                               uint64_t max_val_size,
                               const int* slot_dims,
-                              const int hidden,
+                              const size_t hidden,
                               const int* key2slot,
                               const uint32_t* restore_idx,
                               TAccess accessor) {
-  CUDA_KERNEL_LOOP(idx, N) {
+  CUDA_KERNEL_LOOP_TYPE(idx, N, size_t) {
     int i = idx / hidden;
     int off = idx % hidden;
 
@@ -158,7 +158,7 @@ __global__ void PushMergeCopyAtomic(const size_t N,
                                     const uint32_t* d_restore_idx,
                                     size_t grad_value_size,
                                     TAccess accessor) {
-  CUDA_KERNEL_LOOP(idx, N) {
+  CUDA_KERNEL_LOOP_TYPE(idx, N, size_t) {
     int i = idx / hidden;
     int off = idx % hidden;
     // filter 0 keys
@@ -224,7 +224,7 @@ __global__ void PushMergeCopy(const size_t N,
                               const uint32_t* d_sort_cnt,
                               size_t grad_value_size,
                               TAccess accessor) {
-  CUDA_KERNEL_LOOP(idx, N) {
+  CUDA_KERNEL_LOOP_TYPE(idx, N, size_t) {
     int i = idx / hidden;
     int off = idx % hidden;
     // filter 0 keys
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
index ed62292c6b1dc1..10ffa04485ab19 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -171,6 +171,7 @@ struct NeighborSampleResult {
   uint64_t *actual_val;
   int *actual_sample_size, sample_size, key_size;
   int total_sample_size;
+  cudaStream_t stream=0;
   std::shared_ptr<memory::Allocation> val_mem, actual_sample_size_mem;
   std::shared_ptr<memory::Allocation> actual_val_mem;
   uint64_t *get_val() { return val; }
@@ -180,18 +181,30 @@ struct NeighborSampleResult {
   int get_key_size() { return key_size; }
   void set_total_sample_size(int s) { total_sample_size = s; }
   int get_len() { return total_sample_size; }
+  void set_stream(cudaStream_t stream_t) {
+      stream = stream_t;
+  }
   void initialize(int _sample_size, int _key_size, int dev_id) {
     sample_size = _sample_size;
     key_size = _key_size;
     platform::CUDADeviceGuard guard(dev_id);
     platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-    val_mem =
-        memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t));
+    if (stream != 0) {
+      val_mem =
+          memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t),  phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+      actual_sample_size_mem =
+          memory::AllocShared(place, _key_size * sizeof(int),  phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+    }
+    else {
+      val_mem =
+          memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t));
+      actual_sample_size_mem =
+          memory::AllocShared(place, _key_size * sizeof(int));
+    }
     val = (uint64_t *)val_mem->ptr();
-    actual_sample_size_mem =
-        memory::AllocShared(place, _key_size * sizeof(int));
     actual_sample_size = (int *)actual_sample_size_mem->ptr();
   }
+
   void display() {
     VLOG(0) << "in node sample result display ------------------";
     int64_t *res = new int64_t[sample_size * key_size];
@@ -364,7 +377,7 @@ struct GpuPsCommGraphFea {
   uint8_t *slot_id_list;   // locate on both side
   GpuPsFeaInfo
       *fea_info_list;  // only locate on host side, the list of fea_info
-  uint64_t feature_size, node_size;
+  uint64_t feature_size, node_size, feature_capacity;
   // the size of feature array and graph_node_list array
   GpuPsCommGraphFea()
       : node_list(NULL),
@@ -372,7 +385,8 @@ struct GpuPsCommGraphFea {
         slot_id_list(NULL),
         fea_info_list(NULL),
         feature_size(0),
-        node_size(0) {}
+        node_size(0),
+        feature_capacity(0){}
   GpuPsCommGraphFea(uint64_t *node_list_,
                     uint64_t *feature_list_,
                     uint8_t *slot_id_list_,
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
index 39734cae33fca1..69e743cca977a3 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
@@ -13,6 +13,10 @@
 // limitations under the License.
 
 #pragma once
+#include <algorithm>
+#include <vector>
+#include <random>
+#include <time.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
@@ -22,6 +26,38 @@
 namespace paddle {
 namespace framework {
 
+/**
+ * @brief wrapper of the std::default_random_engine each construction will have different seeds.
+ */
+struct random_engine_wrapper_t {
+    std::default_random_engine engine;
+    random_engine_wrapper_t() {
+        timespec tp;
+        clock_gettime(CLOCK_REALTIME, &tp);
+        static std::atomic<unsigned long> x(static_cast<unsigned long>(1));
+        std::seed_seq sseq = {x++, x++, x++,
+            (unsigned long)(tp.tv_sec * 1e9 + tp.tv_nsec)};
+        engine.seed(sseq);
+    }
+};
+
+/**
+ * @brief Get a n-size vector<int>, but its element has unique shuffled int value (from 0 to n-1).
+ * @param n vector size
+ * @return the shuffled vector.
+ */
+inline std::vector<int> shuffle_int_vector(int n) {
+    random_engine_wrapper_t random_engine_wrapper;
+    std::vector<int> ret(n);
+    int i = 0;
+
+    for (auto & e : ret) {
+        e = i++;
+    }
+    std::shuffle(ret.begin(), ret.end(), random_engine_wrapper.engine);
+    return std::move(ret);
+}
+
 #define CUDA_CHECK(cmd)                                                       \
   do {                                                                        \
     cudaError_t e = cmd;                                                      \
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index 4c0ebd11996212..03d7a505302e9f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -113,12 +113,22 @@ class GpuPsGraphTable
         }
       }
     }
+    device_mutex_.resize(gpu_num);
+    for (int i = 0; i < gpu_num; i++) {
+      device_mutex_[i] = new std::mutex();
+    }
+  }
+  ~GpuPsGraphTable() {
+    for (size_t i = 0; i < device_mutex_.size(); ++i) {
+      delete device_mutex_[i];
+    }
+    device_mutex_.clear();
   }
-  ~GpuPsGraphTable() {}
   void build_graph_on_single_gpu(const GpuPsCommGraph &g, int gpu_id, int idx);
   void build_graph_fea_on_single_gpu(const GpuPsCommGraphFea &g, int gpu_id);
   void clear_graph_info(int gpu_id, int index);
   void clear_graph_info(int index);
+  void reset_feature_info(int gpu_id, size_t capacity, size_t feature_size);
   void clear_feature_info(int gpu_id, int index);
   void clear_feature_info(int index);
   void build_graph_from_cpu(const std::vector<GpuPsCommGraph> &cpu_node_list,
@@ -169,7 +179,10 @@ class GpuPsGraphTable
                                                int* actual_sample_size,
                                                int edge_type_len,
                                                int len);
-  int init_cpu_table(const paddle::distributed::GraphParameter &graph);
+  int init_cpu_table(const paddle::distributed::GraphParameter &graph, int gpu_num = 8);
+  gpuStream_t get_local_stream(int gpu_id) {
+    return resource_->local_stream(gpu_id, 0);
+  }
 
   int gpu_num;
   int graph_table_num_, feature_table_num_;
@@ -181,6 +194,7 @@ class GpuPsGraphTable
   std::shared_ptr<paddle::distributed::GraphTable> cpu_graph_table_;
   std::shared_ptr<pthread_rwlock_t> rw_lock;
   mutable std::mutex mutex_;
+  std::vector<std::mutex *> device_mutex_;
   std::condition_variable cv_;
   int cpu_table_status;
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
index 4ce99f159322c7..f3e286358c1df9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
@@ -15,8 +15,8 @@
 #include <thrust/device_vector.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
-
 #include <functional>
+#include "cub/cub.cuh"
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
@@ -244,9 +244,10 @@ __global__ void neighbor_sample_kernel_all_edge_type(GpuPsCommGraph* graphs,
 }
 
 int GpuPsGraphTable::init_cpu_table(
-    const paddle::distributed::GraphParameter& graph) {
+    const paddle::distributed::GraphParameter& graph, int gpu_num) {
   cpu_graph_table_.reset(new paddle::distributed::GraphTable);
   cpu_table_status = cpu_graph_table_->Initialize(graph);
+  cpu_graph_table_->init_worker_poll(gpu_num);
   // if (cpu_table_status != 0) return cpu_table_status;
   // std::function<void(std::vector<GpuPsCommGraph>&)> callback =
   //     [this](std::vector<GpuPsCommGraph>& res) {
@@ -521,6 +522,39 @@ void GpuPsGraphTable::clear_feature_info(int gpu_id) {
     cudaFree(graph.slot_id_list);
     graph.slot_id_list = NULL;
   }
+  graph.feature_capacity = 0;
+}
+
+void GpuPsGraphTable::reset_feature_info(int gpu_id, size_t capacity, size_t feature_size) {
+  int idx = 0;
+  if (idx >= feature_table_num_) return;
+  int offset = get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, idx);
+  if (offset < tables_.size()) {
+      delete tables_[offset];
+      tables_[offset] = new Table(capacity);
+  }
+  int graph_fea_idx = gpu_id * feature_table_num_ + idx;
+  if (graph_fea_idx >= gpu_graph_fea_list_.size()) {
+    return;
+  }
+  auto& graph = gpu_graph_fea_list_[graph_fea_idx];
+  graph.node_list = NULL;
+  if (graph.feature_list == NULL) {
+    CUDA_CHECK(cudaMalloc((void**)&graph.feature_list, feature_size * sizeof(uint64_t)));
+    CUDA_CHECK(cudaMalloc((void**)&graph.slot_id_list, feature_size * sizeof(uint8_t)));
+    graph.feature_capacity = feature_size;  
+  }
+  else if (graph.feature_capacity < feature_size) {
+    cudaFree(graph.feature_list);
+    cudaFree(graph.slot_id_list);
+    CUDA_CHECK(cudaMalloc((void**)&graph.feature_list, feature_size * sizeof(uint64_t)));
+    CUDA_CHECK(cudaMalloc((void**)&graph.slot_id_list, feature_size * sizeof(uint8_t)));
+    graph.feature_capacity = feature_size;  
+  }
+  else {
+    CUDA_CHECK(cudaMemset(graph.feature_list, 0, feature_size * sizeof(uint64_t)));
+    CUDA_CHECK(cudaMemset(graph.slot_id_list, 0, feature_size * sizeof(uint8_t)));
+  }
 }
 
 void GpuPsGraphTable::clear_graph_info(int gpu_id, int idx) {
@@ -552,20 +586,14 @@ In this function, memory is allocated on each gpu to save the graphs,
 gpu i saves the ith graph from cpu_graph_list
 */
 void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g,
-                                                    int gpu_id) {
-  clear_feature_info(gpu_id);
+                                                    int gpu_id) {                                             
+  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));    
+  size_t capacity = std::max((uint64_t)1, g.node_size) / load_factor_;                                              
+  reset_feature_info(gpu_id, capacity, g.feature_size);
   int ntype_id = 0;
-
-  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-
   int offset = gpu_id * feature_table_num_ + ntype_id;
-  gpu_graph_fea_list_[offset] = GpuPsCommGraphFea();
-
   int table_offset =
       get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, ntype_id);
-
-  size_t capacity = std::max((uint64_t)1, g.node_size) / load_factor_;
-  tables_[table_offset] = new Table(capacity);
   if (g.node_size > 0) {
     build_ps(gpu_id,
              g.node_list,
@@ -574,51 +602,23 @@ void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g,
              1024,
              8,
              table_offset);
-    gpu_graph_fea_list_[offset].node_list = NULL;
     gpu_graph_fea_list_[offset].node_size = g.node_size;
   } else {
     build_ps(gpu_id, NULL, NULL, 0, 1024, 8, table_offset);
-    gpu_graph_fea_list_[offset].node_list = NULL;
     gpu_graph_fea_list_[offset].node_size = 0;
   }
   if (g.feature_size) {
-    // TODO
-    cudaError_t cudaStatus =
-        cudaMalloc((void**)&gpu_graph_fea_list_[offset].feature_list,
-                   g.feature_size * sizeof(uint64_t));
-    PADDLE_ENFORCE_EQ(
-        cudaStatus,
-        cudaSuccess,
-        platform::errors::InvalidArgument(
-            "ailed to allocate memory for graph-feature on gpu "));
-    VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint64_t)
-            << " bytes of memory for graph-feature on gpu "
-            << resource_->dev_id(gpu_id);
     CUDA_CHECK(cudaMemcpy(gpu_graph_fea_list_[offset].feature_list,
                           g.feature_list,
                           g.feature_size * sizeof(uint64_t),
                           cudaMemcpyHostToDevice));
-
-    // TODO
-    cudaStatus = cudaMalloc((void**)&gpu_graph_fea_list_[offset].slot_id_list,
-                            g.feature_size * sizeof(uint8_t));
-    PADDLE_ENFORCE_EQ(
-        cudaStatus,
-        cudaSuccess,
-        platform::errors::InvalidArgument(
-            "ailed to allocate memory for graph-feature on gpu "));
-    VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint8_t)
-            << " bytes of memory for graph-feature on gpu "
-            << resource_->dev_id(gpu_id);
-    cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list,
+    CUDA_CHECK(cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list,
                g.slot_id_list,
                g.feature_size * sizeof(uint8_t),
-               cudaMemcpyHostToDevice);
+               cudaMemcpyHostToDevice));
 
     gpu_graph_fea_list_[offset].feature_size = g.feature_size;
   } else {
-    gpu_graph_fea_list_[offset].feature_list = NULL;
-    gpu_graph_fea_list_[offset].slot_id_list = NULL;
     gpu_graph_fea_list_[offset].feature_size = 0;
   }
   VLOG(0) << "gpu node_feature info card :" << gpu_id << " ,node_size is "
@@ -870,6 +870,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     bool compress) {
 
   NeighborSampleResult result;
+  auto stream = resource_->local_stream(gpu_id, 0);
+  result.set_stream(stream);
   result.initialize(sample_size, len, resource_->dev_id(gpu_id));
 
   if (len == 0) {
@@ -882,15 +884,20 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   int* actual_sample_size = result.actual_sample_size;
   uint64_t* val = result.val;
   int total_gpu = resource_->total_device();
-  auto stream = resource_->local_stream(gpu_id, 0);
 
   int grid_size = (len - 1) / block_size_ + 1;
 
   int h_left[total_gpu];   // NOLINT
   int h_right[total_gpu];  // NOLINT
 
-  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
-  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_left =
+      memory::Alloc(place,
+                    total_gpu * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  auto d_right =
+      memory::Alloc(place,
+                    total_gpu * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
   int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
   int default_value = 0;
@@ -901,15 +908,26 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream));
   //
-  auto d_idx = memory::Alloc(place, len * sizeof(int));
+  auto d_idx =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
 
-  auto d_shard_keys = memory::Alloc(place, len * sizeof(uint64_t));
+  auto d_shard_keys =
+      memory::Alloc(place,
+                    len * sizeof(uint64_t),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
   auto d_shard_vals =
-      memory::Alloc(place, sample_size * len * sizeof(uint64_t));
+      memory::Alloc(place,
+                    sample_size * len * sizeof(uint64_t),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   uint64_t* d_shard_vals_ptr = reinterpret_cast<uint64_t*>(d_shard_vals->ptr());
-  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
+  auto d_shard_actual_sample_size =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_shard_actual_sample_size_ptr =
       reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
 
@@ -921,10 +939,18 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
-  CUDA_CHECK(cudaMemcpy(
-      h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
-  CUDA_CHECK(cudaMemcpy(
-      h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpyAsync(h_left,
+                             d_left_ptr,
+                             total_gpu * sizeof(int),
+                             cudaMemcpyDeviceToHost,
+                             stream));
+  CUDA_CHECK(cudaMemcpyAsync(h_right,
+                             d_right_ptr,
+                             total_gpu * sizeof(int),
+                             cudaMemcpyDeviceToHost,
+                             stream));
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  device_mutex_[gpu_id]->lock();
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     if (shard_len == 0) {
@@ -995,6 +1021,16 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
                             h_right,
                             d_shard_vals_ptr,
                             d_shard_actual_sample_size_ptr);
+                            
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    destroy_storage(gpu_id, i);
+  }
+  device_mutex_[gpu_id]->unlock();                           
+                            
   fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
       d_shard_vals_ptr,
       val,
@@ -1100,52 +1136,57 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   }
 
   if (compress) {
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
+    platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
     size_t temp_storage_bytes = 0;
     int total_sample_size = 0;
-    auto cumsum_actual_sample_size = memory::Alloc(place, (len + 1) * sizeof(int));
-    int* cumsum_actual_sample_size_ptr =
+    auto cumsum_actual_sample_size =
+        memory::Alloc(place,
+                      (len + 1) * sizeof(int),
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+    int* cumsum_actual_sample_size_p =
         reinterpret_cast<int*>(cumsum_actual_sample_size->ptr());
-    CUDA_CHECK(cudaMemsetAsync(cumsum_actual_sample_size_ptr, 0, sizeof(int), stream));
+    CUDA_CHECK(
+        cudaMemsetAsync(cumsum_actual_sample_size_p, 0, sizeof(int), stream));
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL,
-                                           temp_storage_bytes,
-                                           actual_sample_size,
-                                           cumsum_actual_sample_size_ptr + 1,
-                                           len,
-                                           stream));
-    auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+                                             temp_storage_bytes,
+                                             actual_sample_size,
+                                             cumsum_actual_sample_size_p + 1,
+                                             len,
+                                             stream));
+    auto d_temp_storage =
+        memory::Alloc(place,
+                      temp_storage_bytes,
+                      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
     CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(),
                                              temp_storage_bytes,
                                              actual_sample_size,
-                                             cumsum_actual_sample_size_ptr + 1,
+                                             cumsum_actual_sample_size_p + 1,
                                              len,
                                              stream));
     CUDA_CHECK(cudaMemcpyAsync(&total_sample_size,
-                               cumsum_actual_sample_size_ptr + len,
+                               cumsum_actual_sample_size_p + len,
                                sizeof(int),
                                cudaMemcpyDeviceToHost,
                                stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
+    result.actual_val_mem = memory::AllocShared(
+        place,
+        total_sample_size * sizeof(uint64_t),
+        phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+    result.actual_val = (uint64_t*)(result.actual_val_mem)->ptr();
 
     result.set_total_sample_size(total_sample_size);
-    result.actual_val_mem =
-        memory::AllocShared(place, total_sample_size * sizeof(uint64_t));
-    result.actual_val = (uint64_t*)(result.actual_val_mem)->ptr();
     fill_actual_vals<<<grid_size, block_size_, 0, stream>>>(
         val,
         result.actual_val,
         actual_sample_size,
-        cumsum_actual_sample_size_ptr,
+        cumsum_actual_sample_size_p,
         sample_size,
         len);
   }
 
-  for (int i = 0; i < total_gpu; ++i) {
-    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
-    if (shard_len == 0) {
-      continue;
-    }
-    destroy_storage(gpu_id, i);
-  }
   cudaStreamSynchronize(stream);
   return result;
 }
@@ -1207,6 +1248,7 @@ NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type(
   CUDA_CHECK(cudaMemcpy(
       h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
 
+  device_mutex_[gpu_id]->lock();
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     if (shard_len == 0) {
@@ -1302,6 +1344,7 @@ NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type(
     }
     destroy_storage(gpu_id, i);
   }
+  device_mutex_[gpu_id]->unlock();                           
   return result;
 }
 
@@ -1370,23 +1413,40 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id,
   int total_gpu = resource_->total_device();
   auto stream = resource_->local_stream(gpu_id, 0);
 
-  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
-  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_left =
+      memory::Alloc(place,
+                    total_gpu * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  auto d_right =
+      memory::Alloc(place,
+                    total_gpu * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
   int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
 
   CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream));
   //
-  auto d_idx = memory::Alloc(place, node_num * sizeof(int));
+  auto d_idx =
+      memory::Alloc(place,
+                    node_num * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
 
-  auto d_shard_keys = memory::Alloc(place, node_num * sizeof(uint64_t));
+  auto d_shard_keys =
+      memory::Alloc(place,
+                    node_num * sizeof(uint64_t),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   uint64_t* d_shard_keys_ptr = reinterpret_cast<uint64_t*>(d_shard_keys->ptr());
   auto d_shard_vals =
-      memory::Alloc(place, fea_num_per_node * node_num * sizeof(uint64_t));
+      memory::Alloc(place,
+                    fea_num_per_node * node_num * sizeof(uint64_t),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   uint64_t* d_shard_vals_ptr = reinterpret_cast<uint64_t*>(d_shard_vals->ptr());
-  auto d_shard_actual_size = memory::Alloc(place, node_num * sizeof(int));
+  auto d_shard_actual_size =
+      memory::Alloc(place,
+                    node_num * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_shard_actual_size_ptr =
       reinterpret_cast<int*>(d_shard_actual_size->ptr());
 
@@ -1403,6 +1463,7 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id,
   int h_right[total_gpu];  // NOLINT
   CUDA_CHECK(cudaMemcpy(
       h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost));
+  device_mutex_[gpu_id]->lock();
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     if (shard_len == 0) {
@@ -1457,9 +1518,9 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id,
         d_slot_feature_num_map,
         slot_num,
         shard_len,
-        fea_num_per_node);
+        fea_num_per_node); 
   }
-
+  
   for (int i = 0; i < total_gpu; ++i) {
     if (h_left[i] == -1) {
       continue;
@@ -1474,6 +1535,14 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id,
                             h_right,
                             d_shard_vals_ptr,
                             d_shard_actual_size_ptr);
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    destroy_storage(gpu_id, i);
+  }
+  device_mutex_[gpu_id]->unlock();
 
   int grid_size = (node_num - 1) / block_size_ + 1;
   fill_dvalues<<<grid_size, block_size_, 0, stream>>>(d_shard_vals_ptr,
@@ -1483,18 +1552,11 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id,
                                                       fea_num_per_node,
                                                       node_num);
 
-  for (int i = 0; i < total_gpu; ++i) {
-    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
-    if (shard_len == 0) {
-      continue;
-    }
-    destroy_storage(gpu_id, i);
-  }
-
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   return 0;
 }
-}  // namespace framework
+
+};  // namespace framework
 };  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index 10a5f38e7313cd..81d3de9ee2f2de 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+DECLARE_int32(gpugraph_storage_mode);
 namespace paddle {
 namespace framework {
 #ifdef PADDLE_WITH_HETERPS
@@ -28,6 +29,121 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) {
   }
 }
 
+void GraphGpuWrapper::init_conf(const std::string &first_node_type,
+                                const std::string &meta_path) {
+  static std::mutex mutex;
+  {
+    std::lock_guard<std::mutex> lock(mutex);
+    if (conf_initialized_) {
+      return;
+    }
+    VLOG(2) << "init path config";
+    conf_initialized_ = true;
+    auto node_types =
+        paddle::string::split_string<std::string>(first_node_type, ";");
+    VLOG(2) << "node_types: " << first_node_type;
+    for (auto &type : node_types) {
+      auto iter = feature_to_id.find(type);
+      PADDLE_ENFORCE_NE(iter,
+                        feature_to_id.end(),
+                        platform::errors::NotFound(
+                            "(%s) is not found in feature_to_id.", type));
+      VLOG(2) << "feature_to_id[" << type << "] = " << iter->second;
+      first_node_type_.push_back(iter->second);
+    }
+    meta_path_.resize(first_node_type_.size());
+    auto meta_paths = paddle::string::split_string<std::string>(meta_path, ";");
+
+    for (size_t i = 0; i < meta_paths.size(); i++) {
+      auto path = meta_paths[i];
+      auto nodes = paddle::string::split_string<std::string>(path, "-");
+      for (auto &node : nodes) {
+        auto iter = edge_to_id.find(node);
+        PADDLE_ENFORCE_NE(iter,
+                          edge_to_id.end(),
+                          platform::errors::NotFound(
+                              "(%s) is not found in edge_to_id.", node));
+        VLOG(2) << "edge_to_id[" << node << "] = " << iter->second;
+        meta_path_[i].push_back(iter->second);
+      }
+    }
+    int max_dev_id = 0;
+    for (size_t i = 0; i < device_id_mapping.size(); i++) {
+      if (device_id_mapping[i] > max_dev_id) {
+        max_dev_id = device_id_mapping[i];
+      }
+    }
+    finish_node_type_.resize(max_dev_id + 1);
+    node_type_start_.resize(max_dev_id + 1);
+    global_infer_node_type_start_.resize(max_dev_id + 1);
+    for (size_t i = 0; i < device_id_mapping.size(); i++) {
+      int dev_id = device_id_mapping[i];
+      auto &node_type_start = node_type_start_[i];
+      auto &infer_node_type_start = global_infer_node_type_start_[i];
+      auto &finish_node_type = finish_node_type_[i];
+      finish_node_type.clear();
+
+      for (size_t idx = 0; idx < feature_to_id.size(); idx++) {
+        infer_node_type_start[idx] = 0;
+      }
+      for (auto &type : node_types) {
+        auto iter = feature_to_id.find(type);
+        node_type_start[iter->second] = 0;
+        infer_node_type_start[iter->second] = 0;
+      }
+      infer_cursor_.push_back(0);
+      cursor_.push_back(0);
+    }
+    init_type_keys();
+  }
+}
+
+void GraphGpuWrapper::init_type_keys() {
+  size_t thread_num = device_id_mapping.size();
+  int cnt = 0;
+
+  auto &graph_all_type_total_keys = get_graph_type_keys();
+  auto &type_to_index = get_graph_type_to_index();
+  std::vector<std::vector<uint64_t>> tmp_keys;
+  tmp_keys.resize(thread_num);
+  d_graph_all_type_total_keys_.resize(graph_all_type_total_keys.size());
+  h_graph_all_type_keys_len_.resize(graph_all_type_total_keys.size());
+  for (size_t f_idx = 0; f_idx < graph_all_type_total_keys.size(); f_idx++) {
+    for (size_t j = 0; j < tmp_keys.size(); j++) {
+      tmp_keys[j].clear();
+    }
+    d_graph_all_type_total_keys_[f_idx].resize(thread_num);
+    auto &type_total_key = graph_all_type_total_keys[f_idx];
+    for (size_t j = 0; j < type_total_key.size(); j++) {
+      uint64_t shard = type_total_key[j] % thread_num;
+      tmp_keys[shard].push_back(type_total_key[j]);
+    }
+    for (size_t j = 0; j < thread_num; j++) {
+      h_graph_all_type_keys_len_[f_idx].push_back(tmp_keys[j].size());
+      VLOG(1) << "node type: " << type_to_index[f_idx]
+              << ", gpu_graph_device_keys[" << j
+              << "] = " << tmp_keys[j].size();
+    }
+    for (size_t j = 0; j < thread_num; j++) {
+      auto stream = get_local_stream(j);
+      int gpuid = device_id_mapping[j];
+      auto place = platform::CUDAPlace(gpuid);
+      platform::CUDADeviceGuard guard(gpuid);
+      d_graph_all_type_total_keys_[f_idx][j] =
+          memory::AllocShared(place, tmp_keys[j].size() * sizeof(uint64_t));
+      cudaMemcpyAsync(d_graph_all_type_total_keys_[f_idx][j]->ptr(),
+                      tmp_keys[j].data(),
+                      sizeof(uint64_t) * tmp_keys[j].size(),
+                      cudaMemcpyHostToDevice,
+                      stream);
+    }
+  }
+  for (int i = 0; i < thread_num; i++) {
+    auto stream = get_local_stream(i);
+    cudaStreamSynchronize(stream);
+  }
+}
+
 int GraphGpuWrapper::get_all_id(int type,
                                 int slice_num,
                                 std::vector<std::vector<uint64_t>> *output) {
@@ -152,6 +268,15 @@ void GraphGpuWrapper::load_edge_file(std::string name,
   }
 }
 
+void GraphGpuWrapper::load_edge_file(std::string etype2files,
+                                     std::string graph_data_local_path,
+                                     int part_num,
+                                     bool reverse) {
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->parse_edge_and_load(
+          etype2files, graph_data_local_path, part_num, reverse);
+}
+
 void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {
   // 'n' means load nodes and 'node_type' follows
 
@@ -163,14 +288,22 @@ void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) {
   }
 }
 
+void GraphGpuWrapper::load_node_file(std::string ntype2files,
+                                     std::string graph_data_local_path,
+                                     int part_num) {
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->parse_node_and_load(
+          ntype2files, graph_data_local_path, part_num);
+}
+
 void GraphGpuWrapper::load_node_and_edge(std::string etype2files,
                                          std::string ntype2files,
                                          std::string graph_data_local_path,
                                          int part_num,
                                          bool reverse) {
-    ((GpuPsGraphTable *)graph_table)
-        ->cpu_graph_table_->load_node_and_edge_file(
-            etype2files, ntype2files, graph_data_local_path, part_num, reverse);
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table_->load_node_and_edge_file(
+          etype2files, ntype2files, graph_data_local_path, part_num, reverse);
 }
 
 void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
@@ -203,8 +336,12 @@ void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
 }
 void GraphGpuWrapper::init_search_level(int level) { search_level = level; }
 
+gpuStream_t GraphGpuWrapper::get_local_stream(int gpuid) {
+  return ((GpuPsGraphTable *)graph_table)->get_local_stream(gpuid);
+}
+
 void GraphGpuWrapper::init_service() {
-  table_proto.set_task_pool_size(24);
+  table_proto.set_task_pool_size(64);
   table_proto.set_shard_num(1000);
   table_proto.set_build_sampler_on_cpu(false);
   table_proto.set_search_level(search_level);
@@ -226,11 +363,14 @@ void GraphGpuWrapper::init_service() {
       std::make_shared<HeterPsResource>(device_id_mapping);
   resource->enable_p2p();
   GpuPsGraphTable *g = new GpuPsGraphTable(resource, 1, id_to_edge.size());
-  g->init_cpu_table(table_proto);
+  size_t gpu_num = device_id_mapping.size();
+  g->init_cpu_table(table_proto, gpu_num);
   g->cpu_graph_table_->set_feature_separator(feature_separator_);
   g->cpu_graph_table_->set_slot_feature_separator(slot_feature_separator_);
   graph_table = (char *)g;
+  upload_num = gpu_num;
   upload_task_pool.reset(new ::ThreadPool(upload_num));
+  
 }
 
 void GraphGpuWrapper::finalize() {
@@ -267,6 +407,10 @@ void GraphGpuWrapper::upload_batch(int type,
 
 // feature table
 void GraphGpuWrapper::upload_batch(int type, int slice_num, int slot_num) {
+  if (type == 1 && (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH
+          || FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH)) {
+    return ;
+  } 
   std::vector<std::vector<uint64_t>> node_ids;
   ((GpuPsGraphTable *)graph_table)
       ->cpu_graph_table_->get_all_id(type, slice_num, &node_ids);
@@ -278,7 +422,7 @@ void GraphGpuWrapper::upload_batch(int type, int slice_num, int slot_num) {
       VLOG(0) << "begin make_gpu_ps_graph_fea, node_ids[" << i << "]_size["
               << node_ids[i].size() << "]";
       GpuPsCommGraphFea sub_graph =
-          g->cpu_graph_table_->make_gpu_ps_graph_fea(node_ids[i], slot_num);
+          g->cpu_graph_table_->make_gpu_ps_graph_fea(i, node_ids[i], slot_num);
       // sub_graph.display_on_cpu();
       VLOG(0) << "begin build_graph_fea_on_single_gpu, node_ids[" << i
               << "]_size[" << node_ids[i].size() << "]";
@@ -293,6 +437,32 @@ void GraphGpuWrapper::upload_batch(int type, int slice_num, int slot_num) {
   debug_gpu_memory_info("upload_batch feature end");
 }
 
+//get sub_graph_fea
+std::vector<GpuPsCommGraphFea> GraphGpuWrapper::get_sub_graph_fea(std::vector<std::vector<uint64_t>> &node_ids, int slot_num) {
+  GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
+  std::vector<std::future<int>> tasks;
+  std::vector<GpuPsCommGraphFea> sub_graph_feas(node_ids.size());
+  for (int i = 0; i < node_ids.size(); i++) {
+    tasks.push_back(upload_task_pool->enqueue([&, i, this]() -> int {
+      GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
+      sub_graph_feas[i] =
+          g->cpu_graph_table_->make_gpu_ps_graph_fea(i, node_ids[i], slot_num);
+      return 0;
+    }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  return sub_graph_feas;
+}
+
+//build_gpu_graph_fea
+void GraphGpuWrapper::build_gpu_graph_fea(GpuPsCommGraphFea &sub_graph_fea, int i) {
+  GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
+  g->build_graph_fea_on_single_gpu(sub_graph_fea, i);
+  sub_graph_fea.release_on_cpu();
+  VLOG(0) << "sub graph fea on gpu " << i << " is built";
+  return ; 
+}
+
 NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3(
     NeighborSampleQuery q, bool cpu_switch, bool compress = true) {
   return ((GpuPsGraphTable *)graph_table)
@@ -382,9 +552,6 @@ std::vector<uint64_t> GraphGpuWrapper::graph_neighbor_sample(
       res.push_back(cpu_key[i * sample_size + j]);
     }
   }
-  /* for(int i = 0;i < res.size();i ++) { */
-  /*     VLOG(0) << i << " " << res[i]; */
-  /* } */
   delete[] actual_sample_size;
   cudaFree(cuda_key);
   return res;
@@ -416,6 +583,31 @@ void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) {
   return ((GpuPsGraphTable *)graph_table)
       ->cpu_graph_table_->export_partition_files(idx, file_path);
 }
+
+void GraphGpuWrapper::release_graph() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph();
+}
+
+void GraphGpuWrapper::release_graph_edge() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph_edge();
+}
+
+void GraphGpuWrapper::release_graph_node() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph_node();
+}
+
+std::vector<uint64_t> &GraphGpuWrapper::get_graph_total_keys() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->graph_total_keys_;
+}
+
+std::vector<std::vector<uint64_t>> &GraphGpuWrapper::get_graph_type_keys() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->graph_type_keys_;
+}
+
+std::unordered_map<int, int> &GraphGpuWrapper::get_graph_type_to_index() {
+  return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->type_to_index_;
+}
+
 #endif
 }  // namespace framework
 };  // namespace paddle
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index 644fb0792cd495..a3cad68f3bb885 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -22,6 +22,14 @@
 namespace paddle {
 namespace framework {
 #ifdef PADDLE_WITH_HETERPS
+
+enum GpuGraphStorageMode {
+  WHOLE_HBM = 1,
+  MEM_EMB_AND_GPU_GRAPH,
+  MEM_EMB_FEATURE_AND_GPU_GRAPH,
+  SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH
+};
+
 class GraphGpuWrapper {
  public:
   static std::shared_ptr<GraphGpuWrapper> GetInstance() {
@@ -31,6 +39,8 @@ class GraphGpuWrapper {
     return s_instance_;
   }
   static std::shared_ptr<GraphGpuWrapper> s_instance_;
+  void init_conf(const std::string& first_node_type,
+                 const std::string& meta_path);
   void initialize();
   void finalize();
   void set_device(std::vector<int> ids);
@@ -42,12 +52,22 @@ class GraphGpuWrapper {
                     int slice_num,
                     const std::string& edge_type);
   void upload_batch(int type, int slice_num, int slot_num);
+  std::vector<GpuPsCommGraphFea> get_sub_graph_fea(std::vector<std::vector<uint64_t>> &node_ids, int slot_num);
+  void build_gpu_graph_fea(GpuPsCommGraphFea &sub_graph_fea, int i);
   void add_table_feat_conf(std::string table_name,
                            std::string feat_name,
                            std::string feat_dtype,
                            int feat_shape);
   void load_edge_file(std::string name, std::string filepath, bool reverse);
+  void load_edge_file(std::string etype2files,
+                      std::string graph_data_local_path,
+                      int part_num,
+                      bool reverse);
+
   void load_node_file(std::string name, std::string filepath);
+  void load_node_file(std::string ntype2files,
+                      std::string graph_data_local_path,
+                      int part_num);
   void load_node_and_edge(std::string etype2files,
                           std::string ntype2files,
                           std::string graph_data_local_path,
@@ -96,6 +116,7 @@ class GraphGpuWrapper {
   NeighborSampleResultV2 graph_neighbor_sample_all_edge_type(
       int gpu_id, int edge_type_len, uint64_t* key, int sample_size, int len,
       std::vector<std::shared_ptr<phi::Allocation>> edge_type_graphs);
+  gpuStream_t get_local_stream(int gpuid);
   std::vector<uint64_t> graph_neighbor_sample(int gpu_id,
                                               int idx,
                                               std::vector<uint64_t>& key,
@@ -112,6 +133,13 @@ class GraphGpuWrapper {
                            int slot_num,
                            int* d_slot_feature_num_map,
                            int fea_num_per_node);
+  void release_graph();
+  void release_graph_edge();
+  void release_graph_node();
+  void init_type_keys();
+  std::vector<uint64_t>& get_graph_total_keys();
+  std::vector<std::vector<uint64_t>>& get_graph_type_keys();
+  std::unordered_map<int, int>& get_graph_type_to_index();
 
   std::unordered_map<std::string, int> edge_to_id, feature_to_id;
   std::vector<std::string> id_to_feature, id_to_edge;
@@ -126,6 +154,18 @@ class GraphGpuWrapper {
   int upload_num = 8;
   std::shared_ptr<::ThreadPool> upload_task_pool;
   std::string feature_separator_ = std::string(" ");
+  bool conf_initialized_ = false;
+  std::vector<int> first_node_type_;
+  std::vector<std::vector<int>> meta_path_;
+
+  std::vector<std::set<int>> finish_node_type_;
+  std::vector<std::unordered_map<int, size_t>> node_type_start_;
+  std::vector<std::unordered_map<int, size_t>> global_infer_node_type_start_;
+  std::vector<size_t> infer_cursor_;
+  std::vector<size_t> cursor_;
+  std::vector<std::vector<std::shared_ptr<phi::Allocation>>>
+      d_graph_all_type_total_keys_;
+  std::vector<std::vector<uint64_t>> h_graph_all_type_keys_len_;
   std::string slot_feature_separator_ = std::string(" ");
 };
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index 18fb2eca5b752e..05c254b2739f22 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -124,6 +124,12 @@ class HashTable {
               size_t len,
               StreamType stream);
 
+  template <typename StreamType>
+  void insert(const KeyType* d_keys,
+              size_t len,
+              uint64_t* global_num,
+              StreamType stream);
+
   template <typename StreamType>
   void insert(const KeyType* d_keys,
               size_t len,
@@ -153,6 +159,9 @@ class HashTable {
   template <typename StreamType>
   void dump_to_cpu(int devid, StreamType stream);
 
+  template <typename StreamType>
+  void get_keys(KeyType* d_out, uint64_t* global_cursor, StreamType stream);
+
 #if defined(PADDLE_WITH_CUDA)
 
   template <typename Sgd, typename StreamType>
@@ -185,7 +194,7 @@ class HashTable {
 #endif
 
   int size() { return container_->size(); }
-
+  thrust::pair<KeyType, ValType>* data() { return container_->data(); }
   void set_feature_value_size(size_t pull_feature_value_size,
                               size_t push_grad_value_size) {
     pull_feature_value_size_ = pull_feature_value_size;
@@ -194,6 +203,12 @@ class HashTable {
             << " push value size: " << push_grad_value_size_;
   }
 
+  int prefetch(const int dev_id, cudaStream_t stream = 0) {
+    return container_->prefetch(dev_id, stream);
+  }
+
+  void clear(cudaStream_t stream = 0) { container_->clear_async(stream); }
+
   void show_collision(int id) { return container_->print_collision(id); }
 
   std::unique_ptr<phi::RWLock> rwlock_{nullptr};
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 1fda5a586a2e81..33b50f789a49cb 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -31,6 +31,35 @@ struct ReplaceOp {
   }
 };
 
+template <typename Table>
+__global__ void insert_kernel(Table* table,
+                              const typename Table::key_type* const keys,
+                              size_t len,
+                              uint64_t* global_num) {
+  ReplaceOp<typename Table::mapped_type> op;
+  thrust::pair<typename Table::key_type, typename Table::mapped_type> kv;
+
+  __shared__ uint64_t local_num;
+
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+
+  if (i < len) {
+    kv.first = keys[i];
+    kv.second = 1;  // fake value
+    auto it = table->insert(kv, op, &local_num);
+    assert(it != table->end() && "error: insert fails: table is full");
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    atomicAdd(global_num, local_num);
+  }
+}
+
 template <typename Table>
 __global__ void insert_kernel(Table* table,
                               const typename Table::key_type* const keys,
@@ -38,7 +67,6 @@ __global__ void insert_kernel(Table* table,
                               size_t len) {
   ReplaceOp<typename Table::mapped_type> op;
   thrust::pair<typename Table::key_type, typename Table::mapped_type> kv;
-
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     kv.first = keys[i];
@@ -139,6 +167,41 @@ __global__ void dy_mf_update_kernel(Table* table,
   }
 }
 
+template <typename Table>
+__global__ void get_keys_kernel(Table* table,
+                                typename Table::key_type* d_out,
+                                uint64_t* global_cursor,
+                                uint64_t unused_key) {
+  extern __shared__ typename Table::key_type local_key[];
+  __shared__ uint64_t local_num;
+  __shared__ uint64_t global_num;
+
+  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (threadIdx.x == 0) {
+    local_num = 0;
+  }
+  __syncthreads();
+  uint64_t len = table->size();
+  if (idx < len) {
+    typename Table::value_type val = *(table->data() + idx);
+    if (val.first != unused_key) {
+      uint64_t dst = atomicAdd(&local_num, 1);
+      local_key[dst] = val.first;
+    }
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    global_num = atomicAdd(global_cursor, local_num);
+  }
+  __syncthreads();
+
+  if (threadIdx.x < local_num) {
+    d_out[global_num + threadIdx.x] = local_key[threadIdx.x];
+  }
+}
+
 template <typename KeyType, typename ValType>
 HashTable<KeyType, ValType>::HashTable(size_t capacity) {
   container_ = new TableContainer<KeyType, ValType>(capacity);
@@ -211,6 +274,20 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
       container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor);
 }
 
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
+                                         size_t len,
+                                         uint64_t* global_num,
+                                         StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      container_, d_keys, len, global_num);
+}
+
 template <typename KeyType, typename ValType>
 template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
@@ -225,6 +302,20 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
       container_, d_keys, d_vals, len);
 }
 
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::get_keys(KeyType* d_out,
+                                           uint64_t* global_cursor,
+                                           StreamType stream) {
+  size_t len = container_->size();
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
+  KeyType unuse_key = std::numeric_limits<KeyType>::max();
+  size_t shared_mem_size = sizeof(KeyType) * BLOCK_SIZE_;
+  get_keys_kernel<<<grid_size, BLOCK_SIZE_, shared_mem_size, stream>>>(
+      container_, d_out, global_cursor, unuse_key);
+}
+
+
 template <typename KeyType, typename ValType>
 template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
@@ -436,6 +527,17 @@ template void HashTable<long, unsigned int>::insert<cudaStream_t>(
     size_t len,
     cudaStream_t stream);
 
+template void HashTable<unsigned long, unsigned long>::get_keys<cudaStream_t>(
+                unsigned long* d_out,
+                unsigned long* global_cursor,
+                cudaStream_t stream);
+
+template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
+    const unsigned long* d_keys,
+    unsigned long len,
+    uint64_t* global_num,
+    cudaStream_t stream);
+
 template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
     const unsigned long* d_keys,
     const unsigned long* d_vals,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 82532836b8e229..cf6c4eaf8b99ac 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -1127,13 +1127,22 @@ void HeterComm<KeyType, ValType, GradType, GPUAccessor>::split_input_to_shard(
   AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(dev_num, 0);
 
-  auto d_idx_tmp = memory::Alloc(place, len * sizeof(int));
+  auto d_idx_tmp =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_idx_tmp_ptr = reinterpret_cast<int*>(d_idx_tmp->ptr());
 
-  auto d_shard_index = memory::Alloc(place, len * sizeof(int));
+  auto d_shard_index =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_shard_index_ptr = reinterpret_cast<int*>(d_shard_index->ptr());
 
-  auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int));
+  auto d_shard_index_tmp =
+      memory::Alloc(place,
+                    len * sizeof(int),
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   int* d_shard_index_tmp_ptr = reinterpret_cast<int*>(d_shard_index_tmp->ptr());
 
   heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream);
@@ -1153,7 +1162,10 @@ void HeterComm<KeyType, ValType, GradType, GPUAccessor>::split_input_to_shard(
                                  num_bits,
                                  stream);
 
-  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  auto d_temp_storage =
+      memory::Alloc(place,
+                    temp_storage_bytes,
+                    phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
   heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(),
                                  temp_storage_bytes,
                                  d_shard_index_tmp_ptr,
diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
index 4696a7cc91b5ae..22d6199584f58c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
+++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
@@ -96,6 +96,53 @@ class HBMMemoryPool : public managed {
   size_t block_size_;
 };
 
+class HBMMemoryPoolFix : public managed {
+ public:
+  HBMMemoryPoolFix() {
+    capacity_ = 0;
+    size_ = 0 ;
+    block_size_ = 0;
+    max_byte_capacity_ = 0;
+  }
+
+  ~HBMMemoryPoolFix() {
+    VLOG(3) << "delete hbm memory pool";
+    cudaFree(mem_);
+  }
+
+  size_t block_size() { return block_size_; }
+
+  void clear(void) { cudaMemset(mem_, 0, block_size_ * capacity_); }
+
+  void reset(size_t capacity, size_t block_size) {
+    if (max_byte_capacity_ < capacity * block_size) {
+      if (mem_ != NULL) {
+        cudaFree(mem_);
+      }
+      max_byte_capacity_ = (block_size * capacity / 8 + 1) * 8;
+      CUDA_CHECK(cudaMalloc(&mem_, max_byte_capacity_));
+    }
+    size_ = capacity;
+    block_size_ = block_size;
+    capacity_ = max_byte_capacity_ / block_size;
+  }
+
+  char* mem() { return mem_; }
+
+  size_t capacity() { return capacity_; }
+  size_t size() { return size_; }
+  __forceinline__ __device__ void* mem_address(const uint32_t& idx) {
+    return (void*)&mem_[(idx)*block_size_];
+  }
+
+ private:
+  char* mem_ = NULL;
+  size_t capacity_;
+  size_t size_;
+  size_t block_size_;
+  size_t max_byte_capacity_;
+};
+
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 9ccd724d533019..0b603133efabbd 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -31,16 +31,18 @@ limitations under the License. */
 
 #include <algorithm>
 #include <deque>
+#include <unordered_set>
 
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #include "paddle/fluid/platform/timer.h"
 #if defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
 #endif
 
 DECLARE_int32(gpugraph_dedup_pull_push_mode);
-DECLARE_int32(gpugraph_sparse_table_storage_mode);
+DECLARE_int32(gpugraph_storage_mode);
 
 namespace paddle {
 namespace framework {
@@ -112,6 +114,105 @@ void PSGPUWrapper::InitAfsApi(const std::string& fs_name,
   use_afs_api_ = 1;
 }
 #endif
+
+void PSGPUWrapper::add_key_to_local(const std::vector<uint64_t>& vec_data) {
+  size_t total_len = vec_data.size();
+  size_t len_per_thread = total_len / thread_keys_thread_num_;
+  size_t begin = 0;
+  std::vector<std::thread> threads;
+
+  int remain = total_len % thread_keys_thread_num_;
+  auto gen_graph_data_func = [this](const std::vector<uint64_t>& total_data,
+                                    int begin_index,
+                                    int end_index,
+                                    int i) {
+    for (auto iter = total_data.begin() + begin_index;
+         iter != total_data.begin() + end_index;
+         iter++) {
+      uint64_t cur_key = *iter;
+      int shard_id = cur_key % thread_keys_shard_num_;
+      this->thread_keys_[i][shard_id].insert(cur_key);
+    }
+  };
+  auto gen_graph_dynamic_mf_func = [this](
+                                       const std::vector<uint64_t>& total_data,
+                                       int begin_index,
+                                       int end_index,
+                                       int i) {
+    for (auto iter = total_data.begin() + begin_index;
+         iter != total_data.begin() + end_index;
+         iter++) {
+      uint64_t cur_key = *iter;
+      int shard_id = cur_key % thread_keys_shard_num_;
+      // TODO: feasign <-> slot <-> multi_dim
+      this->thread_dim_keys_[i][shard_id][0].insert(cur_key);
+    }
+  };
+  for (int i = 0; i < thread_keys_thread_num_; i++) {
+    if (!multi_mf_dim_) {
+      threads.push_back(
+          std::thread(gen_graph_data_func,
+                      std::ref(vec_data),
+                      begin,
+                      begin + len_per_thread + (i < remain ? 1 : 0),
+                      i));
+    } else {
+      threads.push_back(
+          std::thread(gen_graph_dynamic_mf_func,
+                      std::ref(vec_data),
+                      begin,
+                      begin + len_per_thread + (i < remain ? 1 : 0),
+                      i));
+    }
+    begin += len_per_thread + (i < remain ? 1 : 0);
+  }
+  for (std::thread& t : threads) {
+    t.join();
+  }
+}
+
+void PSGPUWrapper::add_key_to_gputask(std::shared_ptr<HeterContext> gpu_task) {
+  std::vector<std::thread> threads;
+  platform::Timer timeline;
+  timeline.Start();
+  // merge thread_keys to shard_keys
+  auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
+    for (int i = 0; i < thread_keys_thread_num_; ++i) {
+      gpu_task->batch_add_keys(
+          shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]);
+      thread_dim_keys_[i][shard_num][dim_id].clear();
+    }
+  };
+  for (int i = 0; i < thread_keys_shard_num_; ++i) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
+    }
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  timeline.Pause();
+
+  VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec()
+          << " seconds.";
+  timeline.Start();
+  gpu_task->UniqueKeys();
+  timeline.Pause();
+  VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
+}
+
+void PSGPUWrapper::resize_gputask(std::shared_ptr<HeterContext> gpu_task) {
+  for (int i = 0; i < thread_keys_shard_num_; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      if (i == 0 && j == multi_mf_dim_ - 1) {
+        gpu_task->feature_dim_keys_[i][j].push_back(0);
+      }
+      gpu_task->value_dim_ptr_[i][j].resize(
+      gpu_task->feature_dim_keys_[i][j].size());
+    }
+  }
+}
+
 void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin";
   platform::Timer timeline;
@@ -238,106 +339,298 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
     VLOG(0) << "PreBuild in GpuGraph mode";
     SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
     const std::vector<uint64_t>& vec_data = dataset->GetGpuGraphTotalKeys();
+    VLOG(0) << "GpuGraphTotalKeys: " << vec_data.size();
+    timeline.Start();
+    add_key_to_local(vec_data);
+    timeline.Pause();
+    VLOG(0) << "add_key_to_local cost " << timeline.ElapsedSec() << " seconds.";
+  }
 
-    total_len = vec_data.size();
-    len_per_thread = total_len / thread_keys_thread_num_;
-    VLOG(0) << "GpuGraphTotalKeys: " << total_len;
-    remain = total_len % thread_keys_thread_num_;
-    auto gen_graph_data_func = [this](const std::vector<uint64_t>& total_data,
-                                      int begin_index,
-                                      int end_index,
-                                      int i) {
-      for (auto iter = total_data.begin() + begin_index;
-           iter != total_data.begin() + end_index;
-           iter++) {
-        uint64_t cur_key = *iter;
-        int shard_id = cur_key % thread_keys_shard_num_;
-        this->thread_keys_[i][shard_id].insert(cur_key);
-      }
+  add_key_to_gputask(gpu_task);
+}
+
+void PSGPUWrapper::add_slot_feature(std::shared_ptr<HeterContext> gpu_task) {
+    platform::Timer timeline;
+    platform::Timer time_stage;
+    timeline.Start();
+    //8卡数据分片
+    size_t device_num = heter_devices_.size();
+    std::vector<std::thread> threads;
+    size_t slot_num = slot_vector_.size() - 1;//node slot 9008 in slot_vector
+    auto& local_dim_keys = gpu_task->feature_dim_keys_;
+    double divide_nodeid_cost = 0;
+    double get_feature_id_cost = 0;
+    double add_feature_to_set_cost = 0;
+    double add_feature_to_key_cost = 0;
+
+    std::vector<std::vector<uint64_t>> node_ids(device_num);
+    size_t node_num = 0;
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+        for (int j = 0; j < multi_mf_dim_; j++) {
+          node_num += local_dim_keys[i][j].size();
+        }
+    }
+    for (auto &node_id_vector : node_ids){
+      node_id_vector.reserve(node_num * 1.2 / device_num);
+    }
+
+    auto& device_dim_mutex = gpu_task->dim_mutex_;
+
+    auto divide_nodeid_to_device = [this,
+                                    device_num,
+                                    &local_dim_keys,
+                                    &node_ids,
+                                    &device_dim_mutex](int i, int j) {
+       std::vector<std::vector<uint64_t>> task_keys(device_num);
+       size_t batch = 10000;
+       for (size_t k = 0; k < device_num; k++) {
+         task_keys[k].reserve(batch * 1.2 / device_num);
+       }
+       std::vector<int> shuffle_device = shuffle_int_vector(device_num);
+       size_t start = 0;
+       while (start < local_dim_keys[i][j].size()) {
+         if (batch + start > local_dim_keys[i][j].size()) {
+           batch = local_dim_keys[i][j].size() - start;
+         }
+         for (size_t k = start; k < (start + batch); k++) {
+           int shard = local_dim_keys[i][j][k] % device_num;
+           task_keys[shard].push_back(local_dim_keys[i][j][k]);
+         }
+         // allocate local keys to devices
+         for (auto dev : shuffle_device) {
+           device_dim_mutex[dev][0]->lock();
+           int len = task_keys[dev].size();
+           for (int k = 0; k < len; ++k) {
+             node_ids[dev].push_back(task_keys[dev][k]);
+           }
+           device_dim_mutex[dev][0]->unlock();
+           task_keys[dev].clear();
+         }
+         start += batch;
+       }
     };
-    auto gen_graph_dynamic_mf_func =
-        [this](const std::vector<uint64_t>& total_data,
-               int begin_index,
-               int end_index,
-               int i) {
-          for (auto iter = total_data.begin() + begin_index;
-               iter != total_data.begin() + end_index;
-               iter++) {
-            uint64_t cur_key = *iter;
-            int shard_id = cur_key % thread_keys_shard_num_;
-            // TODO: feasign <-> slot <-> multi_dim
-            this->thread_dim_keys_[i][shard_id][0].insert(cur_key);
-          }
-        };
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      if (!multi_mf_dim_) {
-        VLOG(1) << "psgpu graph wrapper genfunc";
-        threads.push_back(
-            std::thread(gen_graph_data_func,
-                        std::ref(vec_data),
-                        begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0),
-                        i));
-      } else {
-        VLOG(1) << "psgpu graph wrapper genfunc with dynamic mf";
-        threads.push_back(
-            std::thread(gen_graph_dynamic_mf_func,
-                        std::ref(vec_data),
-                        begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0),
-                        i));
+    threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
+    time_stage.Start();
+
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i * multi_mf_dim_ + j] =
+        std::thread(divide_nodeid_to_device, i, j);
       }
-      begin += len_per_thread + (i < remain ? 1 : 0);
     }
     for (std::thread& t : threads) {
       t.join();
     }
-  }
+    threads.clear();
+    time_stage.Pause();
+    divide_nodeid_cost = time_stage.ElapsedSec();
+    gpu_task->sub_graph_feas = (void *) (new std::vector<GpuPsCommGraphFea>);
+    std::vector<GpuPsCommGraphFea> &sub_graph_feas = *((std::vector<GpuPsCommGraphFea> *) gpu_task->sub_graph_feas);
+    std::vector<std::vector<uint64_t>> feature_ids(device_num);
+    std::vector<uint64_t *> feature_list(device_num);
+    std::vector<size_t> feature_list_size(device_num);
+    size_t batch = 40000;
+
+    time_stage.Start();
+    if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_AND_GPU_GRAPH) {
+      auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+      auto h_slot_feature_num_map = gpu_graph_ptr->slot_feature_num_map();
+      int  fea_num_per_node = 0;
+      for (size_t i = 0; i < slot_num; ++i) {
+        fea_num_per_node += h_slot_feature_num_map[i];
+      }
 
-  timeline.Start();
+      auto get_feature_id = [this, slot_num, batch, fea_num_per_node, &h_slot_feature_num_map, &node_ids, &feature_ids](int i) {
+          platform::CUDADeviceGuard guard(resource_->dev_id(i));
+          int * d_slot_feature_num_map;
+          uint64_t * d_node_list_ptr;
+          uint64_t * d_feature_list_ptr;
+          CUDA_CHECK(cudaMalloc((void**)&d_slot_feature_num_map, slot_num * sizeof(int)));
+          CUDA_CHECK(cudaMemcpy(d_slot_feature_num_map, h_slot_feature_num_map.data(),
+                  sizeof(int) * slot_num, cudaMemcpyHostToDevice));
+          CUDA_CHECK(cudaMalloc((void**)&d_node_list_ptr, batch * sizeof(uint64_t)));
+          CUDA_CHECK(cudaMalloc((void**)&d_feature_list_ptr, batch * fea_num_per_node * sizeof(uint64_t)));
+          auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+          uint64_t pos = 0;
+          size_t real_batch = 0;
+          feature_ids[i].resize(node_ids[i].size() * fea_num_per_node);
+          while (pos < node_ids[i].size()) {
+            real_batch = (pos + batch) <= node_ids[i].size() ? batch : node_ids[i].size() - pos;
+            CUDA_CHECK(cudaMemcpy(d_node_list_ptr,
+                       node_ids[i].data() + pos,
+                       real_batch * sizeof(uint64_t),
+                       cudaMemcpyHostToDevice));
+            int ret = gpu_graph_ptr->get_feature_of_nodes(i,
+                                                          d_node_list_ptr,
+                                                          d_feature_list_ptr,
+                                                          real_batch,
+                                                          slot_num,
+                                                          d_slot_feature_num_map,
+                                                          fea_num_per_node);
+            PADDLE_ENFORCE_EQ(
+                ret,
+                0,
+                platform::errors::PreconditionNotMet(
+                    "get_feature_of_nodes error"));
+
+            CUDA_CHECK(cudaMemcpy(feature_ids[i].data() + pos * fea_num_per_node,
+                                  d_feature_list_ptr,
+                                  real_batch * fea_num_per_node * sizeof(uint64_t),
+                                  cudaMemcpyDeviceToHost));
+            pos += real_batch;
+          }
+          cudaFree(d_slot_feature_num_map);
+          cudaFree(d_node_list_ptr);
+          cudaFree(d_feature_list_ptr);
+      };
 
-  threads.clear();
-  // merge thread_keys to shard_keys
-  auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
-    for (int i = 0; i < thread_keys_thread_num_; ++i) {
-      gpu_task->batch_add_keys(
-          shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]);
-      thread_dim_keys_[i][shard_num][dim_id].clear();
+      threads.resize(device_num);
+      for (size_t i = 0; i < device_num; i++) {
+        threads[i] = std::thread(get_feature_id, i);
+      }
+      for (std::thread& t : threads) {
+        t.join();
+      }
+      threads.clear();
+      for (size_t i = 0; i < device_num; i++) {
+        feature_list[i] = feature_ids[i].data();
+        feature_list_size[i] = feature_ids[i].size();
+      }
     }
-  };
-  for (int i = 0; i < thread_keys_shard_num_; ++i) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
+    else if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH
+            || FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) {
+        auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+        sub_graph_feas = gpu_graph_ptr->get_sub_graph_fea(node_ids, slot_num);
+        for (size_t i = 0; i < device_num; i++) {
+          feature_list[i] = sub_graph_feas[i].feature_list;
+          feature_list_size[i] = sub_graph_feas[i].feature_size;
+        }
     }
-  }
-  for (auto& t : threads) {
-    t.join();
-  }
-  timeline.Pause();
+    else {
+      PADDLE_ENFORCE_EQ(
+        1,
+        0,
+        " FLAGS_gpugraph_storage_mode is not adaptived ");
+    }
+    time_stage.Pause();
+    get_feature_id_cost = time_stage.ElapsedSec();
+    size_t feature_num = 0;
+    for (size_t i = 0; i < device_num; i++) {
+        feature_num +=  feature_list_size[i];
+    }
+    VLOG(0) << "feature_num is " << feature_num << " node_num num is " << node_num;
 
-  VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec()
-          << " seconds.";
-  timeline.Start();
-  gpu_task->UniqueKeys();
-  timeline.Pause();
+    size_t set_num = device_num * 8;
+    std::vector<std::unordered_set<uint64_t>> feature_id_set(set_num);
+    std::vector<std::mutex> set_mutex(set_num);
 
-  VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
-  for (int i = 0; i < thread_keys_shard_num_; i++) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      if (i == 0 && j == multi_mf_dim_ - 1) {
-        gpu_task->feature_dim_keys_[i][j].push_back(0);
+    auto add_feature_to_set = [this, set_num, &feature_list, &feature_id_set, &set_mutex] (int dev, size_t start, size_t end) {
+      size_t batch = 10000 * set_num;
+      std::vector<std::vector<uint64_t>> feature_list_tmp(set_num);
+      for (size_t i = 0; i < set_num; i++) {
+        feature_list_tmp[i].reserve((batch * 1.2) /set_num);
+      }
+      std::vector<int> shuffle_set_index = shuffle_int_vector(set_num);
+      size_t pos = start;
+      size_t real_batch = 0;
+      while (pos < end) {
+        real_batch = (pos + batch <= end) ? batch : end - pos;
+        for (size_t i = pos; i < pos + real_batch; i++) {
+          if (feature_list[dev][i] == 0) {
+            continue;
+          }
+          int shard_num = feature_list[dev][i] % set_num;
+          feature_list_tmp[shard_num].push_back(feature_list[dev][i]);
+        }
+        // uniq in local
+        for (size_t i = 0; i < set_num; i++) {
+          std::sort(feature_list_tmp[i].begin(), feature_list_tmp[i].end());
+          size_t idx = 0;
+          size_t total = feature_list_tmp[i].size();
+          for (size_t j = 0; j < total; j++) {
+            auto &k = feature_list_tmp[i][j];
+            if (idx > 0 && feature_list_tmp[i][idx - 1] == k) {
+              continue;
+            }
+            feature_list_tmp[i][idx] = k;
+            ++idx;
+          }
+          feature_list_tmp[i].resize(idx);
+        }
+        // uniq in global
+        for (auto set_index : shuffle_set_index) {
+          set_mutex[set_index].lock();
+          for (auto feature_id : feature_list_tmp[set_index]) {
+              feature_id_set[set_index].insert(feature_id);
+          }
+          set_mutex[set_index].unlock();
+          feature_list_tmp[set_index].clear();
+        }
+        pos += real_batch;
+      }
+    };
+    size_t device_thread_num = 8;
+    threads.resize(device_num * device_thread_num);
+    time_stage.Start();
+    for (size_t i = 0; i < device_num; i++) {
+      size_t start = 0;
+      for (size_t j = 0; j < device_thread_num; j++) {
+        size_t batch = feature_list_size[i] / device_thread_num;
+        if (j < feature_list_size[i] % device_thread_num) {
+          batch += 1;
+        }
+        threads[i * device_thread_num + j] = std::thread(add_feature_to_set, i, start, start + batch);
+        start += batch;
       }
-      VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
-              << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
-      gpu_task->value_dim_ptr_[i][j].resize(
-          gpu_task->feature_dim_keys_[i][j].size());
     }
-  }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    threads.clear();
+    time_stage.Pause();
+    add_feature_to_set_cost = time_stage.ElapsedSec();
+    auto add_feature_to_key = [this, device_num, &feature_id_set, &local_dim_keys, set_num](int dev) {
+      // set_num = device_num * 8,  a % set_num = b , a = set_num * m + b , a % device_num = b % device_num
+      size_t key_num = 0;
+      for (size_t i = dev; i < set_num; i += device_num) {
+        key_num += feature_id_set[i].size();
+      }
+      VLOG(0) << " feature_num is " << key_num << " for device: " << dev;
+      local_dim_keys[dev][0].reserve(local_dim_keys[dev][0].size() + key_num);
+      for (size_t i = dev; i < set_num; i += device_num) {
+        for (auto it = feature_id_set[i].begin(); it != feature_id_set[i].end(); it++) {
+          local_dim_keys[dev][0].push_back(*it);
+        }
+        feature_id_set[i].clear();
+      }
+    };
+    time_stage.Start();
+    threads.resize(device_num);
+    for (size_t i = 0; i < device_num; i++) {
+      threads[i] = std::thread(add_feature_to_key, i);
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    time_stage.Pause();
+    add_feature_to_key_cost = time_stage.ElapsedSec();
+    threads.clear();
+    timeline.Pause();
+    VLOG(0) << " add_slot_feature costs: " << timeline.ElapsedSec() << " s."
+            << " divide_nodeid_cost " << divide_nodeid_cost
+            << " get_feature_id_cost " << get_feature_id_cost
+            << " add_feature_to_set_cost " << add_feature_to_set_cost
+            << " add_feature_to_key_cost " << add_feature_to_key_cost;
 }
 
 void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   platform::Timer timeline;
+  size_t slot_num = slot_vector_.size() - 1; //node slot 9008 in slot_vector
+  if (slot_num > 0 && FLAGS_gpugraph_storage_mode != paddle::framework::GpuGraphStorageMode::WHOLE_HBM) {
+    add_slot_feature(gpu_task);
+  }
 
+  resize_gputask(gpu_task);
   auto& local_dim_keys = gpu_task->feature_dim_keys_;
   auto& local_dim_ptr = gpu_task->value_dim_ptr_;
 
@@ -374,7 +667,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   timeline.Start();
 
   auto ptl_dynamic_mf_func =
-      [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr](int i, int j) {
+      [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr, &gpu_task](int i,
+                                                                     int j) {
         size_t key_size = local_dim_keys[i][j].size();
         int32_t status = -1;
         int32_t cnt = 0;
@@ -415,10 +709,12 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 #ifdef PADDLE_WITH_PSCORE
         while (true) {
           auto tt = fleet_ptr->worker_ptr_->PullSparsePtr(
+              i,
               reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
               this->table_id_,
               local_dim_keys[i][j].data(),
-              key_size);
+              key_size,
+              gpu_task->pass_id_);
           bool flag = true;
 
           tt.wait();
@@ -455,7 +751,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
         }
       };
 
-  //fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_);
   threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
 
   std::vector<std::future<void>> task_futures;
@@ -468,7 +763,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   for (auto& f : task_futures) {
     f.wait();
   }
-  //fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_);
   task_futures.clear();
   timeline.Pause();
   VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec()
@@ -481,187 +775,205 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     }
     gloo_wrapper->Barrier();
   }
-
 }
 
-void PSGPUWrapper::PrepareGPUTask(std::shared_ptr<HeterContext> gpu_task) {
-    platform::Timer timeline;
-    int device_num = heter_devices_.size();
-    std::vector<std::thread> threads;
-    std::vector<std::future<void>> task_futures;
-    auto& local_keys = gpu_task->feature_keys_;
-    auto& local_ptr = gpu_task->value_ptr_;
-    auto& local_dim_keys = gpu_task->feature_dim_keys_;
-    auto& local_dim_ptr = gpu_task->value_dim_ptr_;
+void PSGPUWrapper::divide_to_device(std::shared_ptr<HeterContext> gpu_task) {
+  platform::Timer timeline;
+  int device_num = heter_devices_.size();
+  std::vector<std::thread> threads;
+  std::vector<std::future<void>> task_futures;
+  auto& local_dim_keys = gpu_task->feature_dim_keys_;
+  auto& local_dim_ptr = gpu_task->value_dim_ptr_;
 
-    auto& device_keys = gpu_task->device_keys_;
-    auto& device_vals = gpu_task->device_values_;
-    auto& device_dim_keys = gpu_task->device_dim_keys_;
-    auto& device_dim_ptr = gpu_task->device_dim_ptr_;
-    auto& device_dim_mutex = gpu_task->dim_mutex_;
-    //auto& device_mutex = gpu_task->mutex_;
+  auto& device_dim_keys = gpu_task->device_dim_keys_;
+  auto& device_dim_ptr = gpu_task->device_dim_ptr_;
+  auto& device_dim_mutex = gpu_task->dim_mutex_;
+  // auto& device_mutex = gpu_task->mutex_;
 
-    if (multi_mf_dim_) {
-      for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
-        device_dim_keys[dev].resize(multi_mf_dim_);
-        device_dim_ptr[dev].resize(multi_mf_dim_);
-      }
+  if (multi_mf_dim_) {
+    for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
+      device_dim_keys[dev].resize(multi_mf_dim_);
+      device_dim_ptr[dev].resize(multi_mf_dim_);
     }
+  }
 
+  timeline.Start();
+  auto build_pull_dynamic_mf_func = [this,
+                                     device_num,
+                                     &local_dim_keys,
+                                     &local_dim_ptr,
+                                     &device_dim_keys,
+                                     &device_dim_ptr,
+                                     &device_dim_mutex](int i, int j) {
+    std::vector<std::vector<FeatureKey>> task_keys(device_num);
+#ifdef PADDLE_WITH_PSLIB
+    std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
+        device_num);
+#endif
 
-    timeline.Start();
-    std::vector<std::vector<std::pair<uint64_t, char*>>> pass_values;
-
-    bool record_status = false;
-    auto& device_task_keys = gpu_task->device_task_keys_;
-    auto& device_task_ptrs = gpu_task->device_task_ptr_;
-    auto build_pull_dynamic_mf_func = [this,
-                                       device_num,
-                                       &local_dim_keys,
-                                       &local_dim_ptr,
-                                       &device_dim_keys,
-                                       &device_dim_ptr,
-                                       &device_dim_mutex](int i, int j) {
-      std::vector<std::vector<FeatureKey>> task_keys(device_num);
-  #ifdef PADDLE_WITH_PSLIB
-      std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
-          device_num);
-  #endif
-
-  #ifdef PADDLE_WITH_PSCORE
-      std::vector<std::vector<paddle::distributed::FixedFeatureValue*>> task_ptrs(
-          device_num);
-  #endif
-      for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) {
-        int shard = local_dim_keys[i][j][k] % device_num;
-        task_keys[shard].push_back(local_dim_keys[i][j][k]);
-        task_ptrs[shard].push_back(local_dim_ptr[i][j][k]);
+#ifdef PADDLE_WITH_PSCORE
+    std::vector<std::vector<paddle::distributed::FixedFeatureValue*>> task_ptrs(
+        device_num);
+#endif
+    for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) {
+      int shard = local_dim_keys[i][j][k] % device_num;
+      task_keys[shard].push_back(local_dim_keys[i][j][k]);
+      task_ptrs[shard].push_back(local_dim_ptr[i][j][k]);
+    }
+    // allocate local keys to devices
+    std::vector<int> shuffle_device = shuffle_int_vector(device_num);
+    for (auto dev : shuffle_device) {
+      device_dim_mutex[dev][j]->lock();
+      int len = task_keys[dev].size();
+      int cur = device_dim_keys[dev][j].size();
+      device_dim_keys[dev][j].resize(device_dim_keys[dev][j].size() + len);
+      device_dim_ptr[dev][j].resize(device_dim_ptr[dev][j].size() + len);
+      for (int k = 0; k < len; ++k) {
+        device_dim_keys[dev][j][cur + k] = task_keys[dev][k];
+        device_dim_ptr[dev][j][cur + k] = task_ptrs[dev][k];
       }
-      // allocate local keys to devices
-      for (int dev = 0; dev < device_num; dev++) {
-        device_dim_mutex[dev][j]->lock();
-        int len = task_keys[dev].size();
-        int cur = device_dim_keys[dev][j].size();
-        device_dim_keys[dev][j].resize(device_dim_keys[dev][j].size() + len);
-        device_dim_ptr[dev][j].resize(device_dim_ptr[dev][j].size() + len);
-        for (int k = 0; k < len; ++k) {
-          device_dim_keys[dev][j][cur + k] = task_keys[dev][k];
-          device_dim_ptr[dev][j][cur + k] = task_ptrs[dev][k];
-        }
-        device_dim_mutex[dev][j]->unlock();
+      device_dim_mutex[dev][j]->unlock();
+    }
+  };
+
+  if (multi_mf_dim_) {
+    threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i * multi_mf_dim_ + j] =
+            std::thread(build_pull_dynamic_mf_func, i, j);
       }
-    };
-    auto build_func = [device_num,
-                       record_status,
-                       &pass_values,
-                       &local_keys,
-                       &local_ptr,
-                       &device_task_keys,
-                       &device_task_ptrs](int i) {
-      auto& task_keys = device_task_keys[i];
-  #ifdef PADDLE_WITH_PSLIB
-      auto& task_ptrs = device_task_ptrs[i];
-  #endif
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+  }
+  timeline.Pause();
+  VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
+          << " seconds.";
+}
 
-  #ifdef PADDLE_WITH_PSCORE
-      auto& task_ptrs = device_task_ptrs[i];
-  #endif
+void PSGPUWrapper::PrepareGPUTask(std::shared_ptr<HeterContext> gpu_task) {
+  platform::Timer timeline;
+  int device_num = heter_devices_.size();
+  std::vector<std::thread> threads;
+  std::vector<std::future<void>> task_futures;
+  auto& local_keys = gpu_task->feature_keys_;
+  auto& local_ptr = gpu_task->value_ptr_;
 
-      for (size_t j = 0; j < local_keys[i].size(); j++) {
-        int shard = local_keys[i][j] % device_num;
-        task_keys[shard].push_back(local_keys[i][j]);
-        task_ptrs[shard].push_back(local_ptr[i][j]);
-      }
-  #ifdef PADDLE_WITH_PSLIB
-      if (record_status) {
-        size_t local_keys_size = local_keys.size();
-        size_t pass_values_size = pass_values.size();
-        for (size_t j = 0; j < pass_values_size; j += local_keys_size) {
-          auto& shard_values = pass_values[j];
-          for (size_t pair_idx = 0; pair_idx < pass_values[j].size();
-               pair_idx++) {
-            auto& cur_pair = shard_values[pair_idx];
-            int shard = cur_pair.first % device_num;
-            task_keys[shard].push_back(cur_pair.first);
-            task_ptrs[shard].push_back(
-                (paddle::ps::DownpourFixedFeatureValue*)cur_pair.second);
-          }
+  auto& device_keys = gpu_task->device_keys_;
+  auto& device_vals = gpu_task->device_values_;
+  // auto& device_mutex = gpu_task->mutex_;
+
+  timeline.Start();
+  std::vector<std::vector<std::pair<uint64_t, char*>>> pass_values;
+
+  bool record_status = false;
+  auto& device_task_keys = gpu_task->device_task_keys_;
+  auto& device_task_ptrs = gpu_task->device_task_ptr_;
+
+  auto build_func = [device_num,
+                     record_status,
+                     &pass_values,
+                     &local_keys,
+                     &local_ptr,
+                     &device_task_keys,
+                     &device_task_ptrs](int i) {
+    auto& task_keys = device_task_keys[i];
+#ifdef PADDLE_WITH_PSLIB
+    auto& task_ptrs = device_task_ptrs[i];
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+    auto& task_ptrs = device_task_ptrs[i];
+#endif
+
+    for (size_t j = 0; j < local_keys[i].size(); j++) {
+      int shard = local_keys[i][j] % device_num;
+      task_keys[shard].push_back(local_keys[i][j]);
+      task_ptrs[shard].push_back(local_ptr[i][j]);
+    }
+#ifdef PADDLE_WITH_PSLIB
+    if (record_status) {
+      size_t local_keys_size = local_keys.size();
+      size_t pass_values_size = pass_values.size();
+      for (size_t j = 0; j < pass_values_size; j += local_keys_size) {
+        auto& shard_values = pass_values[j];
+        for (size_t pair_idx = 0; pair_idx < pass_values[j].size();
+             pair_idx++) {
+          auto& cur_pair = shard_values[pair_idx];
+          int shard = cur_pair.first % device_num;
+          task_keys[shard].push_back(cur_pair.first);
+          task_ptrs[shard].push_back(
+              (paddle::ps::DownpourFixedFeatureValue*)cur_pair.second);
         }
       }
-  #endif
-    };
-    if (!multi_mf_dim_) {
-      for (int i = 0; i < thread_keys_shard_num_; i++) {
-        task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(build_func, i));
-      }
-      for (auto& f : task_futures) {
-        f.wait();
-      }
-      task_futures.clear();
-      VLOG(0) << "GpuPs build hbmps done";
     }
-    std::vector<std::vector<int>> prefix_sum;
-    prefix_sum.resize(device_num);
+#endif
+  };
+  if (!multi_mf_dim_) {
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(build_func, i));
+    }
+    for (auto& f : task_futures) {
+      f.wait();
+    }
+    task_futures.clear();
+    VLOG(0) << "GpuPs build hbmps done";
+  }
+  std::vector<std::vector<int>> prefix_sum;
+  prefix_sum.resize(device_num);
+  for (int i = 0; i < device_num; i++) {
+    prefix_sum[i].resize(thread_keys_shard_num_ + 1);
+    prefix_sum[i][0] = 0;
+  }
+  auto calc_prefix_func = [this,
+       &prefix_sum,
+       &device_keys,
+       &device_vals,
+       &device_task_keys](int device_num) {
+    for (int j = 0; j < thread_keys_shard_num_; j++) {
+      prefix_sum[device_num][j + 1] =
+          prefix_sum[device_num][j] + device_task_keys[j][device_num].size();
+    }
+    device_keys[device_num].resize(
+        prefix_sum[device_num][thread_keys_shard_num_]);
+    device_vals[device_num].resize(
+        prefix_sum[device_num][thread_keys_shard_num_]);
+  };
+  if (!multi_mf_dim_) {
     for (int i = 0; i < device_num; i++) {
-      prefix_sum[i].resize(thread_keys_shard_num_ + 1);
-      prefix_sum[i][0] = 0;
+      task_futures.emplace_back(
+          hbm_thread_pool_[i]->enqueue(calc_prefix_func, i));
     }
-    auto calc_prefix_func = [this,
-                             &prefix_sum,
-                             &device_keys,
-                             &device_vals,
-                             &device_task_keys](int device_num) {
-      for (int j = 0; j < thread_keys_shard_num_; j++) {
-        prefix_sum[device_num][j + 1] =
-            prefix_sum[device_num][j] + device_task_keys[j][device_num].size();
-      }
-      device_keys[device_num].resize(
-          prefix_sum[device_num][thread_keys_shard_num_]);
-      device_vals[device_num].resize(
-          prefix_sum[device_num][thread_keys_shard_num_]);
-    };
-    if (!multi_mf_dim_) {
-      for (int i = 0; i < device_num; i++) {
-        task_futures.emplace_back(
-            hbm_thread_pool_[i]->enqueue(calc_prefix_func, i));
-      }
-      for (auto& f : task_futures) {
-        f.wait();
-      }
-      task_futures.clear();
+    for (auto& f : task_futures) {
+      f.wait();
     }
-    VLOG(0) << "prefix done";
-    auto prepare_dev_value_func = [device_num,
-                                   &prefix_sum,
-                                   &device_keys,
-                                   &device_vals,
-                                   &device_task_keys,
-                                   &device_task_ptrs](int dev, int shard_id) {
-    // auto& task_keys = device_task_keys[shard_id];
-  #ifdef PADDLE_WITH_PSLIB
-      auto& task_ptrs = device_task_ptrs[shard_id];
-  #endif
-
-      // #ifdef PADDLE_WITH_PSCORE
-      //     auto& task_ptrs = device_task_ptrs[shard_id];
-      // #endif
-
-      // int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id];
-      // int cur = prefix_sum[dev][shard_id];
-  #ifdef PADDLE_WITH_PSLIB
-      for (int j = 0; j < len; ++j) {
-        device_keys[dev][cur + j] = task_keys[dev][j];
-        float* ptr_val = task_ptrs[dev][j]->data();
-        FeatureValue& val = device_vals[dev][cur + j];
-        size_t dim = task_ptrs[dev][j]->size();
-
-        val.delta_score = ptr_val[1];
-        val.show = ptr_val[2];
-        val.clk = ptr_val[3];
-        val.slot = ptr_val[6];
-        val.lr = ptr_val[4];
-        val.lr_g2sum = ptr_val[5];
-        val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]);
+    task_futures.clear();
+  }
+  VLOG(0) << "prefix done";
+  auto prepare_dev_value_func = [device_num,
+       &prefix_sum,
+       &device_keys,
+       &device_vals,
+       &device_task_keys,
+       &device_task_ptrs](int dev, int shard_id) {
+#ifdef PADDLE_WITH_PSLIB
+           auto& task_ptrs = device_task_ptrs[shard_id];
+
+           for (int j = 0; j < len; ++j) {
+               device_keys[dev][cur + j] = task_keys[dev][j];
+               float* ptr_val = task_ptrs[dev][j]->data();
+               FeatureValue& val = device_vals[dev][cur + j];
+               size_t dim = task_ptrs[dev][j]->size();
+
+               val.delta_score = ptr_val[1];
+               val.show = ptr_val[2];
+               val.clk = ptr_val[3];
+               val.slot = ptr_val[6];
+               val.lr = ptr_val[4];
+               val.lr_g2sum = ptr_val[5];
+               val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]);
 
         if (dim > 7) {
           val.mf_size = MF_DIM + 1;
@@ -677,40 +989,29 @@ void PSGPUWrapper::PrepareGPUTask(std::shared_ptr<HeterContext> gpu_task) {
       }
   #endif
       VLOG(3) << "GpuPs build hbmps done";
-    };
-    if (multi_mf_dim_) {
-      threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
-      for (int i = 0; i < thread_keys_shard_num_; i++) {
-        for (int j = 0; j < multi_mf_dim_; j++) {
-          threads[i * multi_mf_dim_ + j] =
-              std::thread(build_pull_dynamic_mf_func, i, j);
-        }
-      }
-      for (std::thread& t : threads) {
-        t.join();
-      }
-    } else {
-      for (int i = 0; i < thread_keys_shard_num_; i++) {
-        for (int j = 0; j < device_num; j++) {
-          task_futures.emplace_back(
-              hbm_thread_pool_[i]->enqueue(prepare_dev_value_func, j, i));
-        }
-      }
-      for (auto& f : task_futures) {
-        f.wait();
+  };
+  if (!multi_mf_dim_) {
+    for (int i = 0; i < thread_keys_shard_num_; i++) {
+      for (int j = 0; j < device_num; j++) {
+        task_futures.emplace_back(
+            hbm_thread_pool_[i]->enqueue(prepare_dev_value_func, j, i));
       }
-      task_futures.clear();
     }
-    timeline.Pause();
-    VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
-            << " seconds.";
+    for (auto& f : task_futures) {
+      f.wait();
+    }
+    task_futures.clear();
+  }
+  timeline.Pause();
+  VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec()
+      << " seconds.";
 }
 
 
 void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
   int device_num = heter_devices_.size();
-  platform::Timer timeline;
-  timeline.Start();
+  platform::Timer stagetime;
+  stagetime.Start();
 
   std::vector<size_t> feature_keys_count(device_num);
   size_t size_max = 0;
@@ -722,7 +1023,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
               << " dim index: " << j << " contains feasign nums: "
               << gpu_task->device_dim_ptr_[i][j].size();
     }
-    VLOG(1) << i << " card with dynamic mf contains feasign nums total: "
+    VLOG(0) << i << " card with dynamic mf contains feasign nums total: "
             << feature_keys_count[i];
     size_max = std::max(size_max, feature_keys_count[i]);
   }
@@ -745,87 +1046,28 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
   HeterPs_->set_sparse_sgd(optimizer_config_);
   HeterPs_->set_embedx_sgd(optimizer_config_);
 #endif
+  stagetime.Pause();
+  VLOG(0) << "card: "
+          << " BuildGPUTask create HeterPs_ costs: " << stagetime.ElapsedSec()
+          << " s.";
+  stagetime.Start();
 
-  auto build_dymf_mem_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i,
-                                                                      int j) {
-    this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
-    int mf_dim = this->index_dim_vec_[j];
-    VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim
-            << " feature_value_size:"
-            << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
-    size_t feature_value_size =
-        accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
-    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+  auto build_dynamic_mf_func = [this, &gpu_task, &accessor_wrapper_ptr](
+                                   int i, int j, size_t start, size_t end) {
+    // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
     auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
-    size_t len = device_dim_keys.size();
-    CHECK(len == device_dim_ptrs.size());
-    this->mem_pools_[i * this->multi_mf_dim_ + j] =
-        new MemoryPool(len, feature_value_size);
-  };
-  auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i,
-                                                                      int j) {
-    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
-    size_t len = device_dim_keys.size();
     int mf_dim = this->index_dim_vec_[j];
     size_t feature_value_size =
         accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+    size_t real_len = end - start;
+    std::shared_ptr<char> build_values(new char[feature_value_size * real_len],
+                                       [](char* p) { delete[] p; });
+    char* test_build_values = build_values.get();
 
-    auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
-    platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool);
-    auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
-
-    this->HeterPs_->build_ps(i,
-                             device_dim_keys.data(),
-                             cur_pool->mem(),
-                             len,
-                             feature_value_size,
-                             500000,
-                             2);
-    if (device_dim_keys.size() > 0) {
-      VLOG(3) << "show table: " << i
-              << " table kv size: " << device_dim_keys.size()
-              << "dim: " << mf_dim << " len: " << len;
-      HeterPs_->show_one_table(i);
-    }
-    delete mem_pool;
-  };
-  int thread_num = 16;
-  auto build_dynamic_mf_func = [this,
-                                &gpu_task,
-                                thread_num,
-                                &accessor_wrapper_ptr](int i, int j, int z) {
-    // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
-    int mf_dim = this->index_dim_vec_[j];
-    VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim;
-    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
-    auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
-    size_t len = device_dim_keys.size();
-    CHECK(len == device_dim_ptrs.size());
-    // this->mem_pools_[i * this->multi_mf_dim_ + j] =
-    //    new MemoryPool(len, feature_value_size);
-    auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
-
-    // ============ add for multi-thread ================
-    size_t len_per_thread = len / thread_num;
-    size_t remain = len % thread_num;
-    size_t left = 0, right = 0;
-
-    size_t real_len = len_per_thread;
-    if ((size_t)z < remain) real_len++;
-
-    if ((size_t)z < remain) {
-      left = z * (len_per_thread + 1);
-      right = left + real_len;
-    } else {
-      left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread;
-      right = left + real_len;
-    }
-    // ============ add for multi-thread ================
-
-    for (size_t k = left; k < right; k++) {
+    for (size_t k = start; k < end; k++) {
 #ifdef PADDLE_WITH_PSLIB
-      float* val = (float*)(mem_pool->mem_address(k));
+      float* val =
+          (float*)(test_build_values + (k - start) * feature_value_size);
       float* ptr_val = device_dim_ptrs[k]->data();
       size_t dim = device_dim_ptrs[k]->size();
       val->delta_score =
@@ -859,54 +1101,141 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
       }
 #endif
 #ifdef PADDLE_WITH_PSCORE
-      void* val = mem_pool->mem_address(k);
+      void* val =
+          (float*)(test_build_values + (k - start) * feature_value_size);
       accessor_wrapper_ptr->BuildFill(
           val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim);
 #endif
     }
+    task_info task;
+    task.build_values = build_values;
+    task.offset = start;
+    task.device_id = i;
+    task.multi_mf_dim = j;
+    task.start = 0;
+    task.end = int(real_len);
+    cpu_reday_channels_[i]->Put(task);
   };
 
-  threads.resize(device_num * multi_mf_dim_);
-  for (int i = 0; i < device_num; i++) {
+  auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i) {
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    std::vector<std::thread> threads(multi_mf_dim_);
     for (int j = 0; j < multi_mf_dim_; j++) {
-      threads[i + j * device_num] = std::thread(build_dymf_mem_pool, i, j);
+      auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+      size_t len = device_dim_keys.size();
+      int mf_dim = this->index_dim_vec_[j];
+      size_t feature_value_size =
+          accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+      this->hbm_pools_[i * this->multi_mf_dim_ + j]->reset(len,
+                                                           feature_value_size);
+
+      auto build_ps_thread =
+          [this, &gpu_task](
+              int i, int j, size_t len, size_t feature_value_size) {
+            auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+            this->HeterPs_->build_ps(
+                i,
+                device_dim_keys.data(),
+                this->hbm_pools_[i * this->multi_mf_dim_ + j]->mem(),
+                len,
+                feature_value_size,
+                500000,
+                2);
+            if (device_dim_keys.size() > 0) {
+              VLOG(3) << "show table: " << i
+                      << " table kv size: " << device_dim_keys.size()
+                      << "dim: " << this->index_dim_vec_[j] << " len: " << len;
+              HeterPs_->show_one_table(i);
+            }
+          };
+      threads[j] = std::thread(build_ps_thread, i, j, len, feature_value_size);
+    }
+    //build feature table
+    size_t slot_num = slot_vector_.size() - 1;//node slot 9008 in slot_vector
+    if (slot_num > 0 && (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH
+            || FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH)) {
+        auto build_feature_table = [this, &gpu_task](int i) {
+            auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+            std::vector<GpuPsCommGraphFea> * tmp  = (std::vector<GpuPsCommGraphFea> *) gpu_task->sub_graph_feas;
+            gpu_graph_ptr->build_gpu_graph_fea((*tmp)[i], i);
+        };
+        threads.push_back(std::thread(build_feature_table, i));
     }
-  }
 
-  for (std::thread& t : threads) {
-    t.join();
-  }
-  threads.clear();
+    struct task_info task;
+    while (cpu_reday_channels_[i]->Get(task)) {
+      auto hbm = this->hbm_pools_[task.device_id * this->multi_mf_dim_ +
+                                  task.multi_mf_dim]
+                     ->mem();
+      int mf_dim = this->index_dim_vec_[task.multi_mf_dim];
+      size_t feature_value_size =
+          accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+      auto hbm_start = hbm + task.offset * feature_value_size;
+      CUDA_CHECK(
+          cudaMemcpy(hbm_start,
+                     task.build_values.get() + task.start * feature_value_size,
+                     (task.end - task.start) * feature_value_size,
+                     cudaMemcpyHostToDevice));
+    }
+    platform::Timer stagetime;
+    stagetime.Start();
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    stagetime.Pause();
+    VLOG(0) << "card: " << i
+            << " BuildGPUTask build_ps async costs: " << stagetime.ElapsedSec()
+            << " s.";
+  };
+
+  std::vector<std::future<void>> cpu_task_futures;
+  std::vector<std::future<void>> gpu_task_futures;
 
-  // multi-thread process
-  threads.resize(device_num * multi_mf_dim_ * thread_num);
+  int once_gpu_copy = 64 * 1024;
+  threads.resize(device_num * multi_mf_dim_);
   for (int i = 0; i < device_num; i++) {
+    cpu_reday_channels_[i]->Open();
+    gpu_task_futures.emplace_back(
+        hbm_thread_pool_[i]->enqueue(build_dymf_hbm_pool, i));
     for (int j = 0; j < multi_mf_dim_; j++) {
-      for (int k = 0; k < thread_num; k++) {
-        threads[(i + j * device_num) * thread_num + k] =
-            std::thread(build_dynamic_mf_func, i, j, k);
+      auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+      size_t len = device_dim_keys.size();
+      size_t start = 0;
+      size_t end = 0;
+      while (end < len) {
+        start = end;
+        end = end + once_gpu_copy < len ? (end + once_gpu_copy) : len;
+        cpu_task_futures.emplace_back(cpu_work_pool_[i]->enqueue(
+            build_dynamic_mf_func, i, j, start, end));
       }
     }
   }
 
-  for (std::thread& t : threads) {
-    t.join();
+  stagetime.Start();
+  for (auto& f : cpu_task_futures) {
+    f.wait();
   }
-  threads.clear();
-  threads.resize(device_num * multi_mf_dim_);
+  cpu_task_futures.clear();
+  stagetime.Pause();
+  VLOG(0) << " BuildGPUTask build_dynamic_mf_func "
+          << " cost " << stagetime.ElapsedSec() << " s.";
   for (int i = 0; i < device_num; i++) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      threads[i + j * device_num] = std::thread(build_dymf_hbm_pool, i, j);
-    }
+    cpu_reday_channels_[i]->Close();
   }
-  for (std::thread& t : threads) {
-    t.join();
+  stagetime.Start();
+  for (auto& f : gpu_task_futures) {
+    f.wait();
   }
-  threads.clear();
-
-  timeline.Pause();
-  VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec()
-          << " s.";
+  gpu_task_futures.clear();
+  if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH
+            || FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) {
+    std::vector<GpuPsCommGraphFea> * tmp  = (std::vector<GpuPsCommGraphFea> *) gpu_task->sub_graph_feas;
+    delete tmp;
+    gpu_task->sub_graph_feas = NULL;
+  }
+  stagetime.Pause();
+  VLOG(0) << "  build_dymf_hbm_pool "
+          << " cost " << stagetime.ElapsedSec() << " s.";
 }
 
 void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
@@ -916,17 +1245,25 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
   dataset_->LoadIntoMemory();
   timer.Pause();
   VLOG(0) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s";
-
+  gpu_graph_mode_ = dataset_->GetGpuGraphMode();
+  if (dataset_->GetMemoryDataSize() == 0) {
+    VLOG(0) << "GetMemoryDataSize == 0";
+    return;
+  }
   // local shuffle
   if (is_shuffle) {
     dataset_->LocalShuffle();
   }
-  InitSlotInfo();
-  gpu_graph_mode_ = dataset_->GetGpuGraphMode();
-  std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
-  gpu_task->Reset();
 
-  data_ready_channel_->Put(gpu_task);
+  InitSlotInfo();
+  if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) {
+    std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
+    gpu_task->Reset();
+    gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
+    data_ready_channel_->Put(gpu_task);
+  } else if (hbm_sparse_table_initialized_ == false) {
+    SparseTableToHbm();
+  }
 
   VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]";
 }
@@ -969,9 +1306,11 @@ void PSGPUWrapper::build_pull_thread() {
     timer.Start();
     // build cpu ps data process
     BuildPull(gpu_task);
+    if (multi_mf_dim_) {
+      divide_to_device(gpu_task);
+    }
     timer.Pause();
-    VLOG(1) << "thread BuildPull end, cost time: " << timer.ElapsedSec()
-            << "s";
+    VLOG(1) << "thread BuildPull end, cost time: " << timer.ElapsedSec() << "s";
     buildpull_ready_channel_->Put(gpu_task);
   }
   VLOG(3) << "build cpu thread end";
@@ -992,17 +1331,22 @@ void PSGPUWrapper::build_task() {
   VLOG(0) << "PrepareGPUTask start.";
   platform::Timer timer;
   timer.Start();
-  PrepareGPUTask(gpu_task);
+  if (!multi_mf_dim_) {
+    PrepareGPUTask(gpu_task);
+  }
   BuildGPUTask(gpu_task);
   timer.Pause();
-  VLOG(0) << "PrepareGPUTask + BuildGPUTask end, cost time: " << timer.ElapsedSec()
-             << "s";
+  VLOG(0) << "PrepareGPUTask + BuildGPUTask end, cost time: "
+          << timer.ElapsedSec() << "s";
 
   current_task_ = gpu_task;
 }
 
 void PSGPUWrapper::BeginPass() {
   platform::Timer timer;
+  if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) {
+    return;
+  }
   timer.Start();
   if (current_task_) {
     PADDLE_THROW(
@@ -1028,12 +1372,59 @@ void PSGPUWrapper::BeginPass() {
 }
 
 void PSGPUWrapper::EndPass() {
+  if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) {
+    return;
+  }
+  platform::Timer stagetime;
+  stagetime.Start();
+  HbmToSparseTable();
+  stagetime.Pause();
+  VLOG(0) << "EndPass HbmToSparseTable cost time: " << stagetime.ElapsedSec()
+          << "s";
+
+  gpu_task_pool_.Push(current_task_);
+  current_task_ = nullptr;
+  gpu_free_channel_->Put(current_task_);
+  // fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_);
+}
+
+void PSGPUWrapper::SparseTableToHbm() {
+  std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
+  gpu_task->Reset();
+  size_t device_num = heter_devices_.size();
+  gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
+  gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID());
+  auto gpu_graph_ptr = GraphGpuWrapper::GetInstance();
+  auto node_to_id = gpu_graph_ptr->feature_to_id;
+  auto edge_to_id = gpu_graph_ptr->edge_to_id;
+  std::vector<uint64_t> vec_data = gpu_graph_ptr->get_graph_total_keys();
+
+  thread_dim_keys_.resize(thread_keys_thread_num_);
+  for (int i = 0; i < thread_keys_thread_num_; i++) {
+    thread_dim_keys_[i].resize(thread_keys_shard_num_);
+    for (int j = 0; j < thread_keys_shard_num_; j++) {
+      thread_dim_keys_[i][j].resize(multi_mf_dim_);
+    }
+  }
+
+  add_key_to_local(vec_data);
+  add_key_to_gputask(gpu_task);
+  BuildPull(gpu_task);
+  if (!multi_mf_dim_) {
+    PrepareGPUTask(gpu_task);
+  } else {
+    divide_to_device(gpu_task);
+  }
+  BuildGPUTask(gpu_task);
+  current_task_ = gpu_task;
+  hbm_sparse_table_initialized_ = true;
+}
+
+void PSGPUWrapper::HbmToSparseTable() {
   if (!current_task_) {
     PADDLE_THROW(
         platform::errors::Fatal("[EndPass] current task has been ended."));
   }
-  platform::Timer timer;
-  timer.Start();
   size_t keysize_max = 0;
   // in case of feasign_num = 0, skip dump_to_cpu
 
@@ -1043,89 +1434,123 @@ void PSGPUWrapper::EndPass() {
           std::max(keysize_max, current_task_->device_dim_keys_[i][j].size());
     }
   }
-  int thread_num = 8;
   auto accessor_wrapper_ptr =
       GlobalAccessorFactory::GetInstance().GetAccessorWrapper();
-  //auto fleet_ptr = FleetWrapper::GetInstance();
-  //fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_);
-  auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr](
-                                   int i, int j, int z) {
+
+  int once_cpu_num = 16 * 1024;
+  int once_gpu_copy = 8 * once_cpu_num;
+
+  auto dump_pool_to_cpu_func = [this, &accessor_wrapper_ptr, once_cpu_num](
+                                   int i, int j, size_t start, size_t end) {
     PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
     auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
-    auto& device_keys = this->current_task_->device_dim_keys_[i][j];
-    size_t len = device_keys.size();
-    // ====== multi-thread process feasign================
-    int len_per_thread = len / thread_num;
-    int remain = len % thread_num;
-    int left = -1, right = -1;
-    int real_len = len_per_thread;
-    if (z < remain) real_len++;
-    if (z < remain) {
-      left = z * (len_per_thread + 1);
-      right = left + real_len;
-    } else {
-      left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread;
-      right = left + real_len;
-    }
+    size_t real_len = end - start;
     // ============ multi-thread process feasign============
     int mf_dim = this->index_dim_vec_[j];
     size_t feature_value_size =
         accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
-    VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim
-            << " key_len :" << len
-            << " feature_value_size:" << feature_value_size;
-    char* test_build_values = (char*)malloc(feature_value_size * real_len);
-    uint64_t offset = left * feature_value_size;
+
+    std::shared_ptr<char> build_values(new char[feature_value_size * real_len],
+                                       [](char* p) { delete[] p; });
+    uint64_t offset = start * feature_value_size;
+    char* test_build_values = build_values.get();
+
     cudaMemcpy(test_build_values,
                hbm_pool->mem() + offset,
                feature_value_size * real_len,
                cudaMemcpyDeviceToHost);
-    CHECK(len == hbm_pool->capacity());
-    uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
-    for (int i = left; i < right; ++i) {
-      if (device_keys[i] == unuse_key) {
-        continue;
-      }
-      size_t local_offset = (i - left) * feature_value_size;
-      float* gpu_val = (float*)(test_build_values + local_offset);
+    for (size_t k = 0; k * once_cpu_num < real_len; k++) {
+      struct task_info task;
+      task.build_values = build_values;
+      task.offset = start;
+      task.device_id = i;
+      task.multi_mf_dim = j;
+      task.start = k * once_cpu_num;
+      task.end = (k + 1) * once_cpu_num < real_len ? ((k + 1) * once_cpu_num)
+                                                   : (real_len);
+      cpu_reday_channels_[i]->Put(task);
+    }
+  };
+  auto cpu_func = [this, &accessor_wrapper_ptr](int j) {
+    struct task_info task;
+    while (cpu_reday_channels_[j]->Get(task)) {
+      auto& device_keys =
+          this->current_task_
+              ->device_dim_keys_[task.device_id][task.multi_mf_dim];
+      char* test_build_values = task.build_values.get();
+      int mf_dim = this->index_dim_vec_[task.multi_mf_dim];
+      size_t feature_value_size =
+          accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+      uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
+      for (int i = task.start; i < task.end; ++i) {
+        if (device_keys[i + task.offset] == unuse_key) {
+          continue;
+        }
+        size_t local_offset = i * feature_value_size;
+        float* gpu_val = (float*)(test_build_values + local_offset);
 #ifdef PADDLE_WITH_PSLIB
-      // TODO: PSLIB DumpFill
+        // TODO: PSLIB DumpFill
 #endif
 #ifdef PADDLE_WITH_PSCORE
-      accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim);
+        accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim);
 #endif
+      }
     }
-    free(test_build_values);
   };
+  platform::Timer timer;
+  timer.Start();
+  std::vector<std::future<void>> cpu_task_futures;
+  std::vector<std::future<void>> gpu_task_futures;
+  size_t thread_num = 16;
+  size_t device_num = heter_devices_.size();
   if (multi_mf_dim_) {
     VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
-    size_t device_num = heter_devices_.size();
-    std::vector<std::thread> threads(device_num * multi_mf_dim_ * thread_num);
     for (size_t i = 0; i < device_num; i++) {
+      cpu_reday_channels_[i]->Open();
       for (int j = 0; j < multi_mf_dim_; j++) {
-        for (int k = 0; k < thread_num; k++) {
-          threads[(i + j * device_num) * thread_num + k] =
-              std::thread(dump_pool_to_cpu_func, i, j, k);
+        auto& device_keys = this->current_task_->device_dim_keys_[i][j];
+        size_t len = device_keys.size();
+        size_t start = 0;
+        size_t end = 0;
+        while (end < len) {
+          start = end;
+          end = end + once_gpu_copy < len ? (end + once_gpu_copy) : len;
+          gpu_task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(
+              dump_pool_to_cpu_func, i, j, start, end));
         }
       }
+      for (size_t j = 0; j < thread_num; j++) {
+        cpu_task_futures.emplace_back(cpu_work_pool_[i]->enqueue(cpu_func, i));
+      }
     }
-    for (std::thread& t : threads) {
-      t.join();
-    }
   }
+  for (auto& f : gpu_task_futures) {
+    f.wait();
+  }
+  timer.Pause();
+  VLOG(0) << " EndPass  dump_pool_to_cpu_func "
+          << " cost " << timer.ElapsedSec() << " s.";
+  for (size_t i = 0; i < device_num; i++) {
+    cpu_reday_channels_[i]->Close();
+  }
+  gpu_task_futures.clear();
+  timer.Start();
+  for (auto& f : cpu_task_futures) {
+    f.wait();
+  }
+  cpu_task_futures.clear();
+  timer.Pause();
+  VLOG(0) << " EndPass  cpu_func "
+          << " cost " << timer.ElapsedSec() << " s.";
   if (keysize_max != 0) {
     HeterPs_->end_pass();
   }
+}
 
-  for (size_t i = 0; i < hbm_pools_.size(); i++) {
-    delete hbm_pools_[i];
+void PSGPUWrapper::DumpToMem() {
+  if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) {
+    this->HbmToSparseTable();
   }
-  gpu_task_pool_.Push(current_task_);
-  current_task_ = nullptr;
-  gpu_free_channel_->Put(current_task_);
-  //fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_);
-  timer.Pause();
-  VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 54f4abd97e831d..21e92ac6aef957 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_context.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h"
 #include "paddle/fluid/framework/heter_util.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h"
@@ -63,6 +64,7 @@ limitations under the License. */
 #include "downpour_accessor.h"  // NOLINT
 #endif
 #include "paddle/fluid/framework/fleet/heter_ps/log_patch.h"
+DECLARE_int32(gpugraph_storage_mode);
 
 namespace paddle {
 namespace framework {
@@ -96,6 +98,15 @@ class AfsWrapper {
 };
 #endif
 
+struct task_info {
+  std::shared_ptr<char> build_values;
+  size_t offset;
+  int device_id;
+  int multi_mf_dim;
+  int start;
+  int end;
+};
+
 class PSGPUWrapper {
   class DCacheBuffer {
    public:
@@ -188,6 +199,9 @@ class PSGPUWrapper {
                 int total_len,
                 int* key2slot);
 
+
+  void divide_to_device(std::shared_ptr<HeterContext> gpu_task);
+  void add_slot_feature(std::shared_ptr<HeterContext> gpu_task);
   void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
   void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
   void BuildPull(std::shared_ptr<HeterContext> gpu_task);
@@ -195,16 +209,28 @@ class PSGPUWrapper {
   void LoadIntoMemory(bool is_shuffle);
   void BeginPass();
   void EndPass();
+  void add_key_to_local(const std::vector<uint64_t> & keys);
+  void add_key_to_gputask(std::shared_ptr<HeterContext> gpu_task);
+  void resize_gputask(std::shared_ptr<HeterContext> gpu_task);
+  void SparseTableToHbm();
+  void HbmToSparseTable();
   void start_build_thread();
   void pre_build_thread();
   void build_pull_thread();
   void build_task();
+  void DumpToMem();
 
   void Finalize() {
     VLOG(3) << "PSGPUWrapper Begin Finalize.";
     if (s_instance_ == nullptr) {
       return;
     }
+    if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) {
+        this->EndPass();
+    }
+    for (size_t i = 0; i < hbm_pools_.size(); i++) {
+      delete hbm_pools_[i];
+    }
     data_ready_channel_->Close();
     buildcpu_ready_channel_->Close();
     buildpull_ready_channel_->Close();
@@ -288,6 +314,12 @@ class PSGPUWrapper {
       gpu_free_channel_->Open();
       gpu_free_channel_->SetCapacity(1);
 
+      cpu_reday_channels_.resize(dev_ids.size());
+      for (size_t i = 0; i < dev_ids.size(); i++) {
+          cpu_reday_channels_[i] = paddle::framework::MakeChannel<task_info>();
+          cpu_reday_channels_[i]->SetCapacity(16);
+      }
+
       current_task_ = nullptr;
       gpu_free_channel_->Put(current_task_);
 
@@ -385,6 +417,11 @@ class PSGPUWrapper {
       hbm_thread_pool_[i].reset(new ::ThreadPool(1));
     }
 
+    cpu_work_pool_.resize(thread_keys_shard_num_);
+    for (size_t i = 0; i < hbm_thread_pool_.size(); i++) {
+        cpu_work_pool_[i].reset(new ::ThreadPool(16));
+    }
+
     auto sparse_table_accessor = sparse_table.accessor();
     auto sparse_table_accessor_parameter =
         sparse_table_accessor.ctr_accessor_param();
@@ -542,6 +579,7 @@ class PSGPUWrapper {
   }
   void SetSlotVector(const std::vector<int>& slot_vector) {
     slot_vector_ = slot_vector;
+    VLOG(0) << "slot_vector size is " << slot_vector_.size();
   }
 
   void SetSlotOffsetVector(const std::vector<int>& slot_offset_vector) {
@@ -596,6 +634,10 @@ class PSGPUWrapper {
       dim_index_map[index_dim_vec_[i]] = i;
     }
     hbm_pools_.resize(resource_->total_device() * num_of_dim);
+    for (size_t i = 0; i < hbm_pools_.size(); i++) {
+        hbm_pools_[i] = new HBMMemoryPoolFix();
+    }
+
     mem_pools_.resize(resource_->total_device() * num_of_dim);
     max_mf_dim_ = index_dim_vec_.back();
     multi_mf_dim_ = (dim_index_map.size() >= 1) ? dim_index_map.size() : 0;
@@ -693,6 +735,7 @@ class PSGPUWrapper {
   int month_;
   int day_;
   bool slot_info_initialized_ = false;
+  bool hbm_sparse_table_initialized_ = false;
   int use_afs_api_ = 0;
   int optimizer_type_ = 1;
   std::string accessor_class_;
@@ -703,7 +746,7 @@ class PSGPUWrapper {
 
 #ifdef PADDLE_WITH_CUDA
   std::vector<MemoryPool*> mem_pools_;
-  std::vector<HBMMemoryPool*> hbm_pools_;  // in multi mfdim, one table need hbm
+  std::vector<HBMMemoryPoolFix*> hbm_pools_;  // in multi mfdim, one table need hbm
                                            // pools of totol dims number
 #endif
 
@@ -723,12 +766,15 @@ class PSGPUWrapper {
       paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
       buildpull_ready_channel_ =
           paddle::framework::MakeChannel<std::shared_ptr<HeterContext>>();
+  std::vector<std::shared_ptr<
+      paddle::framework::ChannelObject<task_info>>> cpu_reday_channels_;
   std::shared_ptr<HeterContext> current_task_ = nullptr;
   std::thread pre_build_threads_;
   std::thread buildpull_threads_;
   bool running_ = false;
   std::vector<std::shared_ptr<ThreadPool>> pull_thread_pool_;
   std::vector<std::shared_ptr<ThreadPool>> hbm_thread_pool_;
+  std::vector<std::shared_ptr<ThreadPool>> cpu_work_pool_;
   OptimizerConfig optimizer_config_;
 
  protected:
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index b4fcbae8b2d4c2..b05935a6eca52b 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -151,6 +151,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
   bool train_mode = device_reader_->IsTrainMode();
   timeline.Start();
   uint64_t total_inst = 0;
+  device_reader_->InitGraphTrainResource();
   while (1) {
     cur_batch = device_reader_->Next();
     if (FLAGS_enable_exit_when_partial_worker && train_mode) {
@@ -268,6 +269,7 @@ void HogwildWorker::TrainFiles() {
 #endif
   // while ((cur_batch = device_reader_->Next()) > 0) {
   bool train_mode = device_reader_->IsTrainMode();
+  device_reader_->InitGraphTrainResource();
   while (1) {
     cur_batch = device_reader_->Next();
     if (FLAGS_enable_exit_when_partial_worker && train_mode) {
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index ceed8cb6bfa636..96a473be1aa8cc 100755
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -48,6 +48,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
     places_.push_back(place);
   }
 #endif
+  user_define_dump_filename_ = trainer_desc.user_define_dump_filename();
   // get filelist from trainer_desc here
   const std::vector<paddle::framework::DataFeed*> readers =
       dataset->GetReaders();
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 27c7563fee840e..22d66dbfd90828 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -49,7 +49,12 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
     : underlying_allocator_(underlying_allocator),
       alignment_(alignment),
       chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
-      allow_free_idle_chunk_(allow_free_idle_chunk) {}
+      allow_free_idle_chunk_(allow_free_idle_chunk) {
+          total_alloc_times_ = 0;
+          total_alloc_size_ = 0;
+          total_free_times_ = 0;
+          total_free_size_ = 0;
+      }
 
 phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
     size_t unaligned_size) {
@@ -112,6 +117,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
     VLOG(2) << "Not found and reallocate " << realloc_size << "("
             << static_cast<void *>(p) << "), and remaining " << remaining_size;
   }
+  ++total_alloc_times_;
+  total_alloc_size_ += size;
   VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
   return new BlockAllocation(block_it);
 }
@@ -126,6 +133,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   auto &blocks = block_it->chunk_->blocks_;
 
+  total_free_times_ += 1;
+  total_free_size_ += block_it->size_;
+
   block_it->is_free_ = true;
 
   if (block_it != blocks.begin()) {
@@ -176,9 +186,28 @@ uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
       ++chunk_it;
     }
   }
+
+  Trace();
   return bytes;
 }
 
+void AutoGrowthBestFitAllocator::Trace() const {
+  size_t cur_idle_bytes = 0;
+  auto it = free_blocks_.begin();
+  for (; it != free_blocks_.end(); ++it) {
+      cur_idle_bytes += it->second->size_;
+  }
+
+  VLOG(1) << "alloc:" << total_alloc_size_ / double(1024*1024)
+      << "m free:" << total_free_size_ / double(1024*1024)
+      << "m busy:" << (total_alloc_size_ - total_free_size_) / double(1024*1024)
+      << "m idle:" << cur_idle_bytes / double(1024*1024)
+      << "m alloc_times:" << total_alloc_times_
+      << " free_times:" << total_free_times_
+      << " free_blocks_num:" << free_blocks_.size()
+      << " curr_chunks_num:" << chunks_.size();
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index dadf751bdfa419..138f4a98c4db5d 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -49,6 +49,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
 
  private:
   uint64_t FreeIdleChunks();
+  void Trace() const;
 
   template <typename T>
   using List = std::list<T>;
@@ -93,6 +94,12 @@ class AutoGrowthBestFitAllocator : public Allocator {
   size_t chunk_size_;
   bool allow_free_idle_chunk_;
 
+  // stat info
+  size_t total_alloc_times_;
+  size_t total_alloc_size_;
+  size_t total_free_times_;
+  size_t total_free_size_;
+
   SpinLock spinlock_;
 };
 
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index a26ed5dbdad8c6..187eb0692cda79 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -833,6 +833,18 @@ PADDLE_DEFINE_EXPORTED_bool(
     false,
     "It controls whether exit trainer when an worker has no ins.");
 
+/**
+ * Distributed related FLAG
+ * Name: enable_exit_when_partial_worker
+ * Since Version: 2.2.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: represent gpugraph storage mode, 1 for full hbm, 2 for hbm + mem + ssd.
+ */
+PADDLE_DEFINE_EXPORTED_int32(gpugraph_storage_mode,
+                             1,
+                             "gpugraph storage mode, default 1");
+
 /**
  * KP kernel related FLAG
  * Name: FLAGS_run_kp_kernel
@@ -961,17 +973,18 @@ PADDLE_DEFINE_EXPORTED_uint64(
     gpugraph_merge_grads_segment_size,
     128,
     "segment size with segment gradient merge, default 128");
+PADDLE_DEFINE_EXPORTED_uint64(
+    gpugraph_slot_feasign_max_num,
+    5,
+    "max feasign number in one slot, default 5");
 PADDLE_DEFINE_EXPORTED_int32(
     gpugraph_dedup_pull_push_mode,
     0,
     "enable dedup keys while pull push sparse, default 0");
-PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm,
-                            true,
-                            "enable load_node_list_into_hbm, default true");
-PADDLE_DEFINE_EXPORTED_int32(
-    gpugraph_sparse_table_storage_mode,
-    0,
-    "parse_table_storage_mode, default 0");
+PADDLE_DEFINE_EXPORTED_bool(
+    gpugraph_load_node_list_into_hbm,
+    true,
+    "enable load_node_list_into_hbm, default true");
 /**
  * ProcessGroupNCCL related FLAG
  * Name: nccl_blocking_wait
diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc
index ea6240b649cad0..dd38ce7956309a 100644
--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -19,6 +19,7 @@ namespace platform {}  // namespace platform
 }  // namespace paddle
 
 DEFINE_INT_STATUS(STAT_total_feasign_num_in_mem)
+DEFINE_INT_STATUS(STAT_epoch_finish)
 DEFINE_INT_STATUS(STAT_gpu0_mem_size)
 DEFINE_INT_STATUS(STAT_gpu1_mem_size)
 DEFINE_INT_STATUS(STAT_gpu2_mem_size)
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index e902baa13532e5..dc381e6a033e00 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -368,6 +368,9 @@ void BindDataset(py::module *m) {
            py::call_guard<py::gil_scoped_release>())
       .def("set_gpu_graph_mode",
            &framework::Dataset::SetGpuGraphMode,
+           py::call_guard<py::gil_scoped_release>())
+      .def("set_pass_id",
+           &framework::Dataset::SetPassId,
            py::call_guard<py::gil_scoped_release>());
 
   py::class_<IterableDatasetWrapper>(*m, "IterableDatasetWrapper")
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
old mode 100755
new mode 100644
index 8b224a617dffa9..1c1d5a5269f306
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -64,6 +64,7 @@ void BindDistFleetWrapper(py::module* m) {
       .def("save_one_model", &FleetWrapper::SaveModelOneTable)
       .def("recv_and_save_model", &FleetWrapper::RecvAndSaveTable)
       .def("sparse_table_stat", &FleetWrapper::PrintTableStat)
+      .def("save_cache_table", &FleetWrapper::SaveCacheTable)
       .def("stop_server", &FleetWrapper::StopServer)
       .def("stop_worker", &FleetWrapper::FinalizeWorker)
       .def("barrier", &FleetWrapper::BarrierWithTable)
@@ -372,7 +373,12 @@ void BindGraphGpuWrapper(py::module* m) {
       .def("set_up_types", &GraphGpuWrapper::set_up_types)
       .def("query_node_list", &GraphGpuWrapper::query_node_list)
       .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf)
-      .def("load_edge_file", &GraphGpuWrapper::load_edge_file)
+      .def("load_edge_file",
+           py::overload_cast<std::string, std::string, bool>(
+               &GraphGpuWrapper::load_edge_file))
+      .def("load_edge_file",
+           py::overload_cast<std::string, std::string, int, bool>(
+               &GraphGpuWrapper::load_edge_file))
       .def("load_node_and_edge", &GraphGpuWrapper::load_node_and_edge)
       .def("upload_batch",
            py::overload_cast<int, int, int, const std::string&>(
@@ -396,7 +402,15 @@ void BindGraphGpuWrapper(py::module* m) {
       .def("get_partition", &GraphGpuWrapper::get_partition)
       .def("load_node_weight", &GraphGpuWrapper::load_node_weight)
       .def("export_partition_files", &GraphGpuWrapper::export_partition_files)
-      .def("load_node_file", &GraphGpuWrapper::load_node_file)
+      .def("load_node_file",
+           py::overload_cast<std::string, std::string>(
+               &GraphGpuWrapper::load_node_file))
+      .def("load_node_file",
+           py::overload_cast<std::string, std::string, int>(
+               &GraphGpuWrapper::load_node_file))
+      .def("release_graph", &GraphGpuWrapper::release_graph)
+      .def("release_graph_edge", &GraphGpuWrapper::release_graph_edge)
+      .def("release_graph_node", &GraphGpuWrapper::release_graph_node)
       .def("finalize", &GraphGpuWrapper::finalize);
 }
 #endif
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index e9c993d3ee1282..4d7d17463e4fe4 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -64,6 +64,9 @@ void BindPSGPUWrapper(py::module* m) {
       .def("begin_pass",
            &framework::PSGPUWrapper::BeginPass,
            py::call_guard<py::gil_scoped_release>())
+      .def("dump_to_mem",
+           &framework::PSGPUWrapper::DumpToMem,
+           py::call_guard<py::gil_scoped_release>())
       .def("load_into_memory",
            &framework::PSGPUWrapper::LoadIntoMemory,
            py::call_guard<py::gil_scoped_release>())
diff --git a/paddle/utils/string/string_helper.h b/paddle/utils/string/string_helper.h
index b84c7fa75209df..029eb9eb59dc6a 100644
--- a/paddle/utils/string/string_helper.h
+++ b/paddle/utils/string/string_helper.h
@@ -334,6 +334,42 @@ inline int split_string_ptr(const char* str,
   return num;
 }
 
+inline int split_string_ptr(const char* str,
+                            size_t len,
+                            char delim,
+                            std::vector<str_ptr>* values,
+                            int max_num) {
+  if (len <= 0) {
+    return 0;
+  }
+
+  int num = 0;
+  const char* p = str;
+  const char* end = str + len;
+  const char* last = str;
+  while (p < end) {
+    if (*p != delim) {
+      ++p;
+      continue;
+    }
+    values->emplace_back(last, (size_t)(p - last));
+    ++num;
+    ++p;
+    if (num >= max_num) {
+      return num;
+    }
+    // skip continue delim
+    while (*p == delim) {
+      ++p;
+    }
+    last = p;
+  }
+  if (p > last) {
+    values->emplace_back(last, (size_t)(p - last));
+    ++num;
+  }
+  return num;
+}
 // A helper class for reading lines from file. A line buffer is maintained. It
 // doesn't need to know the maximum possible length of a line.
 
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 0cfb946d3d8cad..83f60a6e26b40c 100755
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -88,3 +88,4 @@
 shrink = fleet.shrink
 get_hybrid_communicate_group = fleet.get_hybrid_communicate_group
 distributed_scaler = fleet.distributed_scaler
+save_cache_table = fleet.save_cache_table
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 1a9b3f565b77ab..4b9037795e067c 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -906,6 +906,15 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0):
     def save_cache_model(self, dirname, **configs):
         return self._runtime_handle._save_cache_model(dirname, **configs)
 
+    @is_non_distributed_check
+    @inited_runtime_handler
+    def save_cache_table(self,
+                         table_id,
+                         pass_id,
+                         mem_cache_key_threshold=4000000000):
+        return self._runtime_handle._save_cache_table(table_id, pass_id,
+                                                      mem_cache_key_threshold)
+
     def shrink(self, threshold=None):
         self._runtime_handle._shrink(threshold)
 
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index abf7eec73b8fe1..5c7c1b11a27a70 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -1420,6 +1420,12 @@ def _save_cache_model(self, dirname, **kwargs):
         fleet.util.barrier()
         return feasign_num
 
+    def _save_cache_table(self, table_id, pass_id, mem_cache_key_threshold):
+        if self.role_maker._is_first_worker():
+            self._worker.save_cache_table(table_id, pass_id,
+                                          mem_cache_key_threshold)
+        fleet.util.barrier()
+
     def _load_sparse_params(self, dirname, context, main_program, mode):
         distributed_varnames = get_sparse_tablenames(self.origin_main_programs,
                                                      True)
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 20f11c96a91077..decd3988602ac3 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -388,6 +388,7 @@ def __init__(self):
         self.merge_by_lineid = False
         self.fleet_send_sleep_seconds = None
         self.trainer_num = -1
+        self.pass_id = 0
 
     @deprecated(since="2.0.0",
                 update_to="paddle.distributed.InMemoryDataset._set_feed_type")
@@ -1082,8 +1083,25 @@ def set_graph_config(self, config):
             "gpu_graph_training", True)
         self.proto_desc.graph_config.sage_mode = config.get("sage_mode", False)
         self.proto_desc.graph_config.samples = config.get("samples", "")
+        self.proto_desc.graph_config.train_table_cap = config.get(
+            "train_table_cap", 800000)
+        self.proto_desc.graph_config.infer_table_cap = config.get(
+            "infer_table_cap", 800000)
         self.dataset.set_gpu_graph_mode(True)
 
+    def set_pass_id(self, pass_id):
+        """
+        set_pass_id
+        """
+        self.pass_id = pass_id
+        self.dataset.set_pass_id(pass_id)
+
+    def get_pass_id(self):
+        """
+        get_pass_id
+        """
+        return self.pass_id
+
 
 class QueueDataset(DatasetBase):
     """
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index 3ba9f9eea46d1b..945b28aac88de1 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -73,6 +73,9 @@ def _create_trainer(self, opt_info=None):
                 if opt_info.get("dump_fields_path") is not None and len(
                         opt_info.get("dump_fields_path")) != 0:
                     trainer._set_dump_fields_path(opt_info["dump_fields_path"])
+                if opt_info.get("user_define_dump_filename") is not None and len(
+                        opt_info.get("user_define_dump_filename")) != 0:
+                    trainer._set_user_define_dump_filename(opt_info["user_define_dump_filename"])
                 if opt_info.get("dump_file_num") is not None:
                     trainer._set_dump_file_num(opt_info["dump_file_num"])
                 if opt_info.get("dump_converter") is not None: