diff --git a/cmake/external/jemalloc.cmake b/cmake/external/jemalloc.cmake new file mode 100644 index 00000000000000..efce686b20929a --- /dev/null +++ b/cmake/external/jemalloc.cmake @@ -0,0 +1,35 @@ +include(ExternalProject) + +set(JEMALLOC_PROJECT "extern_jemalloc") +set(JEMALLOC_URL + /~https://github.com/jemalloc/jemalloc/releases/download/5.1.0/jemalloc-5.1.0.tar.bz2 +) +set(JEMALLOC_BUILD ${THIRD_PARTY_PATH}/jemalloc/src/extern_jemalloc) +set(JEMALLOC_SOURCE_DIR "${THIRD_PARTY_PATH}/jemalloc") +set(JEMALLOC_INSTALL ${THIRD_PARTY_PATH}/install/jemalloc) +set(JEMALLOC_INCLUDE_DIR ${JEMALLOC_INSTALL}/include) +set(JEMALLOC_DOWNLOAD_DIR "${JEMALLOC_SOURCE_DIR}/src/${JEMALLOC_PROJECT}") + +set(JEMALLOC_STATIC_LIBRARIES + ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a) +set(JEMALLOC_LIBRARIES + ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a) + +ExternalProject_Add( + extern_jemalloc + PREFIX ${JEMALLOC_SOURCE_DIR} + URL ${JEMALLOC_URL} + INSTALL_DIR ${JEMALLOC_INSTALL} + DOWNLOAD_DIR "${JEMALLOC_DOWNLOAD_DIR}" + BUILD_COMMAND $(MAKE) + BUILD_IN_SOURCE 1 + INSTALL_COMMAND $(MAKE) install + CONFIGURE_COMMAND "${JEMALLOC_DOWNLOAD_DIR}/configure" + --prefix=${JEMALLOC_INSTALL} --disable-initial-exec-tls) + +add_library(jemalloc STATIC IMPORTED GLOBAL) +set_property(TARGET jemalloc PROPERTY IMPORTED_LOCATION + ${JEMALLOC_STATIC_LIBRARIES}) + +include_directories(${JEMALLOC_INCLUDE_DIR}) +add_dependencies(jemalloc extern_jemalloc) diff --git a/cmake/external/rocksdb.cmake b/cmake/external/rocksdb.cmake index 41a1916dc33083..0084247461b74b 100644 --- a/cmake/external/rocksdb.cmake +++ b/cmake/external/rocksdb.cmake @@ -14,6 +14,13 @@ include(ExternalProject) +# find_package(jemalloc REQUIRED) + +set(JEMALLOC_INCLUDE_DIR ${THIRD_PARTY_PATH}/install/jemalloc/include) +set(JEMALLOC_LIBRARIES + ${THIRD_PARTY_PATH}/install/jemalloc/lib/libjemalloc_pic.a) +message(STATUS "rocksdb jemalloc:" ${JEMALLOC_LIBRARIES}) + set(ROCKSDB_PREFIX_DIR ${THIRD_PARTY_PATH}/rocksdb) set(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb) set(ROCKSDB_INCLUDE_DIR @@ -22,22 +29,41 @@ set(ROCKSDB_INCLUDE_DIR set(ROCKSDB_LIBRARIES "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a" CACHE FILEPATH "rocksdb library." FORCE) -set(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") +set(ROCKSDB_COMMON_FLAGS + "-g -pipe -O2 -W -Wall -Wno-unused-parameter -fPIC -fno-builtin-memcmp -fno-omit-frame-pointer" +) +set(ROCKSDB_FLAGS + "-DNDEBUG -DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DOS_LINUX -DROCKSDB_FALLOCATE_PRESENT -DHAVE_SSE42 -DHAVE_PCLMUL -DZLIB -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX -DROCKSDB_BACKTRACE -DROCKSDB_SUPPORT_THREAD_LOCAL -DROCKSDB_USE_RTTI -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_AUXV_GETAUXVAL_PRESENT" +) +set(ROCKSDB_CMAKE_CXX_FLAGS + "${ROCKSDB_COMMON_FLAGS} -DROCKSDB_LIBAIO_PRESENT -msse -msse4.2 -mpclmul ${ROCKSDB_FLAGS} -fPIC -I${JEMALLOC_INCLUDE_DIR}" +) +set(ROCKSDB_CMAKE_C_FLAGS + "${ROCKSDB_COMMON_FLAGS} ${ROCKSDB_FLAGS} -DROCKSDB_LIBAIO_PRESENT -fPIC -I${JEMALLOC_INCLUDE_DIR}" +) include_directories(${ROCKSDB_INCLUDE_DIR}) +set(CMAKE_CXX_LINK_EXECUTABLE + "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz") ExternalProject_Add( extern_rocksdb ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${ROCKSDB_PREFIX_DIR} - GIT_REPOSITORY "/~https://github.com/facebook/rocksdb" - GIT_TAG v6.10.1 + GIT_REPOSITORY "/~https://github.com/Thunderbrook/rocksdb" + GIT_TAG 6.19.fb UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DWITH_BZ2=OFF -DWITH_GFLAGS=OFF + -DWITH_TESTS=OFF + -DWITH_JEMALLOC=ON + -DWITH_BENCHMARK_TOOLS=OFF + -DJeMalloc_LIBRARIES=${JEMALLOC_LIBRARIES} + -DJeMalloc_INCLUDE_DIRS=${JEMALLOC_INCLUDE_DIR} -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS=${ROCKSDB_CMAKE_C_FLAGS} + -DCMAKE_CXX_LINK_EXECUTABLE=${CMAKE_CXX_LINK_EXECUTABLE} # BUILD_BYPRODUCTS ${ROCKSDB_PREFIX_DIR}/src/extern_rocksdb/librocksdb.a INSTALL_COMMAND mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ && cp diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 94fb1b4d838f9a..5455ddadfdea44 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -422,6 +422,9 @@ if(WITH_PSCORE) include(external/rocksdb) # download, build, install rocksdb list(APPEND third_party_deps extern_rocksdb) + + include(external/jemalloc) # download, build, install jemalloc + list(APPEND third_party_deps extern_jemalloc) endif() if(WITH_XBYAK) diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h index 5654669d76fdba..74f946b2253aac 100644 --- a/paddle/fluid/distributed/ps/service/ps_client.h +++ b/paddle/fluid/distributed/ps/service/ps_client.h @@ -148,10 +148,12 @@ class PSClient { return fut; } - virtual ::std::future PullSparsePtr(char **select_values, + virtual ::std::future PullSparsePtr(int shard_id, + char **select_values, size_t table_id, const uint64_t *keys, - size_t num) { + size_t num, + uint16_t pass_id) { VLOG(0) << "Did not implement"; std::promise promise; std::future fut = promise.get_future(); @@ -160,6 +162,15 @@ class PSClient { } virtual std::future PrintTableStat(uint32_t table_id) = 0; + virtual std::future SaveCacheTable(uint32_t table_id, + uint16_t pass_id, + size_t threshold) { + VLOG(0) << "Did not implement"; + std::promise promise; + std::future fut = promise.get_future(); + promise.set_value(-1); + return fut; + } // 确保所有积攒中的请求都发起发送 virtual std::future Flush() = 0; diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc index e8bf426710bc3b..5466e9cd95bd09 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.cc +++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc @@ -260,10 +260,12 @@ ::std::future PsLocalClient::PushDense(const Region* regions, // return done(); //} -::std::future PsLocalClient::PullSparsePtr(char** select_values, +::std::future PsLocalClient::PullSparsePtr(int shard_id, + char** select_values, size_t table_id, const uint64_t* keys, - size_t num) { + size_t num, + uint16_t pass_id) { // FIXME // auto timer = // std::make_shared("pslib_downpour_client_pull_sparse"); @@ -278,6 +280,8 @@ ::std::future PsLocalClient::PullSparsePtr(char** select_values, table_context.pull_context.ptr_values = select_values; table_context.use_ptr = true; table_context.num = num; + table_context.shard_id = shard_id; + table_context.pass_id = pass_id; // table_ptr->PullSparsePtr(select_values, keys, num); table_ptr->Pull(table_context); @@ -285,6 +289,28 @@ ::std::future PsLocalClient::PullSparsePtr(char** select_values, return done(); } +::std::future PsLocalClient::PrintTableStat(uint32_t table_id) { + auto* table_ptr = GetTable(table_id); + std::pair ret = table_ptr->PrintTableStat(); + VLOG(0) << "table id: " << table_id << ", feasign size: " << ret.first + << ", mf size: " << ret.second; + return done(); +} + +::std::future PsLocalClient::SaveCacheTable(uint32_t table_id, + uint16_t pass_id, + size_t threshold) { + auto* table_ptr = GetTable(table_id); + std::pair ret = table_ptr->PrintTableStat(); + VLOG(0) << "table id: " << table_id << ", feasign size: " << ret.first + << ", mf size: " << ret.second; + if (ret.first > threshold) { + VLOG(0) << "run cache table"; + table_ptr->CacheTable(pass_id); + } + return done(); +} + ::std::future PsLocalClient::PushSparseRawGradient( size_t table_id, const uint64_t* keys, diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h index 593805547af849..583ea8052eb01d 100644 --- a/paddle/fluid/distributed/ps/service/ps_local_client.h +++ b/paddle/fluid/distributed/ps/service/ps_local_client.h @@ -76,18 +76,19 @@ class PsLocalClient : public PSClient { return fut; } - virtual ::std::future PullSparsePtr(char** select_values, + virtual ::std::future PullSparsePtr(int shard_id, + char** select_values, size_t table_id, const uint64_t* keys, - size_t num); + size_t num, + uint16_t pass_id); - virtual ::std::future PrintTableStat(uint32_t table_id) { - std::promise prom; - std::future fut = prom.get_future(); - prom.set_value(0); + virtual ::std::future PrintTableStat(uint32_t table_id); + + virtual ::std::future SaveCacheTable(uint32_t table_id, + uint16_t pass_id, + size_t threshold); - return fut; - } virtual ::std::future PushSparse(size_t table_id, const uint64_t* keys, const float** update_values, diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h index b55c77bf52d848..9f6baf3189fb8b 100644 --- a/paddle/fluid/distributed/ps/table/accessor.h +++ b/paddle/fluid/distributed/ps/table/accessor.h @@ -162,6 +162,15 @@ class ValueAccessor { return 0; } + virtual bool SaveMemCache(float* value, + int param, + double global_cache_threshold, + uint16_t pass_id) { + return true; + } + + virtual void UpdatePassId(float* value, uint16_t pass_id) {} + virtual float GetField(float* value, const std::string& name) { return 0.0; } #define DEFINE_GET_INDEX(class, field) \ virtual int get_##field##_index() override { return class ::field##_index(); } diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index 06e1f1c9c7734c..88c2895ecb04c5 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -24,6 +24,7 @@ #include "gflags/gflags.h" #include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/platform/timer.h" @@ -32,6 +33,8 @@ DECLARE_bool(graph_load_in_parallel); DECLARE_bool(graph_get_neighbor_id); +DECLARE_int32(gpugraph_storage_mode); +DECLARE_uint64(gpugraph_slot_feasign_max_num); namespace paddle { namespace distributed { @@ -54,32 +57,38 @@ int32_t GraphTable::Load_to_ssd(const std::string &path, } paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea( - std::vector &node_ids, int slot_num) { - std::vector> bags(task_pool_size_); - for (int i = 0; i < task_pool_size_; i++) { - auto predsize = node_ids.size() / task_pool_size_; + int gpu_id, std::vector &node_ids, int slot_num) { + size_t shard_num = 64; + std::vector> bags(shard_num); + std::vector feature_array[shard_num]; + std::vector slot_id_array[shard_num]; + std::vector node_id_array[shard_num]; + std::vector node_fea_info_array[shard_num]; + for (size_t i = 0; i < shard_num; i++) { + auto predsize = node_ids.size() / shard_num; bags[i].reserve(predsize * 1.2); + feature_array[i].reserve(predsize * 1.2 * slot_num); + slot_id_array[i].reserve(predsize * 1.2 * slot_num); + node_id_array[i].reserve(predsize * 1.2); + node_fea_info_array[i].reserve(predsize * 1.2); } for (auto x : node_ids) { - int location = x % shard_num % task_pool_size_; + int location = x % shard_num; bags[location].push_back(x); } std::vector> tasks; - std::vector feature_array[task_pool_size_]; - std::vector slot_id_array[task_pool_size_]; - std::vector node_id_array[task_pool_size_]; - std::vector - node_fea_info_array[task_pool_size_]; - slot_feature_num_map_.resize(slot_num); - for (int k = 0; k < slot_num; ++k) { - slot_feature_num_map_[k] = 0; + if (slot_feature_num_map_.size() == 0) { + slot_feature_num_map_.resize(slot_num); + for (int k = 0; k < slot_num; ++k) { + slot_feature_num_map_[k] = 0; + } } for (size_t i = 0; i < bags.size(); i++) { if (bags[i].size() > 0) { - tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int { + tasks.push_back(_cpu_worker_pool[gpu_id]->enqueue([&, i, this]() -> int { uint64_t node_id; paddle::framework::GpuPsFeaInfo x; std::vector feature_ids; @@ -96,19 +105,11 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea( x.feature_offset = feature_array[i].size(); int total_feature_size = 0; for (int k = 0; k < slot_num; ++k) { - v->get_feature_ids(k, &feature_ids); - int feature_ids_size = feature_ids.size(); + auto feature_ids_size = v->get_feature_ids(k, feature_array[i], slot_id_array[i]); if (slot_feature_num_map_[k] < feature_ids_size) { slot_feature_num_map_[k] = feature_ids_size; } total_feature_size += feature_ids_size; - if (!feature_ids.empty()) { - feature_array[i].insert(feature_array[i].end(), - feature_ids.begin(), - feature_ids.end()); - slot_id_array[i].insert( - slot_id_array[i].end(), feature_ids_size, k); - } } x.feature_size = total_feature_size; node_fea_info_array[i].push_back(x); @@ -127,32 +128,40 @@ paddle::framework::GpuPsCommGraphFea GraphTable::make_gpu_ps_graph_fea( } VLOG(0) << "slot_feature_num_map: " << ss.str(); + tasks.clear(); + paddle::framework::GpuPsCommGraphFea res; uint64_t tot_len = 0; - for (int i = 0; i < task_pool_size_; i++) { + for (size_t i = 0; i < shard_num; i++) { tot_len += feature_array[i].size(); } VLOG(0) << "Loaded feature table on cpu, feature_list_size[" << tot_len << "] node_ids_size[" << node_ids.size() << "]"; res.init_on_cpu(tot_len, (unsigned int)node_ids.size(), slot_num); unsigned int offset = 0, ind = 0; - for (int i = 0; i < task_pool_size_; i++) { - for (int j = 0; j < (int)node_id_array[i].size(); j++) { - res.node_list[ind] = node_id_array[i][j]; - res.fea_info_list[ind] = node_fea_info_array[i][j]; - res.fea_info_list[ind++].feature_offset += offset; - } - for (size_t j = 0; j < feature_array[i].size(); j++) { - res.feature_list[offset + j] = feature_array[i][j]; - res.slot_id_list[offset + j] = slot_id_array[i][j]; - } + for (size_t i = 0; i < shard_num; i++) { + tasks.push_back(_cpu_worker_pool[gpu_id]->enqueue([&, i, ind, offset, this]() -> int { + auto start = ind; + for (int j = 0; j < (int)node_id_array[i].size(); j++) { + res.node_list[start] = node_id_array[i][j]; + res.fea_info_list[start] = node_fea_info_array[i][j]; + res.fea_info_list[start++].feature_offset += offset; + } + for (size_t j = 0; j < feature_array[i].size(); j++) { + res.feature_list[offset + j] = feature_array[i][j]; + res.slot_id_list[offset + j] = slot_id_array[i][j]; + } + return 0; + })); offset += feature_array[i].size(); + ind += node_id_array[i].size(); } + for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get(); return res; } paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph( - int idx, std::vector ids) { + int idx, const std::vector & ids) { std::vector> bags(task_pool_size_); for (int i = 0; i < task_pool_size_; i++) { auto predsize = ids.size() / task_pool_size_; @@ -327,7 +336,7 @@ int64_t GraphTable::load_graph_to_memory_from_ssd(int idx, std::string str; if (_db->get(i, ch, sizeof(int) * 2 + sizeof(uint64_t), str) == 0) { count[i] += (int64_t)str.size(); - for (size_t j = 0; j < (int)str.size(); j += sizeof(uint64_t)) { + for (size_t j = 0; j < str.size(); j += sizeof(uint64_t)) { uint64_t id = *(uint64_t *)(str.c_str() + j); add_comm_edge(idx, v, id); } @@ -397,7 +406,7 @@ void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) { score[i] = 0; } } - for (size_t j = 0; j < (int)value.size(); j += sizeof(uint64_t)) { + for (size_t j = 0; j < value.size(); j += sizeof(uint64_t)) { uint64_t v = *((uint64_t *)(value.c_str() + j)); int index = -1; if (id_map.find(v) != id_map.end()) { @@ -488,6 +497,116 @@ void GraphTable::clear_graph(int idx) { edge_shards[idx].push_back(new GraphShard()); } } + +void GraphTable::release_graph() { + // Before releasing graph, prepare for sampling ids and embedding keys. + build_graph_type_keys(); + + if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::WHOLE_HBM) { + build_graph_total_keys(); + } + // clear graph + if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH + || FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) { + clear_edge_shard(); + } + else { + clear_graph(); + } +} + +void GraphTable::release_graph_edge() { + if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::WHOLE_HBM) { + build_graph_total_keys(); + } + clear_edge_shard(); +} + +void GraphTable::release_graph_node() { + build_graph_type_keys(); + if (FLAGS_gpugraph_storage_mode != paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH + && FLAGS_gpugraph_storage_mode != paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) { + clear_feature_shard(); + } + else { + merge_feature_shard(); + feature_shrink_to_fit(); + } +} + +void GraphTable::clear_edge_shard() { + VLOG(0) << "begin clear edge shard"; + std::vector> tasks; + for (auto &type_shards : edge_shards) { + for (auto &shard : type_shards) { + tasks.push_back( + load_node_edge_task_pool->enqueue([&shard, this]() -> int { + delete shard; + return 0; + })); + } + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + for (auto &shards : edge_shards) shards.clear(); + edge_shards.clear(); + VLOG(0) << "finish clear edge shard"; +} + +void GraphTable::clear_feature_shard() { + VLOG(0) << "begin clear feature shard"; + std::vector> tasks; + for (auto &type_shards : feature_shards) { + for (auto &shard : type_shards) { + tasks.push_back( + load_node_edge_task_pool->enqueue([&shard, this]() -> int { + delete shard; + return 0; + })); + } + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + for (auto &shards : feature_shards) shards.clear(); + feature_shards.clear(); + VLOG(0) << "finish clear feature shard"; +} + +void GraphTable::feature_shrink_to_fit() { + std::vector> tasks; + for (auto &type_shards : feature_shards) { + for (auto &shard : type_shards) { + tasks.push_back( + load_node_edge_task_pool->enqueue([&shard, this]() -> int { + shard->shrink_to_fit(); + return 0; + })); + } + } + for(size_t i = 0; i < tasks.size(); i++) tasks[i].get(); +} + +void GraphTable::merge_feature_shard() { + VLOG(0) << "begin merge_feature_shard"; + std::vector> tasks; + for (size_t i = 0; i < feature_shards[0].size(); i++) { + tasks.push_back( + load_node_edge_task_pool->enqueue([i, this]() -> int { + for (size_t j = 1; j < feature_shards.size(); j++) { + feature_shards[0][i]->merge_shard(feature_shards[j][i]); + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + feature_shards.resize(1); +} + +void GraphTable::clear_graph() { + VLOG(0) << "begin clear_graph"; + clear_edge_shard(); + clear_feature_shard(); + VLOG(0) << "finish clear_graph"; +} + int32_t GraphTable::load_next_partition(int idx) { if (next_partition >= (int)partitions[idx].size()) { VLOG(0) << "partition iteration is done"; @@ -554,7 +673,7 @@ int32_t GraphTable::dump_edges_to_ssd(int idx) { std::vector &v = shards[i]->get_bucket(); for (size_t j = 0; j < v.size(); j++) { std::vector s; - for (size_t k = 0; k < (int)v[j]->get_neighbor_size(); k++) { + for (size_t k = 0; k < v[j]->get_neighbor_size(); k++) { s.push_back(v[j]->get_neighbor_id(k)); } cost += v[j]->get_neighbor_size() * sizeof(uint64_t); @@ -1053,21 +1172,7 @@ Node *GraphShard::find_node(uint64_t id) { return iter == node_location.end() ? nullptr : bucket[iter->second]; } -GraphTable::~GraphTable() { - for (int i = 0; i < (int)edge_shards.size(); i++) { - for (auto p : edge_shards[i]) { - delete p; - } - edge_shards[i].clear(); - } - - for (int i = 0; i < (int)feature_shards.size(); i++) { - for (auto p : feature_shards[i]) { - delete p; - } - feature_shards[i].clear(); - } -} +GraphTable::~GraphTable() { clear_graph(); } int32_t GraphTable::Load(const std::string &path, const std::string ¶m) { bool load_edge = (param[0] == 'e'); @@ -1095,16 +1200,19 @@ std::string GraphTable::get_inverse_etype(std::string &etype) { return res; } -int32_t GraphTable::parse_type_to_typepath(std::string &type2files, - std::string graph_data_local_path, - std::vector &res_type, - std::unordered_map &res_type2path) { - auto type2files_split = paddle::string::split_string(type2files, ","); +int32_t GraphTable::parse_type_to_typepath( + std::string &type2files, + std::string graph_data_local_path, + std::vector &res_type, + std::unordered_map &res_type2path) { + auto type2files_split = + paddle::string::split_string(type2files, ","); if (type2files_split.size() == 0) { return -1; } for (auto one_type2file : type2files_split) { - auto one_type2file_split = paddle::string::split_string(one_type2file, ":"); + auto one_type2file_split = + paddle::string::split_string(one_type2file, ":"); auto type = one_type2file_split[0]; auto type_dir = one_type2file_split[1]; res_type.push_back(type); @@ -1113,6 +1221,94 @@ int32_t GraphTable::parse_type_to_typepath(std::string &type2files, return 0; } +int32_t GraphTable::parse_edge_and_load(std::string etype2files, + std::string graph_data_local_path, + int part_num, + bool reverse) { + std::vector etypes; + std::unordered_map edge_to_edgedir; + int res = parse_type_to_typepath( + etype2files, graph_data_local_path, etypes, edge_to_edgedir); + if (res != 0) { + VLOG(0) << "parse edge type and edgedir failed!"; + return -1; + } + VLOG(0) << "etypes size: " << etypes.size(); + VLOG(0) << "whether reverse: " << reverse; + is_load_reverse_edge = reverse; + std::string delim = ";"; + size_t total_len = etypes.size(); + + std::vector> tasks; + for (size_t i = 0; i < total_len; i++) { + tasks.push_back( + _shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int { + std::string etype_path = edge_to_edgedir[etypes[i]]; + auto etype_path_list = paddle::framework::localfs_list(etype_path); + std::string etype_path_str; + if (part_num > 0 && part_num < (int)etype_path_list.size()) { + std::vector sub_etype_path_list( + etype_path_list.begin(), etype_path_list.begin() + part_num); + etype_path_str = + paddle::string::join_strings(sub_etype_path_list, delim); + } else { + etype_path_str = + paddle::string::join_strings(etype_path_list, delim); + } + this->load_edges(etype_path_str, false, etypes[i]); + if (reverse) { + std::string r_etype = get_inverse_etype(etypes[i]); + this->load_edges(etype_path_str, true, r_etype); + } + return 0; + })); + } + for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get(); + return 0; +} + +int32_t GraphTable::parse_node_and_load(std::string ntype2files, + std::string graph_data_local_path, + int part_num) { + std::vector ntypes; + std::unordered_map node_to_nodedir; + int res = parse_type_to_typepath( + ntype2files, graph_data_local_path, ntypes, node_to_nodedir); + if (res != 0) { + VLOG(0) << "parse node type and nodedir failed!"; + return -1; + } + if (ntypes.size() == 0) { + VLOG(0) << "node_type not specified, nothing will be loaded "; + return 0; + } + + std::string delim = ";"; + std::vector type_npath_strs; + for (size_t i = 0; i 0 && part_num < (int)npath_list.size()) { + std::vector sub_npath_list( + npath_list.begin(), npath_list.begin() + part_num); + type_npath_str = paddle::string::join_strings(sub_npath_list, delim); + } else { + type_npath_str = paddle::string::join_strings(npath_list, delim); + } + type_npath_strs.push_back(type_npath_str); + } + std::string npath_str = paddle::string::join_strings(type_npath_strs, delim); + if (FLAGS_graph_load_in_parallel) { + this->load_nodes(npath_str, ""); + } else { + for (size_t j = 0; j < ntypes.size(); j++) { + this->load_nodes(npath_str, ntypes[j]); + } + } + return 0; +} + int32_t GraphTable::load_node_and_edge_file(std::string etype2files, std::string ntype2files, std::string graph_data_local_path, @@ -1120,14 +1316,16 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype2files, bool reverse) { std::vector etypes; std::unordered_map edge_to_edgedir; - int res = parse_type_to_typepath(etype2files, graph_data_local_path, etypes, edge_to_edgedir); + int res = parse_type_to_typepath( + etype2files, graph_data_local_path, etypes, edge_to_edgedir); if (res != 0) { VLOG(0) << "parse edge type and edgedir failed!"; return -1; } std::vector ntypes; std::unordered_map node_to_nodedir; - res = parse_type_to_typepath(ntype2files, graph_data_local_path, ntypes, node_to_nodedir); + res = parse_type_to_typepath( + ntype2files, graph_data_local_path, ntypes, node_to_nodedir); if (res != 0) { VLOG(0) << "parse node type and nodedir failed!"; return -1; @@ -1177,7 +1375,6 @@ int32_t GraphTable::load_node_and_edge_file(std::string etype2files, VLOG(0) << "node_type not specified, nothing will be loaded "; return 0; } - if (FLAGS_graph_load_in_parallel) { this->load_nodes(npath_str, ""); } else { @@ -1461,7 +1658,6 @@ int32_t GraphTable::load_edges(const std::string &path, const std::string &edge_type) { #ifdef PADDLE_WITH_HETERPS if (search_level == 2) total_memory_cost = 0; - const uint64_t fixed_load_edges = 1000000; #endif int idx = 0; if (edge_type == "") { @@ -1483,7 +1679,7 @@ int32_t GraphTable::load_edges(const std::string &path, VLOG(0) << "Begin GraphTable::load_edges() edge_type[" << edge_type << "]"; if (FLAGS_graph_load_in_parallel) { std::vector>> tasks; - for (int i = 0; i < paths.size(); i++) { + for (size_t i = 0; i < paths.size(); i++) { tasks.push_back(load_node_edge_task_pool->enqueue( [&, i, idx, this]() -> std::pair { return parse_edge_file(paths[i], idx, reverse_edge); @@ -1866,8 +2062,7 @@ int GraphTable::parse_feature(int idx, thread_local std::vector fea_fields; fea_fields.clear(); c = feature_separator_.at(0); - paddle::string::split_string_ptr(fields[1].ptr, fields[1].len, c, &fea_fields); - + paddle::string::split_string_ptr(fields[1].ptr, fields[1].len, c, &fea_fields, FLAGS_gpugraph_slot_feasign_max_num); std::string name = fields[0].to_string(); auto it = feat_id_map[idx].find(name); if (it != feat_id_map[idx].end()) { @@ -1947,8 +2142,8 @@ int GraphTable::get_all_id(int type_id, MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards : feature_shards; std::vector> tasks; - for (int idx = 0; idx < search_shards.size(); idx++) { - for (int j = 0; j < search_shards[idx].size(); j++) { + for (size_t idx = 0; idx < search_shards.size(); idx++) { + for (size_t j = 0; j < search_shards[idx].size(); j++) { tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue( [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t { std::vector> shard_keys; @@ -1971,8 +2166,8 @@ int GraphTable::get_all_neighbor_id( MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards : feature_shards; std::vector> tasks; - for (int idx = 0; idx < search_shards.size(); idx++) { - for (int j = 0; j < search_shards[idx].size(); j++) { + for (size_t idx = 0; idx < search_shards.size(); idx++) { + for (size_t j = 0; j < search_shards[idx].size(); j++) { tasks.push_back(_shards_task_pool[j % task_pool_size_]->enqueue( [&search_shards, idx, j, slice_num, &shard_merge]() -> size_t { std::vector> shard_keys; @@ -2024,7 +2219,7 @@ int GraphTable::get_all_neighbor_id( auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; std::vector> tasks; VLOG(3) << "begin task, task_pool_size_[" << task_pool_size_ << "]"; - for (int i = 0; i < search_shards.size(); i++) { + for (size_t i = 0; i < search_shards.size(); i++) { tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( [&search_shards, i, slice_num, &shard_merge]() -> size_t { std::vector> shard_keys; @@ -2050,7 +2245,7 @@ int GraphTable::get_all_feature_ids( MergeShardVector shard_merge(output, slice_num); auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx]; std::vector> tasks; - for (int i = 0; i < search_shards.size(); i++) { + for (size_t i = 0; i < search_shards.size(); i++) { tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( [&search_shards, i, slice_num, &shard_merge]() -> size_t { std::vector> shard_keys; @@ -2229,7 +2424,7 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) { feat_name.resize(node_types.size()); feat_shape.resize(node_types.size()); feat_dtype.resize(node_types.size()); - VLOG(0) << "got " << node_types.size() << "node types in total"; + VLOG(0) << "got " << node_types.size() << " node types in total"; for (int k = 0; k < node_types.size(); k++) { feature_to_id[node_types[k]] = k; auto node_type = node_types[k]; @@ -2289,5 +2484,50 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) { return 0; } +void GraphTable::init_worker_poll(int gpu_num) { + _cpu_worker_pool.resize(gpu_num); + for (int i = 0; i < gpu_num; i++) { + _cpu_worker_pool[i].reset(new ::ThreadPool(16)); + } +} + +void GraphTable::build_graph_total_keys() { + VLOG(0) << "begin insert edge to graph_total_keys"; + // build node embedding id + std::vector> keys; + this->get_node_embedding_ids(1, &keys); + graph_total_keys_.insert( + graph_total_keys_.end(), keys[0].begin(), keys[0].end()); + + VLOG(0) << "finish insert edge to graph_total_keys"; +} + +void GraphTable::build_graph_type_keys() { + VLOG(0) << "begin build_graph_type_keys"; + graph_type_keys_.clear(); + graph_type_keys_.resize(this->feature_to_id.size()); + + int cnt = 0; + for (auto &it : this->feature_to_id) { + auto node_idx = it.second; + std::vector> keys; + this->get_all_id(1, node_idx, 1, &keys); + type_to_index_[node_idx] = cnt; + graph_type_keys_[cnt++] = std::move(keys[0]); + } + VLOG(0) << "finish build_graph_type_keys"; + + VLOG(0) << "begin insert feature into graph_total_keys"; + // build feature embedding id + for (auto &it : this->feature_to_id) { + auto node_idx = it.second; + std::vector> keys; + this->get_all_feature_ids(1, node_idx, 1, &keys); + graph_total_keys_.insert( + graph_total_keys_.end(), keys[0].begin(), keys[0].end()); + } + VLOG(0) << "finish insert feature into graph_total_keys"; +} + } // namespace distributed }; // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index 9ca8b7f1655fe5..1940fd25f88407 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -130,7 +130,29 @@ class GraphShard { return node_location; } - private: + void shrink_to_fit() { + bucket.shrink_to_fit() ; + for (size_t i = 0; i < bucket.size(); i ++) { + bucket[i]->shrink_to_fit(); + } + } + + void merge_shard(GraphShard * &shard) { + bucket.reserve(bucket.size() + shard->bucket.size()); + for (size_t i = 0; i < shard->bucket.size(); i++) { + auto node_id = shard->bucket[i]->get_id(); + if (node_location.find(node_id) == node_location.end()) { + node_location[node_id] = bucket.size(); + bucket.push_back(shard->bucket[i]); + } + } + shard->node_location.clear(); + shard->bucket.clear(); + delete shard; + shard = NULL; + } + + public: std::unordered_map node_location; std::vector bucket; }; @@ -271,7 +293,6 @@ class RandomSampleLRU { remove(node_head); remove_count--; } - // std::cerr<<"after remove_count = "< *node) { @@ -535,25 +556,28 @@ class GraphTable : public Table { virtual int32_t Initialize(const TableParameter &config, const FsClientParameter &fs_config); virtual int32_t Initialize(const GraphParameter &config); + void init_worker_poll(int gpu_num); int32_t Load(const std::string &path, const std::string ¶m); - int32_t load_node_and_edge_file(std::string etype2files, std::string ntype2files, std::string graph_data_local_path, int part_num, bool reverse); - + int32_t parse_edge_and_load(std::string etype2files, + std::string graph_data_local_path, + int part_num, + bool reverse); + int32_t parse_node_and_load(std::string ntype2files, + std::string graph_data_local_path, + int part_num); std::string get_inverse_etype(std::string &etype); - int32_t parse_type_to_typepath(std::string &type2files, std::string graph_data_local_path, std::vector &res_type, std::unordered_map &res_type2path); - int32_t load_edges(const std::string &path, bool reverse, const std::string &edge_type); - int get_all_id(int type, int slice_num, std::vector> *output); @@ -635,7 +659,15 @@ class GraphTable : public Table { const std::vector> &res); size_t get_server_num() { return server_num; } + void clear_graph(); void clear_graph(int idx); + void clear_edge_shard(); + void clear_feature_shard(); + void feature_shrink_to_fit(); + void merge_feature_shard(); + void release_graph(); + void release_graph_edge(); + void release_graph_node(); virtual int32_t make_neighbor_sample_cache(size_t size_limit, size_t ttl) { { std::unique_lock lock(mutex_); @@ -672,9 +704,9 @@ class GraphTable : public Table { virtual int32_t add_node_to_ssd( int type_id, int idx, uint64_t src_id, char *data, int len); virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph( - int idx, std::vector ids); + int idx, const std::vector & ids); virtual paddle::framework::GpuPsCommGraphFea make_gpu_ps_graph_fea( - std::vector &node_ids, int slot_num); + int gpu_id, std::vector &node_ids, int slot_num); int32_t Load_to_ssd(const std::string &path, const std::string ¶m); int64_t load_graph_to_memory_from_ssd(int idx, std::vector &ids); int32_t make_complementary_graph(int idx, int64_t byte_size); @@ -700,9 +732,17 @@ class GraphTable : public Table { virtual int32_t build_sampler(int idx, std::string sample_type = "random"); void set_slot_feature_separator(const std::string &ch); void set_feature_separator(const std::string &ch); + + void build_graph_total_keys(); + void build_graph_type_keys(); + + std::vector graph_total_keys_; + std::vector> graph_type_keys_; + std::unordered_map type_to_index_; + std::vector> edge_shards, feature_shards; size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; - int task_pool_size_ = 24; + int task_pool_size_ = 64; int load_thread_num = 160; const int random_sample_nodes_ranges = 3; @@ -718,6 +758,7 @@ class GraphTable : public Table { std::string table_type; std::vector> _shards_task_pool; + std::vector> _cpu_worker_pool; std::vector> _shards_task_rng_pool; std::shared_ptr<::ThreadPool> load_node_edge_task_pool; std::shared_ptr> scaled_lru; diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc index 4feee70fed751a..3e4f4d68f49cac 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc @@ -292,7 +292,8 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) { thread_local std::ostringstream os; os.clear(); os.str(""); - os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4]; + os << common_feature_value.UnseenDays(const_cast(v)) << " " << v[1] + << " " << v[2] << " " << v[3] << " " << v[4]; // << v[5] << " " << v[6]; for (int i = common_feature_value.EmbedG2SumIndex(); i < common_feature_value.EmbedxG2SumIndex(); @@ -320,5 +321,18 @@ int CtrDymfAccessor::ParseFromString(const std::string& str, float* value) { return ret; } +bool CtrDymfAccessor::SaveMemCache(float* value, + int param, + double global_cache_threshold, + uint16_t pass_id) { + auto base_threshold = _config.ctr_accessor_param().base_threshold(); + return common_feature_value.Show(value) > global_cache_threshold || + common_feature_value.PassId(value) >= pass_id; +} + +void CtrDymfAccessor::UpdatePassId(float* value, uint16_t pass_id) { + common_feature_value.PassId(value) = pass_id; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h index b820d617d06ae6..047bafedd9d7b6 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h @@ -30,7 +30,7 @@ namespace distributed { class CtrDymfAccessor : public ValueAccessor { public: struct CtrDymfFeatureValue { - /* + /*v1: old version float unseen_days; float delta_score; float show; @@ -44,6 +44,20 @@ class CtrDymfAccessor : public ValueAccessor { // float embedx_g2sum; std::vector embedx_w; */ + /* V2: support pass_id + uint16_t pass_id; + uint16_t unseen_days; + float show; + float click; + float embed_w; + // float embed_g2sum; + std::vector embed_g2sum; + float slot; + float mf_dim + std::float embedx_g2sum; + // float embedx_g2sum; + std::vector embedx_w; + */ int Dim() { return 7 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; } int DimSize(size_t dim, int embedx_dim) { return sizeof(float); } @@ -73,7 +87,17 @@ class CtrDymfAccessor : public ValueAccessor { // 根据mf_dim计算的总byte数 int Size(int& mf_dim) { return (Dim(mf_dim)) * sizeof(float); } - float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; } + uint16_t& PassId(float* val) { + uint16_t* int16_val = + reinterpret_cast(val + UnseenDaysIndex()); + return int16_val[0]; + } + + uint16_t& UnseenDays(float* val) { + uint16_t* int16_val = + reinterpret_cast(val + UnseenDaysIndex()); + return int16_val[1]; + } float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; } float& Show(float* val) { return val[ShowIndex()]; } float& Click(float* val) { return val[ClickIndex()]; } @@ -217,6 +241,14 @@ class CtrDymfAccessor : public ValueAccessor { return 0.0; } + //根据pass_id和show_threashold阈值来判断cache到ssd + bool SaveMemCache(float* value, + int param, + double global_cache_threshold, + uint16_t pass_id); + //更新pass_id + void UpdatePassId(float* value, uint16_t pass_id); + private: // float ShowClickScore(float show, float click); diff --git a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h index eb3ff2e254f567..5b5a0057f37078 100644 --- a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h +++ b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h @@ -16,8 +16,12 @@ #include #include #include +#include #include +#include +#include #include +#include #include #include @@ -27,6 +31,55 @@ namespace paddle { namespace distributed { +class Uint64Comparator : public rocksdb::Comparator { + int Compare(const rocksdb::Slice& a, const rocksdb::Slice& b) const { + uint64_t A = *((uint64_t*)const_cast(a.data())); + uint64_t B = *((uint64_t*)const_cast(b.data())); + if (A < B) { + return -1; + } + if (A > B) { + return 1; + } + return 0; + } + const char* Name() const { return "Uint64Comparator"; } + void FindShortestSeparator(std::string*, const rocksdb::Slice&) const {} + void FindShortSuccessor(std::string*) const {} +}; + +class RocksDBItem { + public: + RocksDBItem() {} + ~RocksDBItem() {} + void reset() { + batch_keys.clear(); + batch_index.clear(); + batch_values.clear(); + status.clear(); + } + std::vector batch_keys; + std::vector batch_index; + std::vector batch_values; + std::vector status; +}; + +class RocksDBCtx { + public: + RocksDBCtx() { + items[0].reset(); + items[1].reset(); + cur_index = 0; + } + ~RocksDBCtx() {} + RocksDBItem* switch_item() { + cur_index = (cur_index + 1) % 2; + return &items[cur_index]; + } + RocksDBItem items[2]; + int cur_index; +}; + class RocksDBHandler { public: RocksDBHandler() {} @@ -38,55 +91,69 @@ class RocksDBHandler { } int initialize(const std::string& db_path, const int colnum) { - VLOG(3) << "db path: " << db_path << " colnum: " << colnum; - rocksdb::Options options; - rocksdb::BlockBasedTableOptions bbto; - bbto.block_size = 4 * 1024; - bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024); - bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024); - bbto.cache_index_and_filter_blocks = false; - bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(20, false)); - bbto.whole_key_filtering = true; - options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto)); - - options.keep_log_file_num = 100; - options.max_log_file_size = 50 * 1024 * 1024; // 50MB - options.create_if_missing = true; - options.use_direct_reads = true; - options.max_background_flushes = 5; - options.max_background_compactions = 5; - options.base_background_compactions = 10; - options.write_buffer_size = 256 * 1024 * 1024; // 256MB - options.max_write_buffer_number = 8; - options.max_bytes_for_level_base = - options.max_write_buffer_number * options.write_buffer_size; - options.min_write_buffer_number_to_merge = 1; - options.target_file_size_base = 1024 * 1024 * 1024; // 1024MB - options.memtable_prefix_bloom_size_ratio = 0.02; - options.num_levels = 4; - options.max_open_files = -1; - - options.compression = rocksdb::kNoCompression; - options.level0_file_num_compaction_trigger = 8; - options.level0_slowdown_writes_trigger = - 1.8 * options.level0_file_num_compaction_trigger; - options.level0_stop_writes_trigger = - 3.6 * options.level0_file_num_compaction_trigger; - - if (!db_path.empty()) { - std::string rm_cmd = "rm -rf " + db_path; - system(rm_cmd.c_str()); - } - - rocksdb::Status s = rocksdb::DB::Open(options, db_path, &_db); - assert(s.ok()); - _handles.resize(colnum); + VLOG(0) << "db path: " << db_path << " colnum: " << colnum; + _dbs.resize(colnum); for (int i = 0; i < colnum; i++) { - s = _db->CreateColumnFamily( - options, "shard_" + std::to_string(i), &_handles[i]); + rocksdb::Options options; + options.comparator = &_comparator; + rocksdb::BlockBasedTableOptions bbto; + // options.memtable_factory.reset(rocksdb::NewHashSkipListRepFactory(65536)); + // options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(2)); + bbto.format_version = 5; + bbto.use_delta_encoding = false; + bbto.block_size = 4 * 1024; + bbto.block_restart_interval = 6; + bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024); + // bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024); + bbto.cache_index_and_filter_blocks = false; + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(15, false)); + bbto.whole_key_filtering = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto)); + + // options.IncreaseParallelism(); + options.OptimizeLevelStyleCompaction(); + options.keep_log_file_num = 100; + // options.db_log_dir = "./log/rocksdb"; + options.max_log_file_size = 50 * 1024 * 1024; // 50MB + // options.threads = 8; + options.create_if_missing = true; + options.use_direct_reads = true; + options.max_background_flushes = 37; + options.max_background_compactions = 64; + options.base_background_compactions = 10; + options.write_buffer_size = 256 * 1024 * 1024; // 256MB + options.max_write_buffer_number = 8; + options.max_bytes_for_level_base = + options.max_write_buffer_number * options.write_buffer_size; + options.min_write_buffer_number_to_merge = 1; + options.target_file_size_base = 1024 * 1024 * 1024; // 1024MB + // options.verify_checksums_in_compaction = false; + // options.disable_auto_compactions = true; + options.memtable_prefix_bloom_size_ratio = 0.02; + options.num_levels = 4; + options.max_open_files = -1; + + options.compression = rocksdb::kNoCompression; + // options.compaction_options_fifo = rocksdb::CompactionOptionsFIFO(); + // options.compaction_style = + // rocksdb::CompactionStyle::kCompactionStyleFIFO; + options.level0_file_num_compaction_trigger = 5; + options.level0_slowdown_writes_trigger = + 1.8 * options.level0_file_num_compaction_trigger; + options.level0_stop_writes_trigger = + 3.6 * options.level0_file_num_compaction_trigger; + + std::string shard_path = db_path + "_" + std::to_string(i); + if (!shard_path.empty()) { + std::string rm_cmd = "rm -rf " + shard_path; + system(rm_cmd.c_str()); + } + + rocksdb::Status s = rocksdb::DB::Open(options, shard_path, &_dbs[i]); assert(s.ok()); } - LOG(INFO) << "DB initialize success, colnum:" << colnum; + VLOG(0) << "DB initialize success, colnum:" << colnum; return 0; } @@ -94,10 +161,9 @@ class RocksDBHandler { int id, const char* key, int key_len, const char* value, int value_len) { rocksdb::WriteOptions options; options.disableWAL = true; - rocksdb::Status s = _db->Put(options, - _handles[id], - rocksdb::Slice(key, key_len), - rocksdb::Slice(value, value_len)); + rocksdb::Status s = _dbs[id]->Put(options, + rocksdb::Slice(key, key_len), + rocksdb::Slice(value, value_len)); assert(s.ok()); return 0; } @@ -110,20 +176,17 @@ class RocksDBHandler { options.disableWAL = true; rocksdb::WriteBatch batch(n * 128); for (int i = 0; i < n; i++) { - batch.Put(_handles[id], - rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second), + batch.Put(rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second), rocksdb::Slice(ssd_values[i].first, ssd_values[i].second)); } - rocksdb::Status s = _db->Write(options, &batch); + rocksdb::Status s = _dbs[id]->Write(options, &batch); assert(s.ok()); return 0; } int get(int id, const char* key, int key_len, std::string& value) { - rocksdb::Status s = _db->Get(rocksdb::ReadOptions(), - _handles[id], - rocksdb::Slice(key, key_len), - &value); + rocksdb::Status s = _dbs[id]->Get( + rocksdb::ReadOptions(), rocksdb::Slice(key, key_len), &value); if (s.IsNotFound()) { return 1; } @@ -131,33 +194,58 @@ class RocksDBHandler { return 0; } + void multi_get(int id, + const size_t num_keys, + const rocksdb::Slice* keys, + rocksdb::PinnableSlice* values, + rocksdb::Status* status, + const bool sorted_input = true) { + rocksdb::ColumnFamilyHandle* handle = _dbs[id]->DefaultColumnFamily(); + auto read_opt = rocksdb::ReadOptions(); + read_opt.fill_cache = false; + _dbs[id]->MultiGet( + read_opt, handle, num_keys, keys, values, status, sorted_input); + } + int del_data(int id, const char* key, int key_len) { rocksdb::WriteOptions options; options.disableWAL = true; - rocksdb::Status s = - _db->Delete(options, _handles[id], rocksdb::Slice(key, key_len)); + rocksdb::Status s = _dbs[id]->Delete(options, rocksdb::Slice(key, key_len)); assert(s.ok()); return 0; } int flush(int id) { - rocksdb::Status s = _db->Flush(rocksdb::FlushOptions(), _handles[id]); + rocksdb::Status s = _dbs[id]->Flush(rocksdb::FlushOptions()); assert(s.ok()); return 0; } rocksdb::Iterator* get_iterator(int id) { - return _db->NewIterator(rocksdb::ReadOptions(), _handles[id]); + return _dbs[id]->NewIterator(rocksdb::ReadOptions()); } int get_estimate_key_num(uint64_t& num_keys) { - _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys); + // _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys); + return 0; + } + + Uint64Comparator* get_comparator() { return &_comparator; } + + int ingest_externel_file(int id, + const std::vector& sst_filelist) { + rocksdb::IngestExternalFileOptions ifo; + ifo.move_files = true; + rocksdb::Status s = _dbs[id]->IngestExternalFile(sst_filelist, ifo); + assert(s.ok()); return 0; } private: std::vector _handles; - rocksdb::DB* _db; + // rocksdb::DB* _db; + std::vector _dbs; + Uint64Comparator _comparator; }; } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h index 9c384a9744b8a3..59e6a5f634c8ad 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h @@ -56,8 +56,12 @@ class Node { virtual int get_feature_ids(int slot_idx, std::vector *res) const { return 0; } + virtual int get_feature_ids(int slot_idx, std::vector & feature_id, std::vector & slot_id) const { + return 0; + } virtual void set_feature(int idx, const std::string &str) {} virtual void set_feature_size(int size) {} + virtual void shrink_to_fit() {} virtual int get_feature_size() { return 0; } virtual size_t get_neighbor_size() { return 0; } @@ -155,6 +159,28 @@ class FeatureNode : public Node { return 0; } + virtual int get_feature_ids(int slot_idx, std::vector & feature_id, std::vector & slot_id) const { + errno = 0; + size_t num = 0; + if (slot_idx < (int)this->feature.size()) { + const std::string &s = this->feature[slot_idx]; + const uint64_t *feas = (const uint64_t *)(s.c_str()); + num = s.length() / sizeof(uint64_t); + CHECK((s.length() % sizeof(uint64_t)) == 0) + << "bad feature_item: [" << s << "]"; + for (size_t i = 0; i < num; ++i) { + feature_id.push_back(feas[i]); + slot_id.push_back(slot_idx); + } + } + PADDLE_ENFORCE_EQ( + errno, + 0, + paddle::platform::errors::InvalidArgument( + "get_feature_ids get errno should be 0, but got %d.", errno)); + return num; + } + virtual std::string *mutable_feature(int idx) { if (idx >= (int)this->feature.size()) { this->feature.resize(idx + 1); @@ -170,6 +196,12 @@ class FeatureNode : public Node { } virtual void set_feature_size(int size) { this->feature.resize(size); } virtual int get_feature_size() { return this->feature.size(); } + virtual void shrink_to_fit() { + feature.shrink_to_fit(); + for (auto & slot : feature) { + slot.shrink_to_fit(); + } + } template static std::string parse_value_to_bytes(std::vector feat_str) { diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc index a46244265ef206..9a69433c6104a1 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc @@ -172,7 +172,6 @@ int32_t MemorySparseTable::Load(const std::string& path, value.resize(feature_value_size); int parse_size = _value_accesor->ParseFromString(++end, value.data()); value.resize(parse_size); - } read_channel->close(); if (err_no == -1) { @@ -725,7 +724,8 @@ int32_t MemorySparseTable::Pull(TableContext& context) { if (context.use_ptr) { char** pull_values = context.pull_context.ptr_values; const uint64_t* keys = context.pull_context.keys; - return PullSparsePtr(pull_values, keys, context.num); + return PullSparsePtr( + context.shard_id, pull_values, keys, context.num, context.pass_id); } else { float* pull_values = context.pull_context.values; const PullSparseValue& pull_value = context.pull_context.pull_value; @@ -822,9 +822,11 @@ int32_t MemorySparseTable::PullSparse(float* pull_values, return 0; } -int32_t MemorySparseTable::PullSparsePtr(char** pull_values, +int32_t MemorySparseTable::PullSparsePtr(int shard_id, // fake num + char** pull_values, const uint64_t* keys, - size_t num) { + size_t num, + uint16_t pass_id) { CostTimer timer("pscore_sparse_select_all"); size_t value_size = _value_accesor->GetAccessorInfo().size / sizeof(float); size_t mf_value_size = diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h index 17018d5e5dfc3d..658446d770c713 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h @@ -90,7 +90,11 @@ class MemorySparseTable : public Table { std::pair PrintTableStat() override; int32_t PullSparse(float* values, const PullSparseValue& pull_value); - int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, size_t num); + int32_t PullSparsePtr(int shard_id, + char** pull_values, + const uint64_t* keys, + size_t num, + uint16_t pass_id); int32_t PushSparse(const uint64_t* keys, const float* values, size_t num); diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc index 05f7c5c5780ea6..b7af4172e3f9e2 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc @@ -24,8 +24,10 @@ DECLARE_bool(pserver_print_missed_key_num_every_push); DECLARE_bool(pserver_create_value_when_push); DECLARE_bool(pserver_enable_create_feasign_randomly); DEFINE_bool(pserver_open_strict_check, false, "pserver_open_strict_check"); -DEFINE_string(rocksdb_path, "database", "path of sparse table rocksdb file"); DEFINE_int32(pserver_load_batch_size, 5000, "load batch size for ssd"); +PADDLE_DEFINE_EXPORTED_string(rocksdb_path, + "database", + "path of sparse table rocksdb file"); namespace paddle { namespace distributed { @@ -35,6 +37,8 @@ int32_t SSDSparseTable::Initialize() { _db = paddle::distributed::RocksDBHandler::GetInstance(); _db->initialize(FLAGS_rocksdb_path, _real_local_shard_num); VLOG(0) << "initalize SSDSparseTable succ"; + VLOG(0) << "SSD FLAGS_pserver_print_missed_key_num_every_push:" + << FLAGS_pserver_print_missed_key_num_every_push; return 0; } @@ -45,7 +49,8 @@ int32_t SSDSparseTable::Pull(TableContext& context) { if (context.use_ptr) { char** pull_values = context.pull_context.ptr_values; const uint64_t* keys = context.pull_context.keys; - return PullSparsePtr(pull_values, keys, context.num); + return PullSparsePtr( + context.shard_id, pull_values, keys, context.num, context.pass_id); } else { float* pull_values = context.pull_context.values; const PullSparseValue& pull_value = context.pull_context.pull_value; @@ -172,90 +177,139 @@ int32_t SSDSparseTable::PullSparse(float* pull_values, return 0; } -int32_t SSDSparseTable::PullSparsePtr(char** pull_values, - const uint64_t* keys, - size_t num) { +int32_t SSDSparseTable::PullSparsePtr(int shard_id, + char** pull_values, + const uint64_t* pull_keys, + size_t num, + uint16_t pass_id) { CostTimer timer("pserver_ssd_sparse_select_all"); size_t value_size = _value_accesor->GetAccessorInfo().size / sizeof(float); size_t mf_value_size = _value_accesor->GetAccessorInfo().mf_size / sizeof(float); { // 从table取值 or create - std::vector> tasks(_real_local_shard_num); - std::vector>> task_keys( - _real_local_shard_num); - for (size_t i = 0; i < num; ++i) { - int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num; - task_keys[shard_id].push_back({keys[i], i}); - } + RocksDBCtx context; + std::vector> tasks; + RocksDBItem* cur_ctx = context.switch_item(); + cur_ctx->reset(); + FixedFeatureValue* ret = NULL; + auto& local_shard = _local_shards[shard_id]; + float data_buffer[value_size]; + float* data_buffer_ptr = data_buffer; - std::atomic missed_keys{0}; - for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) { - tasks[shard_id] = + for (int i = 0; i < num; ++i) { + uint64_t key = pull_keys[i]; + auto itr = local_shard.find(key); + if (itr == local_shard.end()) { + cur_ctx->batch_index.push_back(i); + cur_ctx->batch_keys.push_back( + rocksdb::Slice((char*)&(pull_keys[i]), sizeof(uint64_t))); + if (cur_ctx->batch_keys.size() == 1024) { + cur_ctx->batch_values.resize(cur_ctx->batch_keys.size()); + cur_ctx->status.resize(cur_ctx->batch_keys.size()); + auto fut = + _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue( + [this, shard_id, cur_ctx]() -> int { + _db->multi_get(shard_id, + cur_ctx->batch_keys.size(), + cur_ctx->batch_keys.data(), + cur_ctx->batch_values.data(), + cur_ctx->status.data()); + return 0; + }); + cur_ctx = context.switch_item(); + for (size_t x = 0; x < tasks.size(); ++x) { + tasks[x].wait(); + for (size_t idx = 0; idx < cur_ctx->status.size(); idx++) { + uint64_t cur_key = *((uint64_t*)const_cast( + cur_ctx->batch_keys[idx].data())); + if (cur_ctx->status[idx].IsNotFound()) { + auto& feature_value = local_shard[cur_key]; + int init_size = value_size - mf_value_size; + feature_value.resize(init_size); + _value_accesor->Create(&data_buffer_ptr, 1); + memcpy(const_cast(feature_value.data()), + data_buffer_ptr, + init_size * sizeof(float)); + ret = &feature_value; + } else { + int data_size = + cur_ctx->batch_values[idx].size() / sizeof(float); + // from rocksdb to mem + auto& feature_value = local_shard[cur_key]; + feature_value.resize(data_size); + memcpy(const_cast(feature_value.data()), + paddle::string::str_to_float( + cur_ctx->batch_values[idx].data()), + data_size * sizeof(float)); + _db->del_data(shard_id, (char*)&cur_key, sizeof(uint64_t)); + ret = &feature_value; + } + _value_accesor->UpdatePassId(ret->data(), pass_id); + int pull_data_idx = cur_ctx->batch_index[idx]; + pull_values[pull_data_idx] = (char*)ret; + } + } + cur_ctx->reset(); + tasks.clear(); + tasks.push_back(std::move(fut)); + } + } else { + ret = itr.value_ptr(); + // int pull_data_idx = keys[i].second; + _value_accesor->UpdatePassId(ret->data(), pass_id); + pull_values[i] = (char*)ret; + } + } + if (cur_ctx->batch_keys.size() != 0) { + cur_ctx->batch_values.resize(cur_ctx->batch_keys.size()); + cur_ctx->status.resize(cur_ctx->batch_keys.size()); + auto fut = _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue( - [this, - shard_id, - &task_keys, - value_size, - mf_value_size, - pull_values, - &missed_keys]() -> int { - auto& keys = task_keys[shard_id]; - auto& local_shard = _local_shards[shard_id]; - float data_buffer[value_size]; // NOLINT - float* data_buffer_ptr = data_buffer; - for (size_t i = 0; i < keys.size(); ++i) { - uint64_t key = keys[i].first; - auto itr = local_shard.find(key); - size_t data_size = value_size - mf_value_size; - FixedFeatureValue* ret = NULL; - if (itr == local_shard.end()) { - // pull rocksdb - std::string tmp_string(""); - if (_db->get(shard_id, - reinterpret_cast(&key), - sizeof(uint64_t), - tmp_string) > 0) { - ++missed_keys; - auto& feature_value = local_shard[key]; - feature_value.resize(data_size); - float* data_ptr = - const_cast(feature_value.data()); - _value_accesor->Create(&data_buffer_ptr, 1); - memcpy( - data_ptr, data_buffer_ptr, data_size * sizeof(float)); - ret = &feature_value; - } else { - data_size = tmp_string.size() / sizeof(float); - memcpy(data_buffer_ptr, - paddle::string::str_to_float(tmp_string), - data_size * sizeof(float)); - // from rocksdb to mem - auto& feature_value = local_shard[key]; - feature_value.resize(data_size); - memcpy(const_cast(feature_value.data()), - data_buffer_ptr, - data_size * sizeof(float)); - _db->del_data(shard_id, - reinterpret_cast(&key), - sizeof(uint64_t)); - ret = &feature_value; - } - } else { - ret = itr.value_ptr(); - } - int pull_data_idx = keys[i].second; - pull_values[pull_data_idx] = reinterpret_cast(ret); - } + [this, shard_id, cur_ctx]() -> int { + _db->multi_get(shard_id, + cur_ctx->batch_keys.size(), + cur_ctx->batch_keys.data(), + cur_ctx->batch_values.data(), + cur_ctx->status.data()); return 0; }); + tasks.push_back(std::move(fut)); } - for (int i = 0; i < _real_local_shard_num; ++i) { - tasks[i].wait(); + for (size_t x = 0; x < tasks.size(); ++x) { + tasks[x].wait(); } - if (FLAGS_pserver_print_missed_key_num_every_push) { - LOG(WARNING) << "total pull keys:" << num - << " missed_keys:" << missed_keys.load(); + for (size_t x = 0; x < 2; x++) { + cur_ctx = context.switch_item(); + for (size_t idx = 0; idx < cur_ctx->status.size(); idx++) { + uint64_t cur_key = + *((uint64_t*)const_cast(cur_ctx->batch_keys[idx].data())); + if (cur_ctx->status[idx].IsNotFound()) { + auto& feature_value = local_shard[cur_key]; + int init_size = value_size - mf_value_size; + feature_value.resize(init_size); + _value_accesor->Create(&data_buffer_ptr, 1); + memcpy(const_cast(feature_value.data()), + data_buffer_ptr, + init_size * sizeof(float)); + ret = &feature_value; + } else { + int data_size = cur_ctx->batch_values[idx].size() / sizeof(float); + // from rocksdb to mem + auto& feature_value = local_shard[cur_key]; + feature_value.resize(data_size); + memcpy( + const_cast(feature_value.data()), + paddle::string::str_to_float(cur_ctx->batch_values[idx].data()), + data_size * sizeof(float)); + _db->del_data(shard_id, (char*)&cur_key, sizeof(uint64_t)); + ret = &feature_value; + } + _value_accesor->UpdatePassId(ret->data(), pass_id); + int pull_data_idx = cur_ctx->batch_index[idx]; + pull_values[pull_data_idx] = (char*)ret; + } + cur_ctx->reset(); } } return 0; @@ -527,6 +581,7 @@ int64_t SSDSparseTable::LocalSize() { int32_t SSDSparseTable::Save(const std::string& path, const std::string& param) { + std::lock_guard guard(_table_mutex); if (_real_local_shard_num == 0) { _local_show_threshold = -1; return 0; @@ -537,15 +592,16 @@ int32_t SSDSparseTable::Save(const std::string& path, // } // LOG(INFO) << "table cache rate is: " << _config.sparse_table_cache_rate(); - LOG(INFO) << "table cache rate is: " << _config.sparse_table_cache_rate(); - LOG(INFO) << "enable_sparse_table_cache: " - << _config.enable_sparse_table_cache(); - LOG(INFO) << "LocalSize: " << LocalSize(); + VLOG(0) << "table cache rate is: " << _config.sparse_table_cache_rate(); + VLOG(0) << "enable_sparse_table_cache: " + << _config.enable_sparse_table_cache(); + VLOG(0) << "LocalSize: " << LocalSize(); if (_config.enable_sparse_table_cache()) { - LOG(INFO) << "Enable sparse table cache, top n:" << _cache_tk_size; + VLOG(0) << "Enable sparse table cache, top n:" << _cache_tk_size; } _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate(); TopkCalculator tk(_real_local_shard_num, _cache_tk_size); + VLOG(0) << "TopkCalculator top n:" << _cache_tk_size; size_t file_start_idx = _avg_local_shard_num * _shard_idx; std::string table_path = TableDir(path); _afs_client.remove(paddle::string::format_string( @@ -560,138 +616,141 @@ int32_t SSDSparseTable::Save(const std::string& path, std::atomic feasign_size_all{0}; // feasign_size = 0; - omp_set_num_threads(thread_num); -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < _real_local_shard_num; ++i) { + std::vector< + paddle::framework::Channel>>> + fs_channel; + for (int i = 0; i < _real_local_shard_num; i++) { + fs_channel.push_back( + paddle::framework::MakeChannel>>( + 10240)); + } + std::vector threads; + threads.resize(_real_local_shard_num); + + auto save_func = [this, + &save_param, + &table_path, + &file_start_idx, + &fs_channel](int file_num) { + int err_no = 0; FsChannelConfig channel_config; if (_config.compress_in_save() && (save_param == 0 || save_param == 3)) { channel_config.path = paddle::string::format_string("%s/part-%03d-%05d.gz", table_path.c_str(), _shard_idx, - file_start_idx + i); + file_start_idx + file_num); } else { - channel_config.path = paddle::string::format_string("%s/part-%03d-%05d", - table_path.c_str(), - _shard_idx, - file_start_idx + i); + channel_config.path = + paddle::string::format_string("%s/part-%03d-%05d", + table_path.c_str(), + _shard_idx, + file_start_idx + file_num); } channel_config.converter = _value_accesor->Converter(save_param).converter; channel_config.deconverter = _value_accesor->Converter(save_param).deconverter; - int err_no = 0; - int retry_num = 0; - bool is_write_failed = false; + auto write_channel = + _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no); + paddle::framework::ChannelReader>> + reader(fs_channel[file_num].get()); + std::pair> out_str; + while (reader >> out_str) { + std::string format_value = _value_accesor->ParseToString( + out_str.second.data(), out_str.second.size()); + if (0 != write_channel->write_line(paddle::string::format_string( + "%lu %s", out_str.first, format_value.c_str()))) { + LOG(FATAL) << "SSDSparseTable save failed, retry it! path:" + << channel_config.path; + } + } + write_channel->close(); + }; + for (size_t i = 0; i < threads.size(); i++) { + threads[i] = std::thread(save_func, i); + } + + std::vector< + paddle::framework::ChannelWriter>>> + writers(_real_local_shard_num); + omp_set_num_threads(thread_num); +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < _real_local_shard_num; ++i) { int feasign_size = 0; auto& shard = _local_shards[i]; - do { - err_no = 0; - feasign_size = 0; - is_write_failed = false; - auto write_channel = - _afs_client.open_w(channel_config, 1024 * 1024 * 40, &err_no); + auto& writer = writers[i]; + writer.Reset(fs_channel[i].get()); + { for (auto it = shard.begin(); it != shard.end(); ++it) { if (_config.enable_sparse_table_cache() && - (save_param == 1 || save_param == 2) && - _value_accesor->Save(it.value().data(), 4)) { - // tk.push(i, it.value().data()[2]); + (save_param == 1 || save_param == 2)) { + // get_field get right decayed show tk.push(i, _value_accesor->GetField(it.value().data(), "show")); } if (_value_accesor->Save(it.value().data(), save_param)) { - std::string format_value = _value_accesor->ParseToString( - it.value().data(), it.value().size()); - if (0 != write_channel->write_line(paddle::string::format_string( - "%lu %s", it.key(), format_value.c_str()))) { - ++retry_num; - is_write_failed = true; - LOG(ERROR) << "SSDSparseTable save failed, retry it! path:" - << channel_config.path << ", retry_num=" << retry_num; - break; - } + std::vector feature_value; + feature_value.resize(it.value().size()); + memcpy(const_cast(feature_value.data()), + it.value().data(), + it.value().size() * sizeof(float)); + writer << std::make_pair(it.key(), std::move(feature_value)); ++feasign_size; } } + } - if (err_no == -1 && !is_write_failed) { - ++retry_num; - is_write_failed = true; - LOG(ERROR) << "SSDSparseTable save failed after write, retry it! " - << "path:" << channel_config.path - << " , retry_num=" << retry_num; - } - if (is_write_failed) { - _afs_client.remove(channel_config.path); - continue; - } - - // delta and cache and revert is all in mem, base in rocksdb - if (save_param != 1) { - auto* it = _db->get_iterator(i); - for (it->SeekToFirst(); it->Valid(); it->Next()) { - bool need_save = _value_accesor->Save( - paddle::string::str_to_float(it->value().data()), save_param); - _value_accesor->UpdateStatAfterSave( - paddle::string::str_to_float(it->value().data()), save_param); - if (need_save) { - std::string format_value = _value_accesor->ParseToString( - paddle::string::str_to_float(it->value().data()), - it->value().size() / sizeof(float)); - if (0 != write_channel->write_line(paddle::string::format_string( - "%lu %s", - *((uint64_t*)const_cast(it->key().data())), - format_value.c_str()))) { - ++retry_num; - is_write_failed = true; - LOG(ERROR) << "SSDSparseTable save failed, retry it! path:" - << channel_config.path << ", retry_num=" << retry_num; - break; - } - if (save_param == 3) { - _db->put(i, - it->key().data(), - it->key().size(), - it->value().data(), - it->value().size()); - } - ++feasign_size; - } + if (save_param != 1) { + auto* it = _db->get_iterator(i); + for (it->SeekToFirst(); it->Valid(); it->Next()) { + bool need_save = _value_accesor->Save( + paddle::string::str_to_float(it->value().data()), save_param); + _value_accesor->UpdateStatAfterSave( + paddle::string::str_to_float(it->value().data()), save_param); + if (need_save) { + std::vector feature_value; + feature_value.resize(it->value().size() / sizeof(float)); + memcpy(const_cast(feature_value.data()), + paddle::string::str_to_float(it->value().data()), + it->value().size()); + writer << std::make_pair( + *((uint64_t*)const_cast(it->key().data())), + std::move(feature_value)); + ++feasign_size; } - delete it; } + delete it; + } - write_channel->close(); - if (err_no == -1) { - ++retry_num; - is_write_failed = true; - LOG(ERROR) << "SSDSparseTable save failed after write, retry it! " - << "path:" << channel_config.path - << " , retry_num=" << retry_num; - } - if (is_write_failed) { - _afs_client.remove(channel_config.path); - } - } while (is_write_failed); + writer.Flush(); + fs_channel[i]->Close(); feasign_size_all += feasign_size; for (auto it = shard.begin(); it != shard.end(); ++it) { _value_accesor->UpdateStatAfterSave(it.value().data(), save_param); } } + for (int i = 0; i < threads.size(); i++) { + threads[i].join(); + } + for (int i = 0; i < fs_channel.size(); i++) { + fs_channel[i].reset(); + } + fs_channel.clear(); + if (save_param == 3) { - UpdateTable(); + // UpdateTable(); _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate(); - LOG(INFO) << "SSDSparseTable update success."; - } - LOG(INFO) << "SSDSparseTable save success, path:" - << paddle::string::format_string("%s/%03d/part-%03d-", - path.c_str(), - _config.table_id(), - _shard_idx) - << " from " << file_start_idx << " to " - << file_start_idx + _real_local_shard_num - 1; - // return feasign_size_all; + VLOG(0) << "SSDSparseTable update success."; + } + VLOG(0) << "SSDSparseTable save success, feasign size:" << feasign_size_all + << ", path:" + << paddle::string::format_string("%s/%03d/part-%03d-", + path.c_str(), + _config.table_id(), + _shard_idx) + << " from " << file_start_idx << " to " + << file_start_idx + _real_local_shard_num - 1; _local_show_threshold = tk.top(); - LOG(INFO) << "local cache threshold: " << _local_show_threshold; - // int32 may overflow need to change return value + VLOG(0) << "local cache threshold: " << _local_show_threshold; return 0; } @@ -862,7 +921,167 @@ int32_t SSDSparseTable::SaveCache( int32_t SSDSparseTable::Load(const std::string& path, const std::string& param) { - return MemorySparseTable::Load(path, param); + VLOG(0) << "LOAD FLAGS_rocksdb_path:" << FLAGS_rocksdb_path; + std::string table_path = TableDir(path); + auto file_list = _afs_client.list(table_path); + + // std::sort(file_list.begin(), file_list.end()); + for (auto file : file_list) { + VLOG(1) << "SSDSparseTable::Load() file list: " << file; + } + + int load_param = atoi(param.c_str()); + size_t expect_shard_num = _sparse_table_shard_num; + if (file_list.size() != expect_shard_num) { + LOG(WARNING) << "SSDSparseTable file_size:" << file_list.size() + << " not equal to expect_shard_num:" << expect_shard_num; + return -1; + } + if (file_list.size() == 0) { + LOG(WARNING) << "SSDSparseTable load file is empty, path:" << path; + return -1; + } + + size_t file_start_idx = _shard_idx * _avg_local_shard_num; + + if (file_start_idx >= file_list.size()) { + return 0; + } + + size_t feature_value_size = + _value_accesor->GetAccessorInfo().size / sizeof(float); + size_t mf_value_size = + _value_accesor->GetAccessorInfo().mf_size / sizeof(float); + +#ifdef PADDLE_WITH_HETERPS + int thread_num = _real_local_shard_num; +#else + int thread_num = _real_local_shard_num < 15 ? _real_local_shard_num : 15; +#endif + + for (int i = 0; i < _real_local_shard_num; i++) { + _fs_channel.push_back(paddle::framework::MakeChannel(30000)); + } + + std::vector threads; + threads.resize(thread_num); + auto load_func = [this, &file_start_idx, &file_list, &load_param]( + int file_num) { + int err_no = 0; + FsChannelConfig channel_config; + channel_config.path = file_list[file_num + file_start_idx]; + VLOG(1) << "SSDSparseTable::load begin load " << channel_config.path + << " into local shard " << file_num; + channel_config.converter = _value_accesor->Converter(load_param).converter; + channel_config.deconverter = + _value_accesor->Converter(load_param).deconverter; + + std::string line_data; + auto read_channel = _afs_client.open_r(channel_config, 0, &err_no); + paddle::framework::ChannelWriter writer( + _fs_channel[file_num].get()); + while (read_channel->read_line(line_data) == 0 && line_data.size() > 1) { + writer << line_data; + } + writer.Flush(); + read_channel->close(); + _fs_channel[file_num]->Close(); + }; + for (size_t i = 0; i < threads.size(); i++) { + threads[i] = std::thread(load_func, i); + } + + omp_set_num_threads(thread_num); +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < _real_local_shard_num; ++i) { + std::vector> ssd_keys; + std::vector> ssd_values; + std::vector tmp_key; + ssd_keys.reserve(FLAGS_pserver_load_batch_size); + ssd_values.reserve(FLAGS_pserver_load_batch_size); + tmp_key.reserve(FLAGS_pserver_load_batch_size); + ssd_keys.clear(); + ssd_values.clear(); + tmp_key.clear(); + std::string line_data; + char* end = NULL; + int local_shard_id = i % _avg_local_shard_num; + auto& shard = _local_shards[local_shard_id]; + float data_buffer[FLAGS_pserver_load_batch_size * feature_value_size]; + float* data_buffer_ptr = data_buffer; + uint64_t mem_count = 0; + uint64_t ssd_count = 0; + uint64_t mem_mf_count = 0; + uint64_t ssd_mf_count = 0; + + paddle::framework::ChannelReader reader(_fs_channel[i].get()); + + while (reader >> line_data) { + uint64_t key = std::strtoul(line_data.data(), &end, 10); + if (FLAGS_pserver_open_strict_check) { + if (key % _sparse_table_shard_num != (i + file_start_idx)) { + LOG(WARNING) << "SSDSparseTable key:" << key << " not match shard," + << " file_idx:" << i + << " shard num:" << _sparse_table_shard_num; + continue; + } + } + size_t value_size = + _value_accesor->ParseFromString(++end, data_buffer_ptr); + // ssd or mem + if (_value_accesor->SaveSSD(data_buffer_ptr)) { + tmp_key.emplace_back(key); + ssd_keys.emplace_back( + std::make_pair((char*)&tmp_key.back(), sizeof(uint64_t))); + ssd_values.emplace_back( + std::make_pair((char*)data_buffer_ptr, value_size * sizeof(float))); + data_buffer_ptr += feature_value_size; + if (static_cast(ssd_keys.size()) == + FLAGS_pserver_load_batch_size) { + _db->put_batch(local_shard_id, ssd_keys, ssd_values, ssd_keys.size()); + ssd_keys.clear(); + ssd_values.clear(); + tmp_key.clear(); + data_buffer_ptr = data_buffer; + } + ssd_count++; + if (value_size > feature_value_size - mf_value_size) { + ssd_mf_count++; + } + } else { + auto& value = shard[key]; + value.resize(value_size); + _value_accesor->ParseFromString(end, value.data()); + mem_count++; + if (value_size > feature_value_size - mf_value_size) { + mem_mf_count++; + } + } + } + // last batch + if (ssd_keys.size() > 0) { + _db->put_batch(local_shard_id, ssd_keys, ssd_values, ssd_keys.size()); + } + + _db->flush(local_shard_id); + VLOG(0) << "Table>> load done. ALL[" << mem_count + ssd_count << "] MEM[" + << mem_count << "] MEM_MF[" << mem_mf_count << "] SSD[" << ssd_count + << "] SSD_MF[" << ssd_mf_count << "]."; + } + for (int i = 0; i < threads.size(); i++) { + threads[i].join(); + } + for (int i = 0; i < _fs_channel.size(); i++) { + _fs_channel[i].reset(); + } + _fs_channel.clear(); + LOG(INFO) << "load num:" << LocalSize(); + LOG(INFO) << "SSDSparseTable load success, path from " + << file_list[file_start_idx] << " to " + << file_list[file_start_idx + _real_local_shard_num - 1]; + + _cache_tk_size = LocalSize() * _config.sparse_table_cache_rate(); + return 0; } //加载path目录下数据[start_idx, end_idx) @@ -882,7 +1101,11 @@ int32_t SSDSparseTable::Load(size_t start_idx, end_idx = static_cast(end_idx) < _sparse_table_shard_num ? end_idx : _sparse_table_shard_num; +#ifdef PADDLE_WITH_HETERPS + int thread_num = end_idx - start_idx; +#else int thread_num = (end_idx - start_idx) < 20 ? (end_idx - start_idx) : 20; +#endif omp_set_num_threads(thread_num); #pragma omp parallel for schedule(dynamic) for (size_t i = start_idx; i < end_idx; ++i) { @@ -999,5 +1222,163 @@ int32_t SSDSparseTable::Load(size_t start_idx, return 0; } +std::pair SSDSparseTable::PrintTableStat() { + int64_t feasign_size = LocalSize(); + return {feasign_size, -1}; +} + +int32_t SSDSparseTable::CacheTable(uint16_t pass_id) { + std::lock_guard guard(_table_mutex); + VLOG(0) << "cache_table"; + std::atomic count{0}; + auto thread_num = _real_local_shard_num; + std::vector> tasks; + + double show_threshold = 10000000; + + //保证cache数据不被淘汰掉 + if (_config.enable_sparse_table_cache()) { + if (_local_show_threshold < show_threshold) { + show_threshold = _local_show_threshold; + } + } + + if (show_threshold < 500) { + show_threshold = 500; + } + VLOG(0) << " show_threshold:" << show_threshold + << " ; local_show_threshold:" << _local_show_threshold; + VLOG(0) << "Table>> origin mem feasign size:" << LocalSize(); + static int cache_table_count = 0; + ++cache_table_count; + for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) { + // from mem to ssd + auto fut = _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue( + [shard_id, this, &count, show_threshold, pass_id]() -> int { + rocksdb::Options options; + options.comparator = _db->get_comparator(); + rocksdb::BlockBasedTableOptions bbto; + bbto.format_version = 5; + bbto.use_delta_encoding = false; + bbto.block_size = 4 * 1024; + bbto.block_restart_interval = 6; + bbto.cache_index_and_filter_blocks = false; + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(15, false)); + bbto.whole_key_filtering = true; + options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto)); + options.OptimizeLevelStyleCompaction(); + options.keep_log_file_num = 100; + options.max_log_file_size = 50 * 1024 * 1024; // 50MB + options.create_if_missing = true; + options.use_direct_reads = true; + options.write_buffer_size = 64 * 1024 * 1024; // 256MB + options.max_write_buffer_number = 4; + options.max_bytes_for_level_base = + options.max_write_buffer_number * options.write_buffer_size; + options.min_write_buffer_number_to_merge = 1; + options.target_file_size_base = 1024 * 1024 * 1024; // 1024MB + options.memtable_prefix_bloom_size_ratio = 0.02; + options.num_levels = 4; + options.max_open_files = -1; + + options.compression = rocksdb::kNoCompression; + + auto& shard = _local_shards[shard_id]; + if (1) { + using DataType = shard_type::map_type::iterator; + std::vector datas; + datas.reserve(shard.size() * 0.8); + size_t idx = 0; + for (auto it = shard.begin(); it != shard.end(); ++it) { + if (!_value_accesor->SaveMemCache( + it.value().data(), 0, show_threshold, pass_id)) { + datas.emplace_back(it.it); + } + } + count.fetch_add(datas.size(), std::memory_order_relaxed); + VLOG(0) << "datas size: " << datas.size(); + { + // sst文件写入必须有序 + uint64_t show_begin = butil::gettimeofday_ms(); + std::sort(datas.begin(), + datas.end(), + [](const DataType& a, const DataType& b) { + return a->first < b->first; + }); + VLOG(0) << "sort shard " << shard_id << ": " + << butil::gettimeofday_ms() - show_begin + << " ms, num: " << datas.size(); + } + + //必须做空判断,否则sst_writer.Finish会core掉 + if (datas.size() != 0) { + rocksdb::SstFileWriter sst_writer(rocksdb::EnvOptions(), options); + std::string filename = + paddle::string::format_string("%s_%d/cache-%05d.sst", + FLAGS_rocksdb_path.c_str(), + shard_id, + cache_table_count); + rocksdb::Status status = sst_writer.Open(filename); + if (!status.ok()) { + VLOG(0) << "sst writer open " << filename << "failed" + << ", " << status.getState(); + abort(); + } + VLOG(0) << "sst writer open " << filename; + + uint64_t show_begin = butil::gettimeofday_ms(); + for (auto& data : datas) { + uint64_t tmp_key = data->first; + FixedFeatureValue& tmp_value = + *((FixedFeatureValue*)(void*)(data->second)); + status = sst_writer.Put( + rocksdb::Slice((char*)(&(tmp_key)), sizeof(uint64_t)), + rocksdb::Slice((char*)(tmp_value.data()), + tmp_value.size() * sizeof(float))); + if (!status.ok()) { + VLOG(0) << "fatal in Put file: " << filename << ", " + << status.getState(); + abort(); + } + } + status = sst_writer.Finish(); + if (!status.ok()) { + VLOG(0) << "fatal in finish file: " << filename << ", " + << status.getState(); + abort(); + } + VLOG(0) << "write sst_file shard " << shard_id << ": " + << butil::gettimeofday_ms() - show_begin << " ms"; + int ret = _db->ingest_externel_file(shard_id, {filename}); + if (ret) { + VLOG(0) << "ingest file failed" + << ", " << status.getState(); + abort(); + } + } + + for (auto it = shard.begin(); it != shard.end();) { + if (!_value_accesor->SaveMemCache( + it.value().data(), 0, show_threshold, pass_id)) { + it = shard.erase(it); + } else { + ++it; + } + } + } + return 0; + }); + tasks.push_back(std::move(fut)); + } + for (size_t i = 0; i < tasks.size(); ++i) { + tasks[i].wait(); + } + tasks.clear(); + + VLOG(0) << "Table>> cache ssd count: " << count.load(); + VLOG(0) << "Table>> after update, mem feasign size:" << LocalSize(); + return 0; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h index 55a05bbab5ec24..8f281b5a4bffb0 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h @@ -38,7 +38,11 @@ class SSDSparseTable : public MemorySparseTable { int32_t Push(TableContext& context) override; int32_t PullSparse(float* pull_values, const uint64_t* keys, size_t num); - int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, size_t num); + int32_t PullSparsePtr(int shard_id, + char** pull_values, + const uint64_t* keys, + size_t num, + uint16_t pass_id); int32_t PushSparse(const uint64_t* keys, const float* values, size_t num); int32_t PushSparse(const uint64_t* keys, const float** values, size_t num); @@ -77,10 +81,16 @@ class SSDSparseTable : public MemorySparseTable { const std::string& param); int64_t LocalSize(); + std::pair PrintTableStat() override; + + int32_t CacheTable(uint16_t pass_id) override; + private: RocksDBHandler* _db; int64_t _cache_tk_size; double _local_show_threshold{0.0}; + std::vector> _fs_channel; + std::mutex _table_mutex; }; } // namespace distributed diff --git a/paddle/fluid/distributed/ps/table/table.h b/paddle/fluid/distributed/ps/table/table.h index aee707712f6629..f07a3f2132217e 100644 --- a/paddle/fluid/distributed/ps/table/table.h +++ b/paddle/fluid/distributed/ps/table/table.h @@ -62,6 +62,8 @@ struct TableContext { size_t num; bool use_ptr = false; uint32_t trainer_id; // for GEO and global step + int shard_id; // for gpups + uint16_t pass_id; // for gpups ssd }; class Table { @@ -147,6 +149,7 @@ class Table { virtual void *GetShard(size_t shard_idx) = 0; virtual std::pair PrintTableStat() { return {0, 0}; } + virtual int32_t CacheTable(uint16_t pass_id) { return 0; } // for patch model virtual void Revert() {} diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc index 5df74883f9247f..db0dcf0605dc70 100644 --- a/paddle/fluid/distributed/ps/wrapper/fleet.cc +++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc @@ -748,6 +748,17 @@ void FleetWrapper::PrintTableStat(const uint64_t table_id) { } } +void FleetWrapper::SaveCacheTable(const uint64_t table_id, + uint16_t pass_id, + size_t threshold) { + auto ret = worker_ptr_->SaveCacheTable(table_id, pass_id, threshold); + ret.wait(); + int32_t err_code = ret.get(); + if (err_code == -1) { + LOG(ERROR) << "save cache table stat failed"; + } +} + void FleetWrapper::ShrinkSparseTable(int table_id, int threshold) { auto ret = worker_ptr_->Shrink(table_id, std::to_string(threshold)); ret.wait(); diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h old mode 100755 new mode 100644 index 28347b3502707c..5065fb380a3464 --- a/paddle/fluid/distributed/ps/wrapper/fleet.h +++ b/paddle/fluid/distributed/ps/wrapper/fleet.h @@ -242,6 +242,9 @@ class FleetWrapper { void BarrierWithTable(uint32_t barrier_type); void PrintTableStat(const uint64_t table_id); + void SaveCacheTable(const uint64_t table_id, + uint16_t pass_id, + size_t threshold); // mode = 0, load all feature // mode = 1, load delta feature, which means load diff void LoadModel(const std::string& path, const int mode); diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index 47dc8abf402b9c..fe6c51b87228aa 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -353,7 +353,7 @@ class ChannelReader { } if (cursor_ >= buffer_.size()) { cursor_ = 0; - if (channel_->read(buffer_) == 0) { + if (channel_->Read(buffer_) == 0) { failed_ = true; return *this; } diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index c88c91f166112d..0181ea9eb062b4 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -2118,6 +2118,14 @@ void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) { } } +void SlotRecordInMemoryDataFeed::InitGraphResource() { + gpu_graph_data_generator_.AllocResource(thread_id_, feed_vec_); +} + +void SlotRecordInMemoryDataFeed::InitGraphTrainResource() { + gpu_graph_data_generator_.AllocTrainResource(thread_id_); +} + void SlotRecordInMemoryDataFeed::LoadIntoMemory() { VLOG(3) << "SlotRecord LoadIntoMemory() begin, thread_id=" << thread_id_; if (!so_parser_name_.empty()) { @@ -2654,7 +2662,7 @@ bool SlotRecordInMemoryDataFeed::Start() { pack_ = BatchGpuPackMgr().get(this->GetPlace(), used_slots_info_); #endif #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) - gpu_graph_data_generator_.AllocResource(this->place_, feed_vec_); + gpu_graph_data_generator_.SetFeedVec(feed_vec_); #endif return true; } @@ -2696,6 +2704,12 @@ int SlotRecordInMemoryDataFeed::Next() { #endif } +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) +void SlotRecordInMemoryDataFeed::DoWalk() { + gpu_graph_data_generator_.DoWalk(); +} +#endif + #if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS) void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) { int offset_cols_size = (ins_num + 1); diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index 94e478f8c58af9..a90e853e65deaf 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -19,20 +19,19 @@ limitations under the License. */ #include "paddle/fluid/framework/data_feed.h" #include -#include #include #include #include #include "cub/cub.cuh" #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" - -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/unique_kernel.h" -#include "paddle/phi/kernels/graph_reindex_kernel.h" -#include "paddle/phi/kernels/gpu/graph_reindex_funcs.h" +#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h" +#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" DECLARE_bool(enable_opt_get_features); +DECLARE_int32(gpugraph_storage_mode); +DECLARE_double(gpugraph_hbm_table_load_factor); namespace paddle { namespace framework { @@ -41,6 +40,11 @@ namespace framework { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) +#define DEBUG_STATE(state) \ + VLOG(2) << "left: " << state->left << " right: " << state->right \ + << " central_word: " << state->central_word \ + << " step: " << state->step << " cursor: " << state->cursor \ + << " len: " << state->len << " row_num: " << state->row_num; \ // CUDA: use 512 threads per block const int CUDA_NUM_THREADS = 512; // CUDA: number of blocks for threads. @@ -85,26 +89,6 @@ __global__ void FillSlotValueOffsetKernel(const int ins_num, } } -__global__ void fill_actual_neighbors(int64_t* vals, - int64_t* actual_vals, - int64_t* actual_vals_dst, - int* actual_sample_size, - int* cumsum_actual_sample_size, - int sample_size, - int len, - int mod) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { - int offset1 = cumsum_actual_sample_size[i]; - int offset2 = sample_size * i; - int dst_id = i % mod; - for (int j = 0; j < actual_sample_size[i]; j++) { - actual_vals[offset1 + j] = vals[offset2 + j]; - actual_vals_dst[offset1 + j] = dst_id; - } - } -} - void SlotRecordInMemoryDataFeed::FillSlotValueOffset( const int ins_num, const int used_slot_num, @@ -233,13 +217,13 @@ __global__ void CopyDuplicateKeys(int64_t *dist_tensor, int GraphDataGenerator::AcquireInstance(BufState *state) { // if (state->GetNextStep()) { - state->Debug(); + DEBUG_STATE(state); return state->len; } else if (state->GetNextCentrolWord()) { - state->Debug(); + DEBUG_STATE(state); return state->len; } else if (state->GetNextBatch()) { - state->Debug(); + DEBUG_STATE(state); return state->len; } return 0; @@ -371,64 +355,157 @@ __global__ void GraphFillSlotLodKernel(int64_t *id_tensor, int len) { CUDA_KERNEL_LOOP(idx, len) { id_tensor[idx] = idx; } } -int GraphDataGenerator::FillInsBuf() { - if (ins_buf_pair_len_ >= batch_size_) { - return batch_size_; +int GraphDataGenerator::FillIdShowClkTensor(int total_instance, + bool gpu_graph_training, + size_t cursor) { + id_tensor_ptr_ = + feed_vec_[0]->mutable_data({total_instance, 1}, this->place_); + show_tensor_ptr_ = + feed_vec_[1]->mutable_data({total_instance}, this->place_); + clk_tensor_ptr_ = + feed_vec_[2]->mutable_data({total_instance}, this->place_); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + if (gpu_graph_training) { + uint64_t *ins_cursor, *ins_buf; + ins_buf = reinterpret_cast(d_ins_buf_->ptr()); + ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; + cudaMemcpyAsync(id_tensor_ptr_, + ins_cursor, + sizeof(uint64_t) * total_instance, + cudaMemcpyDeviceToDevice, + train_stream_); + } else { + uint64_t *d_type_keys = + reinterpret_cast(d_device_keys_[cursor]->ptr()); + d_type_keys += infer_node_start_; + infer_node_start_ += total_instance / 2; + CopyDuplicateKeys<<>>( + id_tensor_ptr_, d_type_keys, total_instance / 2); } - int total_instance = AcquireInstance(&buf_state_); - VLOG(2) << "total_ins: " << total_instance; - buf_state_.Debug(); + GraphFillCVMKernel<<>>(show_tensor_ptr_, total_instance); + GraphFillCVMKernel<<>>(clk_tensor_ptr_, total_instance); + return 0; +} - if (total_instance == 0) { - int res = FillWalkBuf(d_walk_); - if (!res) { - // graph iterate complete - return -1; - } else { - total_instance = buf_state_.len; - VLOG(2) << "total_ins: " << total_instance; - buf_state_.Debug(); - // if (total_instance == 0) { - // return -1; - //} +int GraphDataGenerator::FillGraphSlotFeature(int total_instance, + bool gpu_graph_training) { + int64_t *slot_tensor_ptr_[slot_num_]; + int64_t *slot_lod_tensor_ptr_[slot_num_]; + for (int i = 0; i < slot_num_; ++i) { + slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data( + {total_instance * h_slot_feature_num_map_[i], 1}, this->place_); + slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data( + {total_instance + 1}, this->place_); + } + uint64_t *ins_cursor, *ins_buf; + if (gpu_graph_training) { + ins_buf = reinterpret_cast(d_ins_buf_->ptr()); + ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; + } else { + id_tensor_ptr_ = + feed_vec_[0]->mutable_data({total_instance, 1}, this->place_); + ins_cursor = (uint64_t *)id_tensor_ptr_; + } + + cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(), + slot_tensor_ptr_, + sizeof(uint64_t *) * slot_num_, + cudaMemcpyHostToDevice, + train_stream_); + cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(), + slot_lod_tensor_ptr_, + sizeof(uint64_t *) * slot_num_, + cudaMemcpyHostToDevice, + train_stream_); + uint64_t *feature_buf = reinterpret_cast(d_feature_buf_->ptr()); + FillFeatureBuf(ins_cursor, feature_buf, total_instance); + GraphFillSlotKernel<<>>((uint64_t *)d_slot_tensor_ptr_->ptr(), + feature_buf, + total_instance * fea_num_per_node_, + total_instance, + slot_num_, + (int*)d_slot_feature_num_map_->ptr(), + fea_num_per_node_, + (int*)d_actual_slot_id_map_->ptr(), + (int*)d_fea_offset_map_->ptr()); + GraphFillSlotLodKernelOpt<<>>( + (uint64_t *)d_slot_lod_tensor_ptr_->ptr(), + (total_instance + 1) * slot_num_, + total_instance + 1, + (int*)d_slot_feature_num_map_->ptr()); + if (debug_mode_) { + uint64_t h_walk[total_instance]; + cudaMemcpy(h_walk, + ins_cursor, + total_instance * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + uint64_t h_feature[total_instance * slot_num_ * fea_num_per_node_]; + cudaMemcpy(h_feature, + feature_buf, + total_instance * fea_num_per_node_ * slot_num_ * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + for (int i = 0; i < total_instance; ++i) { + std::stringstream ss; + for (int j = 0; j < fea_num_per_node_; ++j) { + ss << h_feature[i * fea_num_per_node_ + j] << " "; + } + VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i + << "] = " << (uint64_t)h_walk[i] << " feature[" << i * fea_num_per_node_ + << ".." << (i + 1) * fea_num_per_node_ << "] = " << ss.str(); } - if (!FLAGS_enable_opt_get_features && slot_num_ > 0) { - FillFeatureBuf(d_walk_, d_feature_); - if (debug_mode_) { - int len = buf_size_ > 5000 ? 5000 : buf_size_; - uint64_t h_walk[len]; - cudaMemcpy(h_walk, - d_walk_->ptr(), - len * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - uint64_t h_feature[len * slot_num_]; - cudaMemcpy(h_feature, - d_feature_->ptr(), - len * slot_num_ * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - for (int i = 0; i < len; ++i) { - std::stringstream ss; - for (int j = 0; j < slot_num_; ++j) { - ss << h_feature[i * slot_num_ + j] << " "; - } - VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i - << "] = " << (uint64_t)h_walk[i] << " feature[" - << i * slot_num_ << ".." << (i + 1) * slot_num_ - << "] = " << ss.str(); - } + uint64_t h_slot_tensor[fea_num_per_node_][total_instance]; + uint64_t h_slot_lod_tensor[slot_num_][total_instance + 1]; + for (int i = 0; i < slot_num_; ++i) { + cudaMemcpy(h_slot_tensor[i], + slot_tensor_ptr_[i], + total_instance * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + int len = total_instance > 5000 ? 5000 : total_instance; + for (int j = 0; j < len; ++j) { + VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i << "][" << j + << "] = " << h_slot_tensor[i][j]; + } + + cudaMemcpy(h_slot_lod_tensor[i], + slot_lod_tensor_ptr_[i], + (total_instance + 1) * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + len = total_instance + 1 > 5000 ? 5000 : total_instance + 1; + for (int j = 0; j < len; ++j) { + VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i << "][" << j + << "] = " << h_slot_lod_tensor[i][j]; } } } + return 0; +} +int GraphDataGenerator::MakeInsPair() { uint64_t *walk = reinterpret_cast(d_walk_->ptr()); uint64_t *ins_buf = reinterpret_cast(d_ins_buf_->ptr()); int *random_row = reinterpret_cast(d_random_row_->ptr()); int *d_pair_num = reinterpret_cast(d_pair_num_->ptr()); - cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_); + cudaMemsetAsync(d_pair_num, 0, sizeof(int), train_stream_); int len = buf_state_.len; - GraphFillIdKernel<<>>( + // make pair + GraphFillIdKernel<<>>( ins_buf + ins_buf_pair_len_ * 2, d_pair_num, walk, @@ -438,29 +515,12 @@ int GraphDataGenerator::FillInsBuf() { len, walk_len_); int h_pair_num; - cudaMemcpyAsync( - &h_pair_num, d_pair_num, sizeof(int), cudaMemcpyDeviceToHost, stream_); - if (!FLAGS_enable_opt_get_features && slot_num_ > 0) { - uint64_t *feature_buf = reinterpret_cast(d_feature_buf_->ptr()); - uint64_t *feature = reinterpret_cast(d_feature_->ptr()); - cudaMemsetAsync(d_pair_num, 0, sizeof(int), stream_); - int len = buf_state_.len; - VLOG(2) << "feature_buf start[" << ins_buf_pair_len_ * 2 * slot_num_ - << "] len[" << len << "]"; - GraphFillFeatureKernel<<>>( - feature_buf + ins_buf_pair_len_ * 2 * slot_num_, - d_pair_num, - walk, - feature, - random_row + buf_state_.cursor, - buf_state_.central_word, - window_step_[buf_state_.step], - len, - walk_len_, - slot_num_); - } - - cudaStreamSynchronize(stream_); + cudaMemcpyAsync(&h_pair_num, + d_pair_num, + sizeof(int), + cudaMemcpyDeviceToHost, + train_stream_); + cudaStreamSynchronize(train_stream_); ins_buf_pair_len_ += h_pair_num; if (debug_mode_) { @@ -474,357 +534,41 @@ int GraphDataGenerator::FillInsBuf() { for (int xx = 0; xx < 2 * ins_buf_pair_len_; xx++) { VLOG(2) << "h_ins_buf[" << xx << "]: " << h_ins_buf[xx]; } - delete[] h_ins_buf; - - if (!FLAGS_enable_opt_get_features && slot_num_ > 0) { - uint64_t *feature_buf = - reinterpret_cast(d_feature_buf_->ptr()); - uint64_t h_feature_buf[(batch_size_ * 2 * 2) * slot_num_]; - cudaMemcpy(h_feature_buf, - feature_buf, - (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - for (int xx = 0; xx < (batch_size_ * 2 * 2) * slot_num_; xx++) { - VLOG(2) << "h_feature_buf[" << xx << "]: " << h_feature_buf[xx]; - } - } } return ins_buf_pair_len_; } -std::vector> GraphDataGenerator::SampleNeighbors( - int64_t* uniq_nodes, int len, int sample_size, - std::vector& edges_split_num, int64_t* neighbor_len) { - - auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - auto edge_to_id = gpu_graph_ptr->edge_to_id; - - auto sample_res = gpu_graph_ptr->graph_neighbor_sample_all_edge_type( - gpuid_, edge_to_id_len_, (uint64_t*)(uniq_nodes), sample_size, len, - edge_type_graph_); - - int* all_sample_count_ptr = - reinterpret_cast(sample_res.actual_sample_size_mem->ptr()); - - auto cumsum_actual_sample_size = - memory::Alloc(place_, (len * edge_to_id_len_ + 1) * sizeof(int)); - int* cumsum_actual_sample_size_ptr = - reinterpret_cast(cumsum_actual_sample_size->ptr()); - cudaMemsetAsync(cumsum_actual_sample_size_ptr, - 0, - (len * edge_to_id_len_ + 1) * sizeof(int), - stream_); - - size_t temp_storage_bytes = 0; - CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, - temp_storage_bytes, - all_sample_count_ptr, - cumsum_actual_sample_size_ptr + 1, - len * edge_to_id_len_, - stream_)); - auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes); - CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), - temp_storage_bytes, - all_sample_count_ptr, - cumsum_actual_sample_size_ptr + 1, - len * edge_to_id_len_, - stream_)); - cudaStreamSynchronize(stream_); - - edges_split_num.resize(edge_to_id_len_); - for (int i = 0; i < edge_to_id_len_; i++) { - cudaMemcpyAsync( - edges_split_num.data() + i, - cumsum_actual_sample_size_ptr + (i + 1) * len, - sizeof(int), - cudaMemcpyDeviceToHost, - stream_); +int GraphDataGenerator::FillInsBuf() { + if (ins_buf_pair_len_ >= batch_size_) { + return batch_size_; } - CUDA_CHECK(cudaStreamSynchronize(stream_)); - int all_sample_size = edges_split_num[edge_to_id_len_ - 1]; - auto final_sample_val = - memory::AllocShared(place_, all_sample_size * sizeof(int64_t)); - auto final_sample_val_dst = - memory::AllocShared(place_, all_sample_size * sizeof(int64_t)); - int64_t* final_sample_val_ptr = - reinterpret_cast(final_sample_val->ptr()); - int64_t* final_sample_val_dst_ptr = - reinterpret_cast(final_sample_val_dst->ptr()); - int64_t* all_sample_val_ptr = - reinterpret_cast(sample_res.val_mem->ptr()); - fill_actual_neighbors<<>>(all_sample_val_ptr, - final_sample_val_ptr, - final_sample_val_dst_ptr, - all_sample_count_ptr, - cumsum_actual_sample_size_ptr, - sample_size, - len * edge_to_id_len_, - len); - *neighbor_len = all_sample_size; - cudaStreamSynchronize(stream_); - - std::vector> sample_results; - sample_results.emplace_back(final_sample_val); - sample_results.emplace_back(final_sample_val_dst); - return sample_results; -} - -std::shared_ptr GraphDataGenerator::GetReindexResult( - int64_t* reindex_src_data, const int64_t* center_nodes, int* final_nodes_len, - int node_len, int64_t neighbor_len) { + int total_instance = AcquireInstance(&buf_state_); - VLOG(2) << gpuid_ << ": Enter GetReindexResult Function"; - const phi::GPUContext& dev_ctx_ = - *(static_cast( - platform::DeviceContextPool::Instance().Get(place_))); - - // Reset reindex table - int64_t* d_reindex_table_key_ptr = - reinterpret_cast(d_reindex_table_key_->ptr()); - int* d_reindex_table_value_ptr = - reinterpret_cast(d_reindex_table_value_->ptr()); - int* d_reindex_table_index_ptr = - reinterpret_cast(d_reindex_table_index_->ptr()); - - VLOG(2) << gpuid_ << ": ResetReindexTable With -1"; - // Fill table with -1. - cudaMemsetAsync(d_reindex_table_key_ptr, -1, - reindex_table_size_ * sizeof(int64_t), stream_); - cudaMemsetAsync(d_reindex_table_value_ptr, -1, - reindex_table_size_ * sizeof(int), stream_); - cudaMemsetAsync(d_reindex_table_index_ptr, -1, - reindex_table_size_ * sizeof(int), stream_); - - VLOG(2) << gpuid_ << ": Alloc all_nodes"; - auto all_nodes = - memory::AllocShared(place_, (node_len + neighbor_len) * sizeof(int64_t)); - int64_t* all_nodes_data = reinterpret_cast(all_nodes->ptr()); - - VLOG(2) << gpuid_ << ": cudaMemcpy all_nodes_data"; - cudaMemcpy(all_nodes_data, center_nodes, sizeof(int64_t) * node_len, - cudaMemcpyDeviceToDevice); - cudaMemcpy(all_nodes_data + node_len, reindex_src_data, sizeof(int64_t) * neighbor_len, - cudaMemcpyDeviceToDevice); - - cudaStreamSynchronize(stream_); - VLOG(2) << gpuid_ << ": Run phi::FillHashTable"; - auto final_nodes = - phi::FillHashTable(dev_ctx_, all_nodes_data, - node_len + neighbor_len, - reindex_table_size_, - d_reindex_table_key_ptr, - d_reindex_table_value_ptr, - d_reindex_table_index_ptr, - final_nodes_len); - - VLOG(2) << gpuid_ << ": Run phi::ReindexSrcOutput"; - phi::ReindexSrcOutput<<>>(reindex_src_data, neighbor_len, - reindex_table_size_, - d_reindex_table_key_ptr, - d_reindex_table_value_ptr); - return final_nodes; -} + VLOG(2) << "total_ins: " << total_instance; + buf_state_.Debug(); -std::shared_ptr GraphDataGenerator::GenerateSampleGraph( - uint64_t* node_ids, int len, int* final_len, phi::DenseTensor* inverse) { - - const phi::GPUContext& dev_ctx_ = - *(static_cast( - platform::DeviceContextPool::Instance().Get(place_))); - - VLOG(2) << "Get Unique Nodes"; - phi::DenseTensor in_x = phi::Empty(dev_ctx_, {len}); - cudaMemcpy(in_x.data(), node_ids, len * sizeof(uint64_t), - cudaMemcpyDeviceToDevice); - - phi::DenseTensor uniq_nodes, index; - std::vector axis; - phi::UniqueKernel(dev_ctx_, in_x, false, true, - false, axis, phi::DataType::INT32, &uniq_nodes, &index, inverse, &index); - - int64_t* uniq_nodes_data = uniq_nodes.data(); - int uniq_len = uniq_nodes.numel(); - int len_samples = samples_.size(); - - int *num_nodes_tensor_ptr_[len_samples]; - int *next_num_nodes_tensor_ptr_[len_samples]; - int64_t *edges_src_tensor_ptr_[len_samples]; - int64_t *edges_dst_tensor_ptr_[len_samples]; - int *edges_split_tensor_ptr_[len_samples]; - - VLOG(2) << "Sample Neighbors and Reindex"; - std::vector edges_split_num; - std::vector> final_nodes_vec; - std::vector final_nodes_len_vec; - - for (int i = 0; i < len_samples; i++) { - - edges_split_num.clear(); - std::shared_ptr neighbors, reindex_dst; - int64_t neighbors_len = 0; - if (i == 0) { - auto sample_results = - SampleNeighbors(uniq_nodes_data, uniq_len, samples_[i], edges_split_num, - &neighbors_len); - neighbors = sample_results[0]; - reindex_dst = sample_results[1]; - edges_split_num.push_back(uniq_len); - } else { - int64_t* final_nodes_data = - reinterpret_cast(final_nodes_vec[i - 1]->ptr()); - auto sample_results = - SampleNeighbors(final_nodes_data, final_nodes_len_vec[i - 1], - samples_[i], edges_split_num, &neighbors_len); - neighbors = sample_results[0]; - reindex_dst = sample_results[1]; - edges_split_num.push_back(final_nodes_len_vec[i - 1]); - } - - int64_t* reindex_src_data = reinterpret_cast(neighbors->ptr()); - int64_t* reindex_dst_data = reinterpret_cast(reindex_dst->ptr()); - int final_nodes_len = 0; - if (i == 0) { - auto tmp_final_nodes = - GetReindexResult(reindex_src_data, uniq_nodes_data, &final_nodes_len, - uniq_len, neighbors_len); - final_nodes_vec.emplace_back(tmp_final_nodes); - final_nodes_len_vec.emplace_back(final_nodes_len); - } else { - int64_t* final_nodes_data = - reinterpret_cast(final_nodes_vec[i - 1]->ptr()); - auto tmp_final_nodes = - GetReindexResult(reindex_src_data, final_nodes_data, &final_nodes_len, - final_nodes_len_vec[i - 1], neighbors_len); - final_nodes_vec.emplace_back(tmp_final_nodes); - final_nodes_len_vec.emplace_back(final_nodes_len); - } - - int offset = 3 + 2 * slot_num_ + 5 * i; - num_nodes_tensor_ptr_[i] = - feed_vec_[offset]->mutable_data({1}, this->place_); - next_num_nodes_tensor_ptr_[i] = - feed_vec_[offset + 1]->mutable_data({1}, this->place_); - edges_src_tensor_ptr_[i] = - feed_vec_[offset + 2]->mutable_data({neighbors_len, 1}, this->place_); - edges_dst_tensor_ptr_[i] = - feed_vec_[offset + 3]->mutable_data({neighbors_len, 1}, this->place_); - edges_split_tensor_ptr_[i] = - feed_vec_[offset + 4]->mutable_data({edge_to_id_len_}, this->place_); - - cudaMemcpyAsync(num_nodes_tensor_ptr_[i], final_nodes_len_vec.data() + i, - sizeof(int), cudaMemcpyHostToDevice, stream_); - cudaMemcpyAsync(next_num_nodes_tensor_ptr_[i], edges_split_num.data() + edge_to_id_len_, - sizeof(int), cudaMemcpyHostToDevice, stream_); - cudaMemcpyAsync(edges_split_tensor_ptr_[i], edges_split_num.data(), - sizeof(int) * edge_to_id_len_, cudaMemcpyHostToDevice, stream_); - cudaMemcpyAsync(edges_src_tensor_ptr_[i], reindex_src_data, - sizeof(int64_t) * neighbors_len, cudaMemcpyDeviceToDevice, stream_); - cudaMemcpyAsync(edges_dst_tensor_ptr_[i], reindex_dst_data, - sizeof(int64_t) * neighbors_len, cudaMemcpyDeviceToDevice, stream_); - - cudaStreamSynchronize(stream_); + if (total_instance == 0) { + return -1; } - - *final_len = final_nodes_len_vec[len_samples - 1]; - return final_nodes_vec[len_samples - 1]; + return MakeInsPair(); } int GraphDataGenerator::GenerateBatch() { int total_instance = 0; platform::CUDADeviceGuard guard(gpuid_); int res = 0; - - std::shared_ptr final_sage_nodes; + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); if (!gpu_graph_training_) { - while (cursor_ < h_device_keys_.size()) { - size_t device_key_size = h_device_keys_[cursor_]->size(); - if (infer_node_type_start_[cursor_] >= device_key_size) { - cursor_++; - continue; - } - total_instance = - (infer_node_type_start_[cursor_] + batch_size_ <= device_key_size) - ? batch_size_ - : device_key_size - infer_node_type_start_[cursor_]; - uint64_t *d_type_keys = - reinterpret_cast(d_device_keys_[cursor_]->ptr()); - d_type_keys += infer_node_type_start_[cursor_]; - infer_node_type_start_[cursor_] += total_instance; - VLOG(1) << "in graph_data generator:batch_size = " << batch_size_ - << " instance = " << total_instance; - total_instance *= 2; - if (!sage_mode_) { - id_tensor_ptr_ = - feed_vec_[0]->mutable_data({total_instance, 1}, this->place_); - show_tensor_ptr_ = - feed_vec_[1]->mutable_data({total_instance}, this->place_); - clk_tensor_ptr_ = - feed_vec_[2]->mutable_data({total_instance}, this->place_); - CopyDuplicateKeys<<>>( - id_tensor_ptr_, d_type_keys, total_instance / 2); - GraphFillCVMKernel<<>>(show_tensor_ptr_, total_instance); - GraphFillCVMKernel<<>>(clk_tensor_ptr_, total_instance); - } else { - auto node_buf = memory::AllocShared( - place_, total_instance * sizeof(uint64_t)); - int64_t* node_buf_ptr = reinterpret_cast(node_buf->ptr()); - VLOG(1) << "copy center keys"; - CopyDuplicateKeys<<>>( - node_buf_ptr, d_type_keys, total_instance / 2); - phi::DenseTensor inverse_; - VLOG(1) << "generate sample graph"; - uint64_t* node_buf_ptr_ = reinterpret_cast(node_buf->ptr()); - final_sage_nodes = - GenerateSampleGraph(node_buf_ptr_, total_instance, &uniq_instance_, - &inverse_); - id_tensor_ptr_ = - feed_vec_[0]->mutable_data({uniq_instance_, 1}, this->place_); - show_tensor_ptr_ = - feed_vec_[1]->mutable_data({uniq_instance_}, this->place_); - clk_tensor_ptr_ = - feed_vec_[2]->mutable_data({uniq_instance_}, this->place_); - int index_offset = 3 + slot_num_ * 2 + 5 * samples_.size(); - index_tensor_ptr_ = - feed_vec_[index_offset]->mutable_data({total_instance}, this->place_); - - VLOG(1) << "copy id and index"; - cudaMemcpy(id_tensor_ptr_, final_sage_nodes->ptr(), - sizeof(int64_t) * uniq_instance_, - cudaMemcpyDeviceToDevice); - cudaMemcpy(index_tensor_ptr_, inverse_.data(), sizeof(int) * total_instance, - cudaMemcpyDeviceToDevice); - GraphFillCVMKernel<<>>( - show_tensor_ptr_, uniq_instance_); - GraphFillCVMKernel<<>>( - clk_tensor_ptr_, uniq_instance_); - } - break; - } + total_instance = (infer_node_start_ + batch_size_ <= infer_node_end_) + ? batch_size_ + : infer_node_end_ - infer_node_start_; + VLOG(1) << "in graph_data generator:batch_size = " << batch_size_ + << " instance = " << total_instance; + total_instance *= 2; if (total_instance == 0) { return 0; } + FillIdShowClkTensor(total_instance, gpu_graph_training_, cursor_); } else { while (ins_buf_pair_len_ < batch_size_) { res = FillInsBuf(); @@ -839,192 +583,17 @@ int GraphDataGenerator::GenerateBatch() { total_instance = ins_buf_pair_len_ < batch_size_ ? ins_buf_pair_len_ : batch_size_; total_instance *= 2; - } - - uint64_t *ins_cursor, *ins_buf; - phi::DenseTensor inverse; - if (gpu_graph_training_) { VLOG(2) << "total_instance: " << total_instance << ", ins_buf_pair_len = " << ins_buf_pair_len_; - ins_buf = reinterpret_cast(d_ins_buf_->ptr()); - ins_cursor = ins_buf + ins_buf_pair_len_ * 2 - total_instance; - if (!sage_mode_) { - id_tensor_ptr_ = - feed_vec_[0]->mutable_data({total_instance, 1}, this->place_); - show_tensor_ptr_ = - feed_vec_[1]->mutable_data({total_instance}, this->place_); - clk_tensor_ptr_ = - feed_vec_[2]->mutable_data({total_instance}, this->place_); - cudaMemcpyAsync(id_tensor_ptr_, - ins_cursor, - sizeof(uint64_t) * total_instance, - cudaMemcpyDeviceToDevice, - stream_); - GraphFillCVMKernel<<>>(show_tensor_ptr_, total_instance); - GraphFillCVMKernel<<>>(clk_tensor_ptr_, total_instance); - } else { - VLOG(2) << gpuid_ << " " << "Ready to enter GenerateSampleGraph"; - final_sage_nodes = GenerateSampleGraph(ins_cursor, total_instance, &uniq_instance_, - &inverse); - VLOG(2) << "Copy Final Results"; - id_tensor_ptr_ = - feed_vec_[0]->mutable_data({uniq_instance_, 1}, this->place_); - show_tensor_ptr_ = - feed_vec_[1]->mutable_data({uniq_instance_}, this->place_); - clk_tensor_ptr_ = - feed_vec_[2]->mutable_data({uniq_instance_}, this->place_); - int index_offset = 3 + slot_num_ * 2 + 5 * samples_.size(); - index_tensor_ptr_ = - feed_vec_[index_offset]->mutable_data({total_instance}, this->place_); - - cudaMemcpyAsync(id_tensor_ptr_, - final_sage_nodes->ptr(), - sizeof(int64_t) * uniq_instance_, - cudaMemcpyDeviceToDevice, - stream_); - cudaMemcpyAsync(index_tensor_ptr_, - inverse.data(), - sizeof(int) * total_instance, - cudaMemcpyDeviceToDevice, - stream_); - GraphFillCVMKernel<<>>(show_tensor_ptr_, uniq_instance_); - GraphFillCVMKernel<<>>(clk_tensor_ptr_, uniq_instance_); - - } - } else { - ins_cursor = (uint64_t *)id_tensor_ptr_; + FillIdShowClkTensor(total_instance, gpu_graph_training_); } - int64_t *slot_tensor_ptr_[slot_num_]; - int64_t *slot_lod_tensor_ptr_[slot_num_]; if (slot_num_ > 0) { - int slot_instance = sage_mode_ == true ? uniq_instance_ : total_instance; - for (int i = 0; i < slot_num_; ++i) { - slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data( - {slot_instance * h_slot_feature_num_map_[i], 1}, this->place_); - slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data( - {slot_instance + 1}, this->place_); - } - if (FLAGS_enable_opt_get_features || !gpu_graph_training_) { - cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(), - slot_tensor_ptr_, - sizeof(uint64_t *) * slot_num_, - cudaMemcpyHostToDevice, - stream_); - cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(), - slot_lod_tensor_ptr_, - sizeof(uint64_t *) * slot_num_, - cudaMemcpyHostToDevice, - stream_); - } - if (sage_mode_) { - size_t temp_storage_bytes = slot_instance * fea_num_per_node_ * sizeof(uint64_t); - // No need to allocate a new d_feature_buf_ if the old one is enough. - if (d_feature_buf_ == NULL || d_feature_buf_->size() < temp_storage_bytes) { - d_feature_buf_ = memory::AllocShared(place_, temp_storage_bytes); - } - } - uint64_t *feature_buf = reinterpret_cast(d_feature_buf_->ptr()); - if (FLAGS_enable_opt_get_features || !gpu_graph_training_) { - if (!sage_mode_) { - FillFeatureBuf(ins_cursor, feature_buf, slot_instance); - } else { - uint64_t* sage_nodes_ptr = reinterpret_cast(final_sage_nodes->ptr()); - FillFeatureBuf(sage_nodes_ptr, feature_buf, slot_instance); - } - if (debug_mode_) { - uint64_t h_walk[slot_instance]; - if (!sage_mode_) { - cudaMemcpy(h_walk, - ins_cursor, - slot_instance * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - } else { - uint64_t* sage_nodes_ptr = reinterpret_cast(final_sage_nodes->ptr()); - cudaMemcpy(h_walk, - sage_nodes_ptr, - slot_instance * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - } - uint64_t h_feature[slot_instance * fea_num_per_node_]; - cudaMemcpy(h_feature, - feature_buf, - slot_instance * fea_num_per_node_ * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - for (int i = 0; i < slot_instance; ++i) { - std::stringstream ss; - for (int j = 0; j < fea_num_per_node_; ++j) { - ss << h_feature[i * fea_num_per_node_ + j] << " "; - } - VLOG(2) << "aft FillFeatureBuf, gpu[" << gpuid_ << "] walk[" << i - << "] = " << (uint64_t)h_walk[i] << " feature[" - << i * fea_num_per_node_ << ".." << (i + 1) * fea_num_per_node_ - << "] = " << ss.str(); - } - } - GraphFillSlotKernel<<>>((uint64_t *)d_slot_tensor_ptr_->ptr(), - feature_buf, - slot_instance * fea_num_per_node_, - slot_instance, - slot_num_, - (int*)d_slot_feature_num_map_->ptr(), - fea_num_per_node_, - (int*)d_actual_slot_id_map_->ptr(), - (int*)d_fea_offset_map_->ptr()); - GraphFillSlotLodKernelOpt<<>>( - (uint64_t *)d_slot_lod_tensor_ptr_->ptr(), - (slot_instance + 1) * slot_num_, - slot_instance + 1, - (int*)d_slot_feature_num_map_->ptr()); - } else { - for (int i = 0; i < slot_num_; ++i) { - int feature_buf_offset = - (ins_buf_pair_len_ * 2 - total_instance) * slot_num_ + i * 2; - for (int j = 0; j < total_instance; j += 2) { - VLOG(2) << "slot_tensor[" << i << "][" << j << "] <- feature_buf[" - << feature_buf_offset + j * slot_num_ << "]"; - VLOG(2) << "slot_tensor[" << i << "][" << j + 1 << "] <- feature_buf[" - << feature_buf_offset + j * slot_num_ + 1 << "]"; - cudaMemcpyAsync(slot_tensor_ptr_[i] + j, - &feature_buf[feature_buf_offset + j * slot_num_], - sizeof(uint64_t) * 2, - cudaMemcpyDeviceToDevice, - stream_); - } - GraphFillSlotLodKernel<<>>(slot_lod_tensor_ptr_[i], - total_instance + 1); - } - } + FillGraphSlotFeature(total_instance, gpu_graph_training_); } - offset_.clear(); offset_.push_back(0); - if (!sage_mode_) { - offset_.push_back(total_instance); - } else { - offset_.push_back(uniq_instance_); - } + offset_.push_back(total_instance); LoD lod{offset_}; feed_vec_[0]->set_lod(lod); if (slot_num_ > 0) { @@ -1033,35 +602,9 @@ int GraphDataGenerator::GenerateBatch() { } } - cudaStreamSynchronize(stream_); + cudaStreamSynchronize(train_stream_); if (!gpu_graph_training_) return 1; ins_buf_pair_len_ -= total_instance / 2; - if (debug_mode_) { - uint64_t h_slot_tensor[fea_num_per_node_][total_instance]; - uint64_t h_slot_lod_tensor[slot_num_][total_instance + 1]; - for (int i = 0; i < slot_num_; ++i) { - cudaMemcpy(h_slot_tensor[i], - slot_tensor_ptr_[i], - total_instance * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - int len = total_instance > 5000 ? 5000 : total_instance; - for (int j = 0; j < len; ++j) { - VLOG(2) << "gpu[" << gpuid_ << "] slot_tensor[" << i << "][" << j - << "] = " << h_slot_tensor[i][j]; - } - - cudaMemcpy(h_slot_lod_tensor[i], - slot_lod_tensor_ptr_[i], - (total_instance + 1) * sizeof(uint64_t), - cudaMemcpyDeviceToHost); - len = total_instance + 1 > 5000 ? 5000 : total_instance + 1; - for (int j = 0; j < len; ++j) { - VLOG(2) << "gpu[" << gpuid_ << "] slot_lod_tensor[" << i << "][" << j - << "] = " << h_slot_lod_tensor[i][j]; - } - } - } - return 1; } @@ -1128,6 +671,66 @@ __global__ void GraphFillFirstStepKernel(int *prefix_sum, } } +__global__ void GetUniqueFeaNum(uint64_t *d_in, + uint64_t *unique_num, + size_t len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ uint64_t local_num; + if (threadIdx.x == 0) { + local_num = 0; + } + __syncthreads(); + + if (i < len - 1) { + if (d_in[i] != d_in[i + 1]) { + atomicAdd(&local_num, 1); + } + } + if (i == len - 1) { + atomicAdd(&local_num, 1); + } + + __syncthreads(); + if (threadIdx.x == 0) { + atomicAdd(unique_num, local_num); + } +} + +__global__ void UniqueFeature(uint64_t *d_in, + uint64_t *d_out, + uint64_t *unique_num, + size_t len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ uint64_t local_key[CUDA_NUM_THREADS]; + __shared__ uint64_t local_num; + __shared__ uint64_t global_num; + if (threadIdx.x == 0) { + local_num = 0; + } + __syncthreads(); + + if (i < len - 1) { + if (d_in[i] != d_in[i + 1]) { + size_t dst = atomicAdd(&local_num, 1); + local_key[dst] = d_in[i]; + } + } + if (i == len - 1) { + size_t dst = atomicAdd(&local_num, 1); + local_key[dst] = d_in[i]; + } + + __syncthreads(); + + if (threadIdx.x == 0) { + global_num = atomicAdd(unique_num, local_num); + } + __syncthreads(); + + if (threadIdx.x < local_num) { + d_out[global_num + threadIdx.x] = local_key[threadIdx.x]; + } +} // Fill sample_res to the stepth column of walk void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids, uint64_t *walk, @@ -1151,45 +754,50 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids, d_actual_sample_size, d_prefix_sum + 1, len, - stream_)); - auto d_temp_storage = memory::Alloc(place_, temp_storage_bytes); + sample_stream_)); + auto d_temp_storage = memory::Alloc( + place_, + temp_storage_bytes, + phi::Stream(reinterpret_cast(sample_stream_))); CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), temp_storage_bytes, d_actual_sample_size, d_prefix_sum + 1, len, - stream_)); + sample_stream_)); - cudaStreamSynchronize(stream_); + cudaStreamSynchronize(sample_stream_); if (step == 1) { - GraphFillFirstStepKernel<<>>( - d_prefix_sum, - d_tmp_sampleidx2row, - walk, - d_start_ids, - len, - walk_degree_, - walk_len_, - d_actual_sample_size, - d_neighbors, - d_sample_keys); + GraphFillFirstStepKernel<<>>(d_prefix_sum, + d_tmp_sampleidx2row, + walk, + d_start_ids, + len, + walk_degree_, + walk_len_, + d_actual_sample_size, + d_neighbors, + d_sample_keys); } else { GraphFillSampleKeysKernel<<>>(d_neighbors, - d_sample_keys, - d_prefix_sum, - d_sampleidx2row, - d_tmp_sampleidx2row, - d_actual_sample_size, - cur_degree, - len); - - GraphDoWalkKernel<<>>( + sample_stream_>>>(d_neighbors, + d_sample_keys, + d_prefix_sum, + d_sampleidx2row, + d_tmp_sampleidx2row, + d_actual_sample_size, + cur_degree, + len); + + GraphDoWalkKernel<<>>( d_neighbors, walk, d_prefix_sum, @@ -1206,7 +814,6 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids, int *h_prefix_sum = new int[len + 1]; int *h_actual_size = new int[len]; int *h_offset2idx = new int[once_max_sample_keynum]; - uint64_t h_sample_keys[once_max_sample_keynum]; cudaMemcpy(h_offset2idx, d_tmp_sampleidx2row, once_max_sample_keynum * sizeof(int), @@ -1225,9 +832,8 @@ void GraphDataGenerator::FillOneStep(uint64_t *d_start_ids, delete[] h_prefix_sum; delete[] h_actual_size; delete[] h_offset2idx; - delete[] h_sample_keys; } - cudaStreamSynchronize(stream_); + cudaStreamSynchronize(sample_stream_); cur_sampleidx2row_ = 1 - cur_sampleidx2row_; } @@ -1259,7 +865,105 @@ int GraphDataGenerator::FillFeatureBuf( return ret; } -int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { +// 尝试插入table, 0表示插入成功 +int GraphDataGenerator::InsertTable( + const unsigned long *d_keys, + unsigned long len, + std::shared_ptr d_uniq_node_num) { + uint64_t h_uniq_node_num = 0; + uint64_t *d_uniq_node_num_ptr = + reinterpret_cast(d_uniq_node_num->ptr()); + cudaMemcpyAsync(&h_uniq_node_num, + d_uniq_node_num_ptr, + sizeof(uint64_t), + cudaMemcpyDeviceToHost, + sample_stream_); + cudaStreamSynchronize(sample_stream_); + // 产生了足够多的node,采样结束 + VLOG(2) << "table capcity: " << train_table_cap_ << ", " << h_uniq_node_num + << " used"; + if (h_uniq_node_num + len >= train_table_cap_) { + return 1; + } + table_->insert(d_keys, len, d_uniq_node_num_ptr, sample_stream_); + CUDA_CHECK(cudaStreamSynchronize(sample_stream_)); + return 0; +} + +void GraphDataGenerator::DoWalk() { + int device_id = place_.GetDeviceId(); + debug_gpu_memory_info(device_id, "DoWalk start"); + if (gpu_graph_training_) { + FillWalkBuf(); + } else { + FillInferBuf(); + } + debug_gpu_memory_info(device_id, "DoWalk end"); +} + +void GraphDataGenerator::clear_gpu_mem() { + d_len_per_row_.reset(); + d_sample_keys_.reset(); + d_prefix_sum_.reset(); + for (size_t i = 0; i < d_sampleidx2rows_.size(); i++) { + d_sampleidx2rows_[i].reset(); + } + delete table_; +} + +int GraphDataGenerator::FillInferBuf() { + platform::CUDADeviceGuard guard(gpuid_); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + auto &global_infer_node_type_start = + gpu_graph_ptr->global_infer_node_type_start_[gpuid_]; + auto &infer_cursor = gpu_graph_ptr->infer_cursor_[thread_id_]; + total_row_ = 0; + if (infer_cursor < h_device_keys_len_.size()) { + if (global_infer_node_type_start[infer_cursor] >= + h_device_keys_len_[infer_cursor]) { + infer_cursor++; + if (infer_cursor >= h_device_keys_len_.size()) { + return 0; + } + } + size_t device_key_size = h_device_keys_len_[infer_cursor]; + total_row_ = + (global_infer_node_type_start[infer_cursor] + infer_table_cap_ <= + device_key_size) + ? infer_table_cap_ + : device_key_size - global_infer_node_type_start[infer_cursor]; + + host_vec_.resize(total_row_); + uint64_t *d_type_keys = + reinterpret_cast(d_device_keys_[infer_cursor]->ptr()); + cudaMemcpyAsync(host_vec_.data(), + d_type_keys + global_infer_node_type_start[infer_cursor], + sizeof(uint64_t) * total_row_, + cudaMemcpyDeviceToHost, + sample_stream_); + cudaStreamSynchronize(sample_stream_); + VLOG(1) << "cursor: " << infer_cursor + << " start: " << global_infer_node_type_start[infer_cursor] + << " num: " << total_row_; + infer_node_start_ = global_infer_node_type_start[infer_cursor]; + global_infer_node_type_start[infer_cursor] += total_row_; + infer_node_end_ = global_infer_node_type_start[infer_cursor]; + cursor_ = infer_cursor; + } + return 0; +} + +void GraphDataGenerator::ClearSampleState() { + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + auto &finish_node_type = gpu_graph_ptr->finish_node_type_[gpuid_]; + auto &node_type_start = gpu_graph_ptr->node_type_start_[gpuid_]; + finish_node_type.clear(); + for (auto iter = node_type_start.begin(); iter != node_type_start.end(); iter++) { + iter->second = 0; + } +} + +int GraphDataGenerator::FillWalkBuf() { platform::CUDADeviceGuard guard(gpuid_); size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_; //////// @@ -1277,30 +981,42 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { } /////// auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - uint64_t *walk = reinterpret_cast(d_walk->ptr()); + uint64_t *walk = reinterpret_cast(d_walk_->ptr()); int *len_per_row = reinterpret_cast(d_len_per_row_->ptr()); uint64_t *d_sample_keys = reinterpret_cast(d_sample_keys_->ptr()); - cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), stream_); - cudaMemsetAsync( - len_per_row, 0, once_max_sample_keynum * sizeof(int), stream_); + cudaMemsetAsync(walk, 0, buf_size_ * sizeof(uint64_t), sample_stream_); + // cudaMemsetAsync( + // len_per_row, 0, once_max_sample_keynum * sizeof(int), sample_stream_); + int sample_times = 0; int i = 0; - int total_row = 0; - size_t node_type_len = first_node_type_.size(); + total_row_ = 0; + + // 获取全局采样状态 + auto &first_node_type = gpu_graph_ptr->first_node_type_; + auto &meta_path = gpu_graph_ptr->meta_path_; + auto &node_type_start = gpu_graph_ptr->node_type_start_[gpuid_]; + auto &finish_node_type = gpu_graph_ptr->finish_node_type_[gpuid_]; + auto &type_to_index = gpu_graph_ptr->get_graph_type_to_index(); + auto &cursor = gpu_graph_ptr->cursor_[thread_id_]; + size_t node_type_len = first_node_type.size(); int remain_size = buf_size_ - walk_degree_ * once_sample_startid_len_ * walk_len_; + int total_samples = 0; while (i <= remain_size) { - int cur_node_idx = cursor_ % node_type_len; - int node_type = first_node_type_[cur_node_idx]; - auto &path = meta_path_[cur_node_idx]; - size_t start = node_type_start_[node_type]; + int cur_node_idx = cursor % node_type_len; + int node_type = first_node_type[cur_node_idx]; + auto &path = meta_path[cur_node_idx]; + size_t start = node_type_start[node_type]; + VLOG(2) << "cur_node_idx = " << cur_node_idx + << " meta_path.size = " << meta_path.size(); // auto node_query_result = gpu_graph_ptr->query_node_list( - // gpuid_, node_type, start, once_sample_startid_len_); + // gpuid_, node_type, start, once_sample_startid_len_); // int tmp_len = node_query_result.actual_sample_size; VLOG(2) << "choose start type: " << node_type; - int type_index = type_to_index_[node_type]; - size_t device_key_size = h_device_keys_[type_index]->size(); + int type_index = type_to_index[node_type]; + size_t device_key_size = h_device_keys_len_[type_index]; VLOG(2) << "type: " << node_type << " size: " << device_key_size << " start: " << start; uint64_t *d_type_keys = @@ -1308,21 +1024,19 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { int tmp_len = start + once_sample_startid_len_ > device_key_size ? device_key_size - start : once_sample_startid_len_; - node_type_start_[node_type] = tmp_len + start; + bool update = true; if (tmp_len == 0) { - finish_node_type_.insert(node_type); - if (finish_node_type_.size() == node_type_start_.size()) { + finish_node_type.insert(node_type); + if (finish_node_type.size() == node_type_start.size()) { + cursor = 0; + epoch_finish_ = true; break; } - cursor_ += 1; + cursor += 1; continue; } - // if (tmp_len == 0) { - // break; - //} - VLOG(2) << "i = " << i << " buf_size_ = " << buf_size_ - << " tmp_len = " << tmp_len << " cursor = " << cursor_ - << " once_max_sample_keynum = " << once_max_sample_keynum; + + VLOG(2) << "gpuid = " << gpuid_ << " path[0] = " << path[0]; uint64_t *cur_walk = walk + i; NeighborSampleQuery q; @@ -1336,6 +1050,30 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { int step = 1; VLOG(2) << "sample edge type: " << path[0] << " step: " << 1; jump_rows_ = sample_res.total_sample_size; + total_samples += sample_res.total_sample_size; + VLOG(2) << "i = " << i << " start = " << start << " tmp_len = " << tmp_len + << " cursor = " << node_type << " cur_node_idx = " << cur_node_idx + << " jump row: " << jump_rows_; + VLOG(2) << "jump_row: " << jump_rows_; + if (jump_rows_ == 0) { + node_type_start[node_type] = tmp_len + start; + cursor += 1; + continue; + } + + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + if (InsertTable(d_type_keys + start, tmp_len, d_uniq_node_num_) != 0) { + VLOG(2) << "in step 0, insert key stage, table is full"; + update = false; + break; + } + if (InsertTable(sample_res.actual_val, sample_res.total_sample_size, d_uniq_node_num_) != + 0) { + VLOG(2) << "in step 0, insert sample res stage, table is full"; + update = false; + break; + } + } FillOneStep(d_type_keys + start, cur_walk, tmp_len, @@ -1343,7 +1081,6 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { walk_degree_, step, len_per_row); - VLOG(2) << "jump_row: " << jump_rows_; ///////// if (debug_mode_) { cudaMemcpy( @@ -1352,11 +1089,16 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx]; } } + + VLOG(2) << "sample, step=" << step << " sample_keys=" << tmp_len + << " sample_res_len=" << sample_res.total_sample_size; + ///////// step++; size_t path_len = path.size(); for (; step < walk_len_; step++) { if (sample_res.total_sample_size == 0) { + VLOG(2) << "sample finish, step=" << step; break; } auto sample_key_mem = sample_res.actual_val_mem; @@ -1369,9 +1111,17 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { (uint64_t)sample_keys_ptr, 1, sample_res.total_sample_size); - int sample_key_len = sample_res.total_sample_size; + int sample_key_len = sample_res.total_sample_size; sample_res = gpu_graph_ptr->graph_neighbor_sample_v3(q, false, true); - + total_samples += sample_res.total_sample_size; + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + if (InsertTable(sample_res.actual_val, sample_res.total_sample_size, d_uniq_node_num_) != + 0) { + VLOG(2) << "in step: " << step << ", table is full"; + update = false; + break; + } + } FillOneStep(d_type_keys + start, cur_walk, sample_key_len, @@ -1386,34 +1136,44 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { VLOG(2) << "h_walk[" << xx << "]: " << h_walk[xx]; } } + + VLOG(2) << "sample, step=" << step << " sample_keys=" << sample_key_len + << " sample_res_len=" << sample_res.total_sample_size; + } + // 此时更新全局采样状态 + if (update == true) { + node_type_start[node_type] = tmp_len + start; + i += jump_rows_ * walk_len_; + total_row_ += jump_rows_; + cursor += 1; + sample_times++; + } else { + VLOG(2) << "table is full, not update stat!"; + break; } - // cursor_ += tmp_len; - i += jump_rows_ * walk_len_; - total_row += jump_rows_; - cursor_ += 1; } - buf_state_.Reset(total_row); + buf_state_.Reset(total_row_); int *d_random_row = reinterpret_cast(d_random_row_->ptr()); thrust::random::default_random_engine engine(shuffle_seed_); - const auto &exec_policy = thrust::cuda::par.on(stream_); + const auto &exec_policy = thrust::cuda::par.on(sample_stream_); thrust::counting_iterator cnt_iter(0); thrust::shuffle_copy(exec_policy, cnt_iter, - cnt_iter + total_row, + cnt_iter + total_row_, thrust::device_pointer_cast(d_random_row), engine); - cudaStreamSynchronize(stream_); + cudaStreamSynchronize(sample_stream_); shuffle_seed_ = engine(); if (debug_mode_) { - int *h_random_row = new int[total_row + 10]; + int *h_random_row = new int[total_row_ + 10]; cudaMemcpy(h_random_row, d_random_row, - total_row * sizeof(int), + total_row_ * sizeof(int), cudaMemcpyDeviceToHost); - for (int xx = 0; xx < total_row; xx++) { + for (int xx = 0; xx < total_row_; xx++) { VLOG(2) << "h_random_row[" << xx << "]: " << h_random_row[xx]; } delete[] h_random_row; @@ -1423,101 +1183,162 @@ int GraphDataGenerator::FillWalkBuf(std::shared_ptr d_walk) { delete[] h_len_per_row; delete[] h_prefix_sum; } - return total_row != 0; + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + // table_->prefetch(cudaCpuDeviceId, sample_stream_); + // thrust::pair *kv = table_->data(); + // size_t size = table_->size(); + // uint64_t unused_key = std::numeric_limits::max(); + // for (size_t i = 0; i < size; i++) { + // if (kv[i].first == unused_key) { + // continue; + // } + // host_vec_.push_back(kv[i].first); + // } + + uint64_t h_uniq_node_num = 0; + uint64_t *d_uniq_node_num = + reinterpret_cast(d_uniq_node_num_->ptr()); + cudaMemcpyAsync(&h_uniq_node_num, + d_uniq_node_num, + sizeof(uint64_t), + cudaMemcpyDeviceToHost, + sample_stream_); + cudaStreamSynchronize(sample_stream_); + VLOG(2) << "h_uniq_node_num: " << h_uniq_node_num; + // 临时显存, 存储去重后的nodeid + auto d_uniq_node = memory::AllocShared( + place_, + h_uniq_node_num * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + uint64_t *d_uniq_node_ptr = + reinterpret_cast(d_uniq_node->ptr()); + + auto d_node_cursor = memory::AllocShared( + place_, + sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + + uint64_t *d_node_cursor_ptr = + reinterpret_cast(d_node_cursor->ptr()); + cudaMemsetAsync(d_node_cursor_ptr, 0, sizeof(uint64_t), sample_stream_); + // uint64_t unused_key = std::numeric_limits::max(); + table_->get_keys(d_uniq_node_ptr, d_node_cursor_ptr, sample_stream_); + + cudaStreamSynchronize(sample_stream_); + + host_vec_.resize(h_uniq_node_num); + cudaMemcpyAsync(host_vec_.data(), + d_uniq_node_ptr, + sizeof(uint64_t) * h_uniq_node_num, + cudaMemcpyDeviceToHost, + sample_stream_); + cudaStreamSynchronize(sample_stream_); + + VLOG(0) << "sample_times:" << sample_times + << ", d_walk_size:" << buf_size_ + << ", d_walk_offset:" << i + << ", total_rows:" << total_row_ + << ", total_samples:" << total_samples + << ", h_uniq_node_num:" << h_uniq_node_num; + } + return total_row_ != 0; } -void GraphDataGenerator::AllocResource(const paddle::platform::Place &place, - std::vector feed_vec) { - place_ = place; - gpuid_ = place_.GetDeviceId(); - VLOG(3) << "gpuid " << gpuid_; - stream_ = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); +void GraphDataGenerator::SetFeedVec(std::vector feed_vec) { feed_vec_ = feed_vec; - if (!sage_mode_) { - slot_num_ = (feed_vec_.size() - 3) / 2; - } else { - slot_num_ = (feed_vec_.size() - 4 - samples_.size() * 5) / 2; - } - +} +void GraphDataGenerator::AllocResource(int thread_id, + std::vector feed_vec) { auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - h_slot_feature_num_map_ = gpu_graph_ptr->slot_feature_num_map(); - fea_num_per_node_ = 0; - for (int i = 0; i < slot_num_; ++i) { - fea_num_per_node_ += h_slot_feature_num_map_[i]; + gpuid_ = gpu_graph_ptr->device_id_mapping[thread_id]; + thread_id_ = thread_id; + place_ = platform::CUDAPlace(gpuid_); + debug_gpu_memory_info(gpuid_, "AllocResource start"); + + platform::CUDADeviceGuard guard(gpuid_); + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + table_ = new HashTable( + train_table_cap_ / FLAGS_gpugraph_hbm_table_load_factor); } - std::vector h_actual_slot_id_map, h_fea_offset_map; - h_actual_slot_id_map.resize(fea_num_per_node_); - h_fea_offset_map.resize(fea_num_per_node_); - for (int slot_id = 0, fea_idx = 0; slot_id < slot_num_; ++slot_id) { - for (int j = 0; j < h_slot_feature_num_map_[slot_id]; ++j, ++fea_idx) { - h_actual_slot_id_map[fea_idx] = slot_id; - h_fea_offset_map[fea_idx] = j; - } + VLOG(1) << "AllocResource gpuid " << gpuid_ + << " feed_vec.size: " << feed_vec.size() + << " table cap: " << train_table_cap_; + sample_stream_ = gpu_graph_ptr->get_local_stream(gpuid_); + train_stream_ = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + // feed_vec_ = feed_vec; + slot_num_ = (feed_vec.size() - 3) / 2; + + // infer_node_type_start_ = std::vector(h_device_keys_.size(), 0); + // for (size_t i = 0; i < h_device_keys_.size(); i++) { + // for (size_t j = 0; j < h_device_keys_[i]->size(); j++) { + // VLOG(3) << "h_device_keys_[" << i << "][" << j + // << "] = " << (*(h_device_keys_[i]))[j]; + // } + // auto buf = memory::AllocShared( + // place_, h_device_keys_[i]->size() * sizeof(uint64_t)); + // d_device_keys_.push_back(buf); + // CUDA_CHECK(cudaMemcpyAsync(buf->ptr(), + // h_device_keys_[i]->data(), + // h_device_keys_[i]->size() * sizeof(uint64_t), + // cudaMemcpyHostToDevice, + // stream_)); + // } + auto &d_graph_all_type_keys = gpu_graph_ptr->d_graph_all_type_total_keys_; + auto &h_graph_all_type_keys_len = gpu_graph_ptr->h_graph_all_type_keys_len_; + + for (size_t i = 0; i < d_graph_all_type_keys.size(); i++) { + d_device_keys_.push_back(d_graph_all_type_keys[i][thread_id]); + h_device_keys_len_.push_back(h_graph_all_type_keys_len[i][thread_id]); } + VLOG(2) << "h_device_keys size: " << h_device_keys_len_.size(); + - d_slot_feature_num_map_ = memory::Alloc(place, slot_num_ * sizeof(int)); - cudaMemcpy(d_slot_feature_num_map_->ptr(), h_slot_feature_num_map_.data(), - sizeof(int) * slot_num_, cudaMemcpyHostToDevice); - d_actual_slot_id_map_ = memory::Alloc(place, fea_num_per_node_ * sizeof(int)); - cudaMemcpy(d_actual_slot_id_map_->ptr(), h_actual_slot_id_map.data(), - sizeof(int) * fea_num_per_node_, cudaMemcpyHostToDevice); - d_fea_offset_map_ = memory::Alloc(place, fea_num_per_node_ * sizeof(int)); - cudaMemcpy(d_fea_offset_map_->ptr(), h_fea_offset_map.data(), - sizeof(int) * fea_num_per_node_, cudaMemcpyHostToDevice); - // d_device_keys_.resize(h_device_keys_.size()); - VLOG(2) << "h_device_keys size: " << h_device_keys_.size(); - infer_node_type_start_ = std::vector(h_device_keys_.size(), 0); - for (size_t i = 0; i < h_device_keys_.size(); i++) { - for (size_t j = 0; j < h_device_keys_[i]->size(); j++) { - VLOG(3) << "h_device_keys_[" << i << "][" << j - << "] = " << (*(h_device_keys_[i]))[j]; - } - auto buf = memory::AllocShared( - place_, h_device_keys_[i]->size() * sizeof(uint64_t)); - d_device_keys_.push_back(buf); - CUDA_CHECK(cudaMemcpyAsync(buf->ptr(), - h_device_keys_[i]->data(), - h_device_keys_[i]->size() * sizeof(uint64_t), - cudaMemcpyHostToDevice, - stream_)); - } - // h_device_keys_ = h_device_keys; - // device_key_size_ = h_device_keys_->size(); - // d_device_keys_ = - // memory::AllocShared(place_, device_key_size_ * sizeof(int64_t)); - // CUDA_CHECK(cudaMemcpyAsync(d_device_keys_->ptr(), h_device_keys_->data(), - // device_key_size_ * sizeof(int64_t), - // cudaMemcpyHostToDevice, stream_)); size_t once_max_sample_keynum = walk_degree_ * once_sample_startid_len_; - d_prefix_sum_ = - memory::AllocShared(place_, (once_max_sample_keynum + 1) * sizeof(int)); + d_prefix_sum_ = memory::AllocShared( + place_, + (once_max_sample_keynum + 1) * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); int *d_prefix_sum_ptr = reinterpret_cast(d_prefix_sum_->ptr()); - cudaMemsetAsync( - d_prefix_sum_ptr, 0, (once_max_sample_keynum + 1) * sizeof(int), stream_); + cudaMemsetAsync(d_prefix_sum_ptr, + 0, + (once_max_sample_keynum + 1) * sizeof(int), + sample_stream_); cursor_ = 0; jump_rows_ = 0; - d_walk_ = memory::AllocShared(place_, buf_size_ * sizeof(uint64_t)); - cudaMemsetAsync(d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_); - if (!FLAGS_enable_opt_get_features && slot_num_ > 0) { - d_feature_ = - memory::AllocShared(place_, buf_size_ * slot_num_ * sizeof(uint64_t)); - cudaMemsetAsync( - d_feature_->ptr(), 0, buf_size_ * sizeof(uint64_t), stream_); - } - d_sample_keys_ = - memory::AllocShared(place_, once_max_sample_keynum * sizeof(uint64_t)); + d_uniq_node_num_ = memory::AllocShared( + place_, + sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + cudaMemsetAsync(d_uniq_node_num_->ptr(), 0, sizeof(uint64_t), sample_stream_); + + d_walk_ = memory::AllocShared( + place_, + buf_size_ * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); + cudaMemsetAsync( + d_walk_->ptr(), 0, buf_size_ * sizeof(uint64_t), sample_stream_); + d_sample_keys_ = memory::AllocShared( + place_, + once_max_sample_keynum * sizeof(uint64_t), + phi::Stream(reinterpret_cast(sample_stream_))); - d_sampleidx2rows_.push_back( - memory::AllocShared(place_, once_max_sample_keynum * sizeof(int))); - d_sampleidx2rows_.push_back( - memory::AllocShared(place_, once_max_sample_keynum * sizeof(int))); + d_sampleidx2rows_.push_back(memory::AllocShared( + place_, + once_max_sample_keynum * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_)))); + d_sampleidx2rows_.push_back(memory::AllocShared( + place_, + once_max_sample_keynum * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_)))); cur_sampleidx2row_ = 0; - d_len_per_row_ = - memory::AllocShared(place_, once_max_sample_keynum * sizeof(int)); + d_len_per_row_ = memory::AllocShared( + place_, + once_max_sample_keynum * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); for (int i = -window_; i < 0; i++) { window_step_.push_back(i); } @@ -1527,52 +1348,58 @@ void GraphDataGenerator::AllocResource(const paddle::platform::Place &place, buf_state_.Init(batch_size_, walk_len_, &window_step_); d_random_row_ = memory::AllocShared( place_, - (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int)); + (once_sample_startid_len_ * walk_degree_ * repeat_time_) * sizeof(int), + phi::Stream(reinterpret_cast(sample_stream_))); shuffle_seed_ = 0; ins_buf_pair_len_ = 0; d_ins_buf_ = memory::AllocShared(place_, (batch_size_ * 2 * 2) * sizeof(uint64_t)); - if (slot_num_ > 0) { - if (!sage_mode_) { - d_feature_buf_ = memory::AllocShared( - place_, (batch_size_ * 2 * 2) * fea_num_per_node_ * sizeof(uint64_t)); - } else { - d_feature_buf_ = NULL; - } - } d_pair_num_ = memory::AllocShared(place_, sizeof(int)); - if (FLAGS_enable_opt_get_features && slot_num_ > 0) { - d_slot_tensor_ptr_ = - memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *)); - d_slot_lod_tensor_ptr_ = - memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *)); - } - if (sage_mode_) { - reindex_table_size_ = batch_size_ * 2; - // get hashtable size - for (int i = 0; i < samples_.size(); i++) { - reindex_table_size_ *= (samples_[i] * edge_to_id_len_ + 1); - } - int64_t next_pow2 = - 1 << static_cast(1 + std::log2(reindex_table_size_ >> 1)); - reindex_table_size_ = next_pow2 << 1; - - d_reindex_table_key_ = - memory::AllocShared(place_, reindex_table_size_ * sizeof(int64_t)); - d_reindex_table_value_ = - memory::AllocShared(place_, reindex_table_size_ * sizeof(int)); - d_reindex_table_index_ = - memory::AllocShared(place_, reindex_table_size_ * sizeof(int)); - auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - edge_type_graph_ = - gpu_graph_ptr->get_edge_type_graph(gpuid_, edge_to_id_len_); - } + d_slot_tensor_ptr_ = + memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *)); + d_slot_lod_tensor_ptr_ = + memory::AllocShared(place_, slot_num_ * sizeof(uint64_t *)); + + cudaStreamSynchronize(sample_stream_); - cudaStreamSynchronize(stream_); + debug_gpu_memory_info(gpuid_, "AllocResource end"); } +void GraphDataGenerator::AllocTrainResource(int thread_id) { + if (slot_num_ > 0) { + platform::CUDADeviceGuard guard(gpuid_); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + h_slot_feature_num_map_ = gpu_graph_ptr->slot_feature_num_map(); + fea_num_per_node_ = 0; + for (int i = 0; i < slot_num_; ++i) { + fea_num_per_node_ += h_slot_feature_num_map_[i]; + } + std::vector h_actual_slot_id_map, h_fea_offset_map; + h_actual_slot_id_map.resize(fea_num_per_node_); + h_fea_offset_map.resize(fea_num_per_node_); + for (int slot_id = 0, fea_idx = 0; slot_id < slot_num_; ++slot_id) { + for (int j = 0; j < h_slot_feature_num_map_[slot_id]; ++j, ++fea_idx) { + h_actual_slot_id_map[fea_idx] = slot_id; + h_fea_offset_map[fea_idx] = j; + } + } + + d_slot_feature_num_map_ = memory::Alloc(place_, slot_num_ * sizeof(int)); + cudaMemcpy(d_slot_feature_num_map_->ptr(), h_slot_feature_num_map_.data(), + sizeof(int) * slot_num_, cudaMemcpyHostToDevice); + d_actual_slot_id_map_ = memory::Alloc(place_, fea_num_per_node_ * sizeof(int)); + cudaMemcpy(d_actual_slot_id_map_->ptr(), h_actual_slot_id_map.data(), + sizeof(int) * fea_num_per_node_, cudaMemcpyHostToDevice); + d_fea_offset_map_ = memory::Alloc(place_, fea_num_per_node_ * sizeof(int)); + cudaMemcpy(d_fea_offset_map_->ptr(), h_fea_offset_map.data(), + sizeof(int) * fea_num_per_node_, cudaMemcpyHostToDevice); + d_feature_buf_ = memory::AllocShared( + place_, (batch_size_ * 2 * 2) * fea_num_per_node_ * sizeof(uint64_t)); + } +} + void GraphDataGenerator::SetConfig( const paddle::framework::DataFeedDesc &data_feed_desc) { auto graph_config = data_feed_desc.graph_config(); @@ -1590,58 +1417,22 @@ void GraphDataGenerator::SetConfig( repeat_time_ = graph_config.sample_times_one_chunk(); buf_size_ = once_sample_startid_len_ * walk_len_ * walk_degree_ * repeat_time_; - VLOG(2) << "Confirm GraphConfig, walk_degree : " << walk_degree_ + train_table_cap_ = graph_config.train_table_cap(); + infer_table_cap_ = graph_config.infer_table_cap(); + epoch_finish_ = false; + VLOG(0) << "Confirm GraphConfig, walk_degree : " << walk_degree_ << ", walk_len : " << walk_len_ << ", window : " << window_ << ", once_sample_startid_len : " << once_sample_startid_len_ << ", sample_times_one_chunk : " << repeat_time_ - << ", batch_size: " << batch_size_; + << ", batch_size: " << batch_size_ + << ", train_table_cap: " << train_table_cap_ + << ", infer_table_cap: " << infer_table_cap_; std::string first_node_type = graph_config.first_node_type(); std::string meta_path = graph_config.meta_path(); - sage_mode_ = graph_config.sage_mode(); - std::string str_samples = graph_config.samples(); - auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - auto edge_to_id = gpu_graph_ptr->edge_to_id; - edge_to_id_len_ = edge_to_id.size(); - auto node_to_id = gpu_graph_ptr->feature_to_id; - // parse first_node_type - auto node_types = - paddle::string::split_string(first_node_type, ";"); - VLOG(2) << "node_types: " << first_node_type; - finish_node_type_.clear(); - node_type_start_.clear(); - for (auto &type : node_types) { - auto iter = node_to_id.find(type); - PADDLE_ENFORCE_NE( - iter, - node_to_id.end(), - platform::errors::NotFound("(%s) is not found in node_to_id.", type)); - VLOG(2) << "node_to_id[" << type << "] = " << iter->second; - first_node_type_.push_back(iter->second); - node_type_start_[iter->second] = 0; - } - meta_path_.resize(first_node_type_.size()); - auto meta_paths = paddle::string::split_string(meta_path, ";"); - - for (size_t i = 0; i < meta_paths.size(); i++) { - auto path = meta_paths[i]; - auto nodes = paddle::string::split_string(path, "-"); - for (auto &node : nodes) { - auto iter = edge_to_id.find(node); - PADDLE_ENFORCE_NE( - iter, - edge_to_id.end(), - platform::errors::NotFound("(%s) is not found in edge_to_id.", node)); - VLOG(2) << "edge_to_id[" << node << "] = " << iter->second; - meta_path_[i].push_back(iter->second); - } - } - - auto samples = paddle::string::split_string(str_samples, ";"); - for (size_t i = 0; i < samples.size(); i++) { - int sample_size = std::stoi(samples[i]); - samples_.emplace_back(sample_size); - } + debug_gpu_memory_info("init_conf start"); + gpu_graph_ptr->init_conf(first_node_type, meta_path); + debug_gpu_memory_info("init_conf end"); }; } // namespace framework diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 4598788c0a7d85..dff08972a3f684 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -60,6 +60,8 @@ class Scope; class Variable; class NeighborSampleResult; class NodeQueryResult; +template +class HashTable; } // namespace framework } // namespace paddle @@ -878,6 +880,9 @@ struct BufState { int GetNextBatch() { cursor += len; + if (row_num - cursor < 0) { + return 0; + } int tmp_len = cursor + batch_size > row_num ? row_num - cursor : batch_size; if (tmp_len == 0) { return 0; @@ -895,11 +900,14 @@ class GraphDataGenerator { GraphDataGenerator(){}; virtual ~GraphDataGenerator(){}; void SetConfig(const paddle::framework::DataFeedDesc& data_feed_desc); - void AllocResource(const paddle::platform::Place& place, - std::vector feed_vec); + void AllocResource(int thread_id, std::vector feed_vec); + void AllocTrainResource(int thread_id); + void SetFeedVec(std::vector feed_vec); int AcquireInstance(BufState* state); int GenerateBatch(); - int FillWalkBuf(std::shared_ptr d_walk); + int FillWalkBuf(); + int FillInferBuf(); + void DoWalk(); int FillFeatureBuf(uint64_t* d_walk, uint64_t* d_feature, size_t key_num); int FillFeatureBuf(std::shared_ptr d_walk, std::shared_ptr d_feature); @@ -911,34 +919,44 @@ class GraphDataGenerator { int step, int* len_per_row); int FillInsBuf(); + int FillIdShowClkTensor(int total_instance, + bool gpu_graph_training, + size_t cursor = 0); + int FillGraphSlotFeature(int total_instance, bool gpu_graph_training); + int MakeInsPair(); + int GetPathNum() { return total_row_; } + void ResetPathNum() {total_row_ = 0; } + void ResetEpochFinish() {epoch_finish_ = false; } + void ClearSampleState(); void SetDeviceKeys(std::vector* device_keys, int type) { - type_to_index_[type] = h_device_keys_.size(); - h_device_keys_.push_back(device_keys); + // type_to_index_[type] = h_device_keys_.size(); + // h_device_keys_.push_back(device_keys); } - std::vector> SampleNeighbors( - int64_t* uniq_nodes, int len, int sample_size, - std::vector& edges_split_num, int64_t* neighbor_len); + std::vector> SampleNeighbors( + int64_t* uniq_nodes, int len, int sample_size, + std::vector& edges_split_num, int64_t* neighbor_len); std::shared_ptr GetReindexResult( - int64_t* reindex_src_data, const int64_t* center_nodes, - int* final_nodes_len, int node_len, int64_t neighbor_len); - + int64_t* reindex_src_data, const int64_t* center_nodes, + int* final_nodes_len, int node_len, int64_t neighbor_len); std::shared_ptr GenerateSampleGraph( - uint64_t* node_ids, int len, int* uniq_len, phi::DenseTensor* inverse); + uint64_t* node_ids, int len, int* uniq_len, phi::DenseTensor* inverse); + int InsertTable(const unsigned long* d_keys, + unsigned long len, + std::shared_ptr d_uniq_node_num); + std::vector& GetHostVec() { return host_vec_; } + bool get_epoch_finish() {return epoch_finish_; } + void clear_gpu_mem(); protected: + HashTable* table_; int walk_degree_; int walk_len_; int window_; int once_sample_startid_len_; int gpuid_; - // start ids - // int64_t* device_keys_; - // size_t device_key_size_; - std::vector*> h_device_keys_; - std::unordered_map type_to_index_; - // point to device_keys_ size_t cursor_; + int thread_id_; size_t jump_rows_; int edge_to_id_len_; int uniq_instance_; @@ -947,7 +965,8 @@ class GraphDataGenerator { int64_t* show_tensor_ptr_; int64_t* clk_tensor_ptr_; - cudaStream_t stream_; + cudaStream_t train_stream_; + cudaStream_t sample_stream_; paddle::platform::Place place_; std::vector feed_vec_; std::vector offset_; @@ -955,23 +974,21 @@ class GraphDataGenerator { std::vector> d_device_keys_; std::shared_ptr d_walk_; + std::shared_ptr d_feature_list_; std::shared_ptr d_feature_; std::shared_ptr d_len_per_row_; std::shared_ptr d_random_row_; + std::shared_ptr d_uniq_node_num_; std::shared_ptr d_slot_feature_num_map_; std::shared_ptr d_actual_slot_id_map_; std::shared_ptr d_fea_offset_map_; - // + std::vector> d_sampleidx2rows_; int cur_sampleidx2row_; // record the keys to call graph_neighbor_sample std::shared_ptr d_sample_keys_; int sample_keys_len_; - std::set finish_node_type_; - std::unordered_map node_type_start_; - std::vector infer_node_type_start_; - std::shared_ptr d_ins_buf_; std::shared_ptr d_feature_buf_; std::shared_ptr d_pair_num_; @@ -994,11 +1011,17 @@ class GraphDataGenerator { int fea_num_per_node_; int shuffle_seed_; int debug_mode_; - std::vector first_node_type_; - std::vector> meta_path_; bool gpu_graph_training_; bool sage_mode_; std::vector samples_; + bool epoch_finish_; + std::vector host_vec_; + std::vector h_device_keys_len_; + uint64_t train_table_cap_; + uint64_t infer_table_cap_; + int total_row_; + size_t infer_node_start_; + size_t infer_node_end_; }; class DataFeed { @@ -1063,11 +1086,30 @@ class DataFeed { virtual void SetParseLogKey(bool parse_logkey) {} virtual void SetEnablePvMerge(bool enable_pv_merge) {} virtual void SetCurrentPhase(int current_phase) {} + virtual void InitGraphResource() {} + virtual void InitGraphTrainResource() {} virtual void SetDeviceKeys(std::vector* device_keys, int type) { #if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) gpu_graph_data_generator_.SetDeviceKeys(device_keys, type); #endif } +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + virtual const std::vector& GetHostVec() { + return gpu_graph_data_generator_.GetHostVec(); + } +#endif + + virtual void clear_gpu_mem() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + gpu_graph_data_generator_.clear_gpu_mem(); +#endif + } + virtual bool get_epoch_finish() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + return gpu_graph_data_generator_.get_epoch_finish(); +#endif + } + virtual void SetGpuGraphMode(int gpu_graph_mode) { gpu_graph_mode_ = gpu_graph_mode; } @@ -1084,11 +1126,40 @@ class DataFeed { return ins_content_vec_; } virtual int GetCurBatchSize() { return batch_size_; } + virtual int GetGraphPathNum() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + return gpu_graph_data_generator_.GetPathNum(); +#else + return 0; +#endif + } + virtual void ResetPathNum() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + gpu_graph_data_generator_.ResetPathNum(); +#endif + } + + virtual void ClearSampleState() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + gpu_graph_data_generator_.ClearSampleState(); +#endif + } + + virtual void ResetEpochFinish() { +#if defined(PADDLE_WITH_GPU_GRAPH) && defined(PADDLE_WITH_HETERPS) + gpu_graph_data_generator_.ResetEpochFinish(); +#endif +} + virtual bool IsTrainMode() { return train_mode_; } virtual void LoadIntoMemory() { PADDLE_THROW(platform::errors::Unimplemented( "This function(LoadIntoMemory) is not implemented.")); } + virtual void DoWalk() { + PADDLE_THROW(platform::errors::Unimplemented( + "This function(DoWalk) is not implemented.")); + } virtual void SetPlace(const paddle::platform::Place& place) { place_ = place; } @@ -1663,6 +1734,8 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { // CustomParser* parser) {} virtual void PutToFeedVec(const std::vector& ins_vec) {} + virtual void InitGraphResource(void); + virtual void InitGraphTrainResource(void); virtual void LoadIntoMemoryByCommand(void); virtual void LoadIntoMemoryByLib(void); virtual void LoadIntoMemoryByLine(void); @@ -1697,6 +1770,8 @@ class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { const int float_slot_size, const UsedSlotGpuType* used_slots); #endif + virtual void DoWalk(); + float sample_rate_ = 1.0f; int use_slot_size_ = 0; int float_use_slot_size_ = 0; diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto index 9726880d64ab5c..25610eea237813 100644 --- a/paddle/fluid/framework/data_feed.proto +++ b/paddle/fluid/framework/data_feed.proto @@ -40,6 +40,8 @@ message GraphConfig { optional bool gpu_graph_training = 10 [ default = true ]; optional bool sage_mode = 11 [ default = false ]; optional string samples = 12; + optional int64 train_table_cap = 13 [ default = 80000 ]; + optional int64 infer_table_cap = 14 [ default = 80000 ]; } message DataFeedDesc { diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 1d70ef6a1c78b0..e22136fdaf7079 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -36,7 +36,9 @@ #endif USE_INT_STAT(STAT_total_feasign_num_in_mem); +USE_INT_STAT(STAT_epoch_finish); DECLARE_bool(graph_get_neighbor_id); +DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace framework { @@ -446,18 +448,6 @@ void MultiSlotDataset::PrepareTrain() { return; } -template -void DatasetImpl::SetGraphDeviceKeys( - const std::vector& h_device_keys) { - // for (size_t i = 0; i < gpu_graph_device_keys_.size(); i++) { - // gpu_graph_device_keys_[i].clear(); - // } - // size_t device_num = gpu_graph_device_keys_.size(); - // for (size_t i = 0; i < h_device_keys.size(); i++) { - // int shard = h_device_keys[i] % device_num; - // gpu_graph_device_keys_[shard].push_back(h_device_keys[i]); - // } -} // load data into memory, Dataset hold this memory, // which will later be fed into readers' channel template @@ -469,63 +459,54 @@ void DatasetImpl::LoadIntoMemory() { if (gpu_graph_mode_) { VLOG(0) << "in gpu_graph_mode"; #ifdef PADDLE_WITH_HETERPS - graph_all_type_total_keys_.clear(); - auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); - auto node_to_id = gpu_graph_ptr->feature_to_id; - auto edge_to_id = gpu_graph_ptr->edge_to_id; - graph_all_type_total_keys_.resize(node_to_id.size()); - int cnt = 0; - // set sample start node - for (auto& iter : node_to_id) { - int node_idx = iter.second; - std::vector> gpu_graph_device_keys; - gpu_graph_ptr->get_all_id( - 1, node_idx, thread_num_, &gpu_graph_device_keys); - auto& type_total_key = graph_all_type_total_keys_[cnt]; - type_total_key.resize(thread_num_); - for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) { - VLOG(2) << "node type: " << node_idx << ", gpu_graph_device_keys[" << i - << "] = " << gpu_graph_device_keys[i].size(); - for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) { - type_total_key[i].push_back(gpu_graph_device_keys[i][j]); - } - } - + for (size_t i = 0; i < readers_.size(); i++) { + readers_[i]->SetGpuGraphMode(gpu_graph_mode_); + } + + if (STAT_GET(STAT_epoch_finish) == 1) { + VLOG(0) << "get epoch finish true"; + STAT_RESET(STAT_epoch_finish, 0); for (size_t i = 0; i < readers_.size(); i++) { - readers_[i]->SetDeviceKeys(&type_total_key[i], node_idx); - readers_[i]->SetGpuGraphMode(gpu_graph_mode_); + readers_[i]->ResetPathNum(); + readers_[i]->ResetEpochFinish(); } - cnt++; + return; } - // add node embedding id - std::vector> gpu_graph_device_keys; - gpu_graph_ptr->get_node_embedding_ids(thread_num_, &gpu_graph_device_keys); - for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) { - for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) { - gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]); + for (int64_t i = 0; i < thread_num_; ++i) { + load_threads.push_back( + std::thread(&paddle::framework::DataFeed::DoWalk, readers_[i].get())); + } + for (std::thread& t : load_threads) { + t.join(); + } + uint64_t node_num = 0; + for (int i = 0; i < thread_num_; i++) { + auto& host_vec = readers_[i]->GetHostVec(); + node_num += host_vec.size(); + } + gpu_graph_total_keys_.reserve(node_num); + for (int i = 0; i < thread_num_; i++) { + auto& host_vec = readers_[i]->GetHostVec(); + for (size_t j = 0; j < host_vec.size(); j++) { + gpu_graph_total_keys_.push_back(host_vec[j]); } } - // add feature embedding id - VLOG(2) << "begin add feature_id into gpu_graph_total_keys_ size[" - << gpu_graph_total_keys_.size() << "]"; - for (auto& iter : node_to_id) { - std::vector> gpu_graph_device_keys; - int node_idx = iter.second; - gpu_graph_ptr->get_all_feature_ids( - 1, node_idx, thread_num_, &gpu_graph_device_keys); - for (size_t i = 0; i < gpu_graph_device_keys.size(); i++) { - VLOG(2) << "begin node type: " << node_idx << ", gpu_graph_device_keys[" - << i << "] = " << gpu_graph_device_keys[i].size(); - for (size_t j = 0; j < gpu_graph_device_keys[i].size(); j++) { - gpu_graph_total_keys_.push_back(gpu_graph_device_keys[i][j]); - } - VLOG(2) << "end node type: " << node_idx << ", gpu_graph_device_keys[" - << i << "] = " << gpu_graph_device_keys[i].size(); + if (GetEpochFinish() == true) { + VLOG(0) << "epoch finish, set stat and clear sample stat!"; + STAT_RESET(STAT_epoch_finish, 1); + for (size_t i = 0; i < readers_.size(); i++) { + readers_[i]->ClearSampleState(); } } - VLOG(2) << "end add feature_id into gpu_graph_total_keys_ size[" + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + for (size_t i = 0; i < readers_.size(); i++) { + readers_[i]->clear_gpu_mem(); + } + } + + VLOG(2) << "end add edge into gpu_graph_total_keys_ size[" << gpu_graph_total_keys_.size() << "]"; #endif } else { @@ -1123,7 +1104,26 @@ void DatasetImpl::DestroyPreLoadReaders() { template int64_t DatasetImpl::GetMemoryDataSize() { - return input_channel_->Size(); + if (gpu_graph_mode_) { + int64_t total_path_num = 0; + for (int i = 0; i < thread_num_; i++) { + total_path_num += readers_[i]->GetGraphPathNum(); + } + return total_path_num; + } else { + return input_channel_->Size(); + } +} + +template +bool DatasetImpl::GetEpochFinish() { + bool is_epoch_finish = true; + if (gpu_graph_mode_) { + for (int i = 0; i < thread_num_; i++) { + is_epoch_finish = is_epoch_finish && readers_[i]->get_epoch_finish(); + } + } + return is_epoch_finish; } template @@ -1780,6 +1780,7 @@ void SlotRecordDataset::CreateReaders() { readers_[i]->SetParseLogKey(parse_logkey_); readers_[i]->SetEnablePvMerge(enable_pv_merge_); readers_[i]->SetCurrentPhase(current_phase_); + readers_[i]->InitGraphResource(); if (input_channel_ != nullptr) { readers_[i]->SetInputChannel(input_channel_.get()); } diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index 0489c2ece64e8f..9e1998a35fd649 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -169,6 +169,10 @@ class Dataset { virtual void SetGpuGraphMode(int is_graph_mode) = 0; virtual int GetGpuGraphMode() = 0; + virtual bool GetEpochFinish() = 0; + + virtual void SetPassId(uint32_t pass_id) = 0; + virtual uint32_t GetPassID() = 0; protected: virtual int ReceiveFromClient(int msg_type, @@ -253,7 +257,7 @@ class DatasetImpl : public Dataset { int read_thread_num, int consume_thread_num, int shard_num) {} - virtual void SetGraphDeviceKeys(const std::vector& h_device_keys); + virtual void SetGraphDeviceKeys(const std::vector& h_device_keys) {} virtual void ClearLocalTables() {} virtual void CreatePreLoadReaders(); virtual void DestroyPreLoadReaders(); @@ -263,11 +267,7 @@ class DatasetImpl : public Dataset { virtual void DynamicAdjustReadersNum(int thread_num); virtual void SetFleetSendSleepSeconds(int seconds); virtual std::vector GetSlots(); - /* for enable_heterps_ - virtual void EnableHeterps(bool enable_heterps) { - enable_heterps_ = enable_heterps; - } - */ + virtual bool GetEpochFinish(); std::vector>& GetMultiOutputChannel() { return multi_output_channel_; @@ -280,10 +280,13 @@ class DatasetImpl : public Dataset { return multi_consume_channel_; } } + Channel& GetInputChannelRef() { return input_channel_; } std::vector& GetGpuGraphTotalKeys() { return gpu_graph_total_keys_; } - Channel& GetInputChannelRef() { return input_channel_; } + + virtual void SetPassId(uint32_t pass_id) { pass_id_ = pass_id; } + virtual uint32_t GetPassID() { return pass_id_; } protected: virtual int ReceiveFromClient(int msg_type, @@ -344,9 +347,9 @@ class DatasetImpl : public Dataset { std::vector use_slots_; bool enable_heterps_ = false; int gpu_graph_mode_ = 0; - // std::vector> gpu_graph_device_keys_; - std::vector>> graph_all_type_total_keys_; + std::vector>> gpu_graph_type_keys_; std::vector gpu_graph_total_keys_; + uint32_t pass_id_ = 0; }; // use std::vector or Record as data type diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 4cf3ab8dc1a67d..bacb096f751b1d 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -29,7 +29,7 @@ if(WITH_HETERPS) nv_library( ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps gloo_wrapper ps_framework_proto ${BRPC_DEPS}) + DEPS heter_ps gloo_wrapper ps_framework_proto graph_gpu_wrapper ${BRPC_DEPS}) else() nv_library( ps_gpu_wrapper diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index ef2e73d6dd5b56..948021582275d4 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -85,7 +85,9 @@ class HeterContext { std::vector> dim_mutex_; int multi_mf_dim_ = 0; + void * sub_graph_feas = NULL; uint32_t shard_num_ = 37; + uint16_t pass_id_ = 0; uint64_t size() { uint64_t total_size = 0; for (auto& keys : feature_keys_) { diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h index 85bf6bb553b220..160de2646d7d96 100644 --- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h @@ -524,6 +524,7 @@ class concurrent_unordered_map : public managed { __forceinline__ __device__ iterator insert(const value_type& x, aggregation_type op, + uint64_t* local_count = NULL, comparison_type keys_equal = key_equal(), bool precomputed_hash = false, hash_value_type precomputed_hash_value = 0) { @@ -580,6 +581,10 @@ class concurrent_unordered_map : public managed { if (m_enable_collision_stat) { atomicAdd(&m_insert_times, 1); } + + if (local_count != NULL && keys_equal(unused_key, old_key)) { + atomicAdd(local_count, 1); + } break; } diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu index f05fe6c95de0a5..52a02cbfb2d8b9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -60,11 +60,11 @@ __global__ void PullDedupCopy(const size_t N, const int64_t* slot_lens, uint64_t max_val_size, const int* slot_dims, - const int hidden, + const size_t hidden, const int* key2slot, const uint32_t* restore_idx, TAccess accessor) { - CUDA_KERNEL_LOOP(idx, N) { + CUDA_KERNEL_LOOP_TYPE(idx, N, size_t) { int i = idx / hidden; int off = idx % hidden; @@ -158,7 +158,7 @@ __global__ void PushMergeCopyAtomic(const size_t N, const uint32_t* d_restore_idx, size_t grad_value_size, TAccess accessor) { - CUDA_KERNEL_LOOP(idx, N) { + CUDA_KERNEL_LOOP_TYPE(idx, N, size_t) { int i = idx / hidden; int off = idx % hidden; // filter 0 keys @@ -224,7 +224,7 @@ __global__ void PushMergeCopy(const size_t N, const uint32_t* d_sort_cnt, size_t grad_value_size, TAccess accessor) { - CUDA_KERNEL_LOOP(idx, N) { + CUDA_KERNEL_LOOP_TYPE(idx, N, size_t) { int i = idx / hidden; int off = idx % hidden; // filter 0 keys diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h index ed62292c6b1dc1..10ffa04485ab19 100644 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -171,6 +171,7 @@ struct NeighborSampleResult { uint64_t *actual_val; int *actual_sample_size, sample_size, key_size; int total_sample_size; + cudaStream_t stream=0; std::shared_ptr val_mem, actual_sample_size_mem; std::shared_ptr actual_val_mem; uint64_t *get_val() { return val; } @@ -180,18 +181,30 @@ struct NeighborSampleResult { int get_key_size() { return key_size; } void set_total_sample_size(int s) { total_sample_size = s; } int get_len() { return total_sample_size; } + void set_stream(cudaStream_t stream_t) { + stream = stream_t; + } void initialize(int _sample_size, int _key_size, int dev_id) { sample_size = _sample_size; key_size = _key_size; platform::CUDADeviceGuard guard(dev_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id); - val_mem = - memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t)); + if (stream != 0) { + val_mem = + memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t), phi::Stream(reinterpret_cast(stream))); + actual_sample_size_mem = + memory::AllocShared(place, _key_size * sizeof(int), phi::Stream(reinterpret_cast(stream))); + } + else { + val_mem = + memory::AllocShared(place, _sample_size * _key_size * sizeof(uint64_t)); + actual_sample_size_mem = + memory::AllocShared(place, _key_size * sizeof(int)); + } val = (uint64_t *)val_mem->ptr(); - actual_sample_size_mem = - memory::AllocShared(place, _key_size * sizeof(int)); actual_sample_size = (int *)actual_sample_size_mem->ptr(); } + void display() { VLOG(0) << "in node sample result display ------------------"; int64_t *res = new int64_t[sample_size * key_size]; @@ -364,7 +377,7 @@ struct GpuPsCommGraphFea { uint8_t *slot_id_list; // locate on both side GpuPsFeaInfo *fea_info_list; // only locate on host side, the list of fea_info - uint64_t feature_size, node_size; + uint64_t feature_size, node_size, feature_capacity; // the size of feature array and graph_node_list array GpuPsCommGraphFea() : node_list(NULL), @@ -372,7 +385,8 @@ struct GpuPsCommGraphFea { slot_id_list(NULL), fea_info_list(NULL), feature_size(0), - node_size(0) {} + node_size(0), + feature_capacity(0){} GpuPsCommGraphFea(uint64_t *node_list_, uint64_t *feature_list_, uint8_t *slot_id_list_, diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h index 39734cae33fca1..69e743cca977a3 100644 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h @@ -13,6 +13,10 @@ // limitations under the License. #pragma once +#include +#include +#include +#include #include #include #include @@ -22,6 +26,38 @@ namespace paddle { namespace framework { +/** + * @brief wrapper of the std::default_random_engine each construction will have different seeds. + */ +struct random_engine_wrapper_t { + std::default_random_engine engine; + random_engine_wrapper_t() { + timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + static std::atomic x(static_cast(1)); + std::seed_seq sseq = {x++, x++, x++, + (unsigned long)(tp.tv_sec * 1e9 + tp.tv_nsec)}; + engine.seed(sseq); + } +}; + +/** + * @brief Get a n-size vector, but its element has unique shuffled int value (from 0 to n-1). + * @param n vector size + * @return the shuffled vector. + */ +inline std::vector shuffle_int_vector(int n) { + random_engine_wrapper_t random_engine_wrapper; + std::vector ret(n); + int i = 0; + + for (auto & e : ret) { + e = i++; + } + std::shuffle(ret.begin(), ret.end(), random_engine_wrapper.engine); + return std::move(ret); +} + #define CUDA_CHECK(cmd) \ do { \ cudaError_t e = cmd; \ diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index 4c0ebd11996212..03d7a505302e9f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -113,12 +113,22 @@ class GpuPsGraphTable } } } + device_mutex_.resize(gpu_num); + for (int i = 0; i < gpu_num; i++) { + device_mutex_[i] = new std::mutex(); + } + } + ~GpuPsGraphTable() { + for (size_t i = 0; i < device_mutex_.size(); ++i) { + delete device_mutex_[i]; + } + device_mutex_.clear(); } - ~GpuPsGraphTable() {} void build_graph_on_single_gpu(const GpuPsCommGraph &g, int gpu_id, int idx); void build_graph_fea_on_single_gpu(const GpuPsCommGraphFea &g, int gpu_id); void clear_graph_info(int gpu_id, int index); void clear_graph_info(int index); + void reset_feature_info(int gpu_id, size_t capacity, size_t feature_size); void clear_feature_info(int gpu_id, int index); void clear_feature_info(int index); void build_graph_from_cpu(const std::vector &cpu_node_list, @@ -169,7 +179,10 @@ class GpuPsGraphTable int* actual_sample_size, int edge_type_len, int len); - int init_cpu_table(const paddle::distributed::GraphParameter &graph); + int init_cpu_table(const paddle::distributed::GraphParameter &graph, int gpu_num = 8); + gpuStream_t get_local_stream(int gpu_id) { + return resource_->local_stream(gpu_id, 0); + } int gpu_num; int graph_table_num_, feature_table_num_; @@ -181,6 +194,7 @@ class GpuPsGraphTable std::shared_ptr cpu_graph_table_; std::shared_ptr rw_lock; mutable std::mutex mutex_; + std::vector device_mutex_; std::condition_variable cv_; int cpu_table_status; }; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu index 4ce99f159322c7..f3e286358c1df9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu @@ -15,8 +15,8 @@ #include #include #include - #include +#include "cub/cub.cuh" #pragma once #ifdef PADDLE_WITH_HETERPS #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" @@ -244,9 +244,10 @@ __global__ void neighbor_sample_kernel_all_edge_type(GpuPsCommGraph* graphs, } int GpuPsGraphTable::init_cpu_table( - const paddle::distributed::GraphParameter& graph) { + const paddle::distributed::GraphParameter& graph, int gpu_num) { cpu_graph_table_.reset(new paddle::distributed::GraphTable); cpu_table_status = cpu_graph_table_->Initialize(graph); + cpu_graph_table_->init_worker_poll(gpu_num); // if (cpu_table_status != 0) return cpu_table_status; // std::function&)> callback = // [this](std::vector& res) { @@ -521,6 +522,39 @@ void GpuPsGraphTable::clear_feature_info(int gpu_id) { cudaFree(graph.slot_id_list); graph.slot_id_list = NULL; } + graph.feature_capacity = 0; +} + +void GpuPsGraphTable::reset_feature_info(int gpu_id, size_t capacity, size_t feature_size) { + int idx = 0; + if (idx >= feature_table_num_) return; + int offset = get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, idx); + if (offset < tables_.size()) { + delete tables_[offset]; + tables_[offset] = new Table(capacity); + } + int graph_fea_idx = gpu_id * feature_table_num_ + idx; + if (graph_fea_idx >= gpu_graph_fea_list_.size()) { + return; + } + auto& graph = gpu_graph_fea_list_[graph_fea_idx]; + graph.node_list = NULL; + if (graph.feature_list == NULL) { + CUDA_CHECK(cudaMalloc((void**)&graph.feature_list, feature_size * sizeof(uint64_t))); + CUDA_CHECK(cudaMalloc((void**)&graph.slot_id_list, feature_size * sizeof(uint8_t))); + graph.feature_capacity = feature_size; + } + else if (graph.feature_capacity < feature_size) { + cudaFree(graph.feature_list); + cudaFree(graph.slot_id_list); + CUDA_CHECK(cudaMalloc((void**)&graph.feature_list, feature_size * sizeof(uint64_t))); + CUDA_CHECK(cudaMalloc((void**)&graph.slot_id_list, feature_size * sizeof(uint8_t))); + graph.feature_capacity = feature_size; + } + else { + CUDA_CHECK(cudaMemset(graph.feature_list, 0, feature_size * sizeof(uint64_t))); + CUDA_CHECK(cudaMemset(graph.slot_id_list, 0, feature_size * sizeof(uint8_t))); + } } void GpuPsGraphTable::clear_graph_info(int gpu_id, int idx) { @@ -552,20 +586,14 @@ In this function, memory is allocated on each gpu to save the graphs, gpu i saves the ith graph from cpu_graph_list */ void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g, - int gpu_id) { - clear_feature_info(gpu_id); + int gpu_id) { + platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); + size_t capacity = std::max((uint64_t)1, g.node_size) / load_factor_; + reset_feature_info(gpu_id, capacity, g.feature_size); int ntype_id = 0; - - platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); - int offset = gpu_id * feature_table_num_ + ntype_id; - gpu_graph_fea_list_[offset] = GpuPsCommGraphFea(); - int table_offset = get_table_offset(gpu_id, GraphTableType::FEATURE_TABLE, ntype_id); - - size_t capacity = std::max((uint64_t)1, g.node_size) / load_factor_; - tables_[table_offset] = new Table(capacity); if (g.node_size > 0) { build_ps(gpu_id, g.node_list, @@ -574,51 +602,23 @@ void GpuPsGraphTable::build_graph_fea_on_single_gpu(const GpuPsCommGraphFea& g, 1024, 8, table_offset); - gpu_graph_fea_list_[offset].node_list = NULL; gpu_graph_fea_list_[offset].node_size = g.node_size; } else { build_ps(gpu_id, NULL, NULL, 0, 1024, 8, table_offset); - gpu_graph_fea_list_[offset].node_list = NULL; gpu_graph_fea_list_[offset].node_size = 0; } if (g.feature_size) { - // TODO - cudaError_t cudaStatus = - cudaMalloc((void**)&gpu_graph_fea_list_[offset].feature_list, - g.feature_size * sizeof(uint64_t)); - PADDLE_ENFORCE_EQ( - cudaStatus, - cudaSuccess, - platform::errors::InvalidArgument( - "ailed to allocate memory for graph-feature on gpu ")); - VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint64_t) - << " bytes of memory for graph-feature on gpu " - << resource_->dev_id(gpu_id); CUDA_CHECK(cudaMemcpy(gpu_graph_fea_list_[offset].feature_list, g.feature_list, g.feature_size * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - // TODO - cudaStatus = cudaMalloc((void**)&gpu_graph_fea_list_[offset].slot_id_list, - g.feature_size * sizeof(uint8_t)); - PADDLE_ENFORCE_EQ( - cudaStatus, - cudaSuccess, - platform::errors::InvalidArgument( - "ailed to allocate memory for graph-feature on gpu ")); - VLOG(0) << "sucessfully allocate " << g.feature_size * sizeof(uint8_t) - << " bytes of memory for graph-feature on gpu " - << resource_->dev_id(gpu_id); - cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list, + CUDA_CHECK(cudaMemcpy(gpu_graph_fea_list_[offset].slot_id_list, g.slot_id_list, g.feature_size * sizeof(uint8_t), - cudaMemcpyHostToDevice); + cudaMemcpyHostToDevice)); gpu_graph_fea_list_[offset].feature_size = g.feature_size; } else { - gpu_graph_fea_list_[offset].feature_list = NULL; - gpu_graph_fea_list_[offset].slot_id_list = NULL; gpu_graph_fea_list_[offset].feature_size = 0; } VLOG(0) << "gpu node_feature info card :" << gpu_id << " ,node_size is " @@ -870,6 +870,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( bool compress) { NeighborSampleResult result; + auto stream = resource_->local_stream(gpu_id, 0); + result.set_stream(stream); result.initialize(sample_size, len, resource_->dev_id(gpu_id)); if (len == 0) { @@ -882,15 +884,20 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( int* actual_sample_size = result.actual_sample_size; uint64_t* val = result.val; int total_gpu = resource_->total_device(); - auto stream = resource_->local_stream(gpu_id, 0); int grid_size = (len - 1) / block_size_ + 1; int h_left[total_gpu]; // NOLINT int h_right[total_gpu]; // NOLINT - auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); - auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_left = + memory::Alloc(place, + total_gpu * sizeof(int), + phi::Stream(reinterpret_cast(stream))); + auto d_right = + memory::Alloc(place, + total_gpu * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_left_ptr = reinterpret_cast(d_left->ptr()); int* d_right_ptr = reinterpret_cast(d_right->ptr()); int default_value = 0; @@ -901,15 +908,26 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream)); // - auto d_idx = memory::Alloc(place, len * sizeof(int)); + auto d_idx = + memory::Alloc(place, + len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - auto d_shard_keys = memory::Alloc(place, len * sizeof(uint64_t)); + auto d_shard_keys = + memory::Alloc(place, + len * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); uint64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); auto d_shard_vals = - memory::Alloc(place, sample_size * len * sizeof(uint64_t)); + memory::Alloc(place, + sample_size * len * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); uint64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); - auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); + auto d_shard_actual_sample_size = + memory::Alloc(place, + len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_shard_actual_sample_size_ptr = reinterpret_cast(d_shard_actual_sample_size->ptr()); @@ -921,10 +939,18 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( CUDA_CHECK(cudaStreamSynchronize(stream)); - CUDA_CHECK(cudaMemcpy( - h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy( - h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(h_left, + d_left_ptr, + total_gpu * sizeof(int), + cudaMemcpyDeviceToHost, + stream)); + CUDA_CHECK(cudaMemcpyAsync(h_right, + d_right_ptr, + total_gpu * sizeof(int), + cudaMemcpyDeviceToHost, + stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + device_mutex_[gpu_id]->lock(); for (int i = 0; i < total_gpu; ++i) { int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; if (shard_len == 0) { @@ -995,6 +1021,16 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( h_right, d_shard_vals_ptr, d_shard_actual_sample_size_ptr); + + for (int i = 0; i < total_gpu; ++i) { + int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; + if (shard_len == 0) { + continue; + } + destroy_storage(gpu_id, i); + } + device_mutex_[gpu_id]->unlock(); + fill_dvalues<<>>( d_shard_vals_ptr, val, @@ -1100,52 +1136,57 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( } if (compress) { + CUDA_CHECK(cudaStreamSynchronize(stream)); + platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id)); + platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); size_t temp_storage_bytes = 0; int total_sample_size = 0; - auto cumsum_actual_sample_size = memory::Alloc(place, (len + 1) * sizeof(int)); - int* cumsum_actual_sample_size_ptr = + auto cumsum_actual_sample_size = + memory::Alloc(place, + (len + 1) * sizeof(int), + phi::Stream(reinterpret_cast(stream))); + int* cumsum_actual_sample_size_p = reinterpret_cast(cumsum_actual_sample_size->ptr()); - CUDA_CHECK(cudaMemsetAsync(cumsum_actual_sample_size_ptr, 0, sizeof(int), stream)); + CUDA_CHECK( + cudaMemsetAsync(cumsum_actual_sample_size_p, 0, sizeof(int), stream)); CUDA_CHECK(cub::DeviceScan::InclusiveSum(NULL, - temp_storage_bytes, - actual_sample_size, - cumsum_actual_sample_size_ptr + 1, - len, - stream)); - auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + temp_storage_bytes, + actual_sample_size, + cumsum_actual_sample_size_p + 1, + len, + stream)); + auto d_temp_storage = + memory::Alloc(place, + temp_storage_bytes, + phi::Stream(reinterpret_cast(stream))); CUDA_CHECK(cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), temp_storage_bytes, actual_sample_size, - cumsum_actual_sample_size_ptr + 1, + cumsum_actual_sample_size_p + 1, len, stream)); CUDA_CHECK(cudaMemcpyAsync(&total_sample_size, - cumsum_actual_sample_size_ptr + len, + cumsum_actual_sample_size_p + len, sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); + result.actual_val_mem = memory::AllocShared( + place, + total_sample_size * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); + result.actual_val = (uint64_t*)(result.actual_val_mem)->ptr(); result.set_total_sample_size(total_sample_size); - result.actual_val_mem = - memory::AllocShared(place, total_sample_size * sizeof(uint64_t)); - result.actual_val = (uint64_t*)(result.actual_val_mem)->ptr(); fill_actual_vals<<>>( val, result.actual_val, actual_sample_size, - cumsum_actual_sample_size_ptr, + cumsum_actual_sample_size_p, sample_size, len); } - for (int i = 0; i < total_gpu; ++i) { - int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; - if (shard_len == 0) { - continue; - } - destroy_storage(gpu_id, i); - } cudaStreamSynchronize(stream); return result; } @@ -1207,6 +1248,7 @@ NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type( CUDA_CHECK(cudaMemcpy( h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost)); + device_mutex_[gpu_id]->lock(); for (int i = 0; i < total_gpu; ++i) { int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; if (shard_len == 0) { @@ -1302,6 +1344,7 @@ NeighborSampleResultV2 GpuPsGraphTable::graph_neighbor_sample_all_edge_type( } destroy_storage(gpu_id, i); } + device_mutex_[gpu_id]->unlock(); return result; } @@ -1370,23 +1413,40 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, int total_gpu = resource_->total_device(); auto stream = resource_->local_stream(gpu_id, 0); - auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); - auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_left = + memory::Alloc(place, + total_gpu * sizeof(int), + phi::Stream(reinterpret_cast(stream))); + auto d_right = + memory::Alloc(place, + total_gpu * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_left_ptr = reinterpret_cast(d_left->ptr()); int* d_right_ptr = reinterpret_cast(d_right->ptr()); CUDA_CHECK(cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream)); CUDA_CHECK(cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream)); // - auto d_idx = memory::Alloc(place, node_num * sizeof(int)); + auto d_idx = + memory::Alloc(place, + node_num * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - auto d_shard_keys = memory::Alloc(place, node_num * sizeof(uint64_t)); + auto d_shard_keys = + memory::Alloc(place, + node_num * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); uint64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); auto d_shard_vals = - memory::Alloc(place, fea_num_per_node * node_num * sizeof(uint64_t)); + memory::Alloc(place, + fea_num_per_node * node_num * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); uint64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); - auto d_shard_actual_size = memory::Alloc(place, node_num * sizeof(int)); + auto d_shard_actual_size = + memory::Alloc(place, + node_num * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_shard_actual_size_ptr = reinterpret_cast(d_shard_actual_size->ptr()); @@ -1403,6 +1463,7 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, int h_right[total_gpu]; // NOLINT CUDA_CHECK(cudaMemcpy( h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost)); + device_mutex_[gpu_id]->lock(); for (int i = 0; i < total_gpu; ++i) { int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; if (shard_len == 0) { @@ -1457,9 +1518,9 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, d_slot_feature_num_map, slot_num, shard_len, - fea_num_per_node); + fea_num_per_node); } - + for (int i = 0; i < total_gpu; ++i) { if (h_left[i] == -1) { continue; @@ -1474,6 +1535,14 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, h_right, d_shard_vals_ptr, d_shard_actual_size_ptr); + for (int i = 0; i < total_gpu; ++i) { + int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; + if (shard_len == 0) { + continue; + } + destroy_storage(gpu_id, i); + } + device_mutex_[gpu_id]->unlock(); int grid_size = (node_num - 1) / block_size_ + 1; fill_dvalues<<>>(d_shard_vals_ptr, @@ -1483,18 +1552,11 @@ int GpuPsGraphTable::get_feature_of_nodes(int gpu_id, fea_num_per_node, node_num); - for (int i = 0; i < total_gpu; ++i) { - int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; - if (shard_len == 0) { - continue; - } - destroy_storage(gpu_id, i); - } - CUDA_CHECK(cudaStreamSynchronize(stream)); return 0; } -} // namespace framework + +}; // namespace framework }; // namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu index 10a5f38e7313cd..81d3de9ee2f2de 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace framework { #ifdef PADDLE_WITH_HETERPS @@ -28,6 +29,121 @@ void GraphGpuWrapper::set_device(std::vector ids) { } } +void GraphGpuWrapper::init_conf(const std::string &first_node_type, + const std::string &meta_path) { + static std::mutex mutex; + { + std::lock_guard lock(mutex); + if (conf_initialized_) { + return; + } + VLOG(2) << "init path config"; + conf_initialized_ = true; + auto node_types = + paddle::string::split_string(first_node_type, ";"); + VLOG(2) << "node_types: " << first_node_type; + for (auto &type : node_types) { + auto iter = feature_to_id.find(type); + PADDLE_ENFORCE_NE(iter, + feature_to_id.end(), + platform::errors::NotFound( + "(%s) is not found in feature_to_id.", type)); + VLOG(2) << "feature_to_id[" << type << "] = " << iter->second; + first_node_type_.push_back(iter->second); + } + meta_path_.resize(first_node_type_.size()); + auto meta_paths = paddle::string::split_string(meta_path, ";"); + + for (size_t i = 0; i < meta_paths.size(); i++) { + auto path = meta_paths[i]; + auto nodes = paddle::string::split_string(path, "-"); + for (auto &node : nodes) { + auto iter = edge_to_id.find(node); + PADDLE_ENFORCE_NE(iter, + edge_to_id.end(), + platform::errors::NotFound( + "(%s) is not found in edge_to_id.", node)); + VLOG(2) << "edge_to_id[" << node << "] = " << iter->second; + meta_path_[i].push_back(iter->second); + } + } + int max_dev_id = 0; + for (size_t i = 0; i < device_id_mapping.size(); i++) { + if (device_id_mapping[i] > max_dev_id) { + max_dev_id = device_id_mapping[i]; + } + } + finish_node_type_.resize(max_dev_id + 1); + node_type_start_.resize(max_dev_id + 1); + global_infer_node_type_start_.resize(max_dev_id + 1); + for (size_t i = 0; i < device_id_mapping.size(); i++) { + int dev_id = device_id_mapping[i]; + auto &node_type_start = node_type_start_[i]; + auto &infer_node_type_start = global_infer_node_type_start_[i]; + auto &finish_node_type = finish_node_type_[i]; + finish_node_type.clear(); + + for (size_t idx = 0; idx < feature_to_id.size(); idx++) { + infer_node_type_start[idx] = 0; + } + for (auto &type : node_types) { + auto iter = feature_to_id.find(type); + node_type_start[iter->second] = 0; + infer_node_type_start[iter->second] = 0; + } + infer_cursor_.push_back(0); + cursor_.push_back(0); + } + init_type_keys(); + } +} + +void GraphGpuWrapper::init_type_keys() { + size_t thread_num = device_id_mapping.size(); + int cnt = 0; + + auto &graph_all_type_total_keys = get_graph_type_keys(); + auto &type_to_index = get_graph_type_to_index(); + std::vector> tmp_keys; + tmp_keys.resize(thread_num); + d_graph_all_type_total_keys_.resize(graph_all_type_total_keys.size()); + h_graph_all_type_keys_len_.resize(graph_all_type_total_keys.size()); + for (size_t f_idx = 0; f_idx < graph_all_type_total_keys.size(); f_idx++) { + for (size_t j = 0; j < tmp_keys.size(); j++) { + tmp_keys[j].clear(); + } + d_graph_all_type_total_keys_[f_idx].resize(thread_num); + auto &type_total_key = graph_all_type_total_keys[f_idx]; + for (size_t j = 0; j < type_total_key.size(); j++) { + uint64_t shard = type_total_key[j] % thread_num; + tmp_keys[shard].push_back(type_total_key[j]); + } + for (size_t j = 0; j < thread_num; j++) { + h_graph_all_type_keys_len_[f_idx].push_back(tmp_keys[j].size()); + VLOG(1) << "node type: " << type_to_index[f_idx] + << ", gpu_graph_device_keys[" << j + << "] = " << tmp_keys[j].size(); + } + for (size_t j = 0; j < thread_num; j++) { + auto stream = get_local_stream(j); + int gpuid = device_id_mapping[j]; + auto place = platform::CUDAPlace(gpuid); + platform::CUDADeviceGuard guard(gpuid); + d_graph_all_type_total_keys_[f_idx][j] = + memory::AllocShared(place, tmp_keys[j].size() * sizeof(uint64_t)); + cudaMemcpyAsync(d_graph_all_type_total_keys_[f_idx][j]->ptr(), + tmp_keys[j].data(), + sizeof(uint64_t) * tmp_keys[j].size(), + cudaMemcpyHostToDevice, + stream); + } + } + for (int i = 0; i < thread_num; i++) { + auto stream = get_local_stream(i); + cudaStreamSynchronize(stream); + } +} + int GraphGpuWrapper::get_all_id(int type, int slice_num, std::vector> *output) { @@ -152,6 +268,15 @@ void GraphGpuWrapper::load_edge_file(std::string name, } } +void GraphGpuWrapper::load_edge_file(std::string etype2files, + std::string graph_data_local_path, + int part_num, + bool reverse) { + ((GpuPsGraphTable *)graph_table) + ->cpu_graph_table_->parse_edge_and_load( + etype2files, graph_data_local_path, part_num, reverse); +} + void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) { // 'n' means load nodes and 'node_type' follows @@ -163,14 +288,22 @@ void GraphGpuWrapper::load_node_file(std::string name, std::string filepath) { } } +void GraphGpuWrapper::load_node_file(std::string ntype2files, + std::string graph_data_local_path, + int part_num) { + ((GpuPsGraphTable *)graph_table) + ->cpu_graph_table_->parse_node_and_load( + ntype2files, graph_data_local_path, part_num); +} + void GraphGpuWrapper::load_node_and_edge(std::string etype2files, std::string ntype2files, std::string graph_data_local_path, int part_num, bool reverse) { - ((GpuPsGraphTable *)graph_table) - ->cpu_graph_table_->load_node_and_edge_file( - etype2files, ntype2files, graph_data_local_path, part_num, reverse); + ((GpuPsGraphTable *)graph_table) + ->cpu_graph_table_->load_node_and_edge_file( + etype2files, ntype2files, graph_data_local_path, part_num, reverse); } void GraphGpuWrapper::add_table_feat_conf(std::string table_name, @@ -203,8 +336,12 @@ void GraphGpuWrapper::add_table_feat_conf(std::string table_name, } void GraphGpuWrapper::init_search_level(int level) { search_level = level; } +gpuStream_t GraphGpuWrapper::get_local_stream(int gpuid) { + return ((GpuPsGraphTable *)graph_table)->get_local_stream(gpuid); +} + void GraphGpuWrapper::init_service() { - table_proto.set_task_pool_size(24); + table_proto.set_task_pool_size(64); table_proto.set_shard_num(1000); table_proto.set_build_sampler_on_cpu(false); table_proto.set_search_level(search_level); @@ -226,11 +363,14 @@ void GraphGpuWrapper::init_service() { std::make_shared(device_id_mapping); resource->enable_p2p(); GpuPsGraphTable *g = new GpuPsGraphTable(resource, 1, id_to_edge.size()); - g->init_cpu_table(table_proto); + size_t gpu_num = device_id_mapping.size(); + g->init_cpu_table(table_proto, gpu_num); g->cpu_graph_table_->set_feature_separator(feature_separator_); g->cpu_graph_table_->set_slot_feature_separator(slot_feature_separator_); graph_table = (char *)g; + upload_num = gpu_num; upload_task_pool.reset(new ::ThreadPool(upload_num)); + } void GraphGpuWrapper::finalize() { @@ -267,6 +407,10 @@ void GraphGpuWrapper::upload_batch(int type, // feature table void GraphGpuWrapper::upload_batch(int type, int slice_num, int slot_num) { + if (type == 1 && (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH + || FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH)) { + return ; + } std::vector> node_ids; ((GpuPsGraphTable *)graph_table) ->cpu_graph_table_->get_all_id(type, slice_num, &node_ids); @@ -278,7 +422,7 @@ void GraphGpuWrapper::upload_batch(int type, int slice_num, int slot_num) { VLOG(0) << "begin make_gpu_ps_graph_fea, node_ids[" << i << "]_size[" << node_ids[i].size() << "]"; GpuPsCommGraphFea sub_graph = - g->cpu_graph_table_->make_gpu_ps_graph_fea(node_ids[i], slot_num); + g->cpu_graph_table_->make_gpu_ps_graph_fea(i, node_ids[i], slot_num); // sub_graph.display_on_cpu(); VLOG(0) << "begin build_graph_fea_on_single_gpu, node_ids[" << i << "]_size[" << node_ids[i].size() << "]"; @@ -293,6 +437,32 @@ void GraphGpuWrapper::upload_batch(int type, int slice_num, int slot_num) { debug_gpu_memory_info("upload_batch feature end"); } +//get sub_graph_fea +std::vector GraphGpuWrapper::get_sub_graph_fea(std::vector> &node_ids, int slot_num) { + GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table; + std::vector> tasks; + std::vector sub_graph_feas(node_ids.size()); + for (int i = 0; i < node_ids.size(); i++) { + tasks.push_back(upload_task_pool->enqueue([&, i, this]() -> int { + GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table; + sub_graph_feas[i] = + g->cpu_graph_table_->make_gpu_ps_graph_fea(i, node_ids[i], slot_num); + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + return sub_graph_feas; +} + +//build_gpu_graph_fea +void GraphGpuWrapper::build_gpu_graph_fea(GpuPsCommGraphFea &sub_graph_fea, int i) { + GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table; + g->build_graph_fea_on_single_gpu(sub_graph_fea, i); + sub_graph_fea.release_on_cpu(); + VLOG(0) << "sub graph fea on gpu " << i << " is built"; + return ; +} + NeighborSampleResult GraphGpuWrapper::graph_neighbor_sample_v3( NeighborSampleQuery q, bool cpu_switch, bool compress = true) { return ((GpuPsGraphTable *)graph_table) @@ -382,9 +552,6 @@ std::vector GraphGpuWrapper::graph_neighbor_sample( res.push_back(cpu_key[i * sample_size + j]); } } - /* for(int i = 0;i < res.size();i ++) { */ - /* VLOG(0) << i << " " << res[i]; */ - /* } */ delete[] actual_sample_size; cudaFree(cuda_key); return res; @@ -416,6 +583,31 @@ void GraphGpuWrapper::export_partition_files(int idx, std::string file_path) { return ((GpuPsGraphTable *)graph_table) ->cpu_graph_table_->export_partition_files(idx, file_path); } + +void GraphGpuWrapper::release_graph() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph(); +} + +void GraphGpuWrapper::release_graph_edge() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph_edge(); +} + +void GraphGpuWrapper::release_graph_node() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->release_graph_node(); +} + +std::vector &GraphGpuWrapper::get_graph_total_keys() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->graph_total_keys_; +} + +std::vector> &GraphGpuWrapper::get_graph_type_keys() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->graph_type_keys_; +} + +std::unordered_map &GraphGpuWrapper::get_graph_type_to_index() { + return ((GpuPsGraphTable *)graph_table)->cpu_graph_table_->type_to_index_; +} + #endif } // namespace framework }; // namespace paddle diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h index 644fb0792cd495..a3cad68f3bb885 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h @@ -22,6 +22,14 @@ namespace paddle { namespace framework { #ifdef PADDLE_WITH_HETERPS + +enum GpuGraphStorageMode { + WHOLE_HBM = 1, + MEM_EMB_AND_GPU_GRAPH, + MEM_EMB_FEATURE_AND_GPU_GRAPH, + SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH +}; + class GraphGpuWrapper { public: static std::shared_ptr GetInstance() { @@ -31,6 +39,8 @@ class GraphGpuWrapper { return s_instance_; } static std::shared_ptr s_instance_; + void init_conf(const std::string& first_node_type, + const std::string& meta_path); void initialize(); void finalize(); void set_device(std::vector ids); @@ -42,12 +52,22 @@ class GraphGpuWrapper { int slice_num, const std::string& edge_type); void upload_batch(int type, int slice_num, int slot_num); + std::vector get_sub_graph_fea(std::vector> &node_ids, int slot_num); + void build_gpu_graph_fea(GpuPsCommGraphFea &sub_graph_fea, int i); void add_table_feat_conf(std::string table_name, std::string feat_name, std::string feat_dtype, int feat_shape); void load_edge_file(std::string name, std::string filepath, bool reverse); + void load_edge_file(std::string etype2files, + std::string graph_data_local_path, + int part_num, + bool reverse); + void load_node_file(std::string name, std::string filepath); + void load_node_file(std::string ntype2files, + std::string graph_data_local_path, + int part_num); void load_node_and_edge(std::string etype2files, std::string ntype2files, std::string graph_data_local_path, @@ -96,6 +116,7 @@ class GraphGpuWrapper { NeighborSampleResultV2 graph_neighbor_sample_all_edge_type( int gpu_id, int edge_type_len, uint64_t* key, int sample_size, int len, std::vector> edge_type_graphs); + gpuStream_t get_local_stream(int gpuid); std::vector graph_neighbor_sample(int gpu_id, int idx, std::vector& key, @@ -112,6 +133,13 @@ class GraphGpuWrapper { int slot_num, int* d_slot_feature_num_map, int fea_num_per_node); + void release_graph(); + void release_graph_edge(); + void release_graph_node(); + void init_type_keys(); + std::vector& get_graph_total_keys(); + std::vector>& get_graph_type_keys(); + std::unordered_map& get_graph_type_to_index(); std::unordered_map edge_to_id, feature_to_id; std::vector id_to_feature, id_to_edge; @@ -126,6 +154,18 @@ class GraphGpuWrapper { int upload_num = 8; std::shared_ptr<::ThreadPool> upload_task_pool; std::string feature_separator_ = std::string(" "); + bool conf_initialized_ = false; + std::vector first_node_type_; + std::vector> meta_path_; + + std::vector> finish_node_type_; + std::vector> node_type_start_; + std::vector> global_infer_node_type_start_; + std::vector infer_cursor_; + std::vector cursor_; + std::vector>> + d_graph_all_type_total_keys_; + std::vector> h_graph_all_type_keys_len_; std::string slot_feature_separator_ = std::string(" "); }; #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index 18fb2eca5b752e..05c254b2739f22 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -124,6 +124,12 @@ class HashTable { size_t len, StreamType stream); + template + void insert(const KeyType* d_keys, + size_t len, + uint64_t* global_num, + StreamType stream); + template void insert(const KeyType* d_keys, size_t len, @@ -153,6 +159,9 @@ class HashTable { template void dump_to_cpu(int devid, StreamType stream); + template + void get_keys(KeyType* d_out, uint64_t* global_cursor, StreamType stream); + #if defined(PADDLE_WITH_CUDA) template @@ -185,7 +194,7 @@ class HashTable { #endif int size() { return container_->size(); } - + thrust::pair* data() { return container_->data(); } void set_feature_value_size(size_t pull_feature_value_size, size_t push_grad_value_size) { pull_feature_value_size_ = pull_feature_value_size; @@ -194,6 +203,12 @@ class HashTable { << " push value size: " << push_grad_value_size_; } + int prefetch(const int dev_id, cudaStream_t stream = 0) { + return container_->prefetch(dev_id, stream); + } + + void clear(cudaStream_t stream = 0) { container_->clear_async(stream); } + void show_collision(int id) { return container_->print_collision(id); } std::unique_ptr rwlock_{nullptr}; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index 1fda5a586a2e81..33b50f789a49cb 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -31,6 +31,35 @@ struct ReplaceOp { } }; +template +__global__ void insert_kernel(Table* table, + const typename Table::key_type* const keys, + size_t len, + uint64_t* global_num) { + ReplaceOp op; + thrust::pair kv; + + __shared__ uint64_t local_num; + + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (threadIdx.x == 0) { + local_num = 0; + } + __syncthreads(); + + if (i < len) { + kv.first = keys[i]; + kv.second = 1; // fake value + auto it = table->insert(kv, op, &local_num); + assert(it != table->end() && "error: insert fails: table is full"); + } + __syncthreads(); + + if (threadIdx.x == 0) { + atomicAdd(global_num, local_num); + } +} + template __global__ void insert_kernel(Table* table, const typename Table::key_type* const keys, @@ -38,7 +67,6 @@ __global__ void insert_kernel(Table* table, size_t len) { ReplaceOp op; thrust::pair kv; - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { kv.first = keys[i]; @@ -139,6 +167,41 @@ __global__ void dy_mf_update_kernel(Table* table, } } +template +__global__ void get_keys_kernel(Table* table, + typename Table::key_type* d_out, + uint64_t* global_cursor, + uint64_t unused_key) { + extern __shared__ typename Table::key_type local_key[]; + __shared__ uint64_t local_num; + __shared__ uint64_t global_num; + + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (threadIdx.x == 0) { + local_num = 0; + } + __syncthreads(); + uint64_t len = table->size(); + if (idx < len) { + typename Table::value_type val = *(table->data() + idx); + if (val.first != unused_key) { + uint64_t dst = atomicAdd(&local_num, 1); + local_key[dst] = val.first; + } + } + + __syncthreads(); + + if (threadIdx.x == 0) { + global_num = atomicAdd(global_cursor, local_num); + } + __syncthreads(); + + if (threadIdx.x < local_num) { + d_out[global_num + threadIdx.x] = local_key[threadIdx.x]; + } +} + template HashTable::HashTable(size_t capacity) { container_ = new TableContainer(capacity); @@ -211,6 +274,20 @@ void HashTable::get(const KeyType* d_keys, container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor); } +template +template +void HashTable::insert(const KeyType* d_keys, + size_t len, + uint64_t* global_num, + StreamType stream) { + if (len == 0) { + return; + } + const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; + insert_kernel<<>>( + container_, d_keys, len, global_num); +} + template template void HashTable::insert(const KeyType* d_keys, @@ -225,6 +302,20 @@ void HashTable::insert(const KeyType* d_keys, container_, d_keys, d_vals, len); } +template +template +void HashTable::get_keys(KeyType* d_out, + uint64_t* global_cursor, + StreamType stream) { + size_t len = container_->size(); + const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; + KeyType unuse_key = std::numeric_limits::max(); + size_t shared_mem_size = sizeof(KeyType) * BLOCK_SIZE_; + get_keys_kernel<<>>( + container_, d_out, global_cursor, unuse_key); +} + + template template void HashTable::insert(const KeyType* d_keys, @@ -436,6 +527,17 @@ template void HashTable::insert( size_t len, cudaStream_t stream); +template void HashTable::get_keys( + unsigned long* d_out, + unsigned long* global_cursor, + cudaStream_t stream); + +template void HashTable::insert( + const unsigned long* d_keys, + unsigned long len, + uint64_t* global_num, + cudaStream_t stream); + template void HashTable::insert( const unsigned long* d_keys, const unsigned long* d_vals, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 82532836b8e229..cf6c4eaf8b99ac 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -1127,13 +1127,22 @@ void HeterComm::split_input_to_shard( AnyDeviceGuard guard(dev_id); auto stream = resource_->local_stream(dev_num, 0); - auto d_idx_tmp = memory::Alloc(place, len * sizeof(int)); + auto d_idx_tmp = + memory::Alloc(place, + len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_idx_tmp_ptr = reinterpret_cast(d_idx_tmp->ptr()); - auto d_shard_index = memory::Alloc(place, len * sizeof(int)); + auto d_shard_index = + memory::Alloc(place, + len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_shard_index_ptr = reinterpret_cast(d_shard_index->ptr()); - auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int)); + auto d_shard_index_tmp = + memory::Alloc(place, + len * sizeof(int), + phi::Stream(reinterpret_cast(stream))); int* d_shard_index_tmp_ptr = reinterpret_cast(d_shard_index_tmp->ptr()); heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream); @@ -1153,7 +1162,10 @@ void HeterComm::split_input_to_shard( num_bits, stream); - auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + auto d_temp_storage = + memory::Alloc(place, + temp_storage_bytes, + phi::Stream(reinterpret_cast(stream))); heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr, diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h index 4696a7cc91b5ae..22d6199584f58c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h +++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h @@ -96,6 +96,53 @@ class HBMMemoryPool : public managed { size_t block_size_; }; +class HBMMemoryPoolFix : public managed { + public: + HBMMemoryPoolFix() { + capacity_ = 0; + size_ = 0 ; + block_size_ = 0; + max_byte_capacity_ = 0; + } + + ~HBMMemoryPoolFix() { + VLOG(3) << "delete hbm memory pool"; + cudaFree(mem_); + } + + size_t block_size() { return block_size_; } + + void clear(void) { cudaMemset(mem_, 0, block_size_ * capacity_); } + + void reset(size_t capacity, size_t block_size) { + if (max_byte_capacity_ < capacity * block_size) { + if (mem_ != NULL) { + cudaFree(mem_); + } + max_byte_capacity_ = (block_size * capacity / 8 + 1) * 8; + CUDA_CHECK(cudaMalloc(&mem_, max_byte_capacity_)); + } + size_ = capacity; + block_size_ = block_size; + capacity_ = max_byte_capacity_ / block_size; + } + + char* mem() { return mem_; } + + size_t capacity() { return capacity_; } + size_t size() { return size_; } + __forceinline__ __device__ void* mem_address(const uint32_t& idx) { + return (void*)&mem_[(idx)*block_size_]; + } + + private: + char* mem_ = NULL; + size_t capacity_; + size_t size_; + size_t block_size_; + size_t max_byte_capacity_; +}; + } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 9ccd724d533019..0b603133efabbd 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -31,16 +31,18 @@ limitations under the License. */ #include #include +#include #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" #include "paddle/fluid/platform/timer.h" #if defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #endif DECLARE_int32(gpugraph_dedup_pull_push_mode); -DECLARE_int32(gpugraph_sparse_table_storage_mode); +DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace framework { @@ -112,6 +114,105 @@ void PSGPUWrapper::InitAfsApi(const std::string& fs_name, use_afs_api_ = 1; } #endif + +void PSGPUWrapper::add_key_to_local(const std::vector& vec_data) { + size_t total_len = vec_data.size(); + size_t len_per_thread = total_len / thread_keys_thread_num_; + size_t begin = 0; + std::vector threads; + + int remain = total_len % thread_keys_thread_num_; + auto gen_graph_data_func = [this](const std::vector& total_data, + int begin_index, + int end_index, + int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; + iter++) { + uint64_t cur_key = *iter; + int shard_id = cur_key % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(cur_key); + } + }; + auto gen_graph_dynamic_mf_func = [this]( + const std::vector& total_data, + int begin_index, + int end_index, + int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; + iter++) { + uint64_t cur_key = *iter; + int shard_id = cur_key % thread_keys_shard_num_; + // TODO: feasign <-> slot <-> multi_dim + this->thread_dim_keys_[i][shard_id][0].insert(cur_key); + } + }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + if (!multi_mf_dim_) { + threads.push_back( + std::thread(gen_graph_data_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); + } else { + threads.push_back( + std::thread(gen_graph_dynamic_mf_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); + } + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); + } +} + +void PSGPUWrapper::add_key_to_gputask(std::shared_ptr gpu_task) { + std::vector threads; + platform::Timer timeline; + timeline.Start(); + // merge thread_keys to shard_keys + auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) { + for (int i = 0; i < thread_keys_thread_num_; ++i) { + gpu_task->batch_add_keys( + shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]); + thread_dim_keys_[i][shard_num][dim_id].clear(); + } + }; + for (int i = 0; i < thread_keys_shard_num_; ++i) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j)); + } + } + for (auto& t : threads) { + t.join(); + } + timeline.Pause(); + + VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec() + << " seconds."; + timeline.Start(); + gpu_task->UniqueKeys(); + timeline.Pause(); + VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds."; +} + +void PSGPUWrapper::resize_gputask(std::shared_ptr gpu_task) { + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + if (i == 0 && j == multi_mf_dim_ - 1) { + gpu_task->feature_dim_keys_[i][j].push_back(0); + } + gpu_task->value_dim_ptr_[i][j].resize( + gpu_task->feature_dim_keys_[i][j].size()); + } + } +} + void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin"; platform::Timer timeline; @@ -238,106 +339,298 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(0) << "PreBuild in GpuGraph mode"; SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_); const std::vector& vec_data = dataset->GetGpuGraphTotalKeys(); + VLOG(0) << "GpuGraphTotalKeys: " << vec_data.size(); + timeline.Start(); + add_key_to_local(vec_data); + timeline.Pause(); + VLOG(0) << "add_key_to_local cost " << timeline.ElapsedSec() << " seconds."; + } - total_len = vec_data.size(); - len_per_thread = total_len / thread_keys_thread_num_; - VLOG(0) << "GpuGraphTotalKeys: " << total_len; - remain = total_len % thread_keys_thread_num_; - auto gen_graph_data_func = [this](const std::vector& total_data, - int begin_index, - int end_index, - int i) { - for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; - iter++) { - uint64_t cur_key = *iter; - int shard_id = cur_key % thread_keys_shard_num_; - this->thread_keys_[i][shard_id].insert(cur_key); - } + add_key_to_gputask(gpu_task); +} + +void PSGPUWrapper::add_slot_feature(std::shared_ptr gpu_task) { + platform::Timer timeline; + platform::Timer time_stage; + timeline.Start(); + //8卡数据分片 + size_t device_num = heter_devices_.size(); + std::vector threads; + size_t slot_num = slot_vector_.size() - 1;//node slot 9008 in slot_vector + auto& local_dim_keys = gpu_task->feature_dim_keys_; + double divide_nodeid_cost = 0; + double get_feature_id_cost = 0; + double add_feature_to_set_cost = 0; + double add_feature_to_key_cost = 0; + + std::vector> node_ids(device_num); + size_t node_num = 0; + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + node_num += local_dim_keys[i][j].size(); + } + } + for (auto &node_id_vector : node_ids){ + node_id_vector.reserve(node_num * 1.2 / device_num); + } + + auto& device_dim_mutex = gpu_task->dim_mutex_; + + auto divide_nodeid_to_device = [this, + device_num, + &local_dim_keys, + &node_ids, + &device_dim_mutex](int i, int j) { + std::vector> task_keys(device_num); + size_t batch = 10000; + for (size_t k = 0; k < device_num; k++) { + task_keys[k].reserve(batch * 1.2 / device_num); + } + std::vector shuffle_device = shuffle_int_vector(device_num); + size_t start = 0; + while (start < local_dim_keys[i][j].size()) { + if (batch + start > local_dim_keys[i][j].size()) { + batch = local_dim_keys[i][j].size() - start; + } + for (size_t k = start; k < (start + batch); k++) { + int shard = local_dim_keys[i][j][k] % device_num; + task_keys[shard].push_back(local_dim_keys[i][j][k]); + } + // allocate local keys to devices + for (auto dev : shuffle_device) { + device_dim_mutex[dev][0]->lock(); + int len = task_keys[dev].size(); + for (int k = 0; k < len; ++k) { + node_ids[dev].push_back(task_keys[dev][k]); + } + device_dim_mutex[dev][0]->unlock(); + task_keys[dev].clear(); + } + start += batch; + } }; - auto gen_graph_dynamic_mf_func = - [this](const std::vector& total_data, - int begin_index, - int end_index, - int i) { - for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; - iter++) { - uint64_t cur_key = *iter; - int shard_id = cur_key % thread_keys_shard_num_; - // TODO: feasign <-> slot <-> multi_dim - this->thread_dim_keys_[i][shard_id][0].insert(cur_key); - } - }; - for (int i = 0; i < thread_keys_thread_num_; i++) { - if (!multi_mf_dim_) { - VLOG(1) << "psgpu graph wrapper genfunc"; - threads.push_back( - std::thread(gen_graph_data_func, - std::ref(vec_data), - begin, - begin + len_per_thread + (i < remain ? 1 : 0), - i)); - } else { - VLOG(1) << "psgpu graph wrapper genfunc with dynamic mf"; - threads.push_back( - std::thread(gen_graph_dynamic_mf_func, - std::ref(vec_data), - begin, - begin + len_per_thread + (i < remain ? 1 : 0), - i)); + threads.resize(thread_keys_shard_num_ * multi_mf_dim_); + time_stage.Start(); + + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i * multi_mf_dim_ + j] = + std::thread(divide_nodeid_to_device, i, j); } - begin += len_per_thread + (i < remain ? 1 : 0); } for (std::thread& t : threads) { t.join(); } - } + threads.clear(); + time_stage.Pause(); + divide_nodeid_cost = time_stage.ElapsedSec(); + gpu_task->sub_graph_feas = (void *) (new std::vector); + std::vector &sub_graph_feas = *((std::vector *) gpu_task->sub_graph_feas); + std::vector> feature_ids(device_num); + std::vector feature_list(device_num); + std::vector feature_list_size(device_num); + size_t batch = 40000; + + time_stage.Start(); + if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_AND_GPU_GRAPH) { + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + auto h_slot_feature_num_map = gpu_graph_ptr->slot_feature_num_map(); + int fea_num_per_node = 0; + for (size_t i = 0; i < slot_num; ++i) { + fea_num_per_node += h_slot_feature_num_map[i]; + } - timeline.Start(); + auto get_feature_id = [this, slot_num, batch, fea_num_per_node, &h_slot_feature_num_map, &node_ids, &feature_ids](int i) { + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + int * d_slot_feature_num_map; + uint64_t * d_node_list_ptr; + uint64_t * d_feature_list_ptr; + CUDA_CHECK(cudaMalloc((void**)&d_slot_feature_num_map, slot_num * sizeof(int))); + CUDA_CHECK(cudaMemcpy(d_slot_feature_num_map, h_slot_feature_num_map.data(), + sizeof(int) * slot_num, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMalloc((void**)&d_node_list_ptr, batch * sizeof(uint64_t))); + CUDA_CHECK(cudaMalloc((void**)&d_feature_list_ptr, batch * fea_num_per_node * sizeof(uint64_t))); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + uint64_t pos = 0; + size_t real_batch = 0; + feature_ids[i].resize(node_ids[i].size() * fea_num_per_node); + while (pos < node_ids[i].size()) { + real_batch = (pos + batch) <= node_ids[i].size() ? batch : node_ids[i].size() - pos; + CUDA_CHECK(cudaMemcpy(d_node_list_ptr, + node_ids[i].data() + pos, + real_batch * sizeof(uint64_t), + cudaMemcpyHostToDevice)); + int ret = gpu_graph_ptr->get_feature_of_nodes(i, + d_node_list_ptr, + d_feature_list_ptr, + real_batch, + slot_num, + d_slot_feature_num_map, + fea_num_per_node); + PADDLE_ENFORCE_EQ( + ret, + 0, + platform::errors::PreconditionNotMet( + "get_feature_of_nodes error")); + + CUDA_CHECK(cudaMemcpy(feature_ids[i].data() + pos * fea_num_per_node, + d_feature_list_ptr, + real_batch * fea_num_per_node * sizeof(uint64_t), + cudaMemcpyDeviceToHost)); + pos += real_batch; + } + cudaFree(d_slot_feature_num_map); + cudaFree(d_node_list_ptr); + cudaFree(d_feature_list_ptr); + }; - threads.clear(); - // merge thread_keys to shard_keys - auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) { - for (int i = 0; i < thread_keys_thread_num_; ++i) { - gpu_task->batch_add_keys( - shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]); - thread_dim_keys_[i][shard_num][dim_id].clear(); + threads.resize(device_num); + for (size_t i = 0; i < device_num; i++) { + threads[i] = std::thread(get_feature_id, i); + } + for (std::thread& t : threads) { + t.join(); + } + threads.clear(); + for (size_t i = 0; i < device_num; i++) { + feature_list[i] = feature_ids[i].data(); + feature_list_size[i] = feature_ids[i].size(); + } } - }; - for (int i = 0; i < thread_keys_shard_num_; ++i) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j)); + else if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH + || FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) { + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + sub_graph_feas = gpu_graph_ptr->get_sub_graph_fea(node_ids, slot_num); + for (size_t i = 0; i < device_num; i++) { + feature_list[i] = sub_graph_feas[i].feature_list; + feature_list_size[i] = sub_graph_feas[i].feature_size; + } } - } - for (auto& t : threads) { - t.join(); - } - timeline.Pause(); + else { + PADDLE_ENFORCE_EQ( + 1, + 0, + " FLAGS_gpugraph_storage_mode is not adaptived "); + } + time_stage.Pause(); + get_feature_id_cost = time_stage.ElapsedSec(); + size_t feature_num = 0; + for (size_t i = 0; i < device_num; i++) { + feature_num += feature_list_size[i]; + } + VLOG(0) << "feature_num is " << feature_num << " node_num num is " << node_num; - VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec() - << " seconds."; - timeline.Start(); - gpu_task->UniqueKeys(); - timeline.Pause(); + size_t set_num = device_num * 8; + std::vector> feature_id_set(set_num); + std::vector set_mutex(set_num); - VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds."; - for (int i = 0; i < thread_keys_shard_num_; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - if (i == 0 && j == multi_mf_dim_ - 1) { - gpu_task->feature_dim_keys_[i][j].push_back(0); + auto add_feature_to_set = [this, set_num, &feature_list, &feature_id_set, &set_mutex] (int dev, size_t start, size_t end) { + size_t batch = 10000 * set_num; + std::vector> feature_list_tmp(set_num); + for (size_t i = 0; i < set_num; i++) { + feature_list_tmp[i].reserve((batch * 1.2) /set_num); + } + std::vector shuffle_set_index = shuffle_int_vector(set_num); + size_t pos = start; + size_t real_batch = 0; + while (pos < end) { + real_batch = (pos + batch <= end) ? batch : end - pos; + for (size_t i = pos; i < pos + real_batch; i++) { + if (feature_list[dev][i] == 0) { + continue; + } + int shard_num = feature_list[dev][i] % set_num; + feature_list_tmp[shard_num].push_back(feature_list[dev][i]); + } + // uniq in local + for (size_t i = 0; i < set_num; i++) { + std::sort(feature_list_tmp[i].begin(), feature_list_tmp[i].end()); + size_t idx = 0; + size_t total = feature_list_tmp[i].size(); + for (size_t j = 0; j < total; j++) { + auto &k = feature_list_tmp[i][j]; + if (idx > 0 && feature_list_tmp[i][idx - 1] == k) { + continue; + } + feature_list_tmp[i][idx] = k; + ++idx; + } + feature_list_tmp[i].resize(idx); + } + // uniq in global + for (auto set_index : shuffle_set_index) { + set_mutex[set_index].lock(); + for (auto feature_id : feature_list_tmp[set_index]) { + feature_id_set[set_index].insert(feature_id); + } + set_mutex[set_index].unlock(); + feature_list_tmp[set_index].clear(); + } + pos += real_batch; + } + }; + size_t device_thread_num = 8; + threads.resize(device_num * device_thread_num); + time_stage.Start(); + for (size_t i = 0; i < device_num; i++) { + size_t start = 0; + for (size_t j = 0; j < device_thread_num; j++) { + size_t batch = feature_list_size[i] / device_thread_num; + if (j < feature_list_size[i] % device_thread_num) { + batch += 1; + } + threads[i * device_thread_num + j] = std::thread(add_feature_to_set, i, start, start + batch); + start += batch; } - VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j] - << " key len: " << gpu_task->feature_dim_keys_[i][j].size(); - gpu_task->value_dim_ptr_[i][j].resize( - gpu_task->feature_dim_keys_[i][j].size()); } - } + for (std::thread& t : threads) { + t.join(); + } + threads.clear(); + time_stage.Pause(); + add_feature_to_set_cost = time_stage.ElapsedSec(); + auto add_feature_to_key = [this, device_num, &feature_id_set, &local_dim_keys, set_num](int dev) { + // set_num = device_num * 8, a % set_num = b , a = set_num * m + b , a % device_num = b % device_num + size_t key_num = 0; + for (size_t i = dev; i < set_num; i += device_num) { + key_num += feature_id_set[i].size(); + } + VLOG(0) << " feature_num is " << key_num << " for device: " << dev; + local_dim_keys[dev][0].reserve(local_dim_keys[dev][0].size() + key_num); + for (size_t i = dev; i < set_num; i += device_num) { + for (auto it = feature_id_set[i].begin(); it != feature_id_set[i].end(); it++) { + local_dim_keys[dev][0].push_back(*it); + } + feature_id_set[i].clear(); + } + }; + time_stage.Start(); + threads.resize(device_num); + for (size_t i = 0; i < device_num; i++) { + threads[i] = std::thread(add_feature_to_key, i); + } + for (std::thread& t : threads) { + t.join(); + } + time_stage.Pause(); + add_feature_to_key_cost = time_stage.ElapsedSec(); + threads.clear(); + timeline.Pause(); + VLOG(0) << " add_slot_feature costs: " << timeline.ElapsedSec() << " s." + << " divide_nodeid_cost " << divide_nodeid_cost + << " get_feature_id_cost " << get_feature_id_cost + << " add_feature_to_set_cost " << add_feature_to_set_cost + << " add_feature_to_key_cost " << add_feature_to_key_cost; } void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { platform::Timer timeline; + size_t slot_num = slot_vector_.size() - 1; //node slot 9008 in slot_vector + if (slot_num > 0 && FLAGS_gpugraph_storage_mode != paddle::framework::GpuGraphStorageMode::WHOLE_HBM) { + add_slot_feature(gpu_task); + } + resize_gputask(gpu_task); auto& local_dim_keys = gpu_task->feature_dim_keys_; auto& local_dim_ptr = gpu_task->value_dim_ptr_; @@ -374,7 +667,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { timeline.Start(); auto ptl_dynamic_mf_func = - [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr](int i, int j) { + [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr, &gpu_task](int i, + int j) { size_t key_size = local_dim_keys[i][j].size(); int32_t status = -1; int32_t cnt = 0; @@ -415,10 +709,12 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { #ifdef PADDLE_WITH_PSCORE while (true) { auto tt = fleet_ptr->worker_ptr_->PullSparsePtr( + i, reinterpret_cast(local_dim_ptr[i][j].data()), this->table_id_, local_dim_keys[i][j].data(), - key_size); + key_size, + gpu_task->pass_id_); bool flag = true; tt.wait(); @@ -455,7 +751,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } }; - //fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_); threads.resize(thread_keys_shard_num_ * multi_mf_dim_); std::vector> task_futures; @@ -468,7 +763,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { for (auto& f : task_futures) { f.wait(); } - //fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_); task_futures.clear(); timeline.Pause(); VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() @@ -481,187 +775,205 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } gloo_wrapper->Barrier(); } - } -void PSGPUWrapper::PrepareGPUTask(std::shared_ptr gpu_task) { - platform::Timer timeline; - int device_num = heter_devices_.size(); - std::vector threads; - std::vector> task_futures; - auto& local_keys = gpu_task->feature_keys_; - auto& local_ptr = gpu_task->value_ptr_; - auto& local_dim_keys = gpu_task->feature_dim_keys_; - auto& local_dim_ptr = gpu_task->value_dim_ptr_; +void PSGPUWrapper::divide_to_device(std::shared_ptr gpu_task) { + platform::Timer timeline; + int device_num = heter_devices_.size(); + std::vector threads; + std::vector> task_futures; + auto& local_dim_keys = gpu_task->feature_dim_keys_; + auto& local_dim_ptr = gpu_task->value_dim_ptr_; - auto& device_keys = gpu_task->device_keys_; - auto& device_vals = gpu_task->device_values_; - auto& device_dim_keys = gpu_task->device_dim_keys_; - auto& device_dim_ptr = gpu_task->device_dim_ptr_; - auto& device_dim_mutex = gpu_task->dim_mutex_; - //auto& device_mutex = gpu_task->mutex_; + auto& device_dim_keys = gpu_task->device_dim_keys_; + auto& device_dim_ptr = gpu_task->device_dim_ptr_; + auto& device_dim_mutex = gpu_task->dim_mutex_; + // auto& device_mutex = gpu_task->mutex_; - if (multi_mf_dim_) { - for (size_t dev = 0; dev < device_dim_keys.size(); dev++) { - device_dim_keys[dev].resize(multi_mf_dim_); - device_dim_ptr[dev].resize(multi_mf_dim_); - } + if (multi_mf_dim_) { + for (size_t dev = 0; dev < device_dim_keys.size(); dev++) { + device_dim_keys[dev].resize(multi_mf_dim_); + device_dim_ptr[dev].resize(multi_mf_dim_); } + } + timeline.Start(); + auto build_pull_dynamic_mf_func = [this, + device_num, + &local_dim_keys, + &local_dim_ptr, + &device_dim_keys, + &device_dim_ptr, + &device_dim_mutex](int i, int j) { + std::vector> task_keys(device_num); +#ifdef PADDLE_WITH_PSLIB + std::vector> task_ptrs( + device_num); +#endif - timeline.Start(); - std::vector>> pass_values; - - bool record_status = false; - auto& device_task_keys = gpu_task->device_task_keys_; - auto& device_task_ptrs = gpu_task->device_task_ptr_; - auto build_pull_dynamic_mf_func = [this, - device_num, - &local_dim_keys, - &local_dim_ptr, - &device_dim_keys, - &device_dim_ptr, - &device_dim_mutex](int i, int j) { - std::vector> task_keys(device_num); - #ifdef PADDLE_WITH_PSLIB - std::vector> task_ptrs( - device_num); - #endif - - #ifdef PADDLE_WITH_PSCORE - std::vector> task_ptrs( - device_num); - #endif - for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) { - int shard = local_dim_keys[i][j][k] % device_num; - task_keys[shard].push_back(local_dim_keys[i][j][k]); - task_ptrs[shard].push_back(local_dim_ptr[i][j][k]); +#ifdef PADDLE_WITH_PSCORE + std::vector> task_ptrs( + device_num); +#endif + for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) { + int shard = local_dim_keys[i][j][k] % device_num; + task_keys[shard].push_back(local_dim_keys[i][j][k]); + task_ptrs[shard].push_back(local_dim_ptr[i][j][k]); + } + // allocate local keys to devices + std::vector shuffle_device = shuffle_int_vector(device_num); + for (auto dev : shuffle_device) { + device_dim_mutex[dev][j]->lock(); + int len = task_keys[dev].size(); + int cur = device_dim_keys[dev][j].size(); + device_dim_keys[dev][j].resize(device_dim_keys[dev][j].size() + len); + device_dim_ptr[dev][j].resize(device_dim_ptr[dev][j].size() + len); + for (int k = 0; k < len; ++k) { + device_dim_keys[dev][j][cur + k] = task_keys[dev][k]; + device_dim_ptr[dev][j][cur + k] = task_ptrs[dev][k]; } - // allocate local keys to devices - for (int dev = 0; dev < device_num; dev++) { - device_dim_mutex[dev][j]->lock(); - int len = task_keys[dev].size(); - int cur = device_dim_keys[dev][j].size(); - device_dim_keys[dev][j].resize(device_dim_keys[dev][j].size() + len); - device_dim_ptr[dev][j].resize(device_dim_ptr[dev][j].size() + len); - for (int k = 0; k < len; ++k) { - device_dim_keys[dev][j][cur + k] = task_keys[dev][k]; - device_dim_ptr[dev][j][cur + k] = task_ptrs[dev][k]; - } - device_dim_mutex[dev][j]->unlock(); + device_dim_mutex[dev][j]->unlock(); + } + }; + + if (multi_mf_dim_) { + threads.resize(thread_keys_shard_num_ * multi_mf_dim_); + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i * multi_mf_dim_ + j] = + std::thread(build_pull_dynamic_mf_func, i, j); } - }; - auto build_func = [device_num, - record_status, - &pass_values, - &local_keys, - &local_ptr, - &device_task_keys, - &device_task_ptrs](int i) { - auto& task_keys = device_task_keys[i]; - #ifdef PADDLE_WITH_PSLIB - auto& task_ptrs = device_task_ptrs[i]; - #endif + } + for (std::thread& t : threads) { + t.join(); + } + } + timeline.Pause(); + VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() + << " seconds."; +} - #ifdef PADDLE_WITH_PSCORE - auto& task_ptrs = device_task_ptrs[i]; - #endif +void PSGPUWrapper::PrepareGPUTask(std::shared_ptr gpu_task) { + platform::Timer timeline; + int device_num = heter_devices_.size(); + std::vector threads; + std::vector> task_futures; + auto& local_keys = gpu_task->feature_keys_; + auto& local_ptr = gpu_task->value_ptr_; - for (size_t j = 0; j < local_keys[i].size(); j++) { - int shard = local_keys[i][j] % device_num; - task_keys[shard].push_back(local_keys[i][j]); - task_ptrs[shard].push_back(local_ptr[i][j]); - } - #ifdef PADDLE_WITH_PSLIB - if (record_status) { - size_t local_keys_size = local_keys.size(); - size_t pass_values_size = pass_values.size(); - for (size_t j = 0; j < pass_values_size; j += local_keys_size) { - auto& shard_values = pass_values[j]; - for (size_t pair_idx = 0; pair_idx < pass_values[j].size(); - pair_idx++) { - auto& cur_pair = shard_values[pair_idx]; - int shard = cur_pair.first % device_num; - task_keys[shard].push_back(cur_pair.first); - task_ptrs[shard].push_back( - (paddle::ps::DownpourFixedFeatureValue*)cur_pair.second); - } + auto& device_keys = gpu_task->device_keys_; + auto& device_vals = gpu_task->device_values_; + // auto& device_mutex = gpu_task->mutex_; + + timeline.Start(); + std::vector>> pass_values; + + bool record_status = false; + auto& device_task_keys = gpu_task->device_task_keys_; + auto& device_task_ptrs = gpu_task->device_task_ptr_; + + auto build_func = [device_num, + record_status, + &pass_values, + &local_keys, + &local_ptr, + &device_task_keys, + &device_task_ptrs](int i) { + auto& task_keys = device_task_keys[i]; +#ifdef PADDLE_WITH_PSLIB + auto& task_ptrs = device_task_ptrs[i]; +#endif + +#ifdef PADDLE_WITH_PSCORE + auto& task_ptrs = device_task_ptrs[i]; +#endif + + for (size_t j = 0; j < local_keys[i].size(); j++) { + int shard = local_keys[i][j] % device_num; + task_keys[shard].push_back(local_keys[i][j]); + task_ptrs[shard].push_back(local_ptr[i][j]); + } +#ifdef PADDLE_WITH_PSLIB + if (record_status) { + size_t local_keys_size = local_keys.size(); + size_t pass_values_size = pass_values.size(); + for (size_t j = 0; j < pass_values_size; j += local_keys_size) { + auto& shard_values = pass_values[j]; + for (size_t pair_idx = 0; pair_idx < pass_values[j].size(); + pair_idx++) { + auto& cur_pair = shard_values[pair_idx]; + int shard = cur_pair.first % device_num; + task_keys[shard].push_back(cur_pair.first); + task_ptrs[shard].push_back( + (paddle::ps::DownpourFixedFeatureValue*)cur_pair.second); } } - #endif - }; - if (!multi_mf_dim_) { - for (int i = 0; i < thread_keys_shard_num_; i++) { - task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(build_func, i)); - } - for (auto& f : task_futures) { - f.wait(); - } - task_futures.clear(); - VLOG(0) << "GpuPs build hbmps done"; } - std::vector> prefix_sum; - prefix_sum.resize(device_num); +#endif + }; + if (!multi_mf_dim_) { + for (int i = 0; i < thread_keys_shard_num_; i++) { + task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(build_func, i)); + } + for (auto& f : task_futures) { + f.wait(); + } + task_futures.clear(); + VLOG(0) << "GpuPs build hbmps done"; + } + std::vector> prefix_sum; + prefix_sum.resize(device_num); + for (int i = 0; i < device_num; i++) { + prefix_sum[i].resize(thread_keys_shard_num_ + 1); + prefix_sum[i][0] = 0; + } + auto calc_prefix_func = [this, + &prefix_sum, + &device_keys, + &device_vals, + &device_task_keys](int device_num) { + for (int j = 0; j < thread_keys_shard_num_; j++) { + prefix_sum[device_num][j + 1] = + prefix_sum[device_num][j] + device_task_keys[j][device_num].size(); + } + device_keys[device_num].resize( + prefix_sum[device_num][thread_keys_shard_num_]); + device_vals[device_num].resize( + prefix_sum[device_num][thread_keys_shard_num_]); + }; + if (!multi_mf_dim_) { for (int i = 0; i < device_num; i++) { - prefix_sum[i].resize(thread_keys_shard_num_ + 1); - prefix_sum[i][0] = 0; + task_futures.emplace_back( + hbm_thread_pool_[i]->enqueue(calc_prefix_func, i)); } - auto calc_prefix_func = [this, - &prefix_sum, - &device_keys, - &device_vals, - &device_task_keys](int device_num) { - for (int j = 0; j < thread_keys_shard_num_; j++) { - prefix_sum[device_num][j + 1] = - prefix_sum[device_num][j] + device_task_keys[j][device_num].size(); - } - device_keys[device_num].resize( - prefix_sum[device_num][thread_keys_shard_num_]); - device_vals[device_num].resize( - prefix_sum[device_num][thread_keys_shard_num_]); - }; - if (!multi_mf_dim_) { - for (int i = 0; i < device_num; i++) { - task_futures.emplace_back( - hbm_thread_pool_[i]->enqueue(calc_prefix_func, i)); - } - for (auto& f : task_futures) { - f.wait(); - } - task_futures.clear(); + for (auto& f : task_futures) { + f.wait(); } - VLOG(0) << "prefix done"; - auto prepare_dev_value_func = [device_num, - &prefix_sum, - &device_keys, - &device_vals, - &device_task_keys, - &device_task_ptrs](int dev, int shard_id) { - // auto& task_keys = device_task_keys[shard_id]; - #ifdef PADDLE_WITH_PSLIB - auto& task_ptrs = device_task_ptrs[shard_id]; - #endif - - // #ifdef PADDLE_WITH_PSCORE - // auto& task_ptrs = device_task_ptrs[shard_id]; - // #endif - - // int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id]; - // int cur = prefix_sum[dev][shard_id]; - #ifdef PADDLE_WITH_PSLIB - for (int j = 0; j < len; ++j) { - device_keys[dev][cur + j] = task_keys[dev][j]; - float* ptr_val = task_ptrs[dev][j]->data(); - FeatureValue& val = device_vals[dev][cur + j]; - size_t dim = task_ptrs[dev][j]->size(); - - val.delta_score = ptr_val[1]; - val.show = ptr_val[2]; - val.clk = ptr_val[3]; - val.slot = ptr_val[6]; - val.lr = ptr_val[4]; - val.lr_g2sum = ptr_val[5]; - val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); + task_futures.clear(); + } + VLOG(0) << "prefix done"; + auto prepare_dev_value_func = [device_num, + &prefix_sum, + &device_keys, + &device_vals, + &device_task_keys, + &device_task_ptrs](int dev, int shard_id) { +#ifdef PADDLE_WITH_PSLIB + auto& task_ptrs = device_task_ptrs[shard_id]; + + for (int j = 0; j < len; ++j) { + device_keys[dev][cur + j] = task_keys[dev][j]; + float* ptr_val = task_ptrs[dev][j]->data(); + FeatureValue& val = device_vals[dev][cur + j]; + size_t dim = task_ptrs[dev][j]->size(); + + val.delta_score = ptr_val[1]; + val.show = ptr_val[2]; + val.clk = ptr_val[3]; + val.slot = ptr_val[6]; + val.lr = ptr_val[4]; + val.lr_g2sum = ptr_val[5]; + val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); if (dim > 7) { val.mf_size = MF_DIM + 1; @@ -677,40 +989,29 @@ void PSGPUWrapper::PrepareGPUTask(std::shared_ptr gpu_task) { } #endif VLOG(3) << "GpuPs build hbmps done"; - }; - if (multi_mf_dim_) { - threads.resize(thread_keys_shard_num_ * multi_mf_dim_); - for (int i = 0; i < thread_keys_shard_num_; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i * multi_mf_dim_ + j] = - std::thread(build_pull_dynamic_mf_func, i, j); - } - } - for (std::thread& t : threads) { - t.join(); - } - } else { - for (int i = 0; i < thread_keys_shard_num_; i++) { - for (int j = 0; j < device_num; j++) { - task_futures.emplace_back( - hbm_thread_pool_[i]->enqueue(prepare_dev_value_func, j, i)); - } - } - for (auto& f : task_futures) { - f.wait(); + }; + if (!multi_mf_dim_) { + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < device_num; j++) { + task_futures.emplace_back( + hbm_thread_pool_[i]->enqueue(prepare_dev_value_func, j, i)); } - task_futures.clear(); } - timeline.Pause(); - VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() - << " seconds."; + for (auto& f : task_futures) { + f.wait(); + } + task_futures.clear(); + } + timeline.Pause(); + VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() + << " seconds."; } void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { int device_num = heter_devices_.size(); - platform::Timer timeline; - timeline.Start(); + platform::Timer stagetime; + stagetime.Start(); std::vector feature_keys_count(device_num); size_t size_max = 0; @@ -722,7 +1023,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { << " dim index: " << j << " contains feasign nums: " << gpu_task->device_dim_ptr_[i][j].size(); } - VLOG(1) << i << " card with dynamic mf contains feasign nums total: " + VLOG(0) << i << " card with dynamic mf contains feasign nums total: " << feature_keys_count[i]; size_max = std::max(size_max, feature_keys_count[i]); } @@ -745,87 +1046,28 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { HeterPs_->set_sparse_sgd(optimizer_config_); HeterPs_->set_embedx_sgd(optimizer_config_); #endif + stagetime.Pause(); + VLOG(0) << "card: " + << " BuildGPUTask create HeterPs_ costs: " << stagetime.ElapsedSec() + << " s."; + stagetime.Start(); - auto build_dymf_mem_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i, - int j) { - this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); - int mf_dim = this->index_dim_vec_[j]; - VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim - << " feature_value_size:" - << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); - size_t feature_value_size = - accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); - auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + auto build_dynamic_mf_func = [this, &gpu_task, &accessor_wrapper_ptr]( + int i, int j, size_t start, size_t end) { + // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; - size_t len = device_dim_keys.size(); - CHECK(len == device_dim_ptrs.size()); - this->mem_pools_[i * this->multi_mf_dim_ + j] = - new MemoryPool(len, feature_value_size); - }; - auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i, - int j) { - auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; - size_t len = device_dim_keys.size(); int mf_dim = this->index_dim_vec_[j]; size_t feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + size_t real_len = end - start; + std::shared_ptr build_values(new char[feature_value_size * real_len], + [](char* p) { delete[] p; }); + char* test_build_values = build_values.get(); - auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; - platform::CUDADeviceGuard guard(resource_->dev_id(i)); - this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool); - auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - - this->HeterPs_->build_ps(i, - device_dim_keys.data(), - cur_pool->mem(), - len, - feature_value_size, - 500000, - 2); - if (device_dim_keys.size() > 0) { - VLOG(3) << "show table: " << i - << " table kv size: " << device_dim_keys.size() - << "dim: " << mf_dim << " len: " << len; - HeterPs_->show_one_table(i); - } - delete mem_pool; - }; - int thread_num = 16; - auto build_dynamic_mf_func = [this, - &gpu_task, - thread_num, - &accessor_wrapper_ptr](int i, int j, int z) { - // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); - int mf_dim = this->index_dim_vec_[j]; - VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim; - auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; - auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; - size_t len = device_dim_keys.size(); - CHECK(len == device_dim_ptrs.size()); - // this->mem_pools_[i * this->multi_mf_dim_ + j] = - // new MemoryPool(len, feature_value_size); - auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; - - // ============ add for multi-thread ================ - size_t len_per_thread = len / thread_num; - size_t remain = len % thread_num; - size_t left = 0, right = 0; - - size_t real_len = len_per_thread; - if ((size_t)z < remain) real_len++; - - if ((size_t)z < remain) { - left = z * (len_per_thread + 1); - right = left + real_len; - } else { - left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread; - right = left + real_len; - } - // ============ add for multi-thread ================ - - for (size_t k = left; k < right; k++) { + for (size_t k = start; k < end; k++) { #ifdef PADDLE_WITH_PSLIB - float* val = (float*)(mem_pool->mem_address(k)); + float* val = + (float*)(test_build_values + (k - start) * feature_value_size); float* ptr_val = device_dim_ptrs[k]->data(); size_t dim = device_dim_ptrs[k]->size(); val->delta_score = @@ -859,54 +1101,141 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { } #endif #ifdef PADDLE_WITH_PSCORE - void* val = mem_pool->mem_address(k); + void* val = + (float*)(test_build_values + (k - start) * feature_value_size); accessor_wrapper_ptr->BuildFill( val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim); #endif } + task_info task; + task.build_values = build_values; + task.offset = start; + task.device_id = i; + task.multi_mf_dim = j; + task.start = 0; + task.end = int(real_len); + cpu_reday_channels_[i]->Put(task); }; - threads.resize(device_num * multi_mf_dim_); - for (int i = 0; i < device_num; i++) { + auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i) { + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + std::vector threads(multi_mf_dim_); for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(build_dymf_mem_pool, i, j); + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + size_t len = device_dim_keys.size(); + int mf_dim = this->index_dim_vec_[j]; + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + this->hbm_pools_[i * this->multi_mf_dim_ + j]->reset(len, + feature_value_size); + + auto build_ps_thread = + [this, &gpu_task]( + int i, int j, size_t len, size_t feature_value_size) { + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + this->HeterPs_->build_ps( + i, + device_dim_keys.data(), + this->hbm_pools_[i * this->multi_mf_dim_ + j]->mem(), + len, + feature_value_size, + 500000, + 2); + if (device_dim_keys.size() > 0) { + VLOG(3) << "show table: " << i + << " table kv size: " << device_dim_keys.size() + << "dim: " << this->index_dim_vec_[j] << " len: " << len; + HeterPs_->show_one_table(i); + } + }; + threads[j] = std::thread(build_ps_thread, i, j, len, feature_value_size); + } + //build feature table + size_t slot_num = slot_vector_.size() - 1;//node slot 9008 in slot_vector + if (slot_num > 0 && (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH + || FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH)) { + auto build_feature_table = [this, &gpu_task](int i) { + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + std::vector * tmp = (std::vector *) gpu_task->sub_graph_feas; + gpu_graph_ptr->build_gpu_graph_fea((*tmp)[i], i); + }; + threads.push_back(std::thread(build_feature_table, i)); } - } - for (std::thread& t : threads) { - t.join(); - } - threads.clear(); + struct task_info task; + while (cpu_reday_channels_[i]->Get(task)) { + auto hbm = this->hbm_pools_[task.device_id * this->multi_mf_dim_ + + task.multi_mf_dim] + ->mem(); + int mf_dim = this->index_dim_vec_[task.multi_mf_dim]; + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + auto hbm_start = hbm + task.offset * feature_value_size; + CUDA_CHECK( + cudaMemcpy(hbm_start, + task.build_values.get() + task.start * feature_value_size, + (task.end - task.start) * feature_value_size, + cudaMemcpyHostToDevice)); + } + platform::Timer stagetime; + stagetime.Start(); + for (std::thread& t : threads) { + t.join(); + } + stagetime.Pause(); + VLOG(0) << "card: " << i + << " BuildGPUTask build_ps async costs: " << stagetime.ElapsedSec() + << " s."; + }; + + std::vector> cpu_task_futures; + std::vector> gpu_task_futures; - // multi-thread process - threads.resize(device_num * multi_mf_dim_ * thread_num); + int once_gpu_copy = 64 * 1024; + threads.resize(device_num * multi_mf_dim_); for (int i = 0; i < device_num; i++) { + cpu_reday_channels_[i]->Open(); + gpu_task_futures.emplace_back( + hbm_thread_pool_[i]->enqueue(build_dymf_hbm_pool, i)); for (int j = 0; j < multi_mf_dim_; j++) { - for (int k = 0; k < thread_num; k++) { - threads[(i + j * device_num) * thread_num + k] = - std::thread(build_dynamic_mf_func, i, j, k); + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + size_t len = device_dim_keys.size(); + size_t start = 0; + size_t end = 0; + while (end < len) { + start = end; + end = end + once_gpu_copy < len ? (end + once_gpu_copy) : len; + cpu_task_futures.emplace_back(cpu_work_pool_[i]->enqueue( + build_dynamic_mf_func, i, j, start, end)); } } } - for (std::thread& t : threads) { - t.join(); + stagetime.Start(); + for (auto& f : cpu_task_futures) { + f.wait(); } - threads.clear(); - threads.resize(device_num * multi_mf_dim_); + cpu_task_futures.clear(); + stagetime.Pause(); + VLOG(0) << " BuildGPUTask build_dynamic_mf_func " + << " cost " << stagetime.ElapsedSec() << " s."; for (int i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(build_dymf_hbm_pool, i, j); - } + cpu_reday_channels_[i]->Close(); } - for (std::thread& t : threads) { - t.join(); + stagetime.Start(); + for (auto& f : gpu_task_futures) { + f.wait(); } - threads.clear(); - - timeline.Pause(); - VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() - << " s."; + gpu_task_futures.clear(); + if (FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::MEM_EMB_FEATURE_AND_GPU_GRAPH + || FLAGS_gpugraph_storage_mode == paddle::framework::GpuGraphStorageMode::SSD_EMB_AND_MEM_FEATURE_GPU_GRAPH) { + std::vector * tmp = (std::vector *) gpu_task->sub_graph_feas; + delete tmp; + gpu_task->sub_graph_feas = NULL; + } + stagetime.Pause(); + VLOG(0) << " build_dymf_hbm_pool " + << " cost " << stagetime.ElapsedSec() << " s."; } void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { @@ -916,17 +1245,25 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { dataset_->LoadIntoMemory(); timer.Pause(); VLOG(0) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s"; - + gpu_graph_mode_ = dataset_->GetGpuGraphMode(); + if (dataset_->GetMemoryDataSize() == 0) { + VLOG(0) << "GetMemoryDataSize == 0"; + return; + } // local shuffle if (is_shuffle) { dataset_->LocalShuffle(); } - InitSlotInfo(); - gpu_graph_mode_ = dataset_->GetGpuGraphMode(); - std::shared_ptr gpu_task = gpu_task_pool_.Get(); - gpu_task->Reset(); - data_ready_channel_->Put(gpu_task); + InitSlotInfo(); + if (FLAGS_gpugraph_storage_mode != GpuGraphStorageMode::WHOLE_HBM) { + std::shared_ptr gpu_task = gpu_task_pool_.Get(); + gpu_task->Reset(); + gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID()); + data_ready_channel_->Put(gpu_task); + } else if (hbm_sparse_table_initialized_ == false) { + SparseTableToHbm(); + } VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]"; } @@ -969,9 +1306,11 @@ void PSGPUWrapper::build_pull_thread() { timer.Start(); // build cpu ps data process BuildPull(gpu_task); + if (multi_mf_dim_) { + divide_to_device(gpu_task); + } timer.Pause(); - VLOG(1) << "thread BuildPull end, cost time: " << timer.ElapsedSec() - << "s"; + VLOG(1) << "thread BuildPull end, cost time: " << timer.ElapsedSec() << "s"; buildpull_ready_channel_->Put(gpu_task); } VLOG(3) << "build cpu thread end"; @@ -992,17 +1331,22 @@ void PSGPUWrapper::build_task() { VLOG(0) << "PrepareGPUTask start."; platform::Timer timer; timer.Start(); - PrepareGPUTask(gpu_task); + if (!multi_mf_dim_) { + PrepareGPUTask(gpu_task); + } BuildGPUTask(gpu_task); timer.Pause(); - VLOG(0) << "PrepareGPUTask + BuildGPUTask end, cost time: " << timer.ElapsedSec() - << "s"; + VLOG(0) << "PrepareGPUTask + BuildGPUTask end, cost time: " + << timer.ElapsedSec() << "s"; current_task_ = gpu_task; } void PSGPUWrapper::BeginPass() { platform::Timer timer; + if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) { + return; + } timer.Start(); if (current_task_) { PADDLE_THROW( @@ -1028,12 +1372,59 @@ void PSGPUWrapper::BeginPass() { } void PSGPUWrapper::EndPass() { + if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) { + return; + } + platform::Timer stagetime; + stagetime.Start(); + HbmToSparseTable(); + stagetime.Pause(); + VLOG(0) << "EndPass HbmToSparseTable cost time: " << stagetime.ElapsedSec() + << "s"; + + gpu_task_pool_.Push(current_task_); + current_task_ = nullptr; + gpu_free_channel_->Put(current_task_); + // fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_); +} + +void PSGPUWrapper::SparseTableToHbm() { + std::shared_ptr gpu_task = gpu_task_pool_.Get(); + gpu_task->Reset(); + size_t device_num = heter_devices_.size(); + gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_); + gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID()); + auto gpu_graph_ptr = GraphGpuWrapper::GetInstance(); + auto node_to_id = gpu_graph_ptr->feature_to_id; + auto edge_to_id = gpu_graph_ptr->edge_to_id; + std::vector vec_data = gpu_graph_ptr->get_graph_total_keys(); + + thread_dim_keys_.resize(thread_keys_thread_num_); + for (int i = 0; i < thread_keys_thread_num_; i++) { + thread_dim_keys_[i].resize(thread_keys_shard_num_); + for (int j = 0; j < thread_keys_shard_num_; j++) { + thread_dim_keys_[i][j].resize(multi_mf_dim_); + } + } + + add_key_to_local(vec_data); + add_key_to_gputask(gpu_task); + BuildPull(gpu_task); + if (!multi_mf_dim_) { + PrepareGPUTask(gpu_task); + } else { + divide_to_device(gpu_task); + } + BuildGPUTask(gpu_task); + current_task_ = gpu_task; + hbm_sparse_table_initialized_ = true; +} + +void PSGPUWrapper::HbmToSparseTable() { if (!current_task_) { PADDLE_THROW( platform::errors::Fatal("[EndPass] current task has been ended.")); } - platform::Timer timer; - timer.Start(); size_t keysize_max = 0; // in case of feasign_num = 0, skip dump_to_cpu @@ -1043,89 +1434,123 @@ void PSGPUWrapper::EndPass() { std::max(keysize_max, current_task_->device_dim_keys_[i][j].size()); } } - int thread_num = 8; auto accessor_wrapper_ptr = GlobalAccessorFactory::GetInstance().GetAccessorWrapper(); - //auto fleet_ptr = FleetWrapper::GetInstance(); - //fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_); - auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr]( - int i, int j, int z) { + + int once_cpu_num = 16 * 1024; + int once_gpu_copy = 8 * once_cpu_num; + + auto dump_pool_to_cpu_func = [this, &accessor_wrapper_ptr, once_cpu_num]( + int i, int j, size_t start, size_t end) { PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - auto& device_keys = this->current_task_->device_dim_keys_[i][j]; - size_t len = device_keys.size(); - // ====== multi-thread process feasign================ - int len_per_thread = len / thread_num; - int remain = len % thread_num; - int left = -1, right = -1; - int real_len = len_per_thread; - if (z < remain) real_len++; - if (z < remain) { - left = z * (len_per_thread + 1); - right = left + real_len; - } else { - left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread; - right = left + real_len; - } + size_t real_len = end - start; // ============ multi-thread process feasign============ int mf_dim = this->index_dim_vec_[j]; size_t feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); - VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim - << " key_len :" << len - << " feature_value_size:" << feature_value_size; - char* test_build_values = (char*)malloc(feature_value_size * real_len); - uint64_t offset = left * feature_value_size; + + std::shared_ptr build_values(new char[feature_value_size * real_len], + [](char* p) { delete[] p; }); + uint64_t offset = start * feature_value_size; + char* test_build_values = build_values.get(); + cudaMemcpy(test_build_values, hbm_pool->mem() + offset, feature_value_size * real_len, cudaMemcpyDeviceToHost); - CHECK(len == hbm_pool->capacity()); - uint64_t unuse_key = std::numeric_limits::max(); - for (int i = left; i < right; ++i) { - if (device_keys[i] == unuse_key) { - continue; - } - size_t local_offset = (i - left) * feature_value_size; - float* gpu_val = (float*)(test_build_values + local_offset); + for (size_t k = 0; k * once_cpu_num < real_len; k++) { + struct task_info task; + task.build_values = build_values; + task.offset = start; + task.device_id = i; + task.multi_mf_dim = j; + task.start = k * once_cpu_num; + task.end = (k + 1) * once_cpu_num < real_len ? ((k + 1) * once_cpu_num) + : (real_len); + cpu_reday_channels_[i]->Put(task); + } + }; + auto cpu_func = [this, &accessor_wrapper_ptr](int j) { + struct task_info task; + while (cpu_reday_channels_[j]->Get(task)) { + auto& device_keys = + this->current_task_ + ->device_dim_keys_[task.device_id][task.multi_mf_dim]; + char* test_build_values = task.build_values.get(); + int mf_dim = this->index_dim_vec_[task.multi_mf_dim]; + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + uint64_t unuse_key = std::numeric_limits::max(); + for (int i = task.start; i < task.end; ++i) { + if (device_keys[i + task.offset] == unuse_key) { + continue; + } + size_t local_offset = i * feature_value_size; + float* gpu_val = (float*)(test_build_values + local_offset); #ifdef PADDLE_WITH_PSLIB - // TODO: PSLIB DumpFill + // TODO: PSLIB DumpFill #endif #ifdef PADDLE_WITH_PSCORE - accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); + accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); #endif + } } - free(test_build_values); }; + platform::Timer timer; + timer.Start(); + std::vector> cpu_task_futures; + std::vector> gpu_task_futures; + size_t thread_num = 16; + size_t device_num = heter_devices_.size(); if (multi_mf_dim_) { VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; - size_t device_num = heter_devices_.size(); - std::vector threads(device_num * multi_mf_dim_ * thread_num); for (size_t i = 0; i < device_num; i++) { + cpu_reday_channels_[i]->Open(); for (int j = 0; j < multi_mf_dim_; j++) { - for (int k = 0; k < thread_num; k++) { - threads[(i + j * device_num) * thread_num + k] = - std::thread(dump_pool_to_cpu_func, i, j, k); + auto& device_keys = this->current_task_->device_dim_keys_[i][j]; + size_t len = device_keys.size(); + size_t start = 0; + size_t end = 0; + while (end < len) { + start = end; + end = end + once_gpu_copy < len ? (end + once_gpu_copy) : len; + gpu_task_futures.emplace_back(hbm_thread_pool_[i]->enqueue( + dump_pool_to_cpu_func, i, j, start, end)); } } + for (size_t j = 0; j < thread_num; j++) { + cpu_task_futures.emplace_back(cpu_work_pool_[i]->enqueue(cpu_func, i)); + } } - for (std::thread& t : threads) { - t.join(); - } } + for (auto& f : gpu_task_futures) { + f.wait(); + } + timer.Pause(); + VLOG(0) << " EndPass dump_pool_to_cpu_func " + << " cost " << timer.ElapsedSec() << " s."; + for (size_t i = 0; i < device_num; i++) { + cpu_reday_channels_[i]->Close(); + } + gpu_task_futures.clear(); + timer.Start(); + for (auto& f : cpu_task_futures) { + f.wait(); + } + cpu_task_futures.clear(); + timer.Pause(); + VLOG(0) << " EndPass cpu_func " + << " cost " << timer.ElapsedSec() << " s."; if (keysize_max != 0) { HeterPs_->end_pass(); } +} - for (size_t i = 0; i < hbm_pools_.size(); i++) { - delete hbm_pools_[i]; +void PSGPUWrapper::DumpToMem() { + if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) { + this->HbmToSparseTable(); } - gpu_task_pool_.Push(current_task_); - current_task_ = nullptr; - gpu_free_channel_->Put(current_task_); - //fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_); - timer.Pause(); - VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 54f4abd97e831d..21e92ac6aef957 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -36,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_context.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h" #include "paddle/fluid/framework/heter_util.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h" @@ -63,6 +64,7 @@ limitations under the License. */ #include "downpour_accessor.h" // NOLINT #endif #include "paddle/fluid/framework/fleet/heter_ps/log_patch.h" +DECLARE_int32(gpugraph_storage_mode); namespace paddle { namespace framework { @@ -96,6 +98,15 @@ class AfsWrapper { }; #endif +struct task_info { + std::shared_ptr build_values; + size_t offset; + int device_id; + int multi_mf_dim; + int start; + int end; +}; + class PSGPUWrapper { class DCacheBuffer { public: @@ -188,6 +199,9 @@ class PSGPUWrapper { int total_len, int* key2slot); + + void divide_to_device(std::shared_ptr gpu_task); + void add_slot_feature(std::shared_ptr gpu_task); void BuildGPUTask(std::shared_ptr gpu_task); void PreBuildTask(std::shared_ptr gpu_task); void BuildPull(std::shared_ptr gpu_task); @@ -195,16 +209,28 @@ class PSGPUWrapper { void LoadIntoMemory(bool is_shuffle); void BeginPass(); void EndPass(); + void add_key_to_local(const std::vector & keys); + void add_key_to_gputask(std::shared_ptr gpu_task); + void resize_gputask(std::shared_ptr gpu_task); + void SparseTableToHbm(); + void HbmToSparseTable(); void start_build_thread(); void pre_build_thread(); void build_pull_thread(); void build_task(); + void DumpToMem(); void Finalize() { VLOG(3) << "PSGPUWrapper Begin Finalize."; if (s_instance_ == nullptr) { return; } + if (FLAGS_gpugraph_storage_mode == GpuGraphStorageMode::WHOLE_HBM) { + this->EndPass(); + } + for (size_t i = 0; i < hbm_pools_.size(); i++) { + delete hbm_pools_[i]; + } data_ready_channel_->Close(); buildcpu_ready_channel_->Close(); buildpull_ready_channel_->Close(); @@ -288,6 +314,12 @@ class PSGPUWrapper { gpu_free_channel_->Open(); gpu_free_channel_->SetCapacity(1); + cpu_reday_channels_.resize(dev_ids.size()); + for (size_t i = 0; i < dev_ids.size(); i++) { + cpu_reday_channels_[i] = paddle::framework::MakeChannel(); + cpu_reday_channels_[i]->SetCapacity(16); + } + current_task_ = nullptr; gpu_free_channel_->Put(current_task_); @@ -385,6 +417,11 @@ class PSGPUWrapper { hbm_thread_pool_[i].reset(new ::ThreadPool(1)); } + cpu_work_pool_.resize(thread_keys_shard_num_); + for (size_t i = 0; i < hbm_thread_pool_.size(); i++) { + cpu_work_pool_[i].reset(new ::ThreadPool(16)); + } + auto sparse_table_accessor = sparse_table.accessor(); auto sparse_table_accessor_parameter = sparse_table_accessor.ctr_accessor_param(); @@ -542,6 +579,7 @@ class PSGPUWrapper { } void SetSlotVector(const std::vector& slot_vector) { slot_vector_ = slot_vector; + VLOG(0) << "slot_vector size is " << slot_vector_.size(); } void SetSlotOffsetVector(const std::vector& slot_offset_vector) { @@ -596,6 +634,10 @@ class PSGPUWrapper { dim_index_map[index_dim_vec_[i]] = i; } hbm_pools_.resize(resource_->total_device() * num_of_dim); + for (size_t i = 0; i < hbm_pools_.size(); i++) { + hbm_pools_[i] = new HBMMemoryPoolFix(); + } + mem_pools_.resize(resource_->total_device() * num_of_dim); max_mf_dim_ = index_dim_vec_.back(); multi_mf_dim_ = (dim_index_map.size() >= 1) ? dim_index_map.size() : 0; @@ -693,6 +735,7 @@ class PSGPUWrapper { int month_; int day_; bool slot_info_initialized_ = false; + bool hbm_sparse_table_initialized_ = false; int use_afs_api_ = 0; int optimizer_type_ = 1; std::string accessor_class_; @@ -703,7 +746,7 @@ class PSGPUWrapper { #ifdef PADDLE_WITH_CUDA std::vector mem_pools_; - std::vector hbm_pools_; // in multi mfdim, one table need hbm + std::vector hbm_pools_; // in multi mfdim, one table need hbm // pools of totol dims number #endif @@ -723,12 +766,15 @@ class PSGPUWrapper { paddle::framework::ChannelObject>> buildpull_ready_channel_ = paddle::framework::MakeChannel>(); + std::vector>> cpu_reday_channels_; std::shared_ptr current_task_ = nullptr; std::thread pre_build_threads_; std::thread buildpull_threads_; bool running_ = false; std::vector> pull_thread_pool_; std::vector> hbm_thread_pool_; + std::vector> cpu_work_pool_; OptimizerConfig optimizer_config_; protected: diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index b4fcbae8b2d4c2..b05935a6eca52b 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -151,6 +151,7 @@ void HogwildWorker::TrainFilesWithProfiler() { bool train_mode = device_reader_->IsTrainMode(); timeline.Start(); uint64_t total_inst = 0; + device_reader_->InitGraphTrainResource(); while (1) { cur_batch = device_reader_->Next(); if (FLAGS_enable_exit_when_partial_worker && train_mode) { @@ -268,6 +269,7 @@ void HogwildWorker::TrainFiles() { #endif // while ((cur_batch = device_reader_->Next()) > 0) { bool train_mode = device_reader_->IsTrainMode(); + device_reader_->InitGraphTrainResource(); while (1) { cur_batch = device_reader_->Next(); if (FLAGS_enable_exit_when_partial_worker && train_mode) { diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index ceed8cb6bfa636..96a473be1aa8cc 100755 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -48,6 +48,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, places_.push_back(place); } #endif + user_define_dump_filename_ = trainer_desc.user_define_dump_filename(); // get filelist from trainer_desc here const std::vector readers = dataset->GetReaders(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index 27c7563fee840e..22d66dbfd90828 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -49,7 +49,12 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( : underlying_allocator_(underlying_allocator), alignment_(alignment), chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), - allow_free_idle_chunk_(allow_free_idle_chunk) {} + allow_free_idle_chunk_(allow_free_idle_chunk) { + total_alloc_times_ = 0; + total_alloc_size_ = 0; + total_free_times_ = 0; + total_free_size_ = 0; + } phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( size_t unaligned_size) { @@ -112,6 +117,8 @@ phi::Allocation *AutoGrowthBestFitAllocator::AllocateImpl( VLOG(2) << "Not found and reallocate " << realloc_size << "(" << static_cast(p) << "), and remaining " << remaining_size; } + ++total_alloc_times_; + total_alloc_size_ += size; VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_; return new BlockAllocation(block_it); } @@ -126,6 +133,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(phi::Allocation *allocation) { auto block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; + total_free_times_ += 1; + total_free_size_ += block_it->size_; + block_it->is_free_ = true; if (block_it != blocks.begin()) { @@ -176,9 +186,28 @@ uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() { ++chunk_it; } } + + Trace(); return bytes; } +void AutoGrowthBestFitAllocator::Trace() const { + size_t cur_idle_bytes = 0; + auto it = free_blocks_.begin(); + for (; it != free_blocks_.end(); ++it) { + cur_idle_bytes += it->second->size_; + } + + VLOG(1) << "alloc:" << total_alloc_size_ / double(1024*1024) + << "m free:" << total_free_size_ / double(1024*1024) + << "m busy:" << (total_alloc_size_ - total_free_size_) / double(1024*1024) + << "m idle:" << cur_idle_bytes / double(1024*1024) + << "m alloc_times:" << total_alloc_times_ + << " free_times:" << total_free_times_ + << " free_blocks_num:" << free_blocks_.size() + << " curr_chunks_num:" << chunks_.size(); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index dadf751bdfa419..138f4a98c4db5d 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -49,6 +49,7 @@ class AutoGrowthBestFitAllocator : public Allocator { private: uint64_t FreeIdleChunks(); + void Trace() const; template using List = std::list; @@ -93,6 +94,12 @@ class AutoGrowthBestFitAllocator : public Allocator { size_t chunk_size_; bool allow_free_idle_chunk_; + // stat info + size_t total_alloc_times_; + size_t total_alloc_size_; + size_t total_free_times_; + size_t total_free_size_; + SpinLock spinlock_; }; diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index a26ed5dbdad8c6..187eb0692cda79 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -833,6 +833,18 @@ PADDLE_DEFINE_EXPORTED_bool( false, "It controls whether exit trainer when an worker has no ins."); +/** + * Distributed related FLAG + * Name: enable_exit_when_partial_worker + * Since Version: 2.2.0 + * Value Range: bool, default=false + * Example: + * Note: represent gpugraph storage mode, 1 for full hbm, 2 for hbm + mem + ssd. + */ +PADDLE_DEFINE_EXPORTED_int32(gpugraph_storage_mode, + 1, + "gpugraph storage mode, default 1"); + /** * KP kernel related FLAG * Name: FLAGS_run_kp_kernel @@ -961,17 +973,18 @@ PADDLE_DEFINE_EXPORTED_uint64( gpugraph_merge_grads_segment_size, 128, "segment size with segment gradient merge, default 128"); +PADDLE_DEFINE_EXPORTED_uint64( + gpugraph_slot_feasign_max_num, + 5, + "max feasign number in one slot, default 5"); PADDLE_DEFINE_EXPORTED_int32( gpugraph_dedup_pull_push_mode, 0, "enable dedup keys while pull push sparse, default 0"); -PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm, - true, - "enable load_node_list_into_hbm, default true"); -PADDLE_DEFINE_EXPORTED_int32( - gpugraph_sparse_table_storage_mode, - 0, - "parse_table_storage_mode, default 0"); +PADDLE_DEFINE_EXPORTED_bool( + gpugraph_load_node_list_into_hbm, + true, + "enable load_node_list_into_hbm, default true"); /** * ProcessGroupNCCL related FLAG * Name: nccl_blocking_wait diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc index ea6240b649cad0..dd38ce7956309a 100644 --- a/paddle/fluid/platform/monitor.cc +++ b/paddle/fluid/platform/monitor.cc @@ -19,6 +19,7 @@ namespace platform {} // namespace platform } // namespace paddle DEFINE_INT_STATUS(STAT_total_feasign_num_in_mem) +DEFINE_INT_STATUS(STAT_epoch_finish) DEFINE_INT_STATUS(STAT_gpu0_mem_size) DEFINE_INT_STATUS(STAT_gpu1_mem_size) DEFINE_INT_STATUS(STAT_gpu2_mem_size) diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index e902baa13532e5..dc381e6a033e00 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -368,6 +368,9 @@ void BindDataset(py::module *m) { py::call_guard()) .def("set_gpu_graph_mode", &framework::Dataset::SetGpuGraphMode, + py::call_guard()) + .def("set_pass_id", + &framework::Dataset::SetPassId, py::call_guard()); py::class_(*m, "IterableDatasetWrapper") diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc old mode 100755 new mode 100644 index 8b224a617dffa9..1c1d5a5269f306 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -64,6 +64,7 @@ void BindDistFleetWrapper(py::module* m) { .def("save_one_model", &FleetWrapper::SaveModelOneTable) .def("recv_and_save_model", &FleetWrapper::RecvAndSaveTable) .def("sparse_table_stat", &FleetWrapper::PrintTableStat) + .def("save_cache_table", &FleetWrapper::SaveCacheTable) .def("stop_server", &FleetWrapper::StopServer) .def("stop_worker", &FleetWrapper::FinalizeWorker) .def("barrier", &FleetWrapper::BarrierWithTable) @@ -372,7 +373,12 @@ void BindGraphGpuWrapper(py::module* m) { .def("set_up_types", &GraphGpuWrapper::set_up_types) .def("query_node_list", &GraphGpuWrapper::query_node_list) .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf) - .def("load_edge_file", &GraphGpuWrapper::load_edge_file) + .def("load_edge_file", + py::overload_cast( + &GraphGpuWrapper::load_edge_file)) + .def("load_edge_file", + py::overload_cast( + &GraphGpuWrapper::load_edge_file)) .def("load_node_and_edge", &GraphGpuWrapper::load_node_and_edge) .def("upload_batch", py::overload_cast( @@ -396,7 +402,15 @@ void BindGraphGpuWrapper(py::module* m) { .def("get_partition", &GraphGpuWrapper::get_partition) .def("load_node_weight", &GraphGpuWrapper::load_node_weight) .def("export_partition_files", &GraphGpuWrapper::export_partition_files) - .def("load_node_file", &GraphGpuWrapper::load_node_file) + .def("load_node_file", + py::overload_cast( + &GraphGpuWrapper::load_node_file)) + .def("load_node_file", + py::overload_cast( + &GraphGpuWrapper::load_node_file)) + .def("release_graph", &GraphGpuWrapper::release_graph) + .def("release_graph_edge", &GraphGpuWrapper::release_graph_edge) + .def("release_graph_node", &GraphGpuWrapper::release_graph_node) .def("finalize", &GraphGpuWrapper::finalize); } #endif diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index e9c993d3ee1282..4d7d17463e4fe4 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -64,6 +64,9 @@ void BindPSGPUWrapper(py::module* m) { .def("begin_pass", &framework::PSGPUWrapper::BeginPass, py::call_guard()) + .def("dump_to_mem", + &framework::PSGPUWrapper::DumpToMem, + py::call_guard()) .def("load_into_memory", &framework::PSGPUWrapper::LoadIntoMemory, py::call_guard()) diff --git a/paddle/utils/string/string_helper.h b/paddle/utils/string/string_helper.h index b84c7fa75209df..029eb9eb59dc6a 100644 --- a/paddle/utils/string/string_helper.h +++ b/paddle/utils/string/string_helper.h @@ -334,6 +334,42 @@ inline int split_string_ptr(const char* str, return num; } +inline int split_string_ptr(const char* str, + size_t len, + char delim, + std::vector* values, + int max_num) { + if (len <= 0) { + return 0; + } + + int num = 0; + const char* p = str; + const char* end = str + len; + const char* last = str; + while (p < end) { + if (*p != delim) { + ++p; + continue; + } + values->emplace_back(last, (size_t)(p - last)); + ++num; + ++p; + if (num >= max_num) { + return num; + } + // skip continue delim + while (*p == delim) { + ++p; + } + last = p; + } + if (p > last) { + values->emplace_back(last, (size_t)(p - last)); + ++num; + } + return num; +} // A helper class for reading lines from file. A line buffer is maintained. It // doesn't need to know the maximum possible length of a line. diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index 0cfb946d3d8cad..83f60a6e26b40c 100755 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -88,3 +88,4 @@ shrink = fleet.shrink get_hybrid_communicate_group = fleet.get_hybrid_communicate_group distributed_scaler = fleet.distributed_scaler +save_cache_table = fleet.save_cache_table diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 1a9b3f565b77ab..4b9037795e067c 100755 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -906,6 +906,15 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0): def save_cache_model(self, dirname, **configs): return self._runtime_handle._save_cache_model(dirname, **configs) + @is_non_distributed_check + @inited_runtime_handler + def save_cache_table(self, + table_id, + pass_id, + mem_cache_key_threshold=4000000000): + return self._runtime_handle._save_cache_table(table_id, pass_id, + mem_cache_key_threshold) + def shrink(self, threshold=None): self._runtime_handle._shrink(threshold) diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index abf7eec73b8fe1..5c7c1b11a27a70 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -1420,6 +1420,12 @@ def _save_cache_model(self, dirname, **kwargs): fleet.util.barrier() return feasign_num + def _save_cache_table(self, table_id, pass_id, mem_cache_key_threshold): + if self.role_maker._is_first_worker(): + self._worker.save_cache_table(table_id, pass_id, + mem_cache_key_threshold) + fleet.util.barrier() + def _load_sparse_params(self, dirname, context, main_program, mode): distributed_varnames = get_sparse_tablenames(self.origin_main_programs, True) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 20f11c96a91077..decd3988602ac3 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -388,6 +388,7 @@ def __init__(self): self.merge_by_lineid = False self.fleet_send_sleep_seconds = None self.trainer_num = -1 + self.pass_id = 0 @deprecated(since="2.0.0", update_to="paddle.distributed.InMemoryDataset._set_feed_type") @@ -1082,8 +1083,25 @@ def set_graph_config(self, config): "gpu_graph_training", True) self.proto_desc.graph_config.sage_mode = config.get("sage_mode", False) self.proto_desc.graph_config.samples = config.get("samples", "") + self.proto_desc.graph_config.train_table_cap = config.get( + "train_table_cap", 800000) + self.proto_desc.graph_config.infer_table_cap = config.get( + "infer_table_cap", 800000) self.dataset.set_gpu_graph_mode(True) + def set_pass_id(self, pass_id): + """ + set_pass_id + """ + self.pass_id = pass_id + self.dataset.set_pass_id(pass_id) + + def get_pass_id(self): + """ + get_pass_id + """ + return self.pass_id + class QueueDataset(DatasetBase): """ diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py index 3ba9f9eea46d1b..945b28aac88de1 100644 --- a/python/paddle/fluid/trainer_factory.py +++ b/python/paddle/fluid/trainer_factory.py @@ -73,6 +73,9 @@ def _create_trainer(self, opt_info=None): if opt_info.get("dump_fields_path") is not None and len( opt_info.get("dump_fields_path")) != 0: trainer._set_dump_fields_path(opt_info["dump_fields_path"]) + if opt_info.get("user_define_dump_filename") is not None and len( + opt_info.get("user_define_dump_filename")) != 0: + trainer._set_user_define_dump_filename(opt_info["user_define_dump_filename"]) if opt_info.get("dump_file_num") is not None: trainer._set_dump_file_num(opt_info["dump_file_num"]) if opt_info.get("dump_converter") is not None: