diff --git a/.gitignore b/.gitignore index bc8a817b4f2f01..4faaf162ba52b4 100644 --- a/.gitignore +++ b/.gitignore @@ -66,4 +66,3 @@ paddle/infrt/tests/lit.cfg.py paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc paddle/fluid/pybind/eager_final_state_op_function_impl.h paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h -builder diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index cc7635649281cd..04f102818a9743 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -49,7 +49,8 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${PSLIB_PREFIX_DIR} DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR} - DOWNLOAD_COMMAND cp /root/paddlejob/new1_code/ps/baidu/paddlepaddle/pslib/pslib.tar.gz ./ && tar zxvf ${PSLIB_NAME}.tar.gz + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz + && tar zxvf ${PSLIB_NAME}.tar.gz DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index 4b15a6047c83f6..100fa9b776292e 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -160,8 +160,7 @@ class Dataset { virtual void SetFleetSendSleepSeconds(int seconds) = 0; virtual std::vector GetSlots() = 0; - virtual void SetPassId(uint32_t pass_id) = 0; - virtual uint32_t GetPassID() = 0; + protected: virtual int ReceiveFromClient(int msg_type, int client_id, const std::string& msg) = 0; @@ -250,13 +249,6 @@ class DatasetImpl : public Dataset { virtual void DynamicAdjustReadersNum(int thread_num); virtual void SetFleetSendSleepSeconds(int seconds); virtual std::vector GetSlots(); - virtual void SetPassId(uint32_t pass_id) { - pass_id_ = pass_id; - } - virtual uint32_t GetPassID() { - return pass_id_; - } - /* for enable_heterps_ virtual void EnableHeterps(bool enable_heterps) { enable_heterps_ = enable_heterps; @@ -283,7 +275,6 @@ class DatasetImpl : public Dataset { // TODO(yaoxuefeng) for SlotRecordDataset return -1; } - uint32_t pass_id_ = 0; std::vector> readers_; std::vector> preload_readers_; paddle::framework::Channel input_channel_; diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index d6f316cf145b9a..b390cf17f492c5 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -1388,9 +1388,9 @@ void FleetWrapper::SetDate(const uint64_t table_id, const std::string& date) { #endif } -void FleetWrapper::PrintTableStat(const uint64_t table_id, uint32_t pass_id, size_t threshold) { +void FleetWrapper::PrintTableStat(const uint64_t table_id) { #ifdef PADDLE_WITH_PSLIB - auto ret = pslib_ptr_->_worker_ptr->print_table_stat(table_id, pass_id, threshold); + auto ret = pslib_ptr_->_worker_ptr->print_table_stat(table_id); ret.wait(); int32_t err_code = ret.get(); if (err_code == -1) { diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index 2e17eae9196a4e..deb2b90c933532 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -265,7 +265,7 @@ class FleetWrapper { std::vector table_var_list, bool load_combine); - void PrintTableStat(const uint64_t table_id, uint32_t pass_id, uint64_t threshold); + void PrintTableStat(const uint64_t table_id); void SetFileNumOneShard(const uint64_t table_id, int file_num); // mode = 0, load all feature // mode = 1, load delta feature, which means load diff diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index c6f34987e65732..061ce0ff3909e9 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -39,76 +39,227 @@ namespace framework { class HeterContext { public: - //保存去重后的待查table的key, 第一层对应table-shard, 第二层对应不同维度,第三层就是key集合 - std::vector>>feature_keys_; - //保存查到的value数据,维度同feature_keys_ + virtual ~HeterContext() { + if (!multi_mf_dim_) { + for (size_t i = 0; i < mutex_.size(); ++i) { + delete mutex_[i]; + } + mutex_.clear(); + } else { + for (size_t i = 0; i < dim_mutex_.size(); ++i) { + for (size_t j = 0; j < dim_mutex_[i].size(); j++) { + delete dim_mutex_[i][j]; + } + dim_mutex_[i].clear(); + } + } + } + Scope* scope_{nullptr}; + std::vector> feature_keys_; + std::vector>> feature_dim_keys_; + std::vector>> device_task_keys_; + #ifdef PADDLE_WITH_PSLIB + std::vector> value_ptr_; + std::vector>> + device_task_ptr_; + std::vector>> + value_dim_ptr_; std::vector>> - value_ptr_; + device_dim_ptr_; #endif #ifdef PADDLE_WITH_PSCORE + std::vector> value_ptr_; std::vector>> - value_ptr_; + value_dim_ptr_; + std::vector>> + device_task_ptr_; + std::vector>> + device_dim_ptr_; #endif - //经过去重后的gpu-table中的key数据, 第一层设备,第二层维度,第三层具体的key - std::vector>> device_keys_; + std::vector> device_values_; + std::vector> device_keys_; + std::vector>> device_dim_keys_; + std::vector>> device_dim_values_; + std::vector mutex_; + std::vector> dim_mutex_; + int multi_mf_dim_ = 0; + + uint32_t shard_num_ = 37; + uint64_t size() { + uint64_t total_size = 0; + for (auto& keys : feature_keys_) { + total_size += keys.size(); + } + return total_size; + } + void SetShardNum(uint32_t shard_num) { shard_num_ = shard_num; } + uint32_t ShardNum() { return shard_num_; } + void init(int shard_num, int device_num) { + shard_num_ = shard_num; + feature_keys_.resize(shard_num_); + value_ptr_.resize(shard_num_); + device_task_ptr_.resize(shard_num_); + device_task_keys_.resize(shard_num_); + for (size_t i = 0; i < device_task_ptr_.size(); i++) { + device_task_ptr_[i].resize(device_num); + device_task_keys_[i].resize(device_num); + } + + device_values_.resize(device_num); + device_keys_.resize(device_num); + mutex_.resize(device_num); + for (size_t i = 0; i < mutex_.size(); ++i) { + mutex_[i] = new std::mutex(); + } + } - //初始化 void init(int shard_num, int device_num, int dim_num) { - feature_keys_.resize(shard_num); - for (auto& iter : feature_keys_) { - iter.resize(dim_num); - for (auto& iter1: iter) { - iter1.clear(); - } + shard_num_ = shard_num; + feature_keys_.resize(shard_num_); + feature_dim_keys_.resize(shard_num_); + value_ptr_.resize(shard_num_); + value_dim_ptr_.resize(shard_num_); + device_task_ptr_.resize(shard_num_); + device_task_keys_.resize(shard_num_); + for (size_t i = 0; i < device_task_ptr_.size(); i++) { + device_task_ptr_[i].resize(device_num); + device_task_keys_[i].resize(device_num); } - value_ptr_.resize(shard_num); - for (auto& iter : value_ptr_) { - iter.resize(dim_num); - for (auto& iter1: iter) { - iter1.clear(); - } + for (size_t i = 0; i < feature_dim_keys_.size(); i++) { + feature_dim_keys_[i].resize(dim_num); + value_dim_ptr_[i].resize(dim_num); } + device_values_.resize(device_num); + device_dim_values_.resize(device_num); device_keys_.resize(device_num); - for (auto& iter : device_keys_) { - iter.resize(dim_num); - for (auto& iter1: iter) { - iter1.clear(); + + device_dim_keys_.resize(device_num); + device_dim_ptr_.resize(device_num); + mutex_.resize(device_num); + dim_mutex_.resize(device_num); + for (size_t i = 0; i < mutex_.size(); ++i) { + mutex_[i] = new std::mutex(); + } + for (size_t i = 0; i < dim_mutex_.size(); ++i) { + dim_mutex_[i].resize(dim_num); + for (int j = 0; j < dim_num; j++) { + dim_mutex_[i][j] = new std::mutex(); } } + multi_mf_dim_ = dim_num; + } + void Reset() { + if (!multi_mf_dim_) { + for (size_t i = 0; i < feature_keys_.size(); ++i) { + feature_keys_[i].clear(); + } + for (size_t i = 0; i < value_ptr_.size(); ++i) { + value_ptr_[i].clear(); + } + for (size_t i = 0; i < device_values_.size(); ++i) { + device_values_[i].clear(); + } + for (size_t i = 0; i < device_keys_.size(); ++i) { + device_keys_[i].clear(); + } + for (size_t i = 0; i < device_task_ptr_.size(); ++i) { + for (size_t j = 0; j < device_task_ptr_[i].size(); ++j) { + device_task_ptr_[i][j].clear(); + device_task_keys_[i][j].clear(); + } + } + } else { + VLOG(3) << "Reset gpu task with dynamic mf dimention"; + for (size_t i = 0; i < feature_dim_keys_.size(); i++) { + for (size_t j = 0; j < feature_dim_keys_[i].size(); j++) { + feature_dim_keys_[i][j].clear(); + } + } + for (size_t i = 0; i < value_dim_ptr_.size(); i++) { + for (size_t j = 0; j < value_dim_ptr_[i].size(); j++) { + value_dim_ptr_[i][j].clear(); + } + } + + for (size_t i = 0; i < device_dim_keys_.size(); i++) { + for (size_t j = 0; j < device_dim_keys_[i].size(); j++) { + device_dim_keys_[i][j].clear(); + } + } + for (size_t i = 0; i < device_dim_ptr_.size(); i++) { + for (size_t j = 0; j < device_dim_ptr_[i].size(); j++) { + device_dim_ptr_[i][j].clear(); + } + } + } + } + void batch_add_keys( + const std::vector>& thread_keys) { + assert(thread_keys.size() == feature_keys_.size()); + + for (uint32_t i = 0; i < shard_num_; i++) { + int idx = 0; + idx = feature_keys_[i].size(); + feature_keys_[i].resize(feature_keys_[i].size() + thread_keys[i].size()); + std::copy(thread_keys[i].begin(), thread_keys[i].end(), + feature_keys_[i].begin() + idx); + } + } + + void batch_add_keys(int shard_num, + const robin_hood::unordered_set& shard_keys) { + int idx = feature_keys_[shard_num].size(); + feature_keys_[shard_num].resize(feature_keys_[shard_num].size() + + shard_keys.size()); + std::copy(shard_keys.begin(), shard_keys.end(), + feature_keys_[shard_num].begin() + idx); } - //将粗去重的key加入进来,后面再做精细化去重 + void batch_add_keys(int shard_num, int dim_id, const robin_hood::unordered_set& shard_keys) { - int idx = feature_keys_[shard_num][dim_id].size(); - feature_keys_[shard_num][dim_id].resize( - feature_keys_[shard_num][dim_id].size() + shard_keys.size()); + int idx = feature_dim_keys_[shard_num][dim_id].size(); + feature_dim_keys_[shard_num][dim_id].resize( + feature_dim_keys_[shard_num][dim_id].size() + shard_keys.size()); std::copy(shard_keys.begin(), shard_keys.end(), - feature_keys_[shard_num][dim_id].begin() + idx); + feature_dim_keys_[shard_num][dim_id].begin() + idx); } - void unique_keys() { + + void UniqueKeys() { std::vector threads; - auto unique_func = [this](int i, int j) { - auto& cur_keys = feature_keys_[i][j]; + auto unique_func = [this](int i) { + auto& cur_keys = feature_keys_[i]; + std::sort(cur_keys.begin(), cur_keys.end()); + std::vector::iterator it; + it = std::unique(cur_keys.begin(), cur_keys.end()); + cur_keys.resize(std::distance(cur_keys.begin(), it)); + }; + auto unique_dynamic_mf_func = [this](int i, int j) { + auto& cur_keys = feature_dim_keys_[i][j]; std::sort(cur_keys.begin(), cur_keys.end()); std::vector::iterator it; it = std::unique(cur_keys.begin(), cur_keys.end()); cur_keys.resize(std::distance(cur_keys.begin(), it)); }; - for (size_t i = 0; i < feature_keys_.size(); i++) { - for (size_t j = 0; j < feature_keys_[i].size(); j++) { - threads.push_back(std::thread(unique_func, i, j)); + if (!multi_mf_dim_) { + for (uint32_t i = 0; i < shard_num_; i++) { + threads.push_back(std::thread(unique_func, i)); + } + } else { + for (uint32_t i = 0; i < shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads.push_back(std::thread(unique_dynamic_mf_func, i, j)); + } } + VLOG(3) << "heter_context unique keys with dynamic mf dimention"; } for (std::thread& t : threads) { t.join(); } } - uint16_t pass_id_; }; - } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index f4fd845de43e6e..983208c0608ae7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -7,9 +7,9 @@ IF(WITH_GPU) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS}) endif() - nv_library(heter_comm SRCS heter_comm.h feature_value.h dy_gpu_value_inl.h feature_value_inl.h gpu_value_inl.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS}) + nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS}) nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) - nv_library(heter_ps SRCS heter_ps.cu feature_value.cu DEPS heter_comm) + nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) if(WITH_PSCORE) nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) @@ -20,7 +20,7 @@ IF(WITH_GPU) endif() ENDIF() IF(WITH_ROCM) - hip_library(heter_comm SRCS heter_comm.h feature_value.h dy_gpu_value_inl.h feature_value_inl.h gpu_value_inl.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) + hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) hip_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) - hip_library(heter_ps SRCS heter_ps.cu feature_value.cu DEPS heter_comm) + hip_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) ENDIF() diff --git a/paddle/fluid/framework/fleet/heter_ps/dy_gpu_value_inl.h b/paddle/fluid/framework/fleet/heter_ps/dy_gpu_value_inl.h deleted file mode 100644 index ac95c5f3b6f4ba..00000000000000 --- a/paddle/fluid/framework/fleet/heter_ps/dy_gpu_value_inl.h +++ /dev/null @@ -1,277 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef PADDLE_WITH_HETERPS - -#include "paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h" -#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" - -namespace paddle { -namespace framework { - -struct DyGpuValue { - float delta_score; - float show; - float clk; - int slot; - float lr; - float lr_g2sum; - int mf_size; - int mf_dim; - uint64_t cpu_ptr; - float mf[0]; - __host__ __device__ __forceinline__ DyGpuValue() { - delta_score = 0; - show = 0; - clk = 0; - slot = -1; - lr = 0; - lr_g2sum = 0; - mf_size = 0; - mf_dim = 0; - cpu_ptr = 0; - } - __device__ __forceinline__ void operator=(const DyGpuValue& in) { - delta_score = in.delta_score; - show = in.show; - clk = in.clk; - slot = in.slot; - lr = in.lr; - lr_g2sum = in.lr_g2sum; - mf_size = in.mf_size; - mf_dim = in.mf_dim; - cpu_ptr = in.cpu_ptr; - for (int i = 0; i < mf_dim + 1; i++) { - mf[i] = in.mf[i]; - } - } - __device__ __forceinline__ void to_cvm(float* des, int dim) { - des[0] = show; - des[1] = clk; - des[2] = lr; - if (mf_size == 0) { - for (int i = 0; i < dim; i++) { - des[3 + i] = 0; - } - } else { - for (int i = 0; i < dim; i++) { - des[3 + i] = mf[1 + i]; - } - } - } -}; - -struct DyGpuPushValue { - float show; - float clk; - int slot; - float lr_g; - int mf_dim; - float mf_g[0]; - __device__ __forceinline__ void from_grad(const float* grad, int dim, int slot_id, int batch_size) { - this->slot = slot_id; - this->mf_dim = dim; - this->show = grad[0]; - this->clk = grad[1]; - this->lr_g = grad[2] * -1. * batch_size; - for (int j = 0; j < dim; j++) { - this->mf_g[j] = grad[3 + j] * -1. * batch_size; - } - } - __device__ __forceinline__ DyGpuPushValue& operator+=(const DyGpuPushValue& input) { - show += input.show; - clk += input.clk; - lr_g += input.lr_g; - for (int i = 0; i < input.mf_dim; i++) { - mf_g[i] += input.mf_g[i]; - } - return *this; - } - __device__ __forceinline__ void operator=(const DyGpuPushValue& input) { - show = input.show; - clk = input.clk; - slot = input.slot; - lr_g = input.lr_g; - mf_dim = input.mf_dim; - for (int i = 0; i < mf_dim; i++) { - mf_g[i] = input.mf_g[i]; - } - } -}; - -template <> -class Optimizer { - public: - Optimizer() {} - - ~Optimizer() {} - - void initialize() {} - - __device__ void update_lr(float& w, float& g2sum, float g, float scale) { - double add_g2sum = 0; - double ratio = optimizer_config::learning_rate * - sqrt(optimizer_config::initial_g2sum / - (optimizer_config::initial_g2sum + g2sum)); - double scaled_grad = g / scale; - w += scaled_grad * ratio; - if (w < optimizer_config::min_bound) w = optimizer_config::min_bound; - if (w > optimizer_config::max_bound) w = optimizer_config::max_bound; - add_g2sum += scaled_grad * scaled_grad; - g2sum += add_g2sum; - } - - __device__ void update_mf(int n, float* w, float& g2sum, const float* g, - float scale) { - double add_g2sum = 0; - double ratio = optimizer_config::mf_learning_rate * - sqrt(optimizer_config::mf_initial_g2sum / - (optimizer_config::mf_initial_g2sum + g2sum)); - for (int i = 0; i < n; ++i) { - double scaled_grad = g[i] / scale; - w[i] += scaled_grad * ratio; - if (w[i] < optimizer_config::mf_min_bound) - w[i] = optimizer_config::mf_min_bound; - if (w[i] > optimizer_config::mf_max_bound) - w[i] = optimizer_config::mf_max_bound; - add_g2sum += scaled_grad * scaled_grad; - } - g2sum += add_g2sum / n; - } - - __device__ void update_value(DyGpuValue* ptr, const DyGpuPushValue& grad, curandState& state) { - ptr->slot = grad.slot; - ptr->show += grad.show; - ptr->clk += grad.clk; - ptr->delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) + - optimizer_config::clk_coeff * grad.clk; - - update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); - - if (ptr->mf_size == 0) { - if (optimizer_config::mf_create_thresholds <= - optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) + - optimizer_config::clk_coeff * ptr->clk) { - ptr->mf_size = ptr->mf_dim + 1; - ptr->mf[0] = 0; - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = 0; i < ptr->mf_dim; ++i) { - ptr->mf[i + 1] = - (curand_uniform(&state)) * optimizer_config::mf_initial_range; - } - } - } else { - update_mf(ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0], grad.mf_g, - grad.show); // for local test - } - } -}; - - -class T_DyGpuValue_DownpourCtrDymfAccessor : public ValueTransforImp { -public: - virtual int get_gpu_value_size(int dim_size) { - int ret = sizeof(DyGpuValue) + (dim_size + 1) * sizeof(float); - return TYPE_ALIGN(8, ret); - } - virtual int get_gpu_push_value_size(int dim_size) { - int ret = sizeof(DyGpuPushValue) + (dim_size) * sizeof(float); - return TYPE_ALIGN(8, ret); - } - virtual void value_cpu_to_gpu(void* cpu, void* gpu, int dim_size) { -#ifdef PADDLE_WITH_PSLIB - paddle::ps::DownpourFixedFeatureValue* cpu_value = (paddle::ps::DownpourFixedFeatureValue*)cpu; - DyGpuValue* gpu_value = (DyGpuValue*)gpu; - const float* ptr_cpu_data = cpu_value->data(); - size_t dim = cpu_value->size(); - uint64_t tmp_aa = (uint64_t)(cpu); - gpu_value->delta_score = ptr_cpu_data[1]; - gpu_value->show = ptr_cpu_data[2]; - gpu_value->clk = ptr_cpu_data[3]; - gpu_value->slot = int(ptr_cpu_data[6]); - gpu_value->lr = ptr_cpu_data[4]; - gpu_value->lr_g2sum = ptr_cpu_data[5]; - gpu_value->cpu_ptr = (uint64_t)(cpu); - gpu_value->mf_dim = dim_size; - if (dim > 8) { - gpu_value->mf_size = dim_size + 1; - for (int x = 0; x < gpu_value->mf_dim + 1; x++) { - gpu_value->mf[x] = ptr_cpu_data[x + 8]; - } - } else { - gpu_value->mf_size = 0; - for (int x = 0; x < gpu_value->mf_dim + 1; x++) { - gpu_value->mf[x] = 0 ; - } - } -#endif - } - virtual void value_gpu_to_cpu(void* gpu) { -#ifdef PADDLE_WITH_PSLIB - DyGpuValue* gpu_value = (DyGpuValue*)gpu; - paddle::ps::DownpourFixedFeatureValue& cpu_fix = *((paddle::ps::DownpourFixedFeatureValue*)(gpu_value->cpu_ptr)); - if (gpu_value->mf_size > 0) { - cpu_fix.resize(8 + 1 + gpu_value->mf_dim); - } - float* cpu_value = cpu_fix.data(); - cpu_value[1] = gpu_value->delta_score; - cpu_value[2] = gpu_value->show; - cpu_value[3] = gpu_value->clk; - cpu_value[4] = gpu_value->lr; - cpu_value[5] = gpu_value->lr_g2sum; - cpu_value[6] = gpu_value->slot; - if (gpu_value->mf_size > 0) { - for (int x = 0; x < gpu_value->mf_dim + 1; x++) { - cpu_value[x + 8] = gpu_value->mf[x]; - } - } -#endif - } - virtual void value_to_cvm(float** gpu_cvm, - const void* gpu_value, - FeatureKey** gpu_keys, - const int slot_num, - const int64_t* key_len, - const int* slot_dim, - int64_t total_length, - int hidden_size, - int value_size, - cudaStream_t stream - ) { - value_to_cvm_impl(gpu_cvm, (DyGpuValue*)gpu_value, gpu_keys, slot_num, key_len, - slot_dim, total_length, hidden_size, value_size, stream); - } - virtual void grad_to_push(void* push_value, - float** grad_value, - const int slot_num, - const int64_t* grad_len, - const int* slot_dim, - int64_t total_length, - int hidden_size, - int value_size, - int batch_size, - const int* slot_vector, - cudaStream_t stream - ) { - grad_to_push_impl((DyGpuPushValue*)push_value, grad_value, slot_num, grad_len, slot_dim, - total_length, hidden_size, value_size, batch_size, slot_vector, stream); - } -}; - -} -} - -#endif \ No newline at end of file diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu deleted file mode 100644 index 755f9a9cd97b1e..00000000000000 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_HETERPS - -#include "paddle/fluid/framework/fleet/heter_ps/gpu_value_inl.h" -#include "paddle/fluid/framework/fleet/heter_ps/dy_gpu_value_inl.h" - -namespace paddle { -namespace framework { - -void GlobalValueTransfor::init(std::string accessor_type, std::string gpu_value_type) { - if (transobj_ != nullptr) { - return; - } - if (accessor_type == "DownpourCtrDymfAccessor" && gpu_value_type == "DyFeatureValue") { - transobj_ = (ValueTransfor*)(new T_DyGpuValue_DownpourCtrDymfAccessor()); - } else if (accessor_type == "DownpourCtrAccessor" && gpu_value_type == "FeatureValue") { - transobj_ = (ValueTransfor*)(new T_GpuValue_DownpourCtrAccessor()); - } - return; -} - -ValueTransfor* GlobalValueTransfor::get_value_transfor() { - return transobj_; -} - - -} -} - -#endif \ No newline at end of file diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index 239451cd00f1e6..c454a0b0ba0be9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -17,88 +17,128 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include -#include "paddle/fluid/platform/device/gpu/gpu_helper.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/memory/memory.h" -#ifdef PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/ps/wrapper/fleet.h" -#endif -#ifdef PADDLE_WITH_PSLIB -#include -#endif namespace paddle { namespace framework { - -#define TYPE_ALIGN(ALIGNVAL, LEN) (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) +#define MF_DIM 8 typedef uint64_t FeatureKey; -class ValueTransfor { -public: - virtual int get_gpu_value_size(int dim_size) = 0; - virtual int get_gpu_push_value_size(int dim_size) = 0; - virtual void value_cpu_to_gpu(void* cpu, void* gpu, int dim_size) = 0; - virtual void value_gpu_to_cpu(void* gpu) = 0; - virtual void value_to_cvm(float** gpu_cvm, //写入的结果,cvm二维数组 - const void* gpu_value, //查表出来的sparse数据 - FeatureKey** gpu_keys, //对应的key的二维数组(内部需要用来判断是否为0) - const int slot_num, //一共有多少个slot - const int64_t* key_len, //每个slot下面有多少个key - const int* slot_dim, //每个slot的维度数据(可能为空,只有动态维度模式才会有值) - int64_t total_length, //总共有多少个key - int hidden_size, //非动态维度的情况下,cvm维度数 - int value_size, //动态维度下,value的字节大小 - cudaStream_t stream //流 - ) = 0; - virtual void grad_to_push(void* push_value, //写入的结果,连续的pushvalue类型值 - float** grad_value, //梯度信息 - const int slot_num, //一共有多少个slot - const int64_t* grad_len, //每个slot下面有多少个梯度 - const int* slot_dim, //每个slot的维度数据(可能为空,只有动态维度模式才会有值) - int64_t total_length, //总共有多少个梯度 - int hidden_size, //非动态维度的情况下,梯度维度数 - int value_size, //动态维度下,value的字节大小 - int batch_size, //mini-batch - const int* slot_vector, //slot的编号信息 - cudaStream_t stream //流 - ) = 0; +/* +struct FeatureValue { + float delta_score; + float show; + float clk; + int slot; + float lr; + float lr_g2sum; + int mf_size; + float mf[MF_DIM + 1]; + uint64_t cpu_ptr; + + friend std::ostream& operator<<(std::ostream& out, FeatureValue& val) { + out << "show: " << val.show << " clk: " << val.clk << " slot: " << val.slot + << " lr: " << val.lr << " mf_size: " << val.mf_size << " mf:"; + for (int i = 0; i < val.mf_size; ++i) { + out << " " << val.mf[i]; + } + return out; + } }; -class GlobalValueTransfor { -public: - static GlobalValueTransfor& get_instance() { - static GlobalValueTransfor ins; - return ins; +struct FeaturePushValue { + float show; + float clk; + int slot; + float lr_g; + float mf_g[MF_DIM]; + + __device__ __forceinline__ FeaturePushValue + operator+(const FeaturePushValue& a) const { + FeaturePushValue out; + out.slot = a.slot; + out.show = a.show + show; + out.clk = a.clk + clk; + out.lr_g = a.lr_g + lr_g; + for (int i = 0; i < MF_DIM; ++i) { + out.mf_g[i] = a.mf_g[i] + mf_g[i]; + } + return out; } - void init(std::string accessor_type, std::string gpu_value_type); - ValueTransfor* get_value_transfor(); -private: - ValueTransfor* transobj_ = nullptr; }; -#define g_transfor GlobalValueTransfor::get_instance().get_value_transfor() - -class PinnedVector { -public: - template - PinnedVector(const Type* buf, const size_t len, gpuStream_t& stream, const paddle::platform::Place& place) { - mem_cpu_ = memory::Alloc(phi::GPUPinnedPlace(), len); - memcpy(reinterpret_cast(mem_cpu_->ptr()), buf, len); - mem_gpu_ = memory::Alloc(place, len); - cudaMemcpyAsync(reinterpret_cast(mem_gpu_->ptr()), reinterpret_cast(mem_cpu_->ptr()), - len, cudaMemcpyHostToDevice, stream); +*/ + +struct FeatureValue { + float delta_score; + float show; + float clk; + int slot; + float lr; + float lr_g2sum; + int mf_size; + int mf_dim; + uint64_t cpu_ptr; + float mf[0]; + + friend std::ostream& operator<<(std::ostream& out, FeatureValue& val) { + out << "show: " << val.show << " clk: " << val.clk << " slot: " << val.slot + << " lr: " << val.lr << " mf_dim: " << val.mf_dim << "cpuptr: " << val.cpu_ptr + << " mf_size: " << val.mf_size << " mf:"; + for (int i = 0; i < val.mf_dim + 1; ++i) { + out << " " << val.mf[i]; + } + return out; } - template - Type* get_gpu_ptr() { - return reinterpret_cast(mem_gpu_->ptr()); + __device__ __forceinline__ void operator=(const FeatureValue& in) { + delta_score = in.delta_score; + show = in.show; + clk = in.clk; + slot = in.slot; + lr = in.lr; + lr_g2sum = in.lr_g2sum; + mf_size = in.mf_size; + mf_dim = in.mf_dim; + cpu_ptr = in.cpu_ptr; + for (int i = 0; i < mf_dim + 1; i++) { + mf[i] = in.mf[i]; + } } -private: - memory::allocation::AllocationPtr mem_cpu_; - memory::allocation::AllocationPtr mem_gpu_; }; -} // end namespace framework -} // end namespace paddle +struct FeaturePushValue { + float show; + float clk; + int slot; + float lr_g; + int mf_dim; + float mf_g[0]; + __device__ __forceinline__ FeaturePushValue + operator+(const FeaturePushValue& a) const { + FeaturePushValue out; + out.slot = a.slot; + out.mf_dim = a.mf_dim; + out.show = a.show + show; + out.clk = a.clk + clk; + out.lr_g = a.lr_g + lr_g; + // out.mf_g = a.mf_g; + for (int i = 0; i < out.mf_dim; ++i) { + out.mf_g[i] = a.mf_g[i] + mf_g[i]; + } + return out; + } + __device__ __forceinline__ void operator=(const FeaturePushValue& in) { + show = in.show; + clk = in.clk; + slot = in.slot; + lr_g = in.lr_g; + mf_dim = in.mf_dim; + for (int i = 0; i < mf_dim; i++) { + mf_g[i] = in.mf_g[i]; + } + } +}; +} // end namespace framework +} // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h b/paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h deleted file mode 100644 index ee19c52ade95d1..00000000000000 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef PADDLE_WITH_HETERPS - -#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" - -namespace paddle { -namespace framework { - -template -__global__ void kernel_value_to_cvm(float** dest, ValueType* src, FeatureKey** keys, const int slot_num, - const int64_t* len, const int* slot_dim, int64_t total_len, int hidden_size, int value_size) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[x - 1] : 0); - int cur_dim =hidden_size - 3; - //动态维度 - if (slot_dim != nullptr) { - cur_dim = slot_dim[x] - 3; - } - char* p_src = (char*)(src); - ValueType* value_ptr = (ValueType*)(p_src + uint64_t(i) * uint64_t(value_size)); - if (*(keys[x] + y) == 0) { - *(dest[x] + y * (cur_dim + 3)) = 0; - *(dest[x] + y * (cur_dim + 3) + 1) = 0; - *(dest[x] + y * (cur_dim + 3) + 2) = 0; - for (int j = 0; j < cur_dim; j++) { - *(dest[x] + y * (cur_dim + 3) + 3 + j) = 0; - } - } else { - value_ptr->to_cvm(dest[x] + y * (cur_dim + 3), cur_dim); - } - } -} - -template -__global__ void kernel_grad_to_push(PushValueType* des, float** src, const int slot_num, const int64_t* len, - const int* slot_dim, int64_t total_len, int hidden_size, int value_size, - int batch_size, const int* slot_vector) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[low - 1] : 0); - char* d_src = (char*)(des); - PushValueType* value_ptr = (PushValueType*)(d_src + i * value_size); - int mf_dim = hidden_size - 3; - if (slot_dim != nullptr) { - mf_dim = slot_dim[x]; - } - int slot_id = slot_vector[x]; - value_ptr->from_grad(src[x] + y * (mf_dim + 3), mf_dim, slot_id, batch_size); - } -} - -class ValueTransforImp : public ValueTransfor { -protected: - template - void value_to_cvm_impl( float** gpu_cvm, - ValueType* gpu_value, - FeatureKey** gpu_keys, - const int slot_num, - const int64_t* key_len, - const int* slot_dim, - int64_t total_length, - int hidden_size, - int value_size, - cudaStream_t stream) { - kernel_value_to_cvm<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - gpu_cvm, gpu_value, gpu_keys, slot_num, key_len, slot_dim, total_length, hidden_size, value_size); - cudaStreamSynchronize(stream); - } - template - void grad_to_push_impl(PushValueType* push_value, - float** grad_value, - const int slot_num, - const int64_t* grad_len, - const int* slot_dim, - int64_t total_length, - int hidden_size, - int value_size, - int batch_size, - const int* slot_vector, - cudaStream_t stream - ) { - kernel_grad_to_push<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - (PushValueType*)push_value, grad_value, slot_num, grad_len, slot_dim, - total_length, hidden_size, value_size, batch_size, slot_vector); - cudaStreamSynchronize(stream); - } -}; - -} -} - -#endif \ No newline at end of file diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_value_inl.h b/paddle/fluid/framework/fleet/heter_ps/gpu_value_inl.h deleted file mode 100644 index 8c8afc31ddcfa3..00000000000000 --- a/paddle/fluid/framework/fleet/heter_ps/gpu_value_inl.h +++ /dev/null @@ -1,308 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef PADDLE_WITH_HETERPS - -#include "paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h" -#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" - -namespace paddle { -namespace framework { - -struct GpuValue { - float delta_score; - float show; - float clk; - int slot; - float lr; - float lr_g2sum; - int mf_size; - float mf[8 + 1]; - uint64_t cpu_ptr; - __host__ __device__ __forceinline__ GpuValue() { - delta_score = 0; - show = 0; - clk = 0; - slot = -1; - lr = 0; - lr_g2sum = 0; - mf_size = 0; - cpu_ptr = 0; - } - __device__ __forceinline__ void operator=(const GpuValue& in) { - delta_score = in.delta_score; - show = in.show; - clk = in.clk; - slot = in.slot; - lr = in.lr; - lr_g2sum = in.lr_g2sum; - mf_size = in.mf_size; - cpu_ptr = in.cpu_ptr; - for (int i = 0; i < 8 + 1; i++) { - mf[i] = in.mf[i]; - } - } - __device__ __forceinline__ void to_cvm(float* des, int dim) { - des[0] = show; - des[1] = clk; - des[2] = lr; - if (mf_size == 0) { - for (int i = 0; i < dim; i++) { - des[3 + i] = 0; - } - } else { - for (int i = 0; i < dim; i++) { - des[3 + i] = mf[1 + i]; - } - } - } -}; - -struct GpuPushValue { - float show; - float clk; - int slot; - float lr_g; - float mf_g[8]; - __device__ __forceinline__ void from_grad(const float* grad, int dim, int slot_id, int batch_size) { - this->slot = slot_id; - this->show = grad[0]; - this->clk = grad[1]; - this->lr_g = grad[2] * -1. * batch_size; - for (int j = 0; j < dim; j++) { - this->mf_g[j] = grad[3 + j] * -1. * batch_size; - } - } - __device__ __forceinline__ GpuPushValue& operator+=(const GpuPushValue& input) { - show += input.show; - clk += input.clk; - lr_g += input.lr_g; - for (int i = 0; i < 8; i++) { - mf_g[i] += input.mf_g[i]; - } - return *this; - } - __device__ __forceinline__ void operator=(const GpuPushValue& input) { - show = input.show; - clk = input.clk; - slot = input.slot; - lr_g = input.lr_g; - for (int i = 0; i < 8; i++) { - mf_g[i] = input.mf_g[i]; - } - } -}; - -template <> -class Optimizer { - public: - Optimizer() {} - ~Optimizer() {} - void initialize() {} - - __device__ void update_lr(float& w, float& g2sum, float g, float scale) { - double add_g2sum = 0; - double ratio = optimizer_config::learning_rate * - sqrt(optimizer_config::initial_g2sum / - (optimizer_config::initial_g2sum + g2sum)); - double scaled_grad = g / scale; - w += scaled_grad * ratio; - if (w < optimizer_config::min_bound) w = optimizer_config::min_bound; - if (w > optimizer_config::max_bound) w = optimizer_config::max_bound; - add_g2sum += scaled_grad * scaled_grad; - g2sum += add_g2sum; - } - - __device__ void update_mf(int n, float* w, float& g2sum, const float* g, - float scale) { - double add_g2sum = 0; - double ratio = optimizer_config::mf_learning_rate * - sqrt(optimizer_config::mf_initial_g2sum / - (optimizer_config::mf_initial_g2sum + g2sum)); - for (int i = 0; i < n; ++i) { - double scaled_grad = g[i] / scale; - w[i] += scaled_grad * ratio; - if (w[i] < optimizer_config::mf_min_bound) - w[i] = optimizer_config::mf_min_bound; - if (w[i] > optimizer_config::mf_max_bound) - w[i] = optimizer_config::mf_max_bound; - add_g2sum += scaled_grad * scaled_grad; - } - g2sum += add_g2sum / n; - } - - __device__ void update_value(GpuValue* ptr, const GpuPushValue& grad, curandState& state) { - ptr->slot = grad.slot; - ptr->show += grad.show; - ptr->clk += grad.clk; - ptr->delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) + - optimizer_config::clk_coeff * grad.clk; - - update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); - - if (ptr->mf_size == 0) { - if (optimizer_config::mf_create_thresholds <= - optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) + - optimizer_config::clk_coeff * ptr->clk) { - ptr->mf_size = 9; - ptr->mf[0] = 0; - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = 0; i < 8; ++i) { - ptr->mf[i + 1] = - (curand_uniform(&state)) * optimizer_config::mf_initial_range; - } - } - } else { - update_mf(8, &(ptr->mf[1]), ptr->mf[0], grad.mf_g, - grad.show); // for local test - } - } -}; - -class T_GpuValue_DownpourCtrAccessor : public ValueTransforImp { -public: - virtual int get_gpu_value_size(int dim_size) { - int ret = sizeof(GpuValue); - return TYPE_ALIGN(8, ret); - } - virtual int get_gpu_push_value_size(int dim_size) { - int ret = sizeof(GpuPushValue); - return TYPE_ALIGN(8, ret); - } - virtual void value_cpu_to_gpu(void* cpu, void* gpu, int dim_size) { -#ifdef PADDLE_WITH_PSLIB - paddle::ps::DownpourFixedFeatureValue* cpu_value = (paddle::ps::DownpourFixedFeatureValue*)cpu; - GpuValue* gpu_value = (GpuValue*)gpu; - const float* ptr_cpu_data = cpu_value->data(); - size_t dim = cpu_value->size(); - gpu_value->delta_score = ptr_cpu_data[1]; - gpu_value->show = ptr_cpu_data[2]; - gpu_value->clk = ptr_cpu_data[3]; - gpu_value->slot = ptr_cpu_data[6]; - gpu_value->lr = ptr_cpu_data[4]; - gpu_value->lr_g2sum = ptr_cpu_data[5]; - gpu_value->cpu_ptr = (uint64_t)(cpu); - if (dim > 7) { - gpu_value->mf_size = 9; - for (int x = 0; x < gpu_value->mf_size; x++) { - gpu_value->mf[x] = ptr_cpu_data[x + 7]; - } - } else { - gpu_value->mf_size = 0; - for (int x = 0; x < 9; x++) { - gpu_value->mf[x] = 0; - } - } -#endif -#ifdef PADDLE_WITH_PSCORE - const paddle::distributed::FixedFeatureValue* cpu_value = (const paddle::distributed::FixedFeatureValue*)cpu; - GpuValue* gpu_value = (GpuValue*)gpu; - const float* ptr_cpu_data = cpu_value->data(); - size_t dim = cpu_value->size(); - gpu_value->delta_score = ptr_cpu_data[2]; - gpu_value->show = ptr_cpu_data[3]; - gpu_value->clk = ptr_cpu_data[4]; - gpu_value->slot = ptr_cpu_data[0]; - gpu_value->lr = ptr_cpu_data[5]; - gpu_value->lr_g2sum = ptr_cpu_data[6]; - gpu_value->cpu_ptr = (uint64_t)(cpu); - if (dim > 7) { - gpu_value->mf_size = 9; - for (int x = 0; x < gpu_value->mf_size; x++) { - gpu_value->mf[x] = ptr_cpu_data[x + 7]; - } - } else { - gpu_value->mf_size = 0; - for (int x = 0; x < 9; x++) { - gpu_value->mf[x] = 0; - } - } -#endif - } - virtual void value_gpu_to_cpu(void* gpu) { -#ifdef PADDLE_WITH_PSLIB - GpuValue* gpu_value = (GpuValue*)gpu; - paddle::ps::DownpourFixedFeatureValue& cpu_fix = *((paddle::ps::DownpourFixedFeatureValue*)(gpu_value->cpu_ptr)); - if (gpu_value->mf_size > 0) { - cpu_fix.resize(7 + gpu_value->mf_size); - } - float* cpu_value = cpu_fix.data(); - cpu_value[1] = gpu_value->delta_score; - cpu_value[2] = gpu_value->show; - cpu_value[3] = gpu_value->clk; - cpu_value[4] = gpu_value->lr; - cpu_value[5] = gpu_value->lr_g2sum; - cpu_value[6] = gpu_value->slot; - if (gpu_value->mf_size > 0) { - for (int x = 0; x < gpu_value->mf_size; x++) { - cpu_value[x + 7] = gpu_value->mf[x]; - } - } -#endif -#ifdef PADDLE_WITH_PSCORE - GpuValue* gpu_value = (GpuValue*)gpu; - paddle::distributed::FixedFeatureValue& cpu_value = *((paddle::distributed::FixedFeatureValue*)(gpu_value->cpu_ptr)); - if (gpu_value->mf_size > 0) { - cpu_value.resize(7 + gpu_value->mf_size); - } - cpu_value[2] = gpu_value->delta_score; - cpu_value[3] = gpu_value->show; - cpu_value[4] = gpu_value->clk; - cpu_value[5] = gpu_value->lr; - cpu_value[6] = gpu_value->lr_g2sum; - cpu_value[0] = gpu_value->slot; - if (gpu_value->mf_size > 0) { - for (int x = 0; x < gpu_value->mf_size; x++) { - cpu_value[x + 7] = gpu_value->mf[x]; - } - } -#endif -} - virtual void value_to_cvm(float** gpu_cvm, - const void* gpu_value, - FeatureKey** gpu_keys, - const int slot_num, - const int64_t* key_len, - const int* slot_dim, - int64_t total_length, - int hidden_size, - int value_size, - cudaStream_t stream - ) { - value_to_cvm_impl(gpu_cvm, (GpuValue*)gpu_value, gpu_keys, slot_num, key_len, - slot_dim, total_length, hidden_size, value_size, stream); - } - virtual void grad_to_push(void* push_value, - float** grad_value, - const int slot_num, - const int64_t* grad_len, - const int* slot_dim, - int64_t total_length, - int hidden_size, - int value_size, - int batch_size, - const int* slot_vector, - cudaStream_t stream - ) { - grad_to_push_impl((GpuPushValue*)push_value, grad_value, slot_num, grad_len, slot_dim, - total_length, hidden_size, value_size, batch_size, slot_vector, stream); - } -}; - -} -} - -#endif \ No newline at end of file diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index 83d71ac912a31d..035478e7d7cfda 100755 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -54,12 +54,22 @@ class HashTable { virtual ~HashTable(); HashTable(const HashTable&) = delete; HashTable& operator=(const HashTable&) = delete; + void insert(const KeyType* d_keys, const ValType* d_vals, size_t len, + gpuStream_t stream); void insert(const KeyType* d_keys, size_t len, char* pool, size_t feature_value_size, size_t start_index, gpuStream_t stream); - void get(const KeyType* d_keys, ValType d_vals, size_t len, gpuStream_t stream); + void get(const KeyType* d_keys, ValType* d_vals, size_t len, + gpuStream_t stream); + void get(const KeyType* d_keys, char* d_vals, size_t len, gpuStream_t stream); void show(); + void dump_to_cpu(int devid, cudaStream_t stream); + template - void update(const KeyType* d_keys, const GradType* d_grads, size_t len, Sgd sgd, + void update(const KeyType* d_keys, const GradType* d_grads, size_t len, + Sgd sgd, gpuStream_t stream); + + template + void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd, gpuStream_t stream); int size() { return container_->size(); } diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index 6e2543e4b38b0a..b4be4c1f8c35ca 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -26,6 +26,23 @@ struct ReplaceOp { } }; +template +__global__ void insert_kernel(Table* table, + const typename Table::key_type* const keys, + const typename Table::mapped_type* const vals, + size_t len) { + ReplaceOp op; + thrust::pair kv; + + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + kv.first = keys[i]; + kv.second = vals[i]; + auto it = table->insert(kv, op); + assert(it != table->end() && "error: insert fails: table is full"); + } +} + template __global__ void insert_kernel(Table* table, const typename Table::key_type* const keys, @@ -45,40 +62,46 @@ __global__ void insert_kernel(Table* table, } } -template +template __global__ void search_kernel(Table* table, - const typename Table::key_type* const keys, - ValType* vals, size_t len, - size_t pull_feature_value_size) { + const typename Table::key_type* const keys, + typename Table::mapped_type* const vals, + size_t len) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { auto it = table->find(keys[i]); - char* d_value = (char*)(vals); if (it != table->end()) { - uint64_t offset = i * pull_feature_value_size; - ValType* cur = (ValType*)(d_value + offset); - ValType& input = *(ValType*)(it->second); - *cur = input; + vals[i] = it->second; } else { - if (keys[i] != 0) printf("pull miss key: %llu", keys[i]); - ValType* cur = (ValType*)(d_value + i * pull_feature_value_size); - *cur = ValType(); + printf("pull miss key: %llu", keys[i]); } } } -template -__global__ void update_kernel(Table* table, +template +__global__ void dy_mf_search_kernel(Table* table, const typename Table::key_type* const keys, - const GradType* grads, curandState* p_state, size_t len, - Sgd sgd, size_t grad_value_size) { + char* vals, size_t len, + size_t pull_feature_value_size) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { auto it = table->find(keys[i]); if (it != table->end()) { - char* grads_tmp = (char*)(grads); - GradType* cur = (GradType*)(grads_tmp + i * grad_value_size); - sgd.update_value((it.getter())->second, *cur, p_state[i]); + uint64_t offset = i * pull_feature_value_size; + FeatureValue* cur = (FeatureValue*)(vals + offset); + FeatureValue& input = *(FeatureValue*)(it->second); + cur->slot = input.slot; + cur->show = input.show; + cur->clk = input.clk; + cur->mf_dim = input.mf_dim; + cur->lr = input.lr; + cur->mf_size = input.mf_size; + cur->cpu_ptr = input.cpu_ptr; + cur->delta_score = input.delta_score; + cur->lr_g2sum = input.lr_g2sum; + for (int j = 0; j < cur->mf_dim + 1; ++j) { + cur->mf[j] = input.mf[j]; + } } else { if (keys[i] != 0) printf("pull miss key: %llu", keys[i]); } @@ -147,6 +170,72 @@ class CuRandState { curandState* states_ = nullptr; }; +template +__global__ void update_kernel(Table* table, + const typename Table::key_type* const keys, + const GradType* const grads, curandState* p_state, + size_t len, Sgd sgd) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + auto it = table->find(keys[i]); + if (it != table->end()) { + sgd.update_value((it.getter())->second, grads[i], p_state[i]); + } else { + printf("push miss key: %llu", keys[i]); + } + } +} + +template +__global__ void update_kernel(Table* table, + const typename Table::key_type* const keys, + const GradType* const grads, size_t len, + Sgd sgd) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + auto it = table->find(keys[i]); + if (it != table->end()) { + sgd.update_value((it.getter())->second, grads[i]); + } else { + printf("push miss key: %llu", keys[i]); + } + } +} + +template +__global__ void dy_mf_update_kernel(Table* table, + const typename Table::key_type* const keys, + const char* const grads, size_t len, + Sgd sgd, size_t grad_value_size) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + auto it = table->find(keys[i]); + if (it != table->end()) { + FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size); + sgd.dy_mf_update_value((it.getter())->second, *cur); + } else { + if (keys[i] != 0) printf("push miss key: %llu", keys[i]); + } + } +} + +template +__global__ void dy_mf_update_kernel(Table* table, + const typename Table::key_type* const keys, + const char* const grads, curandState* p_state, size_t len, + Sgd sgd, size_t grad_value_size) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + auto it = table->find(keys[i]); + if (it != table->end()) { + FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size); + sgd.dy_mf_update_value((it.getter())->second, *cur, p_state[i]); + } else { + if(keys[i] != 0) printf("push miss key: %llu", keys[i]); + } + } +} + template HashTable::HashTable(size_t capacity) { container_ = new TableContainer(capacity); @@ -164,15 +253,39 @@ void HashTable::show() { } template -void HashTable::get(const KeyType* d_keys, ValType d_vals, size_t len, gpuStream_t stream) { +void HashTable::get(const KeyType* d_keys, ValType* d_vals, + size_t len, gpuStream_t stream) { + if (len == 0) { + return; + } + const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; + search_kernel<<>>(container_, d_keys, + d_vals, len); +} + +template +void HashTable::get(const KeyType* d_keys, char* d_vals, + size_t len, gpuStream_t stream) { if (len == 0) { return; } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; - search_kernel<<>>( + dy_mf_search_kernel<<>>( container_, d_keys, d_vals, len, pull_feature_value_size_); } +template +void HashTable::insert(const KeyType* d_keys, + const ValType* d_vals, size_t len, + gpuStream_t stream) { + if (len == 0) { + return; + } + const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; + insert_kernel<<>>(container_, d_keys, + d_vals, len); +} + template void HashTable::insert(const KeyType* d_keys, size_t len, char* pool, size_t feature_value_size, @@ -189,6 +302,87 @@ void HashTable::insert(const KeyType* d_keys, size_t len, container_, d_keys, len, pool, feature_value_size, start_index); } +template +void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { + container_->prefetch(cudaCpuDeviceId, stream); + std::vector threads; + size_t num = container_->size(); + KeyType unuse_key = std::numeric_limits::max(); + thrust::pair* kv = container_->data(); + + int thread_num = 8; + int len_per_thread = num / thread_num; + int remain = num % thread_num; + int begin = 0; + + auto dump_func = [unuse_key, kv](int left, int right) { + for (int i = left; i < right; i++) { + if (kv[i].first == unuse_key) { + continue; + } + ValType& gpu_val = kv[i].second; +#ifdef PADDLE_WITH_PSLIB + auto* downpour_value = + (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); + int downpour_value_size = downpour_value->size(); + if (gpu_val.mf_size > 0 && downpour_value_size == 7) { + downpour_value->resize(gpu_val.mf_size + downpour_value_size); + } + float* cpu_val = downpour_value->data(); + // cpu_val[0] = 0; + cpu_val[1] = gpu_val.delta_score; + cpu_val[2] = gpu_val.show; + cpu_val[3] = gpu_val.clk; + cpu_val[4] = gpu_val.lr; + cpu_val[5] = gpu_val.lr_g2sum; + // useless + if (cpu_val[6] <= 0) { + cpu_val[6] = gpu_val.slot * -1; + } else { + cpu_val[6] = gpu_val.slot; + } + if (gpu_val.mf_size > 0) { + for (int x = 0; x < gpu_val.mf_size; x++) { + cpu_val[x + 7] = gpu_val.mf[x]; + } + } +#endif +#ifdef PADDLE_WITH_PSCORE + auto* downpour_value = + (paddle::distributed::FixedFeatureValue*)(gpu_val.cpu_ptr); + int downpour_value_size = downpour_value->size(); + if (gpu_val.mf_size > 0 && downpour_value_size == 7) { + downpour_value->resize(gpu_val.mf_size + downpour_value_size); + } + float* cpu_val = downpour_value->data(); + // cpu_val[0] = 0; + cpu_val[2] = gpu_val.delta_score; + cpu_val[3] = gpu_val.show; + cpu_val[4] = gpu_val.clk; + cpu_val[5] = gpu_val.lr; + cpu_val[6] = gpu_val.lr_g2sum; + cpu_val[0] = gpu_val.slot; + if (gpu_val.mf_size > 0) { + for (int x = 0; x < gpu_val.mf_size; x++) { + cpu_val[x + 7] = gpu_val.mf[x]; + } + } +#endif + } + }; + + for (int i = 0; i < thread_num; i++) { + threads.push_back(std::thread( + dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0))); + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); + } + + // container_->prefetch(devid, stream); +} + template template void HashTable::update(const KeyType* d_keys, @@ -197,10 +391,26 @@ void HashTable::update(const KeyType* d_keys, if (len == 0) { return; } - const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; auto state = CuRandState::get(); auto d_state = state->get(len, stream); + const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; update_kernel<<>>( + container_, d_keys, d_grads, d_state, len, sgd); + CuRandState::push(state, stream); +} + +template +template +void HashTable::update(const KeyType* d_keys, + const char* d_grads, size_t len, + Sgd sgd, gpuStream_t stream) { + if (len == 0) { + return; + } + auto state = CuRandState::get(); + auto d_state = state->get(len, stream); + const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; + dy_mf_update_kernel<<>>( container_, d_keys, d_grads, d_state, len, sgd, push_grad_value_size_); CuRandState::push(state, stream); } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 2c5e8cb90d4e02..e0ce6f55ba66e4 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -37,6 +37,43 @@ namespace framework { #define TYPEALIGN(ALIGNVAL, LEN) \ (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) +struct CustomGradMerger { + template + CUB_RUNTIME_FUNCTION __forceinline__ __device__ T + operator()(const T& a, const T& b) const { + T out; + out.slot = a.slot; + out.mf_dim = a.mf_dim; + out.show = a.show + b.show; + out.clk = a.clk + b.clk; + out.lr_g = a.lr_g + b.lr_g; + return out; + } + + template + __device__ __forceinline__ + void copy_basic_field(T& output, const T& input) { + output.slot = input.slot; + output.show = input.show; + output.clk = input.clk; + output.mf_dim = input.mf_dim; + output.lr_g = input.lr_g; + for(int i = 0; i < output.mf_dim ; ++i) { + output.mf_g[i] = input.mf_g[i]; + } + } + template + __device__ __forceinline__ + void add_basic_field(T& output, const T& input) { + output.show += input.show; + output.clk += input.clk; + output.lr_g += input.lr_g; + for(int i = 0; i < input.mf_dim; ++i) { + output.mf_g[i] += input.mf_g[i]; + } + } +}; + template class HeterComm { public: @@ -44,40 +81,66 @@ class HeterComm { virtual ~HeterComm(); HeterComm(const HeterComm&) = delete; HeterComm& operator=(const HeterComm&) = delete; - int get_index_by_devid(int devid); - void set_nccl_comm_and_size(const std::vector& inner_comms, - const std::vector& inter_comms, - int comm_size); - void set_multi_mf_dim(int max_mf_dim); - void show_one_table(int gpu_num); + + void split_input_to_shard(KeyType* d_keys, int* d_idx_ptr, size_t len, + int* left, int* right, int gpu_num); + void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + int& uniq_len); // NOLINT + void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, float* mf, + size_t len, int& uniq_len); + void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len); + void build_ps(int num, KeyType* h_keys, ValType* h_vals, size_t len, + size_t chunk_size, int stream_num); void build_ps(int num, KeyType* h_keys, char* pool, size_t len, size_t feature_value_size, size_t chunk_size, int stream_num); - void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len); + void dump(); + void show_one_table(int gpu_num); + int get_index_by_devid(int devid); + template - void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len, Sgd& sgd); + void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len, + Sgd& sgd); // NOLINT + template void push_sparse_multi_node(int num, KeyType* d_keys, GradType* d_grads, - size_t len, Sgd& sgd); -private: - void split_input_to_shard(KeyType* d_keys, int* d_idx_ptr, size_t len, - int* left, int* right, int gpu_num); - void create_storage(int start_index, int end_index, size_t keylen, size_t vallen); - void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right, - KeyType* src_key, char* src_val, size_t val_size); - void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right, - char* src_val, size_t val_size); - void destroy_storage(int start_index, int end_index); - void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, int& uniq_len); - int log2i(int x); - int gather_one_node_grad(int num, KeyType* d_keys, GradType* d_grads, int len); - int gather_multi_node_grad(int num, KeyType* d_keys, GradType* d_grads, int len); + size_t len, Sgd& sgd); // NOLINT + template - void update_one_table(int num, KeyType* d_keys, GradType* d_grads, size_t len, Sgd& sgd); + void update_one_table(int num, KeyType* d_keys, GradType* d_grads, size_t len, + Sgd& sgd); // NOLINT + + int gather_one_node_grad(int num, KeyType* d_keys, GradType* d_grads, + int len); + + int gather_multi_node_grad(int num, KeyType* d_keys, GradType* d_grads, + int len); + + int log2i(int x); + + void set_nccl_comm_and_size(const std::vector& inner_comms, + const std::vector& inter_comms, + int comm_size) { + nccl_inner_comms_ = inner_comms; + nccl_inter_comms_ = inter_comms; + node_size_ = comm_size; + } + + void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { + + multi_mf_dim_ = multi_mf_dim; + max_mf_dim_ = max_mf_dim; + VLOG(3) << "heter comm set multi multi_mf_dim_: " << multi_mf_dim_ << " max_mf_dim_: " << max_mf_dim_; + } + bool need_transfer(int send_id, int receive_id) { return ((send_id / 4 != receive_id / 4) && (send_id + 4) % 8 != receive_id); } + + // void dump_to_cpu(int index); + + void end_pass(); + int get_transfer_devid(int send_id) { return (send_id + 4) % 8; } - void init_path(); struct Node { cudaStream_t in_stream; @@ -89,14 +152,17 @@ class HeterComm { size_t val_bytes_len; int gpu_num; }; + struct Path { std::vector nodes_; }; + struct CopyTask { Path* path; int step; CopyTask(Path* path_, int step_) : path(path_), step(step_) {} }; + struct LocalStorage { LocalStorage() {} void init(int size, int dev_id) { @@ -135,13 +201,23 @@ class HeterComm { GradType* local_grads; }; - - - + void init_path(); + + void create_storage(int start_index, int end_index, size_t keylen, size_t vallen); + void destroy_storage(int start_index, int end_index); + void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right, + KeyType* src_key, GradType* src_val); + void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right, + KeyType* src_key, char* src_val, size_t val_size); + void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right, + ValType* src_val); + void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right, + char* src_val, size_t val_size); protected: using Table = HashTable; using PtrTable = HashTable; + std::vector tables_; std::vector ptr_tables_; std::shared_ptr resource_; std::vector> path_; @@ -150,6 +226,7 @@ class HeterComm { private: std::vector storage_; + CustomGradMerger merger_; int topo_aware_{0}; int feanum_{1800 * 2048}; int multi_node_{0}; @@ -157,6 +234,7 @@ class HeterComm { std::vector nccl_inter_comms_; int node_size_; std::vector> allocators_; + int multi_mf_dim_{8}; int max_mf_dim_ = 8; }; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index f54d33fb1c945b..156035da41a1cf 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -19,7 +19,6 @@ limitations under the License. */ namespace paddle { namespace framework { -//填充索引 template __global__ void fill_idx(T* idx, size_t len) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; @@ -28,17 +27,18 @@ __global__ void fill_idx(T* idx, size_t len) { } } -//计算key归属在哪个gpu里面 -template -__global__ void calc_shard_index(KeyType* d_keys, size_t len, T* shard_index, - int total_gpu) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < len) { - shard_index[i] = d_keys[i] % total_gpu; +template +void show_tensor(T* input, size_t len, gpuStream_t stream, std::string name) { + T tmp[len]; // NOLINT + cudaMemcpyAsync(&tmp, input, sizeof(T) * len, cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + std::cout << name; + for (int i = 0; i < len; ++i) { + std::cout << ":" << tmp[i]; } + std::cout << std::endl; } -//计算每个卡归属key的起始地址 template __global__ void calc_shard_offset(T* idx, T* left, T* right, size_t len) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; @@ -56,7 +56,15 @@ __global__ void calc_shard_offset(T* idx, T* left, T* right, size_t len) { } } -//按照索引进行key的重排 +template +__global__ void calc_shard_index(KeyType* d_keys, size_t len, T* shard_index, + int total_gpu) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + shard_index[i] = d_keys[i] % total_gpu; + } +} + template __global__ void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx, size_t len) { @@ -66,21 +74,19 @@ __global__ void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx, } } -//填充结果 -template -__global__ void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, - size_t len, size_t val_size) { +template +__global__ void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, + GradType* d_shard_grads, GradType* d_grads, + T* idx, size_t len) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { - uint64_t new_offset = uint64_t(idx[i]) * val_size; - *(ValType*)((char*)d_vals + new_offset) = - *(ValType*)((char*)d_shard_vals + i * val_size); + d_shard_keys[i] = d_keys[idx[i]]; + d_shard_grads[i] = d_grads[idx[i]]; } } -//填充结果 template -__global__ void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, +__global__ void dy_mf_fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, GradType* d_shard_grads, GradType* d_grads, T* idx, size_t len, size_t grad_value_size) { @@ -92,30 +98,51 @@ __global__ void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, } } -/* 相同key的梯度聚合到一起*/ -template __global__ void merge_gradient_kernel(const uint32_t* offset, const uint32_t* fea_num, - const uint32_t* index, GradType* input, - GradType* output, int n, - size_t grad_value_size) { + const uint32_t* index, const char* input, + char* output, int n, + size_t grad_value_size, + CustomGradMerger& merger_) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { uint32_t start = offset[i]; uint32_t num = fea_num[i]; int ori_index = index[start]; - char* tmp_in = (char*)(input); - char* tmp_out = (char*)(output); - GradType& out_value = *((GradType*)(tmp_out + size_t(i) * grad_value_size)); - GradType& in_value = *((GradType*)(tmp_in + size_t(ori_index) * grad_value_size)); - out_value = in_value; + + FeaturePushValue& lhs = *(FeaturePushValue*)(output + i * grad_value_size); + FeaturePushValue& in = + *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); + merger_.copy_basic_field(lhs, in); + for (int j = 1; j < num; ++j) { ori_index = index[start + j]; - GradType& in_value_tmp = *((GradType*)(tmp_in + size_t(ori_index) * grad_value_size)); - out_value += in_value_tmp; + FeaturePushValue& rhs = *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); + merger_.add_basic_field(lhs, rhs); } - } + } + +} + +template +__global__ void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, + size_t len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + d_vals[idx[i]] = d_shard_vals[i]; + } +} + +template +__global__ void dy_mf_fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, + size_t len, size_t val_size) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + uint64_t new_offset = uint64_t(idx[i]) * val_size; + *(ValType*)((char*)d_vals + new_offset) = + *(ValType*)((char*)d_shard_vals + i * val_size); + } } template @@ -124,16 +151,22 @@ HeterComm::HeterComm( VLOG(1) << "Construct new HeterComm"; resource_ = resource; storage_.resize(resource_->total_gpu()); + multi_mf_dim_ = resource->multi_mf(); for (int i = 0; i < resource_->total_gpu(); ++i) { platform::CUDADeviceGuard guard(resource_->dev_id(i)); // allocators_.push_back(std::make_shared( // 2, 1, 20, (size_t)-1, false, false)); // NOLINT allocators_.push_back(std::make_shared( 8, 1, (unsigned int)-1, (size_t)-1, false, false)); - { + if (!multi_mf_dim_) { + auto table = new Table(capacity / load_factor_); + tables_.push_back(table); + } else { max_mf_dim_ = resource->max_mf_dim(); - size_t val_type_size = g_transfor->get_gpu_value_size(max_mf_dim_); - size_t grad_type_size = g_transfor->get_gpu_push_value_size(max_mf_dim_); + size_t val_type_size = + TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); + size_t grad_type_size = + TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); auto ptr_table = new PtrTable(capacity / load_factor_); ptr_table->set_feature_value_size(val_type_size, grad_type_size); ptr_tables_.push_back(ptr_table); @@ -147,445 +180,148 @@ HeterComm::HeterComm( } template -HeterComm::~HeterComm() { - for (auto& table : ptr_tables_) { - delete table; - table = nullptr; - } -} - -/* - 通过gpu设备id,获得索引id,内部的一个映射关系 -*/ -template -int HeterComm::get_index_by_devid(int devid) { - return resource_->get_index_by_devid(devid); -} +void HeterComm::init_path() { + int total_gpu = resource_->total_gpu(); + path_.resize(total_gpu); -/* - 参数设置 -*/ -template -void HeterComm::set_nccl_comm_and_size(const std::vector& inner_comms, - const std::vector& inter_comms, - int comm_size) { - nccl_inner_comms_ = inner_comms; - nccl_inter_comms_ = inter_comms; - node_size_ = comm_size; + if (!topo_aware_) { + VLOG(0) << "init path without topo aware"; + for (int i = 0; i < total_gpu; ++i) { + path_[i].resize(total_gpu); + for (int j = 0; j < total_gpu; ++j) { + auto& nodes = path_[i][j].nodes_; + nodes.resize(1); + nodes[0].in_stream = resource_->comm_stream(i, j); + nodes[0].out_stream = resource_->comm_stream(i, j); + nodes[0].key_storage = NULL; + nodes[0].val_storage = NULL; + nodes[0].sync = 0; + nodes[0].gpu_num = j; + } + } + } else { + VLOG(0) << "init path with topo aware"; + for (int i = 0; i < total_gpu; ++i) { + path_[i].resize(total_gpu); + for (int j = 0; j < total_gpu; ++j) { + auto& nodes = path_[i][j].nodes_; + int from = resource_->dev_id(i); + int to = resource_->dev_id(j); + int transfer_id = i; + if (need_transfer(from, to)) { + transfer_id = resource_->get_index_by_devid(get_transfer_devid(from)); + nodes.push_back(Node()); + Node& node = nodes.back(); + node.in_stream = resource_->comm_stream(i, transfer_id); + node.out_stream = resource_->comm_stream(transfer_id, i); + node.key_storage = NULL; + node.val_storage = NULL; + node.sync = 1; + node.gpu_num = transfer_id; + } + nodes.push_back(Node()); + Node& node = nodes.back(); + node.in_stream = resource_->comm_stream(i, transfer_id); + node.out_stream = resource_->comm_stream(transfer_id, i); + node.key_storage = NULL; + node.val_storage = NULL; + node.sync = 0; + node.gpu_num = j; + } + } + } } -/* - 参数设置 -*/ template -void HeterComm::set_multi_mf_dim(int max_mf_dim) { - max_mf_dim_ = max_mf_dim; -} +void HeterComm::create_storage(int start_index, + int end_index, + size_t keylen, + size_t vallen) { + auto& allocator = allocators_[start_index]; + auto& nodes = path_[start_index][end_index].nodes_; + for (size_t i = 0; i < nodes.size(); ++i) { + platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num)); + PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate( + resource_->dev_id(nodes[i].gpu_num), + (void**)&(nodes[i].key_storage), // NOLINT + keylen, resource_->remote_stream(nodes[i].gpu_num, start_index))); + PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate( + resource_->dev_id(nodes[i].gpu_num), + (void**)&(nodes[i].val_storage), // NOLINT + vallen, resource_->remote_stream(nodes[i].gpu_num, start_index))); -template -void HeterComm::show_one_table(int gpu_num) { - //todo impl + nodes[i].key_bytes_len = keylen; + nodes[i].val_bytes_len = vallen; + } } -/* 建表 - num: 是哪一个gpu - h_keys: 具体的key(cpu内存) - pool: 具体的value(gpu内存) - len: key的数量 - feature_value_size: value的大小(value可能是变长的) - chunk_size: 内部需要,多少个key做一次处理(多流加速处理) - stream_num: 流数量 -*/ template -void HeterComm::build_ps(int num, KeyType* h_keys, - char* pool, - size_t len, - size_t feature_value_size, - size_t chunk_size, - int stream_num) { - if (len <= 0) { - return; - } - int dev_id = resource_->dev_id(num); - platform::CUDAPlace place = platform::CUDAPlace(dev_id); - platform::CUDADeviceGuard guard(dev_id); - std::vector d_key_bufs; - std::vector h_pinned_key_bufs; - gpuStream_t streams[stream_num]; - for (int i = 0; i < stream_num; ++i) { - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i]))); - auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType)); - d_key_bufs.push_back(std::move(d_k_buf)); - auto h_k_buf = memory::Alloc(phi::GPUPinnedPlace(), chunk_size * sizeof(KeyType)); - h_pinned_key_bufs.push_back(std::move(h_k_buf)); - } - - int cur_len = 0; - int cur_stream = 0; - while (cur_len < len) { - cur_stream = cur_stream % stream_num; - cudaStreamSynchronize(streams[cur_stream]); - int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size; - memcpy(h_pinned_key_bufs[cur_stream]->ptr(), h_keys + cur_len, sizeof(KeyType) * tmp_len); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_pinned_key_bufs[cur_stream]->ptr(), - sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice, - streams[cur_stream])); - ptr_tables_[num]->insert( - reinterpret_cast(d_key_bufs[cur_stream]->ptr()), tmp_len, - pool, feature_value_size, cur_len, streams[cur_stream]); - cur_stream += 1; - cur_len += tmp_len; - } +void HeterComm::destroy_storage(int start_index, + int end_index) { + auto& allocator = allocators_[start_index]; + auto& nodes = path_[start_index][end_index].nodes_; + for (size_t i = 0; i < nodes.size(); ++i) { + platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num)); - for (int i = 0; i < stream_num; ++i) { - cudaStreamSynchronize(streams[i]); - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i])); + PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num), + nodes[i].key_storage)); + PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num), + nodes[i].val_storage)); } } -/* 查表操作 - num: 是哪个gpu - d_keys: 要查的key - d_vals: 查出来的结果存入这个地方(table里面的value是指针,这里需要构造出value本体出来放在这里) - len: 有多少个key -*/ template -void HeterComm::pull_sparse(int num, - KeyType* d_keys, - ValType* d_vals, - size_t len) { - if (len == 0) { - return; +void HeterComm::walk_to_dest( + int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key, + GradType* src_val) { + int need_copy_val = 0; + if (src_val) { + need_copy_val = 1; } - int total_gpu = resource_->total_gpu(); - int dev_id = resource_->dev_id(num); - platform::CUDAPlace place = platform::CUDAPlace(dev_id); - platform::CUDADeviceGuard guard(dev_id); - auto stream = resource_->local_stream(num, 0); - - int grid_size = (len - 1) / block_size_ + 1; - - auto h_left_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu); - auto h_right_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu); - int* h_left = reinterpret_cast(h_left_alloc->ptr()); - int* h_right = reinterpret_cast(h_right_alloc->ptr()); - - auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); - auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); - int* d_left_ptr = reinterpret_cast(d_left->ptr()); - int* d_right_ptr = reinterpret_cast(d_right->ptr()); - - cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); - cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); - - auto d_idx = memory::Alloc(place, len * sizeof(int)); - int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - - size_t val_type_size = g_transfor->get_gpu_value_size(max_mf_dim_); - - auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); - KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_vals = memory::Alloc(place, len * val_type_size); - ValType* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); - - //将key切分到不同的卡里面去(这里计算出来的只是中间结果) - split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num); - //将零散的key进行重排,保证归属在相同gpu卡里面的key连续存储,并对应上d_left_ptr&d_right_ptr - fill_shard_key<<>>(d_shard_keys_ptr, - d_keys, d_idx_ptr, len); - - cudaMemcpyAsync(h_left, d_left_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(h_right, d_right_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); - - //创建分卡显存资源 - for (int i = 0; i < total_gpu; ++i) { - int shard_len = h_right[i] - h_left[i] + 1; + std::queue que; + for (int i = 0; i < gpu_num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(num, i, shard_len * sizeof(KeyType), - shard_len * val_type_size); - } - - //将数据转移到目的gpu卡中 - walk_to_dest(num, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL, 0); - - std::vector time_lines; - time_lines.resize(total_gpu); - - //开始查表操作了 - for (int i = 0; i < total_gpu; ++i) { - time_lines[i].Start(); - if (h_left[i] == -1) { - continue; + int size = path_[start_index][i].nodes_.size(); + auto& node = path_[start_index][i].nodes_[0]; + CopyTask t(&path_[start_index][i], 0); + que.push(t); + cudaMemcpyAsync(node.key_storage, + reinterpret_cast(src_key + h_left[i]), + node.key_bytes_len, cudaMemcpyDefault, node.in_stream); + if (need_copy_val) { + cudaMemcpyAsync(node.val_storage, + reinterpret_cast(src_val + h_left[i]), + node.val_bytes_len, cudaMemcpyDefault, node.in_stream); } - auto& node = path_[num][i].nodes_.back(); - cudaStreamSynchronize(node.in_stream); - platform::CUDADeviceGuard guard(resource_->dev_id(i)); - - ptr_tables_[i]->rwlock_->RDLock(); - ptr_tables_[i]->get(reinterpret_cast(node.key_storage), - reinterpret_cast(node.val_storage), h_right[i] - h_left[i] + 1, - resource_->remote_stream(i, num)); } - for (int i = 0; i < total_gpu; ++i) { - cudaStreamSynchronize(resource_->remote_stream(i, num)); - if (h_left[i] == -1) { - continue; - } - ptr_tables_[i]->rwlock_->UNLock(); - time_lines[i].Pause(); - } - - //将各卡查到的结果转移到当前卡中来 - walk_to_src(num, total_gpu, h_left, h_right, reinterpret_cast(d_shard_vals_ptr), val_type_size); - - for (int i = 0; i < total_gpu; ++i) { - auto& node = path_[num][i].nodes_.front(); - cudaStreamSynchronize(node.out_stream); - } - - //填充结果 - fill_dvals<<>>( - d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size); - - //资源销毁 - cudaStreamSynchronize(stream); - for (int i = 0; i < total_gpu; ++i) { - if (h_left[i] == -1 || h_right[i] == -1) { - continue; - } - destroy_storage(num, i); - } -} - -/* - * 梯度更新操作 - * gpu_num 是哪个gpu卡 - * d_keys: key指针 - * d_grads: 梯度数据 - * len: key长度 - * sgd: 优化器 -*/ -template -template -void HeterComm::push_sparse(int gpu_num, - KeyType* d_keys, - GradType* d_grads, - size_t len, - Sgd& sgd) { // NOLINT - if (len == 0) { - return; - } - int total_gpu = resource_->total_gpu(); - int dev_id = resource_->dev_id(gpu_num); - platform::CUDAPlace place = platform::CUDAPlace(dev_id); - platform::CUDADeviceGuard guard(dev_id); - auto stream = resource_->local_stream(gpu_num, 0); - - size_t grad_value_size = g_transfor->get_gpu_push_value_size(max_mf_dim_); - - // int h_left[total_gpu]; // NOLINT - // int h_right[total_gpu]; // NOLINT - auto h_left_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu); - auto h_right_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu); - int* h_left = reinterpret_cast(h_left_alloc->ptr()); - int* h_right = reinterpret_cast(h_right_alloc->ptr()); - - auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); - auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); - int* d_left_ptr = reinterpret_cast(d_left->ptr()); - int* d_right_ptr = reinterpret_cast(d_right->ptr()); - - cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); - cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); - auto d_idx = memory::Alloc(place, len * sizeof(int)); - int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - - auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); - KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - - auto d_shard_grads = memory::Alloc(place, len * grad_value_size); - GradType* d_shard_grads_ptr = reinterpret_cast(d_shard_grads->ptr()); - - int uniq_len = len; - merge_grad(gpu_num, d_keys, d_grads, len, uniq_len); - - int grid_size = (uniq_len - 1) / block_size_ + 1; - - split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, - gpu_num); - - fill_shard_grads<<>>( - d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, - uniq_len, grad_value_size); - - - cudaMemcpyAsync(h_left, d_left_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost, stream); - cudaMemcpyAsync(h_right, d_right_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost, stream); - cudaStreamSynchronize(stream); - - for (int i = 0; i < total_gpu; ++i) { - int shard_len = h_right[i] - h_left[i] + 1; - if (h_left[i] == -1 || h_right[i] == -1) { - continue; - } - { - create_storage(gpu_num, i, shard_len * sizeof(KeyType), - shard_len * grad_value_size); - } - } - - { - walk_to_dest(gpu_num, total_gpu, h_left, h_right, d_shard_keys_ptr, - reinterpret_cast(d_shard_grads_ptr), grad_value_size); - } - - std::vector time_lines; - time_lines.resize(total_gpu); - - for (int i = 0; i < total_gpu; ++i) { - time_lines[i].Start(); - if (h_left[i] == -1 || h_right[i] == -1) { - continue; + while (!que.empty()) { + CopyTask& cur_task = que.front(); + que.pop(); + if (cur_task.path->nodes_[cur_task.step].sync) { + cudaStreamSynchronize(cur_task.path->nodes_[cur_task.step].in_stream); } - auto& node = path_[gpu_num][i].nodes_.back(); - cudaStreamSynchronize(node.in_stream); - - platform::CUDADeviceGuard guard(resource_->dev_id(i)); - ptr_tables_[i]->rwlock_->WRLock(); - ptr_tables_[i]->update(reinterpret_cast(node.key_storage), - reinterpret_cast(node.val_storage), h_right[i] - h_left[i] + 1, sgd, - resource_->remote_stream(i, gpu_num)); - } - for (int i = 0; i < total_gpu; ++i) { - cudaStreamSynchronize(resource_->remote_stream(i, gpu_num)); - if (h_left[i] != -1) { - { - ptr_tables_[i]->rwlock_->UNLock(); + if (cur_task.step != cur_task.path->nodes_.size() - 1) { + int cur_step = cur_task.step; + CopyTask c(cur_task.path, cur_step + 1); + que.push(c); + cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage, + cur_task.path->nodes_[cur_step].key_storage, + cur_task.path->nodes_[cur_step + 1].key_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step + 1].in_stream); + if (need_copy_val) { + cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step + 1].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step + 1].in_stream); } - time_lines[i].Pause(); - } - } - for (int i = 0; i < total_gpu; ++i) { - if (h_left[i] == -1 || h_right[i] == -1) { - continue; } - destroy_storage(gpu_num, i); - } -} - -/* 暂时没用到,看性能是否需要启用, 并优化 - 相比push_sparse,是用nccl做卡间的通信 -*/ -template -template -void HeterComm::push_sparse_multi_node( - int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, - Sgd& sgd) { // NOLINT - if (len == 0) { - return; - } - - int uniq_len = len; - merge_grad(gpu_num, d_keys, d_grads, len, uniq_len); - - uniq_len = gather_one_node_grad(gpu_num, d_keys, d_grads, uniq_len); - - uniq_len = gather_multi_node_grad(gpu_num, storage_[gpu_num].local_keys, - storage_[gpu_num].local_grads, uniq_len); - - update_one_table(gpu_num, storage_[gpu_num].local_keys, - storage_[gpu_num].local_grads, uniq_len, sgd); -} - - - - -/* 将key拆分,看归属在哪个gpu里面 - d_keys: key列表 - d_idx_ptr: 结果, 按照gpu卡分块好的对应原始d_keys的下标 - left: 每个卡的首地址 - right: 每个卡的结束地址 - d_idx_ptr/left/right : 这三个值就知道了归属到每个卡的key列表了 -*/ -template -void HeterComm::split_input_to_shard( - KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right, - int gpu_num) { - int total_gpu = resource_->total_gpu(); - int dev_id = resource_->dev_id(gpu_num); - platform::CUDAPlace place = platform::CUDAPlace(dev_id); - platform::CUDADeviceGuard guard(dev_id); - auto stream = resource_->local_stream(gpu_num, 0); - - auto d_idx_tmp = memory::Alloc(place, len * sizeof(int)); - int* d_idx_tmp_ptr = reinterpret_cast(d_idx_tmp->ptr()); - - auto d_shard_index = memory::Alloc(place, len * sizeof(int)); - int* d_shard_index_ptr = reinterpret_cast(d_shard_index->ptr()); - - auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int)); - int* d_shard_index_tmp_ptr = reinterpret_cast(d_shard_index_tmp->ptr()); - - int grid_size = (len - 1) / block_size_ + 1; - fill_idx<<>>(d_idx_tmp_ptr, len); - calc_shard_index<<>>( - d_keys, len, d_shard_index_tmp_ptr, total_gpu); - - size_t temp_storage_bytes; - const int num_bits = 1 + log2i(total_gpu); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, - d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream)); - - auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr, - d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream)); - calc_shard_offset<<>>(d_shard_index_ptr, - left, right, len); - cudaStreamSynchronize(stream); -} - -/* 资源创建,因为要分卡操作,所以需要创建不同卡的资源 - start_index: 当前op计算的卡编号 - end_index: 要创建的哪个卡资源的卡编号 - keylen: key相关的显存大小 - vallen: value相关的显存大小 -*/ -template -void HeterComm::create_storage(int start_index, - int end_index, - size_t keylen, - size_t vallen) { - auto& allocator = allocators_[start_index]; - auto& nodes = path_[start_index][end_index].nodes_; - for (size_t i = 0; i < nodes.size(); ++i) { - platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num)); - PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate( - resource_->dev_id(nodes[i].gpu_num), - (void**)&(nodes[i].key_storage), // NOLINT - keylen, resource_->remote_stream(nodes[i].gpu_num, start_index))); - PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate( - resource_->dev_id(nodes[i].gpu_num), - (void**)&(nodes[i].val_storage), // NOLINT - vallen, resource_->remote_stream(nodes[i].gpu_num, start_index))); - - nodes[i].key_bytes_len = keylen; - nodes[i].val_bytes_len = vallen; } } -/* - 将数据转入到目的gpu卡中 -*/ template void HeterComm::walk_to_dest( int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key, @@ -638,12 +374,9 @@ void HeterComm::walk_to_dest( } } -/* - *将不同卡中的结果转移到当前卡中 -*/ template void HeterComm::walk_to_src( - int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, size_t val_size) { + int start_index, int gpu_num, int* h_left, int* h_right, ValType* src_val) { std::queue que; for (int i = 0; i < gpu_num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { @@ -652,7 +385,7 @@ void HeterComm::walk_to_src( int cur_step = path_[start_index][i].nodes_.size() - 1; auto& node = path_[start_index][i].nodes_[cur_step]; if (cur_step == 0) { - cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size, + cudaMemcpyAsync(reinterpret_cast(src_val + h_left[i]), node.val_storage, node.val_bytes_len, cudaMemcpyDefault, node.out_stream); } else { @@ -682,7 +415,7 @@ void HeterComm::walk_to_src( cur_task.path->nodes_[cur_step - 1].out_stream); } else if (cur_step == 0) { int end_index = cur_task.path->nodes_.back().gpu_num; - cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size, + cudaMemcpyAsync(reinterpret_cast(src_val + h_left[end_index]), cur_task.path->nodes_[cur_step].val_storage, cur_task.path->nodes_[cur_step].val_bytes_len, cudaMemcpyDefault, @@ -691,146 +424,689 @@ void HeterComm::walk_to_src( } } -/* - * 资源销毁 -*/ template -void HeterComm::destroy_storage(int start_index, - int end_index) { - auto& allocator = allocators_[start_index]; - auto& nodes = path_[start_index][end_index].nodes_; - for (size_t i = 0; i < nodes.size(); ++i) { - platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num)); - - PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num), - nodes[i].key_storage)); - PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num), - nodes[i].val_storage)); +void HeterComm::walk_to_src( + int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, size_t val_size) { + std::queue que; + for (int i = 0; i < gpu_num; i++) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + int cur_step = path_[start_index][i].nodes_.size() - 1; + auto& node = path_[start_index][i].nodes_[cur_step]; + if (cur_step == 0) { + cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size, + node.val_storage, node.val_bytes_len, cudaMemcpyDefault, + node.out_stream); + } else { + CopyTask t(&path_[start_index][i], cur_step - 1); + que.push(t); + cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage, + node.val_storage, + path_[start_index][i].nodes_[cur_step - 1].val_bytes_len, + cudaMemcpyDefault, + path_[start_index][i].nodes_[cur_step - 1].out_stream); + } + } + while (!que.empty()) { + CopyTask& cur_task = que.front(); + que.pop(); + int cur_step = cur_task.step; + if (cur_task.path->nodes_[cur_step].sync) { + cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream); + } + if (cur_step > 0) { + CopyTask c(cur_task.path, cur_step - 1); + que.push(c); + cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step - 1].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step - 1].out_stream); + } else if (cur_step == 0) { + int end_index = cur_task.path->nodes_.back().gpu_num; + cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step].out_stream); + } + } +} + +template +HeterComm::~HeterComm() { + if (!multi_mf_dim_) { + for (auto& table : tables_) { + delete table; + table = nullptr; + } + } else { + for (auto& table : ptr_tables_) { + delete table; + table = nullptr; + } + for (auto& table : tables_) { + delete table; + table = nullptr; + } + } +} + +template +void HeterComm::show_one_table(int gpu_num) { + if (!multi_mf_dim_) { + tables_[gpu_num]->show(); + } else { + // ptr_tables_[gpu_num]->show(); + } +} + +template +int HeterComm::log2i(int x) { + unsigned res = 0; + while (x >>= 1) { + ++res; + } + return res; +} + +template +int HeterComm::get_index_by_devid(int devid) { + return resource_->get_index_by_devid(devid); +} + +template +void HeterComm::build_ps(int num, KeyType* h_keys, + ValType* h_vals, + size_t len, + size_t chunk_size, + int stream_num) { + if (len <= 0) { + return; + } + int dev_id = resource_->dev_id(num); + platform::CUDAPlace place = platform::CUDAPlace(dev_id); + platform::CUDADeviceGuard guard(dev_id); + + std::vector d_key_bufs; + std::vector d_val_bufs; + + gpuStream_t streams[stream_num]; // NOLINT + for (int i = 0; i < stream_num; ++i) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i]))); + auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType)); + auto d_v_buf = memory::Alloc(place, chunk_size * sizeof(ValType)); + d_key_bufs.push_back(std::move(d_k_buf)); + d_val_bufs.push_back(std::move(d_v_buf)); + } + + int cur_len = 0; + int cur_stream = 0; + + while (cur_len < len) { + cur_stream = cur_stream % stream_num; + int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size; + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_keys + cur_len, + sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice, + streams[cur_stream])); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemcpyAsync(d_val_bufs[cur_stream]->ptr(), h_vals + cur_len, + sizeof(ValType) * tmp_len, cudaMemcpyHostToDevice, + streams[cur_stream])); + tables_[num]->insert( + reinterpret_cast(d_key_bufs[cur_stream]->ptr()), + reinterpret_cast(d_val_bufs[cur_stream]->ptr()), tmp_len, + streams[cur_stream]); + cur_stream += 1; + cur_len += tmp_len; + } + + for (int i = 0; i < stream_num; ++i) { + cudaStreamSynchronize(streams[i]); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i])); + } +} + +template +void HeterComm::build_ps(int num, KeyType* h_keys, + char* pool, + size_t len, + size_t feature_value_size, + size_t chunk_size, + int stream_num) { + if (len <= 0) { + return; + } + int dev_id = resource_->dev_id(num); + platform::CUDAPlace place = platform::CUDAPlace(dev_id); + platform::CUDADeviceGuard guard(dev_id); + + // use hbm pool + std::vector d_key_bufs; + + gpuStream_t streams[stream_num]; + for (int i = 0; i < stream_num; ++i) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i]))); + auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType)); + d_key_bufs.push_back(std::move(d_k_buf)); + } + + int cur_len = 0; + int cur_stream = 0; + + while (cur_len < len) { + cur_stream = cur_stream % stream_num; + int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size; + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_keys + cur_len, + sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice, + streams[cur_stream])); + ptr_tables_[num]->insert( + reinterpret_cast(d_key_bufs[cur_stream]->ptr()), tmp_len, + pool, feature_value_size, cur_len, streams[cur_stream]); + cur_stream += 1; + cur_len += tmp_len; + } + + for (int i = 0; i < stream_num; ++i) { + cudaStreamSynchronize(streams[i]); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i])); + } +} + +template +void HeterComm::merge_grad( + int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + int& uniq_len) { // NOLINT + int dev_id = resource_->dev_id(gpu_num); + platform::CUDAPlace place = platform::CUDAPlace(dev_id); + platform::CUDADeviceGuard guard(dev_id); + auto stream = resource_->local_stream(gpu_num, 0); + + size_t temp_storage_bytes; + + auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); + KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); + + auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType)); + GradType* d_merge_grads_ptr = + reinterpret_cast(d_merge_grads->ptr()); + + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads, + d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false)); + + void* d_buff = NULL; + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, + d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false)); + temp_storage_bytes = 0; + + auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int)); + int* d_num_runs_out = reinterpret_cast(d_num_runs_out_mem->ptr()); + + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey( + NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_merge_grads_ptr, + d_grads, d_num_runs_out, merger_, len, stream, false)); + + if (d_temp_storage->size() < temp_storage_bytes) { + d_temp_storage = NULL; + d_temp_storage = memory::Alloc(place, temp_storage_bytes); + } + + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey( + d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, + d_merge_grads_ptr, d_grads, d_num_runs_out, merger_, len, stream, false)); + + cudaMemcpyAsync(&uniq_len, d_num_runs_out, sizeof(int), + cudaMemcpyDeviceToHost, stream); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); +} + +template +void HeterComm::merge_grad(int gpu_num, + KeyType* d_keys, + GradType* d_grads, + float* mf, size_t len, + int& uniq_len) { + platform::Timer timeline; + timeline.Start(); + int dev_id = resource_->dev_id(gpu_num); + platform::CUDAPlace place = platform::CUDAPlace(dev_id); + platform::CUDADeviceGuard guard(dev_id); + auto stream = resource_->local_stream(gpu_num, 0); + + size_t temp_storage_bytes; + + //VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_; + size_t grad_value_size = + TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + + auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); + KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); + + auto d_merge_grads = memory::Alloc(place, len * grad_value_size); + GradType* d_merge_grads_ptr = + reinterpret_cast(d_merge_grads->ptr()); + + auto d_fea_num_info = + memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1)); + uint32_t* d_fea_num_info_ptr = + reinterpret_cast(d_fea_num_info->ptr()); + uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len]; + uint32_t* d_idx = (uint32_t*)&d_index[len]; + int* d_merged_size = (int*)&d_idx[len]; + int grid_size = (len - 1) / block_size_ + 1; + fill_idx<<>>(d_idx, len); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len, + 0, 8 * sizeof(KeyType), stream)); + + void* d_buff = NULL; + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, + d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + timeline.Pause(); + timeline.Start(); + temp_storage_bytes = 0; + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( + NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_fea_num_info_ptr, + d_merged_size, len, stream)); + if (d_temp_storage->size() < temp_storage_bytes) { + d_temp_storage = NULL; + d_temp_storage = memory::Alloc(place, temp_storage_bytes); + } + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( + d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, + d_fea_num_info_ptr, d_merged_size, len, stream)); + + cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int), + cudaMemcpyDeviceToHost, stream); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + timeline.Pause(); + timeline.Start(); + + assert(d_merged_size > 0); + uint32_t* d_offset = (uint32_t*)&d_index[len]; + + temp_storage_bytes = 0; + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( + NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len, + stream)); + if (d_temp_storage->size() < temp_storage_bytes) { + d_temp_storage = NULL; + d_temp_storage = memory::Alloc(place, temp_storage_bytes); + } + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( + d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, + uniq_len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + timeline.Pause(); + timeline.Start(); + grid_size = (uniq_len - 1) / block_size_ + 1; + merge_gradient_kernel<<>>( + d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads, + (char*)d_merge_grads_ptr, uniq_len, grad_value_size, merger_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + timeline.Pause(); + timeline.Start(); + + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len, + cudaMemcpyDeviceToDevice, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + timeline.Pause(); + timeline.Start(); +} + +template +void HeterComm::split_input_to_shard( + KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right, + int gpu_num) { + int total_gpu = resource_->total_gpu(); + int dev_id = resource_->dev_id(gpu_num); + platform::CUDAPlace place = platform::CUDAPlace(dev_id); + platform::CUDADeviceGuard guard(dev_id); + auto stream = resource_->local_stream(gpu_num, 0); + + auto d_idx_tmp = memory::Alloc(place, len * sizeof(int)); + int* d_idx_tmp_ptr = reinterpret_cast(d_idx_tmp->ptr()); + + auto d_shard_index = memory::Alloc(place, len * sizeof(int)); + int* d_shard_index_ptr = reinterpret_cast(d_shard_index->ptr()); + + auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int)); + int* d_shard_index_tmp_ptr = reinterpret_cast(d_shard_index_tmp->ptr()); + + int grid_size = (len - 1) / block_size_ + 1; + fill_idx<<>>(d_idx_tmp_ptr, len); + calc_shard_index<<>>( + d_keys, len, d_shard_index_tmp_ptr, total_gpu); + + size_t temp_storage_bytes; + const int num_bits = 1 + log2i(total_gpu); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, + d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream)); + + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( + d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr, + d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream)); + calc_shard_offset<<>>(d_shard_index_ptr, + left, right, len); + cudaStreamSynchronize(stream); +} + +template +void HeterComm::pull_sparse(int num, + KeyType* d_keys, + ValType* d_vals, + size_t len) { + if (len == 0) { + return; + } + + int total_gpu = resource_->total_gpu(); + int dev_id = resource_->dev_id(num); + platform::CUDAPlace place = platform::CUDAPlace(dev_id); + platform::CUDADeviceGuard guard(dev_id); + auto stream = resource_->local_stream(num, 0); + + int grid_size = (len - 1) / block_size_ + 1; + + auto h_left_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu); + auto h_right_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu); + int* h_left = reinterpret_cast(h_left_alloc->ptr()); + int* h_right = reinterpret_cast(h_right_alloc->ptr()); + + auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); + int* d_left_ptr = reinterpret_cast(d_left->ptr()); + int* d_right_ptr = reinterpret_cast(d_right->ptr()); + + cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); + cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); + // + auto d_idx = memory::Alloc(place, len * sizeof(int)); + int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); + + size_t val_type_size = 0; + if (!multi_mf_dim_) { + val_type_size = sizeof(ValType); + } else { + val_type_size = + TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); + } + + auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); + KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); + auto d_shard_vals = memory::Alloc(place, len * val_type_size); + ValType* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); + + split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num); + + fill_shard_key<<>>(d_shard_keys_ptr, + d_keys, d_idx_ptr, len); + + cudaMemcpyAsync(h_left, d_left_ptr, total_gpu * sizeof(int), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(h_right, d_right_ptr, total_gpu * sizeof(int), + cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + + for (int i = 0; i < total_gpu; ++i) { + int shard_len = h_right[i] - h_left[i] + 1; + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + create_storage(num, i, shard_len * sizeof(KeyType), + shard_len * val_type_size); + } + + walk_to_dest(num, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL); + + std::vector time_lines; + time_lines.resize(total_gpu); + + for (int i = 0; i < total_gpu; ++i) { + time_lines[i].Start(); + if (h_left[i] == -1) { + continue; + } + auto& node = path_[num][i].nodes_.back(); + cudaStreamSynchronize(node.in_stream); + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + if (!multi_mf_dim_) { + tables_[i]->rwlock_->RDLock(); + tables_[i]->get(reinterpret_cast(node.key_storage), + reinterpret_cast(node.val_storage), + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, num)); + } else { + ptr_tables_[i]->rwlock_->RDLock(); + ptr_tables_[i]->get(reinterpret_cast(node.key_storage), + node.val_storage, h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, num)); + } + } + for (int i = 0; i < total_gpu; ++i) { + cudaStreamSynchronize(resource_->remote_stream(i, num)); + if (h_left[i] == -1) { + continue; + } + if (!multi_mf_dim_) { + tables_[i]->rwlock_->UNLock(); + } else { + ptr_tables_[i]->rwlock_->UNLock(); + } + time_lines[i].Pause(); + } + + if (!multi_mf_dim_) { + walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr); + } else { + walk_to_src(num, total_gpu, h_left, h_right, reinterpret_cast(d_shard_vals_ptr), val_type_size); + } + + for (int i = 0; i < total_gpu; ++i) { + auto& node = path_[num][i].nodes_.front(); + cudaStreamSynchronize(node.out_stream); + } + + if (!multi_mf_dim_) { + fill_dvals<<>>(d_shard_vals_ptr, d_vals, + d_idx_ptr, len); + } else { + dy_mf_fill_dvals<<>>( + d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size); + } + cudaStreamSynchronize(stream); + for (int i = 0; i < total_gpu; ++i) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + destroy_storage(num, i); + } +} + +template +template +void HeterComm::push_sparse(int gpu_num, + KeyType* d_keys, + GradType* d_grads, + size_t len, + Sgd& sgd) { // NOLINT + if (len == 0) { + return; } -} -/* 相同key的梯度做聚合,聚合到一起 - * gpu_num 归属哪个卡 - * d_keys 要处理的key(input/output) - * d_grads 要处理的value(input/output) - * len key的长度 - * uniq_len 去重后key的长度 -*/ -template -void HeterComm::merge_grad(int gpu_num, - KeyType* d_keys, - GradType* d_grads, - size_t len, - int& uniq_len) { - platform::Timer timeline; - timeline.Start(); + int total_gpu = resource_->total_gpu(); int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); auto stream = resource_->local_stream(gpu_num, 0); - size_t temp_storage_bytes; + size_t grad_value_size = + TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); - //VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_; - size_t grad_value_size = g_transfor->get_gpu_push_value_size(max_mf_dim_); + // int h_left[total_gpu]; // NOLINT + // int h_right[total_gpu]; // NOLINT + auto h_left_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu); + auto h_right_alloc = memory::Alloc(phi::GPUPinnedPlace(), sizeof(int) * total_gpu); + int* h_left = reinterpret_cast(h_left_alloc->ptr()); + int* h_right = reinterpret_cast(h_right_alloc->ptr()); - auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); - //排序后的key - KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); + auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); + int* d_left_ptr = reinterpret_cast(d_left->ptr()); + int* d_right_ptr = reinterpret_cast(d_right->ptr()); - auto d_merge_grads = memory::Alloc(place, len * grad_value_size); - GradType* d_merge_grads_ptr = - reinterpret_cast(d_merge_grads->ptr()); + cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); + cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); + auto d_idx = memory::Alloc(place, len * sizeof(int)); + int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - auto d_fea_num_info = - memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1)); - uint32_t* d_fea_num_info_ptr = - reinterpret_cast(d_fea_num_info->ptr()); - //经过排序后的key的索引下标 - uint32_t* d_index = (uint32_t*)&d_fea_num_info_ptr[len]; - //对应原始key的索引下标 - uint32_t* d_idx = (uint32_t*)&d_index[len]; - int* d_merged_size = (int*)&d_idx[len]; - int grid_size = (len - 1) / block_size_ + 1; - //填充原始key的索引下标 - fill_idx<<>>(d_idx, len); - //对key做排序, d_merge_keys_ptr就是排序后好的key, d_index这个就是key的索引,可以用来反解出来排序后好的key - //对应原始的那个位置的key - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len, - 0, 8 * sizeof(KeyType), stream)); + auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); + KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - void* d_buff = NULL; - auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, - d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - timeline.Pause(); - timeline.Start(); - temp_storage_bytes = 0; - //对排序好后的key做一个去重,并统计数量 - //d_keys,去重后的key,d_fea_num_info_ptr对应的key的数量, d_merged_size去重后的key数量 - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_fea_num_info_ptr, - d_merged_size, len, stream)); - if (d_temp_storage->size() < temp_storage_bytes) { - d_temp_storage = NULL; - d_temp_storage = memory::Alloc(place, temp_storage_bytes); + auto d_shard_grads = memory::Alloc(place, len * grad_value_size); + GradType* d_shard_grads_ptr = reinterpret_cast(d_shard_grads->ptr()); + + int uniq_len = len; + merge_grad(gpu_num, d_keys, d_grads, NULL, len, uniq_len); + + int grid_size = (uniq_len - 1) / block_size_ + 1; + + split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, + gpu_num); + + if (!multi_mf_dim_) { + fill_shard_grads<<>>( + d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, + uniq_len); + } else { + dy_mf_fill_shard_grads<<>>( + d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, + uniq_len, grad_value_size); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, - d_fea_num_info_ptr, d_merged_size, len, stream)); - cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int), - cudaMemcpyDeviceToHost, stream); - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - timeline.Pause(); - timeline.Start(); - assert(d_merged_size > 0); - uint32_t* d_offset = (uint32_t*)&d_index[len]; + cudaMemcpyAsync(h_left, d_left_ptr, total_gpu * sizeof(int), + cudaMemcpyDeviceToHost, stream); + cudaMemcpyAsync(h_right, d_right_ptr, total_gpu * sizeof(int), + cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); - //做一个前缀求和 - temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len, - stream)); - if (d_temp_storage->size() < temp_storage_bytes) { - d_temp_storage = NULL; - d_temp_storage = memory::Alloc(place, temp_storage_bytes); + for (int i = 0; i < total_gpu; ++i) { + int shard_len = h_right[i] - h_left[i] + 1; + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + if (!multi_mf_dim_) { + create_storage(gpu_num, i, shard_len * sizeof(KeyType), + shard_len * sizeof(GradType)); + } else { + create_storage(gpu_num, i, shard_len * sizeof(KeyType), + shard_len * grad_value_size); + } } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, - uniq_len, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - timeline.Pause(); - timeline.Start(); - grid_size = (uniq_len - 1) / block_size_ + 1; - merge_gradient_kernel<<>>( - d_offset, d_fea_num_info_ptr, d_index, (GradType*)d_grads, - (GradType*)d_merge_grads_ptr, uniq_len, grad_value_size); - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); + if (!multi_mf_dim_) { + walk_to_dest(gpu_num, total_gpu, h_left, h_right, d_shard_keys_ptr, + d_shard_grads_ptr); + } else { + walk_to_dest(gpu_num, total_gpu, h_left, h_right, d_shard_keys_ptr, + reinterpret_cast(d_shard_grads_ptr), grad_value_size); + } - timeline.Pause(); - timeline.Start(); + std::vector time_lines; + time_lines.resize(total_gpu); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len, - cudaMemcpyDeviceToDevice, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - timeline.Pause(); - timeline.Start(); + for (int i = 0; i < total_gpu; ++i) { + time_lines[i].Start(); + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + auto& node = path_[gpu_num][i].nodes_.back(); + cudaStreamSynchronize(node.in_stream); + + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + if (!multi_mf_dim_) { + tables_[i]->rwlock_->WRLock(); + tables_[i]->update(reinterpret_cast(node.key_storage), + reinterpret_cast(node.val_storage), + h_right[i] - h_left[i] + 1, sgd, + resource_->remote_stream(i, gpu_num)); + } else { + ptr_tables_[i]->rwlock_->WRLock(); + ptr_tables_[i]->update(reinterpret_cast(node.key_storage), + node.val_storage, h_right[i] - h_left[i] + 1, sgd, + resource_->remote_stream(i, gpu_num)); + } + } + for (int i = 0; i < total_gpu; ++i) { + cudaStreamSynchronize(resource_->remote_stream(i, gpu_num)); + if (h_left[i] != -1) { + if (!multi_mf_dim_) { + tables_[i]->rwlock_->UNLock(); + } else { + ptr_tables_[i]->rwlock_->UNLock(); + } + time_lines[i].Pause(); + } + } + for (int i = 0; i < total_gpu; ++i) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + destroy_storage(gpu_num, i); + } } template -int HeterComm::log2i(int x) { - unsigned res = 0; - while (x >>= 1) { - ++res; +template +void HeterComm::update_one_table( + int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + Sgd& sgd) { // NOLINT + if (len == 0) { + return; } - return res; + + int dev_id = resource_->dev_id(gpu_num); + platform::CUDADeviceGuard guard(dev_id); + tables_[gpu_num]->rwlock_->WRLock(); + tables_[gpu_num]->update(d_keys, d_grads, len, sgd, + resource_->remote_stream(gpu_num, gpu_num)); + tables_[gpu_num]->rwlock_->UNLock(); + cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num)); +} + +template +template +void HeterComm::push_sparse_multi_node( + int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, + Sgd& sgd) { // NOLINT + if (len == 0) { + return; + } + + int uniq_len = len; + merge_grad(gpu_num, d_keys, d_grads, len, uniq_len); + + uniq_len = gather_one_node_grad(gpu_num, d_keys, d_grads, uniq_len); + + uniq_len = gather_multi_node_grad(gpu_num, storage_[gpu_num].local_keys, + storage_[gpu_num].local_grads, uniq_len); + + update_one_table(gpu_num, storage_[gpu_num].local_keys, + storage_[gpu_num].local_grads, uniq_len, sgd); } template @@ -910,8 +1186,7 @@ int HeterComm::gather_one_node_grad( fill_shard_grads<<>>( storage.local_keys + merge_num, storage.all_keys + index, storage.local_grads + merge_num, storage.all_grads + index, - d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1, - g_transfor->get_gpu_push_value_size(max_mf_dim_)); + d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1); merge_num = merge_num + h_right[gpu_num] - h_left[gpu_num] + 1; } @@ -981,76 +1256,35 @@ int HeterComm::gather_multi_node_grad( } template -template -void HeterComm::update_one_table( - int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, - Sgd& sgd) { // NOLINT - if (len == 0) { - return; - } - - int dev_id = resource_->dev_id(gpu_num); - platform::CUDADeviceGuard guard(dev_id); - ptr_tables_[gpu_num]->rwlock_->WRLock(); - ptr_tables_[gpu_num]->update(d_keys, d_grads, len, sgd, - resource_->remote_stream(gpu_num, gpu_num)); - ptr_tables_[gpu_num]->rwlock_->UNLock(); - cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num)); -} - -template -void HeterComm::init_path() { +void HeterComm::end_pass() { int total_gpu = resource_->total_gpu(); - path_.resize(total_gpu); + std::vector threads; - if (!topo_aware_) { - VLOG(0) << "init path without topo aware"; + auto dump_to_cpu_func = [this](int index) { + auto stream = resource_->local_stream(index, 0); + int dev_id = resource_->dev_id(index); + platform::CUDADeviceGuard guard(dev_id); + tables_[index]->dump_to_cpu(dev_id, stream); + }; + + if (!multi_mf_dim_) { for (int i = 0; i < total_gpu; ++i) { - path_[i].resize(total_gpu); - for (int j = 0; j < total_gpu; ++j) { - auto& nodes = path_[i][j].nodes_; - nodes.resize(1); - nodes[0].in_stream = resource_->comm_stream(i, j); - nodes[0].out_stream = resource_->comm_stream(i, j); - nodes[0].key_storage = NULL; - nodes[0].val_storage = NULL; - nodes[0].sync = 0; - nodes[0].gpu_num = j; - } + threads.push_back(std::thread(dump_to_cpu_func, i)); } - } else { - VLOG(0) << "init path with topo aware"; - for (int i = 0; i < total_gpu; ++i) { - path_[i].resize(total_gpu); - for (int j = 0; j < total_gpu; ++j) { - auto& nodes = path_[i][j].nodes_; - int from = resource_->dev_id(i); - int to = resource_->dev_id(j); - int transfer_id = i; - if (need_transfer(from, to)) { - transfer_id = resource_->get_index_by_devid(get_transfer_devid(from)); - nodes.push_back(Node()); - Node& node = nodes.back(); - node.in_stream = resource_->comm_stream(i, transfer_id); - node.out_stream = resource_->comm_stream(transfer_id, i); - node.key_storage = NULL; - node.val_storage = NULL; - node.sync = 1; - node.gpu_num = transfer_id; - } - nodes.push_back(Node()); - Node& node = nodes.back(); - node.in_stream = resource_->comm_stream(i, transfer_id); - node.out_stream = resource_->comm_stream(transfer_id, i); - node.key_storage = NULL; - node.val_storage = NULL; - node.sync = 0; - node.gpu_num = j; - } + for (auto& t : threads) { + t.join(); } } } +// template +// void HeterComm::dump_to_cpu(int index) { +// auto stream = resource_->local_stream(index, 0); +// int dev_id = resource_->dev_id(index); +// platform::CUDADeviceGuard guard(dev_id); +// tables_[index]->dump_to_cpu(dev_id, stream); +//} + } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 2251d28be7ed82..2fa4c08d20d0b2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -14,9 +14,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" -#include "paddle/fluid/framework/fleet/heter_ps/feature_value_inl.h" -#include "paddle/fluid/framework/fleet/heter_ps/gpu_value_inl.h" -#include "paddle/fluid/framework/fleet/heter_ps/dy_gpu_value_inl.h" #ifdef PADDLE_WITH_HETERPS @@ -24,61 +21,57 @@ namespace paddle { namespace framework { HeterPsBase* HeterPsBase::get_instance( - size_t capacity, std::shared_ptr resource, - std::string accessor_type, std::string gpu_value_type) { - if (accessor_type == "DownpourCtrDymfAccessor" && gpu_value_type == "DyFeatureValue") { - return new HeterPs(capacity, resource); - } else if (accessor_type == "DownpourCtrAccessor" && gpu_value_type == "FeatureValue") { - return new HeterPs(capacity, resource); - } - return nullptr; + size_t capacity, std::shared_ptr resource) { + return new HeterPs(capacity, resource); } -template -HeterPs::HeterPs() { +HeterPs::HeterPs(size_t capacity, std::shared_ptr resource) { + comm_ = + std::make_shared>( + capacity, resource); + opt_ = Optimizer(); } -template -HeterPs::HeterPs(size_t capacity, std::shared_ptr resource) { - comm_ = std::make_shared>(capacity, resource); - opt_ = Optimizer(); +HeterPs::~HeterPs() {} + +void HeterPs::pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals, + size_t len) { + comm_->pull_sparse(num, d_keys, d_vals, len); } -template -void HeterPs::pull_sparse(int num, FeatureKey* d_keys, void* d_vals, - size_t len) { - comm_->pull_sparse(num, d_keys, (ValType*)d_vals, len); +void HeterPs::build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, + size_t len, size_t chunk_size, int stream_num) { + comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num); } -template -void HeterPs::build_ps(int num, KeyType* h_keys, char* pool, - size_t len, size_t feature_value_size, size_t chunk_size, int stream_num) { +void HeterPs::build_ps(int num, FeatureKey* h_keys, char* pool, + size_t len, size_t feature_value_size, size_t chunk_size, int stream_num) { comm_->build_ps(num, h_keys, pool, len, feature_value_size, chunk_size, stream_num); } -template -void HeterPs::set_nccl_comm_and_size( - const std::vector& inner_comms, - const std::vector& inter_comms, int comm_size) { - comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); +int HeterPs::get_index_by_devid(int devid) { + return comm_->get_index_by_devid(devid); } -template -void HeterPs::set_multi_mf_dim(int max_mf_dim) { - comm_->set_multi_mf_dim(max_mf_dim); -} +void HeterPs::end_pass() { comm_->end_pass(); } -template -int HeterPs::get_index_by_devid(int devid) { - return comm_->get_index_by_devid(devid); -} +void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } -template -void HeterPs::push_sparse(int num, FeatureKey* d_keys, void* d_grads, size_t len) { - comm_->push_sparse(num, d_keys, (GradType*)d_grads, len, opt_); +void HeterPs::push_sparse(int num, FeatureKey* d_keys, + FeaturePushValue* d_grads, size_t len) { + comm_->push_sparse(num, d_keys, d_grads, len, opt_); + // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); } +void HeterPs::set_nccl_comm_and_size(const std::vector& inner_comms, + const std::vector& inter_comms, + int comm_size) { + comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); +} +void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { + comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim); +} } // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 0a41d2ee2ea51e..217f55e90bcae5 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -23,38 +23,33 @@ limitations under the License. */ namespace paddle { namespace framework { -template class HeterPs : public HeterPsBase { -public: - HeterPs(); + public: + HeterPs() {} HeterPs(size_t capacity, std::shared_ptr resource); - virtual ~HeterPs() {} + virtual ~HeterPs(); HeterPs(const HeterPs&) = delete; HeterPs& operator=(const HeterPs&) = delete; - virtual void pull_sparse(int num, FeatureKey* d_keys, void* d_vals, + virtual void pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals, size_t len) override; - - virtual void build_ps(int num, KeyType* h_keys, char* pool, + virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, + size_t len, size_t chunk_size, int stream_num) override; + virtual void build_ps(int num, FeatureKey* h_keys, char* pool, size_t len, size_t feature_value_size, size_t chunk_size, int stream_num) override; - virtual void set_nccl_comm_and_size( const std::vector& inner_comms, const std::vector& inter_comms, int comm_size) override; - - virtual void set_multi_mf_dim(int max_mf_dim) override; - + virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; + virtual void end_pass() override; virtual int get_index_by_devid(int devid) override; - - virtual void show_one_table(int gpu_num) { - comm_->show_one_table(gpu_num); - }; - + virtual void show_one_table(int gpu_num) override; virtual void push_sparse(int num, FeatureKey* d_keys, - void* d_grads, size_t len) override; -private: - std::shared_ptr> comm_; - Optimizer opt_; + FeaturePushValue* d_grads, size_t len) override; + + private: + std::shared_ptr> comm_; + Optimizer opt_; }; } // end namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index 0024e2017a2dec..713c1123969624 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -30,22 +30,24 @@ class HeterPsBase { virtual ~HeterPsBase(){}; HeterPsBase(const HeterPsBase&) = delete; HeterPsBase& operator=(const HeterPsBase&) = delete; - virtual void pull_sparse(int num, FeatureKey* d_keys, void* d_vals, + + virtual void pull_sparse(int num, FeatureKey* d_keys, FeatureValue* d_vals, size_t len) = 0; + virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals, + size_t len, size_t chunk_size, int stream_num) = 0; virtual void build_ps(int num, FeatureKey* h_keys, char* pool, size_t len, size_t feature_value_size, size_t chunk_size, int stream_num) = 0; virtual int get_index_by_devid(int devid) = 0; virtual void set_nccl_comm_and_size( const std::vector& inner_comms, const std::vector& inter_comms, int comm_size) = 0; - virtual void set_multi_mf_dim(int max_mf_dim) = 0; + virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0; + virtual void end_pass() = 0; virtual void show_one_table(int gpu_num) = 0; virtual void push_sparse(int num, FeatureKey* d_keys, - void* d_grads, size_t len) = 0; + FeaturePushValue* d_grads, size_t len) = 0; static HeterPsBase* get_instance(size_t capacity, - std::shared_ptr resource, - std::string accessor_type, - std::string gpu_value_type); + std::shared_ptr resource); }; } // end namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h index 5ca8a6abc42fd7..9189902c28ffb4 100644 --- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h +++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h @@ -29,11 +29,11 @@ class MemoryPool { : capacity_(capacity), block_size_(block_size) { VLOG(3) << "mem_pool init with block_size: " << block_size << " capacity: " << capacity; - mem_alloc_ = memory::Alloc(phi::GPUPinnedPlace(), block_size * capacity_); - mem_ = reinterpret_cast(mem_alloc_->ptr()); + mem_ = (char*)malloc(block_size * capacity_); } ~MemoryPool() { VLOG(3) << "mem pool delete"; + free(mem_); } size_t block_size() { return block_size_; } char* mem() { return mem_; } @@ -45,7 +45,6 @@ class MemoryPool { } private: - memory::allocation::AllocationPtr mem_alloc_; char* mem_ = NULL; size_t capacity_; size_t block_size_; @@ -61,13 +60,8 @@ class HBMMemoryPool : public managed { VLOG(3) << "hbm memory pool with capacity" << capacity_ << " bs: " << block_size_; cudaMalloc(&mem_, block_size_ * capacity_); - gpuStream_t streams; - cudaStreamCreate(&streams); - cudaMemcpyAsync(mem_, mem_pool->mem(), mem_pool->byte_size(), - cudaMemcpyHostToDevice, streams); - cudaStreamSynchronize(streams); - cudaStreamDestroy(streams); - + cudaMemcpy(mem_, mem_pool->mem(), mem_pool->byte_size(), + cudaMemcpyHostToDevice); } ~HBMMemoryPool() { @@ -77,6 +71,30 @@ class HBMMemoryPool : public managed { size_t block_size() { return block_size_; } + void clear(void) { cudaMemset(mem_, 0, block_size_ * capacity_); } + + void reset(size_t capacity) { + cudaFree(mem_); + mem_ = NULL; + capacity_ = capacity; + cudaMalloc(&mem_, (block_size_ * capacity / 8 + 1) * 8); + cudaMemset(mem_, 0, block_size_ * capacity); + } + + friend std::ostream& operator<<(std::ostream& out, HBMMemoryPool& p) { + for (size_t k = 0; k < 5; k++) { + auto x = (FeatureValue*)(p.mem() + k * p.capacity()); + out << "show: " << x->show << " clk: " << x->clk << " slot: " << x->slot + << " lr: " << x->lr << " mf_dim: " << x->mf_size + << " mf_size: " << x->mf_size << " mf:"; + for (int i = 0; i < x->mf_size + 1; ++i) { + out << " " << x->mf[i]; + } + out << "\n"; + } + return out; + } + char* mem() { return mem_; } size_t capacity() { return capacity_; } diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index 28a49e2d4c3f7b..a0ad727b6386fd 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -27,10 +27,163 @@ template class Optimizer { public: Optimizer() {} + ~Optimizer() {} + void initialize() {} - __device__ void update_value(ValType* ptr, const GradType& grad, curandState& state) { + + __device__ void update_lr(float& w, float& g2sum, float g, float scale) { + double add_g2sum = 0; + double ratio = optimizer_config::learning_rate * + sqrt(optimizer_config::initial_g2sum / + (optimizer_config::initial_g2sum + g2sum)); + double scaled_grad = g / scale; + + w += scaled_grad * ratio; + + if (w < optimizer_config::min_bound) w = optimizer_config::min_bound; + if (w > optimizer_config::max_bound) w = optimizer_config::max_bound; + + add_g2sum += scaled_grad * scaled_grad; + + g2sum += add_g2sum; + } + + __device__ void update_mf(int n, float* w, float& g2sum, const float* g, + float scale) { + double add_g2sum = 0; + double ratio = optimizer_config::mf_learning_rate * + sqrt(optimizer_config::mf_initial_g2sum / + (optimizer_config::mf_initial_g2sum + g2sum)); + for (int i = 0; i < n; ++i) { + double scaled_grad = g[i] / scale; + + w[i] += scaled_grad * ratio; + + if (w[i] < optimizer_config::mf_min_bound) + w[i] = optimizer_config::mf_min_bound; + if (w[i] > optimizer_config::mf_max_bound) + w[i] = optimizer_config::mf_max_bound; + add_g2sum += scaled_grad * scaled_grad; + } + + g2sum += add_g2sum / n; + } + __device__ void update_value(ValType& val, const GradType& grad) { + val.slot = grad.slot; + val.show += grad.show; + val.clk += grad.clk; + val.delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) + + optimizer_config::clk_coeff * grad.clk; + + update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show); + + if (val.mf_size == 0) { + if (optimizer_config::mf_create_thresholds <= + optimizer_config::nonclk_coeff * (val.show - val.clk) + + optimizer_config::clk_coeff * val.clk) { + val.mf_size = MF_DIM + 1; + val.mf[0] = 0; + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + curandState state; + curand_init(clock64(), tid_x, 0, &state); + for (int i = 0; i < MF_DIM; ++i) { + val.mf[i + 1] = + (curand_uniform(&state)) * optimizer_config::mf_initial_range; + } + } + } else { + update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show); + } } + + __device__ void update_value(ValType& val, const GradType& grad, curandState& state) { + val.slot = grad.slot; + val.show += grad.show; + val.clk += grad.clk; + val.delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) + + optimizer_config::clk_coeff * grad.clk; + + update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show); + + if (val.mf_size == 0) { + if (optimizer_config::mf_create_thresholds <= + optimizer_config::nonclk_coeff * (val.show - val.clk) + + optimizer_config::clk_coeff * val.clk) { + val.mf_size = MF_DIM + 1; + val.mf[0] = 0; + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = 0; i < MF_DIM; ++i) { + val.mf[i + 1] = + (curand_uniform(&state)) * optimizer_config::mf_initial_range; + } + } + } else { + update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show); + } + } + + __device__ void dy_mf_update_value(ValType* ptr, const GradType& grad) { + ptr->slot = grad.slot; + ptr->show += grad.show; + ptr->clk += grad.clk; + ptr->delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) + + optimizer_config::clk_coeff * grad.clk; + + update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); + // ptr->mf_dim = grad.mf_dim; + + if (ptr->mf_size == 0) { + if (optimizer_config::mf_create_thresholds <= + optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) + + optimizer_config::clk_coeff * ptr->clk) { + ptr->mf_size = ptr->mf_dim + 1; + ptr->mf[0] = 0; + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + curandState state; + curand_init(clock64(), tid_x, 0, &state); + for (int i = 0; i < ptr->mf_dim; ++i) { + ptr->mf[i + 1] = + (curand_uniform(&state)) * optimizer_config::mf_initial_range; + } + } + } else { + update_mf(ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0], grad.mf_g, + grad.show); // for local test + } + } + + __device__ void dy_mf_update_value(ValType* ptr, const GradType& grad, curandState& state) { + ptr->slot = grad.slot; + ptr->show += grad.show; + ptr->clk += grad.clk; + ptr->delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) + + optimizer_config::clk_coeff * grad.clk; + + update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); + // ptr->mf_dim = grad.mf_dim; + + if (ptr->mf_size == 0) { + if (optimizer_config::mf_create_thresholds <= + optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) + + optimizer_config::clk_coeff * ptr->clk) { + ptr->mf_size = ptr->mf_dim + 1; + ptr->mf[0] = 0; + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + //curandState state; + //curand_init(clock64(), tid_x, 0, &state); + for (int i = 0; i < ptr->mf_dim; ++i) { + ptr->mf[i + 1] = + (curand_uniform(&state)) * optimizer_config::mf_initial_range; + } + } + } else { + update_mf(ptr->mf_dim, &(ptr->mf[1]), ptr->mf[0], grad.mf_g, + grad.show); // for local test + } + } + + }; } // end namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/test_comm.cu b/paddle/fluid/framework/fleet/heter_ps/test_comm.cu index 212f4d4291c953..3a6ed50ad8e702 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_comm.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_comm.cu @@ -23,7 +23,6 @@ limitations under the License. */ using namespace paddle::framework; TEST(TEST_FLEET, heter_comm) { - /* int gpu_count = 3; std::vector dev_ids; dev_ids.push_back(0); @@ -34,14 +33,14 @@ TEST(TEST_FLEET, heter_comm) { resource->enable_p2p(); std::vector count; std::vector> keys; - std::vector> vals; + std::vector> vals; count.resize(dev_ids.size(), 0); keys.resize(dev_ids.size()); vals.resize(dev_ids.size()); for (int i = 0; i < 10; i++) { FeatureKey key; - LxchDyFeatureValue val; + FeatureValue val; int gpu_num = i % gpu_count; key = i; val.lr = i; @@ -57,7 +56,7 @@ TEST(TEST_FLEET, heter_comm) { } auto heter_comm = - std::make_shared>( + std::make_shared>( size, resource); for (int i = 0; i < gpu_count; ++i) { std::cout << "building table: " << i << std::endl; @@ -68,9 +67,9 @@ TEST(TEST_FLEET, heter_comm) { std::cout << "testing pull sparse:" << std::endl; paddle::platform::CUDADeviceGuard guard(0); FeatureKey* pull_keys; - LxchDyFeatureValue* pull_vals; + FeatureValue* pull_vals; cudaMallocManaged(&pull_keys, 5 * sizeof(FeatureKey)); - cudaMallocManaged(&pull_vals, 5 * sizeof(LxchDyFeatureValue)); + cudaMallocManaged(&pull_vals, 5 * sizeof(FeatureValue)); pull_keys[0] = 2; pull_keys[1] = 3; @@ -86,11 +85,11 @@ TEST(TEST_FLEET, heter_comm) { cudaFree(pull_vals); std::cout << "testing push sparse:" << std::endl; - Optimizer opt; + Optimizer opt; FeatureKey* push_keys; - LxchDyFeaturePushValue* push_vals; + FeaturePushValue* push_vals; cudaMallocManaged(&push_keys, 5 * sizeof(FeatureKey)); - cudaMallocManaged(&push_vals, 5 * sizeof(LxchDyFeaturePushValue)); + cudaMallocManaged(&push_vals, 5 * sizeof(FeaturePushValue)); push_keys[0] = 2; push_keys[1] = 3; push_keys[2] = 9; @@ -110,5 +109,4 @@ TEST(TEST_FLEET, heter_comm) { cudaFree(push_keys); cudaFree(push_vals); - */ } diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index aaa61fbbfd0fc0..41b11e8ed90ed2 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -106,135 +106,240 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { platform::Timer timeline; timeline.Start(); int device_num = heter_devices_.size(); - gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_); - - //step1 将读取到的ins,key先做一个粗去重(主要是多线程粗去重) - thread_keys_.resize(thread_keys_thread_num_); - for (auto& iter : thread_keys_) { - iter.resize(thread_keys_thread_num_); - for (auto& iter1 : iter) { - iter1.resize(multi_mf_dim_); - for (auto& iter2 : iter1) { - iter2.clear(); - } - } - } - dataset_mutex_.lock(); - Dataset* cur_dataset = dataset_pipe_.front(); - dataset_pipe_.pop(); - dataset_mutex_.unlock(); - - std::function first_uniq_func; - void* record_vec = nullptr; + if (!multi_mf_dim_) { + gpu_task->init(thread_keys_shard_num_, device_num); + } else { + gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_); + } + auto& local_keys = gpu_task->feature_keys_; + auto& local_ptr = gpu_task->value_ptr_; + + std::vector threads; + + // data should be in input channel + if (!multi_mf_dim_) { + thread_keys_.resize(thread_keys_thread_num_); + for (int i = 0; i < thread_keys_thread_num_; i++) { + thread_keys_[i].resize(thread_keys_shard_num_); + } + } else { + thread_dim_keys_.resize(thread_keys_thread_num_); + for (int i = 0; i < thread_keys_thread_num_; i++) { + thread_dim_keys_[i].resize(thread_keys_shard_num_); + for (int j = 0; j < thread_keys_shard_num_; j++) { + thread_dim_keys_[i][j].resize(multi_mf_dim_); + } + } + } + size_t total_len = 0; - std::string data_set_name = std::string(typeid(*cur_dataset).name()); + size_t len_per_thread = 0; + int remain = 0; + size_t begin = 0; + + std::string data_set_name = std::string(typeid(*dataset_).name()); + if (data_set_name.find("SlotRecordDataset") != std::string::npos) { - SlotRecordDataset* dataset = dynamic_cast(cur_dataset); + SlotRecordDataset* dataset = dynamic_cast(dataset_); auto input_channel = dataset->GetInputChannel(); - const std::deque* vec_data = &(input_channel->GetData()); - record_vec = (void*)(vec_data); - total_len = vec_data->size(); - - first_uniq_func = [this](void* ptr, int begin_index, int end_index, int i) -> void { - const std::deque& total_data = *((const std::deque*)ptr); - for (auto iter = total_data.begin() + begin_index; iter != total_data.begin() + end_index; iter++) { + VLOG(3) << "buildtask::inputslotchannle size: " << input_channel->Size(); + const std::deque& vec_data = input_channel->GetData(); + total_len = vec_data.size(); + len_per_thread = total_len / (thread_keys_thread_num_); + remain = total_len % (thread_keys_thread_num_); + auto gen_func = [this](const std::deque& total_data, + int begin_index, int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + const auto& ins = *iter; + const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; + for (const auto feasign : feasign_v) { + int shard_id = feasign % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(feasign); + } + } + }; + auto gen_dynamic_mf_func = [this](const std::deque& total_data, + int begin_index, int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { const auto& ins = *iter; const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets; - for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size(); slot_idx++) { - for (size_t j = slot_offset[slot_offset_vector_[slot_idx]]; j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) { + for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size(); + slot_idx++) { + for (size_t j = slot_offset[slot_offset_vector_[slot_idx]]; + j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) { int shard_id = feasign_v[j] % thread_keys_shard_num_; int dim_id = slot_index_vec_[slot_idx]; if (feasign_v[j] != 0) { - this->thread_keys_[i][shard_id][dim_id].insert(feasign_v[j]); + this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]); } } } } + /* + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + const auto& ins = *iter; + const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; + for (const auto feasign : feasign_v) { + int shard_id = feasign % thread_keys_shard_num_; + if (slot_idx >= slot_index_vec_.size()) { + VLOG(0) << "WRONG::slot_idx: " << slot_idx << " slot_index_vec_size: " << + slot_index_vec_.size(); + } + int dim_id = slot_index_vec_[slot_idx]; + if (feasign_v[j] != 0) { + this->thread_dim_keys_[i][shard_id][dim_id].insert(feasign_v[j]); + } + } + } + */ }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + if (!multi_mf_dim_) { + threads.push_back( + std::thread(gen_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + } else { + VLOG(3) << "psgpu wrapper genfunc with dynamic mf"; + threads.push_back( + std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + } + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } else { CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); - MultiSlotDataset* dataset = dynamic_cast(cur_dataset); + VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; + MultiSlotDataset* dataset = dynamic_cast(dataset_); auto input_channel = dataset->GetInputChannel(); - const std::deque* vec_data = &(input_channel->GetData()); - record_vec = (void*)(vec_data); - total_len = vec_data->size(); - first_uniq_func = [this](void* ptr, int begin_index, int end_index, int i) -> void { - const std::deque& total_data = *((const std::deque*)ptr); - for (auto iter = total_data.begin() + begin_index; iter != total_data.begin() + end_index; iter++) { + + const std::deque& vec_data = input_channel->GetData(); + total_len = vec_data.size(); + len_per_thread = total_len / thread_keys_thread_num_; + remain = total_len % thread_keys_thread_num_; + auto gen_func = [this](const std::deque& total_data, + int begin_index, int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { const auto& ins = *iter; const auto& feasign_v = ins.uint64_feasigns_; - //暂不支持多维度,打平老逻辑(有需要的时候加上) for (const auto feasign : feasign_v) { uint64_t cur_key = feasign.sign().uint64_feasign_; int shard_id = cur_key % thread_keys_shard_num_; - this->thread_keys_[i][shard_id][0].insert(cur_key); + this->thread_keys_[i][shard_id].insert(cur_key); } } }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + threads.push_back( + std::thread(gen_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } - std::vector threads; - size_t len_per_thread = total_len / thread_keys_thread_num_; - size_t remain = total_len % thread_keys_thread_num_; - size_t begin = 0; - for (size_t i = 0; i < (size_t)thread_keys_thread_num_; i++) { - threads.push_back( - std::thread(first_uniq_func, record_vec, begin, - begin + len_per_thread + (i < remain ? 1 : 0), i)); - begin += len_per_thread + (i < remain ? 1 : 0); - } - for (auto& t : threads) { - t.join(); - } - threads.clear(); - timeline.Pause(); - auto step_1 = timeline.ElapsedSec(); - //step2 insert into together timeline.Start(); - auto merge_ins_func = [this, gpu_task](int shard_num, int dim_id) -> void { + + threads.clear(); + // merge thread_keys to shard_keys + auto merge_ins_func = [this, gpu_task](int shard_num) { for (int i = 0; i < thread_keys_thread_num_; ++i) { - gpu_task->batch_add_keys(shard_num, dim_id, thread_keys_[i][shard_num][dim_id]); - thread_keys_[i][shard_num][dim_id].clear(); + gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]); + thread_keys_[i][shard_num].clear(); } }; + auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) { + for (int i = 0; i < thread_keys_thread_num_; ++i) { + gpu_task->batch_add_keys(shard_num, dim_id, + thread_dim_keys_[i][shard_num][dim_id]); + thread_dim_keys_[i][shard_num][dim_id].clear(); + } + }; + // for (size_t i = 0; i < thread_keys_.size(); i++) { + // gpu_task->batch_add_keys(thread_keys_[i]); + // for (int j = 0; j < thread_keys_thread_num_; j++) { + // thread_keys_[i][j].clear(); + // } + //} for (int i = 0; i < thread_keys_shard_num_; ++i) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads.push_back(std::thread(merge_ins_func, i, j)); + if (!multi_mf_dim_) { + threads.push_back(std::thread(merge_ins_func, i)); + } else { + for (int j = 0; j < multi_mf_dim_; j++) { + threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j)); + } } } for (auto& t : threads) { t.join(); } - threads.clear(); timeline.Pause(); - auto step_2 = timeline.ElapsedSec(); - //step3 精细化去重 + VLOG(0) << "GpuPs task add keys cost " << timeline.ElapsedSec() + << " seconds."; timeline.Start(); - gpu_task->unique_keys(); + gpu_task->UniqueKeys(); timeline.Pause(); - auto step_3 = timeline.ElapsedSec(); - VLOG(0) << "pass_id:" << gpu_task->pass_id_ << " PreBuildTask cost-detail " - << "rough-dedup: " << step_1 - << "s add-batch: " << step_2 - << "s unique_keys:" << step_3 << "s"; + + VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds."; + + if (!multi_mf_dim_) { + for (int i = 0; i < thread_keys_shard_num_; i++) { + VLOG(0) << "GpuPs shard: " << i << " key len: " << local_keys[i].size(); + local_ptr[i].resize(local_keys[i].size()); + } + } else { + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j] + << " key len: " << gpu_task->feature_dim_keys_[i][j].size(); + gpu_task->value_dim_ptr_[i][j].resize( + gpu_task->feature_dim_keys_[i][j].size()); + } + } + } } void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { - -#if (defined PADDLE_WITH_PSLIB) || (defined PADDLE_WITH_PSCORE) platform::Timer timeline; - timeline.Start(); + auto& local_keys = gpu_task->feature_keys_; + auto& local_ptr = gpu_task->value_ptr_; + + auto& local_dim_keys = gpu_task->feature_dim_keys_; + auto& local_dim_ptr = gpu_task->value_dim_ptr_; + + auto& device_dim_keys = gpu_task->device_dim_keys_; + auto& device_dim_ptr = gpu_task->device_dim_ptr_; + if (multi_mf_dim_) { + for (size_t dev = 0; dev < device_dim_keys.size(); dev++) { + device_dim_keys[dev].resize(multi_mf_dim_); + device_dim_ptr[dev].resize(multi_mf_dim_); + } + } + std::vector threads(thread_keys_shard_num_); #ifdef PADDLE_WITH_PSLIB auto fleet_ptr = FleetWrapper::GetInstance(); -#else +#endif +#ifdef PADDLE_WITH_PSCORE auto fleet_ptr = paddle::distributed::FleetWrapper::GetInstance(); #endif #if (defined PADDLE_WITH_PSLIB) && (defined PADDLE_WITH_HETERPS) - //设置日期,ps内部pull的时候需要根据day_id做decay + // get day_id: day nums from 1970 struct std::tm b; b.tm_year = year_ - 1900; b.tm_mon = month_ - 1; @@ -245,58 +350,163 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { fleet_ptr->pslib_ptr_->_worker_ptr->set_day_id(table_id_, day_id); #endif - fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(0); - //获取sparse-value的指针 - auto& pull_keys = gpu_task->feature_keys_; - auto& pull_value = gpu_task->value_ptr_; - auto pull_value_func = [this, &pull_keys, &pull_value, &fleet_ptr, &gpu_task](int i, int j) -> void { - size_t key_size = pull_keys[i][j].size(); - pull_value[i][j].resize(key_size); + timeline.Start(); + auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) { + size_t key_size = local_keys[i].size(); + int32_t status = -1; +#ifdef PADDLE_WITH_PSLIB + // auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( + // reinterpret_cast(local_ptr[i].data()), this->table_id_, + // local_keys[i].data(), key_size); int32_t cnt = 0; while (true) { -#ifdef PADDLE_WITH_PSLIB auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - i, reinterpret_cast(pull_value[i][j].data()), - this->table_id_, pull_keys[i][j].data(), key_size, gpu_task->pass_id_); -#else - auto tt = fleet_ptr->worker_ptr_->PullSparsePtr( - reinterpret_cast(pull_value[i][j].data()), this->table_id_, - pull_keys[i][j].data(), key_size); + i, reinterpret_cast(local_ptr[i].data()), this->table_id_, + local_keys[i].data(), key_size); + bool flag = true; + + tt.wait(); + + try { + status = tt.get(); + } catch (const std::future_error& e) { + VLOG(0) << "Caught a future_error with code" << e.code() + << ", Message:" << e.what(); + } + if (status != 0) { + VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); + flag = false; + cnt++; + } + if (cnt > 3) { + VLOG(0) << "fleet pull sparse failed, retry 3 times"; + exit(-1); + } + + if (flag) { + break; + } + } #endif +#ifdef PADDLE_WITH_PSCORE + int32_t cnt = 0; + while (true) { + auto tt = fleet_ptr->worker_ptr_->PullSparsePtr( + reinterpret_cast(local_ptr[i].data()), this->table_id_, + local_keys[i].data(), key_size); + bool flag = true; + tt.wait(); - int32_t status = -1; + try { status = tt.get(); } catch (const std::future_error& e) { VLOG(0) << "Caught a future_error with code" << e.code() << ", Message:" << e.what(); } - if (status == 0) { + if (status != 0) { + VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); + flag = false; + cnt++; + } + if (cnt > 3) { + VLOG(0) << "fleet pull sparse failed, retry 3 times"; + exit(-1); + } + + if (flag) { break; - } else { + } + } +#endif + if (status != 0) { + LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; + sleep(300); + exit(-1); + } else { + VLOG(3) << "FleetWrapper Pull sparse to local done with table size: " + << local_keys[i].size(); + } + }; + + auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr, + &fleet_ptr](int i, int j) { +#ifdef PADDLE_WITH_PSLIB + size_t key_size = local_dim_keys[i][j].size(); + int32_t status = -1; + int32_t cnt = 0; + while (true) { + auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( + i, reinterpret_cast(local_dim_ptr[i][j].data()), + this->table_id_, local_dim_keys[i][j].data(), key_size); + bool flag = true; + + tt.wait(); + + try { + status = tt.get(); + } catch (const std::future_error& e) { + VLOG(0) << "Caught a future_error with code" << e.code() + << ", Message:" << e.what(); + } + if (status != 0) { VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; sleep(sleep_seconds_before_fail_exit_); + flag = false; cnt++; } if (cnt > 3) { VLOG(0) << "fleet pull sparse failed, retry 3 times"; exit(-1); } + + if (flag) { + break; + } } + if (status != 0) { + LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; + sleep(300); + exit(-1); + } else { + VLOG(3) << "FleetWrapper Pull sparse to local done with table size: " + << local_dim_keys[i][j].size(); + } +#endif }; - std::vector> task_futures; - for (int i = 0; i < thread_keys_shard_num_; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - task_futures.emplace_back(pull_thread_pool_[i]->enqueue(pull_value_func, i, j)); + fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_); + if (!multi_mf_dim_) { + for (size_t i = 0; i < threads.size(); i++) { + threads[i] = std::thread(ptl_func, i); + } + } else { + threads.resize(thread_keys_shard_num_ * multi_mf_dim_); + + std::vector> task_futures; + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + // threads[i * multi_mf_dim_ + j] = std::thread(ptl_dynamic_mf_func, i, + // j); + task_futures.emplace_back( + pull_thread_pool_[i]->enqueue(ptl_dynamic_mf_func, i, j)); + } + } + for (auto& f : task_futures) { + f.wait(); } + task_futures.clear(); } - for (auto& f : task_futures) { - f.wait(); + if (!multi_mf_dim_) { + for (std::thread& t : threads) { + t.join(); + } } + fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_); timeline.Pause(); - fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(0); - VLOG(0) << "pass_id:" << gpu_task->pass_id_ << " build-pull-detail cost: " << timeline.ElapsedSec() << "s"; -#endif + VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() + << " seconds."; if (multi_node_) { auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance(); if (!gloo_wrapper->IsInitialized()) { @@ -307,223 +517,499 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } } -//里面用到了一些静态变量,同一时刻只允许有一个线程运行这个函数 -void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { -#define FOUR_VECTOR_INIT(vec, first_size, second_size, thrid_size) \ - vec.resize(first_size); \ - for (auto& iter1 : vec) { \ - iter1.resize(second_size); \ - for (auto& iter2 : iter1) { \ - iter2.resize(thrid_size); \ - for (auto& iter3 : iter2) { \ - iter3.clear(); \ - } \ - } \ - } -#define THIRD_VECTOR_INIT(vec, first_size, second_size, thrid_size) \ - vec.resize(first_size); \ - for (auto& iter1 : vec) { \ - iter1.resize(second_size); \ - for (auto& iter2 : iter1) { \ - iter2.resize(thrid_size); \ - } \ - } - +void PSGPUWrapper::PrepareGPUTask(std::shared_ptr gpu_task) { platform::Timer timeline; - timeline.Start(); - std::vector> task_futures; int device_num = heter_devices_.size(); - //step1 input: 分片->维度->key; => output: 分片->设备->维度->key - auto& pull_keys = gpu_task->feature_keys_; - auto& pull_values = gpu_task->value_ptr_; - static std::vector>>> s_first_keys; + auto& local_keys = gpu_task->feature_keys_; + auto& local_ptr = gpu_task->value_ptr_; + auto& local_dim_keys = gpu_task->feature_dim_keys_; + auto& local_dim_ptr = gpu_task->value_dim_ptr_; + + auto& device_keys = gpu_task->device_keys_; + auto& device_vals = gpu_task->device_values_; + auto& device_dim_keys = gpu_task->device_dim_keys_; + auto& device_dim_ptr = gpu_task->device_dim_ptr_; + auto& device_dim_mutex = gpu_task->dim_mutex_; + std::vector> task_futures; + std::vector threads(thread_keys_shard_num_); + + if (multi_mf_dim_) { + for (size_t dev = 0; dev < device_dim_keys.size(); dev++) { + device_dim_keys[dev].resize(multi_mf_dim_); + device_dim_ptr[dev].resize(multi_mf_dim_); + } + } + timeline.Start(); + std::vector>> pass_values; + + bool record_status = false; #ifdef PADDLE_WITH_PSLIB - static std::vector>>> s_first_value; -#elif PADDLE_WITH_PSCORE - static std::vector>>> s_first_value; + auto fleet_ptr = FleetWrapper::GetInstance(); + uint16_t pass_id = 0; + if (multi_node_) { + record_status = fleet_ptr->pslib_ptr_->_worker_ptr->take_sparse_record( + table_id_, pass_id, pass_values); + } #endif - FOUR_VECTOR_INIT(s_first_keys, thread_keys_shard_num_, device_num, multi_mf_dim_) - FOUR_VECTOR_INIT(s_first_value, thread_keys_shard_num_, device_num, multi_mf_dim_) - auto& l_first_keys = s_first_keys; - auto& l_first_value = s_first_value; - auto func_first = [this, &pull_keys, &pull_values, &l_first_keys, &l_first_value, &device_num](int shard_id, int dim_id) -> void { - auto& l_keys = pull_keys[shard_id][dim_id]; - auto& l_values = pull_values[shard_id][dim_id]; - for (size_t i = 0; i < l_keys.size(); i++) { - int dev_id = l_keys[i] % device_num; - l_first_keys[shard_id][dev_id][dim_id].push_back(l_keys[i]); - l_first_value[shard_id][dev_id][dim_id].push_back(l_values[i]); + auto& device_task_keys = gpu_task->device_task_keys_; + auto& device_task_ptrs = gpu_task->device_task_ptr_; + auto build_pull_dynamic_mf_func = [this, device_num, &local_dim_keys, + &local_dim_ptr, &device_dim_keys, + &device_dim_ptr, + &device_dim_mutex](int i, int j) { +#ifdef PADDLE_WITH_PSLIB + std::vector> task_keys(device_num); + std::vector> task_ptrs( + device_num); + for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) { + int shard = local_dim_keys[i][j][k] % device_num; + task_keys[shard].push_back(local_dim_keys[i][j][k]); + task_ptrs[shard].push_back(local_dim_ptr[i][j][k]); } - }; - for (int i = 0; i < thread_keys_shard_num_; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(func_first, i, j)); + // allocate local keys to devices + for (int dev = 0; dev < device_num; dev++) { + device_dim_mutex[dev][j]->lock(); + + int len = task_keys[dev].size(); + int cur = device_dim_keys[dev][j].size(); + device_dim_keys[dev][j].resize(device_dim_keys[dev][j].size() + len); + device_dim_ptr[dev][j].resize(device_dim_ptr[dev][j].size() + len); + for (int k = 0; k < len; ++k) { + device_dim_keys[dev][j][cur + k] = task_keys[dev][k]; + device_dim_ptr[dev][j][cur + k] = task_ptrs[dev][k]; + } + device_dim_mutex[dev][j]->unlock(); } - } - for (auto& f : task_futures) { - f.wait(); - } - task_futures.clear(); - timeline.Pause(); - auto step_1 = timeline.ElapsedSec(); - timeline.Start(); +// for (int dev = 0; dev < device_num; dev++) { +// for (int dim = 0; dim < multi_mf_dim_; dim++) { +// device_dim_mutex[dev][dim]->lock(); + +// int len = task_keys[dev].size(); +// int cur = device_dim_keys[dev][dim].size(); +// device_dim_keys[dev][dim].resize(device_dim_keys[dev][dim].size() + +// len); +// device_dim_ptr[dev][dim].resize(device_dim_ptr[dev][dim].size() + len); +// for (int k = 0; k < len; ++k) { +// device_dim_keys[dev][dim][cur + k] = task_keys[dev][k]; +// device_dim_ptr[dev][dim][cur + k] = task_ptrs[dev][k]; +// } +// device_dim_mutex[dev][dim]->unlock(); +// } +// } +#endif + }; + auto build_func = [device_num, record_status, &pass_values, &local_keys, + &local_ptr, &device_task_keys, &device_task_ptrs](int i) { + auto& task_keys = device_task_keys[i]; +#ifdef PADDLE_WITH_PSLIB + auto& task_ptrs = device_task_ptrs[i]; +#endif - //step2 通过前缀计算,获得结果大小 prefix_sum: 设备->维度->分片数量 - // l_second_key/s_second_value: 设备->维度->具体的值 - static std::vector>> prefix_sum; +#ifdef PADDLE_WITH_PSCORE + auto& task_ptrs = device_task_ptrs[i]; +#endif + + for (size_t j = 0; j < local_keys[i].size(); j++) { + int shard = local_keys[i][j] % device_num; + task_keys[shard].push_back(local_keys[i][j]); + task_ptrs[shard].push_back(local_ptr[i][j]); + } #ifdef PADDLE_WITH_PSLIB - static std::vector>> s_second_value; -#elif PADDLE_WITH_PSCORE - static std::vector>> s_second_value; + if (record_status) { + size_t local_keys_size = local_keys.size(); + size_t pass_values_size = pass_values.size(); + for (size_t j = 0; j < pass_values_size; j += local_keys_size) { + auto& shard_values = pass_values[j]; + for (size_t pair_idx = 0; pair_idx < pass_values[j].size(); + pair_idx++) { + auto& cur_pair = shard_values[pair_idx]; + int shard = cur_pair.first % device_num; + task_keys[shard].push_back(cur_pair.first); + task_ptrs[shard].push_back( + (paddle::ps::DownpourFixedFeatureValue*)cur_pair.second); + } + } + } #endif - THIRD_VECTOR_INIT(prefix_sum, device_num, multi_mf_dim_, thread_keys_shard_num_ + 1) - THIRD_VECTOR_INIT(s_second_value, device_num, multi_mf_dim_, 0) - auto& l_second_key = gpu_task->device_keys_; - auto& l_second_value = s_second_value; - auto& l_prefix_sum = prefix_sum; - auto func_second = [this, &l_prefix_sum, &l_first_keys, &l_second_key, &l_second_value] - (int device_id, int dim_id) -> void { - l_prefix_sum[device_id][dim_id][0] = 0; - for (int j = 0; j < this->thread_keys_shard_num_; j++) { - l_prefix_sum[device_id][dim_id][j+1] = l_prefix_sum[device_id][dim_id][j] + l_first_keys[j][device_id][dim_id].size(); - } - l_second_key[device_id][dim_id].resize(l_prefix_sum[device_id][dim_id][this->thread_keys_shard_num_]); - l_second_value[device_id][dim_id].resize(l_prefix_sum[device_id][dim_id][this->thread_keys_shard_num_]); }; - for (int i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(func_second, i, j)); + if (!multi_mf_dim_) { + for (int i = 0; i < thread_keys_shard_num_; i++) { + task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(build_func, i)); + } + for (auto& f : task_futures) { + f.wait(); } + task_futures.clear(); + VLOG(0) << "GpuPs build hbmps done"; } - for (auto& f : task_futures) { - f.wait(); + std::vector> prefix_sum; + prefix_sum.resize(device_num); + for (int i = 0; i < device_num; i++) { + prefix_sum[i].resize(thread_keys_shard_num_ + 1); + prefix_sum[i][0] = 0; } - task_futures.clear(); - timeline.Pause(); - auto step_2 = timeline.ElapsedSec(); - timeline.Start(); + auto calc_prefix_func = [this, &prefix_sum, &device_keys, &device_vals, + &device_task_keys](int device_num) { + for (int j = 0; j < thread_keys_shard_num_; j++) { + prefix_sum[device_num][j + 1] = + prefix_sum[device_num][j] + device_task_keys[j][device_num].size(); + } + device_keys[device_num].resize( + prefix_sum[device_num][thread_keys_shard_num_]); + device_vals[device_num].resize( + prefix_sum[device_num][thread_keys_shard_num_]); + }; + if (!multi_mf_dim_) { + for (int i = 0; i < device_num; i++) { + task_futures.emplace_back( + hbm_thread_pool_[i]->enqueue(calc_prefix_func, i)); + } + for (auto& f : task_futures) { + f.wait(); + } + task_futures.clear(); + } + VLOG(0) << "prefix done"; + auto prepare_dev_value_func = [device_num, &prefix_sum, &device_keys, + &device_vals, &device_task_keys, + &device_task_ptrs](int dev, int shard_id) { + auto& task_keys = device_task_keys[shard_id]; +#ifdef PADDLE_WITH_PSLIB + auto& task_ptrs = device_task_ptrs[shard_id]; +#endif + +#ifdef PADDLE_WITH_PSCORE + auto& task_ptrs = device_task_ptrs[dev]; +#endif - //step3, 具体的key/value => 转入到l_second_key/l_second_value - // 这就是要最终转入到gpu中key-value分设备后的cpu数据了 - auto func_third = [this, &l_second_key, &l_second_value, &l_first_keys, &l_first_value, &l_prefix_sum] - (int shard_id, int device_id, int dim_id) -> void { - auto& input_key = l_first_keys[shard_id][device_id][dim_id]; - auto& input_value = l_first_value[shard_id][device_id][dim_id]; - auto& output_key = l_second_key[device_id][dim_id]; - auto& output_value = l_second_value[device_id][dim_id]; - int start_index = prefix_sum[device_id][dim_id][shard_id]; - for (size_t i = 0; i < input_key.size(); i++) { - output_key[i + start_index] = input_key[i]; - output_value[i + start_index] = input_value[i]; + int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id]; + int cur = prefix_sum[dev][shard_id]; +#ifdef PADDLE_WITH_PSLIB + for (int j = 0; j < len; ++j) { + device_keys[dev][cur + j] = task_keys[dev][j]; + float* ptr_val = task_ptrs[dev][j]->data(); + FeatureValue& val = device_vals[dev][cur + j]; + size_t dim = task_ptrs[dev][j]->size(); + // dec ref + if (ptr_val[6] <= 0) { + ptr_val[6] *= -1; + } else if (ptr_val[0] <= 0) { + ptr_val[0] *= -1; + } + + val.delta_score = ptr_val[1]; + val.show = ptr_val[2]; + val.clk = ptr_val[3]; + val.slot = ptr_val[6]; + val.lr = ptr_val[4]; + val.lr_g2sum = ptr_val[5]; + val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); + + if (dim > 7) { + val.mf_size = MF_DIM + 1; + for (int x = 0; x < val.mf_size; x++) { + val.mf[x] = ptr_val[x + 7]; + } + } else { + val.mf_size = 0; + for (int x = 0; x < MF_DIM + 1; x++) { + val.mf[x] = 0; + } + } + } +#endif +#ifdef PADDLE_WITH_PSCORE + for (int j = 0; j < len; ++j) { + device_keys[dev][cur + j] = task_keys[dev][j]; + float* ptr_val = task_ptrs[dev][j]->data(); + FeatureValue& val = device_vals[dev][cur + j]; + size_t dim = task_ptrs[dev][j]->size(); + val.delta_score = ptr_val[2]; + val.show = ptr_val[3]; + val.clk = ptr_val[4]; + val.slot = ptr_val[0]; + val.lr = ptr_val[5]; + val.lr_g2sum = ptr_val[6]; + val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); + + if (dim > 7) { + val.mf_size = MF_DIM + 1; + for (int x = 0; x < val.mf_size; x++) { + val.mf[x] = ptr_val[x + 7]; + } + } else { + val.mf_size = 0; + for (int x = 0; x < MF_DIM + 1; x++) { + val.mf[x] = 0; + } + } } +#endif + VLOG(3) << "GpuPs build hbmps done"; + }; - for (int i = 0; i < thread_keys_shard_num_; i++) { - for (int j = 0; j < device_num; j++) { - for (int k = 0; k < multi_mf_dim_; k++) { - task_futures.emplace_back(hbm_thread_pool_[i]->enqueue(func_third, i, j, k)); + + fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_); + if (multi_mf_dim_) { + threads.resize(thread_keys_shard_num_ * multi_mf_dim_); + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i * multi_mf_dim_ + j] = + std::thread(build_pull_dynamic_mf_func, i, j); } } + for (std::thread& t : threads) { + t.join(); + } + } else { + for (int i = 0; i < thread_keys_shard_num_; i++) { + for (int j = 0; j < device_num; j++) { + task_futures.emplace_back( + hbm_thread_pool_[i]->enqueue(prepare_dev_value_func, j, i)); + } + } + for (auto& f : task_futures) { + f.wait(); + } + task_futures.clear(); } - for (auto& f : task_futures) { - f.wait(); - } - task_futures.clear(); + fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_); timeline.Pause(); - auto step_3 = timeline.ElapsedSec(); + VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() + << " seconds."; +} + +void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { + int device_num = heter_devices_.size(); + platform::Timer timeline; timeline.Start(); - //step4 初始化gpu-table相关数据了 + std::vector feature_keys_count(device_num); size_t size_max = 0; - for (int i = 0; i < device_num; i++) { - size_t tmp_size = 0; - for (int j = 0; j < multi_mf_dim_; j++) { - tmp_size += l_second_key[i][j].size(); + if (!multi_mf_dim_) { + for (int i = 0; i < device_num; i++) { + feature_keys_count[i] = gpu_task->device_keys_[i].size(); + VLOG(0) << i << " card contains feasign nums: " << feature_keys_count[i]; + size_max = std::max(size_max, feature_keys_count[i]); + } + } else { + for (int i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + feature_keys_count[i] += gpu_task->device_dim_ptr_[i][j].size(); + VLOG(1) << i << " card with dynamic mf dim: " << index_dim_vec_[j] + << " dim index: " << j << " contains feasign nums: " + << gpu_task->device_dim_ptr_[i][j].size(); + } + VLOG(1) << i << " card with dynamic mf contains feasign nums total: " + << feature_keys_count[i]; + size_max = std::max(size_max, feature_keys_count[i]); } - size_max = std::max(size_max, tmp_size); - } - if (size_max <= 0) { - VLOG(0) << "Skip build gpu ps cause feasign nums = " << size_max; - return; } if (HeterPs_) { delete HeterPs_; HeterPs_ = nullptr; } - HeterPs_ = HeterPsBase::get_instance(size_max, resource_, ps_accessor_type_, gpu_value_type_); - CHECK(HeterPs_ != nullptr); + if (size_max <= 0) { + VLOG(0) << "Skip build gpu ps cause feasign nums = " << size_max; + return; + } + std::vector threads(device_num); + HeterPs_ = HeterPsBase::get_instance(size_max, resource_); HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_); - HeterPs_->set_multi_mf_dim(max_mf_dim_); - - //step5, cpu数据转化到gpu数据,并构造gpu-table了 - auto transfor_value_obj = g_transfor; - auto build_table_func = [this, &l_second_key, &l_second_value, &transfor_value_obj] - (int device_id, int dim_id) -> void { - int cur_dim_size = this->index_dim_vec_[dim_id]; - size_t gpu_value_size = transfor_value_obj->get_gpu_value_size(cur_dim_size); - auto& cpu_keys = l_second_key[device_id][dim_id]; - auto& cpu_values = l_second_value[device_id][dim_id]; - size_t keys_len = cpu_keys.size(); - this->mem_pools_[device_id * this->multi_mf_dim_ + dim_id] = new MemoryPool(keys_len, gpu_value_size); - auto& mem_pool = this->mem_pools_[device_id * this->multi_mf_dim_ + dim_id]; - for (size_t k = 0; k < keys_len; k++) { - void* to_value_ptr = mem_pool->mem_address(k); - transfor_value_obj->value_cpu_to_gpu(cpu_values[k], to_value_ptr, cur_dim_size); - } - auto device_index = resource_->dev_id(device_id); - platform::CUDADeviceGuard guard(device_index); - - this->hbm_pools_[device_id * this->multi_mf_dim_ + dim_id] = new HBMMemoryPool(mem_pool); - auto& cur_pool = this->hbm_pools_[device_id * this->multi_mf_dim_ + dim_id]; - this->HeterPs_->build_ps(device_id, cpu_keys.data(), cur_pool->mem(), keys_len, - gpu_value_size, 500000, 2); + auto build_func = [this, &gpu_task, &feature_keys_count](int i) { + VLOG(3) << "building table: " << i; + this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(), + gpu_task->device_values_[i].data(), + feature_keys_count[i], 500000, 2); + if (feature_keys_count[i] > 0) { + HeterPs_->show_one_table(i); + } + }; + + // multi-thread process + + auto build_dymf_mem_pool = [this, &gpu_task](int i, int j) { + this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); + int mf_dim = this->index_dim_vec_[j]; + size_t feature_value_size = + TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; + size_t len = device_dim_keys.size(); + CHECK(len == device_dim_ptrs.size()); + this->mem_pools_[i * this->multi_mf_dim_ + j] = new MemoryPool(len, feature_value_size); + }; + + auto build_dymf_hbm_pool = [this, &gpu_task](int i, int j) { + + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + size_t len = device_dim_keys.size(); + int mf_dim = this->index_dim_vec_[j]; + size_t feature_value_size = + TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + + auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool); + auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; + + this->HeterPs_->build_ps(i, device_dim_keys.data(), + cur_pool->mem(), len, feature_value_size, + 500000, 2); + if (device_dim_keys.size() > 0) { + VLOG(3) << "show table: " << i << " table kv size: " << device_dim_keys.size() << "dim: " << mf_dim << " len: " << len; + HeterPs_->show_one_table(i); + } delete mem_pool; }; - - std::vector threads; - threads.resize(device_num * multi_mf_dim_); - for (int i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(build_table_func, i, j); + + int thread_num = 16; + auto build_dynamic_mf_func = [this, &gpu_task, thread_num](int i, int j, int z) { + int mf_dim = this->index_dim_vec_[j]; + VLOG(3) << "building table: " << i << "with mf dim: " << mf_dim; + + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; + + size_t len = device_dim_keys.size(); + CHECK(len == device_dim_ptrs.size()); + + auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; + + // ============ add for multi-thread ================ + int len_per_thread = len / thread_num; + int remain = len % thread_num; + int left = -1, right = -1; + + int real_len = len_per_thread; + if (z < remain) real_len++; + + if (z < remain) { + left = z * (len_per_thread + 1); + right = left + real_len; + } else { + left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread; + right = left + real_len; } - } - - for (std::thread& t : threads) { - t.join(); + // ============ add for multi-thread ================ + + for (int k = left; k < right; k++) { + + FeatureValue* val = (FeatureValue*)(mem_pool->mem_address(k)); + float* ptr_val = device_dim_ptrs[k]->data(); + size_t dim = device_dim_ptrs[k]->size(); + // dec ref + if (ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()] <= 0) { + ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()] *= -1; + } else if (ptr_val + [paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::unseen_days_index()] <= + 0) { + ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::unseen_days_index()] *= -1; + } + val->delta_score = + ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::delta_score_index()]; + val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::show_index()]; + val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::click_index()]; + val->slot = int(ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()]); + val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_w_index()]; + val->lr_g2sum = + ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_g2sum_index()]; + val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]); + + // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor + ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + mf_dim_index()] = float(mf_dim); + val->mf_dim = mf_dim; + if (dim > 8) { // CpuPS alreay expand as mf_dim + val->mf_size = mf_dim + 1; + for (int x = 0; x < val->mf_dim + 1; x++) { + val->mf[x] = ptr_val[x + 8]; + } + } else { + val->mf_size = 0; + for (int x = 0; x < val->mf_dim + 1; x++) { + val->mf[x] = 0; + } + } + } + }; + if (!multi_mf_dim_) { + for (size_t i = 0; i < threads.size(); i++) { + threads[i] = std::thread(build_func, i); + } + for (std::thread& t : threads) { + t.join(); + } + threads.clear(); + } else { + threads.resize(device_num * multi_mf_dim_); + for (int i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(build_dymf_mem_pool, i, j); + } + } + for (std::thread& t : threads) { + t.join(); + } + threads.clear(); + // multi-thread process + threads.resize(device_num * multi_mf_dim_ * thread_num); + for (int i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + for (int k = 0; k < thread_num; k++) { + threads[(i + j * device_num) * thread_num + k] = std::thread(build_dynamic_mf_func, i, j, k); + } + } + } + for (std::thread& t : threads) { + t.join(); + } + threads.clear(); + threads.resize(device_num * multi_mf_dim_); + for (int i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(build_dymf_hbm_pool, i, j); + } + } + for (std::thread& t : threads) { + t.join(); + } + threads.clear(); } timeline.Pause(); - auto step_4 = timeline.ElapsedSec(); - VLOG(0) << "pass_id:" << gpu_task->pass_id_ << " build_gpu_table_detail " - << " device_picec_a:" << step_1 - << "s device_picec_b:" << step_2 - << "s device_picec_c:" << step_3 - << "s trans_to_gpu:" << step_4 << "s"; - -#undef FOUR_VECTOR_INIT -#undef THIRD_VECTOR_INIT + VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() + << " s."; } void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { platform::Timer timer; + VLOG(3) << "Begin LoadIntoMemory(), dataset[" << dataset_ << "]"; timer.Start(); dataset_->LoadIntoMemory(); timer.Pause(); - auto load_s = timer.ElapsedSec(); - timer.Start(); + VLOG(0) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s"; + + // local shuffle if (is_shuffle) { dataset_->LocalShuffle(); } - timer.Pause(); - auto shuffle_s = timer.ElapsedSec(); - InitSlotInfo(); std::shared_ptr gpu_task = gpu_task_pool_.Get(); - gpu_task->pass_id_ = (uint16_t)(dataset_->GetPassID()); - VLOG(0) << "pass_id:" << gpu_task->pass_id_ << " LoadIntoMemory cost: " << load_s << "s Shuffle cost:" << shuffle_s << "s"; + gpu_task->Reset(); - dataset_mutex_.lock(); - dataset_pipe_.push(dataset_); - dataset_mutex_.unlock(); data_ready_channel_->Put(gpu_task); + + VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]"; } void PSGPUWrapper::start_build_thread() { @@ -540,11 +1026,14 @@ void PSGPUWrapper::pre_build_thread() { if (!data_ready_channel_->Get(gpu_task)) { continue; } + VLOG(3) << "thread PreBuildTask start."; platform::Timer timer; timer.Start(); + // build cpu ps data process PreBuildTask(gpu_task); timer.Pause(); - VLOG(0) << "pass_id:" << gpu_task->pass_id_ << " PreBuildTask cost: " << timer.ElapsedSec() << "s"; + VLOG(0) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec() + << "s"; buildcpu_ready_channel_->Put(gpu_task); } VLOG(3) << "build cpu thread end"; @@ -556,17 +1045,20 @@ void PSGPUWrapper::build_pull_thread() { if (!buildcpu_ready_channel_->Get(gpu_task)) { continue; } + VLOG(3) << "thread build pull start."; platform::Timer timer; timer.Start(); + // build cpu ps data process BuildPull(gpu_task); timer.Pause(); - VLOG(0) << "pass_id:" << gpu_task->pass_id_ << " BuildPull cost: " << timer.ElapsedSec() << "s"; + VLOG(1) << "thread BuildPull end, cost time: " << timer.ElapsedSec() << "s"; buildpull_ready_channel_->Put(gpu_task); } VLOG(3) << "build cpu thread end"; } void PSGPUWrapper::build_task() { + // build_task: build_pull + build_gputask std::shared_ptr gpu_task = nullptr; // train end, gpu free if (!gpu_free_channel_->Get(gpu_task)) { @@ -576,11 +1068,16 @@ void PSGPUWrapper::build_task() { if (!buildpull_ready_channel_->Get(gpu_task)) { return; } + + VLOG(0) << "PrepareGPUTask start."; platform::Timer timer; timer.Start(); + PrepareGPUTask(gpu_task); BuildGPUTask(gpu_task); timer.Pause(); - VLOG(0) << "pass_id:" << gpu_task->pass_id_ << " build_gpu_table cost: " << timer.ElapsedSec() << "s"; + VLOG(0) << "PrepareGPUTask + BuildGPUTask end, cost time: " + << timer.ElapsedSec() << "s"; + current_task_ = gpu_task; } @@ -591,16 +1088,17 @@ void PSGPUWrapper::BeginPass() { PADDLE_THROW( platform::errors::Fatal("[BeginPass] current task is not ended.")); } + build_task(); timer.Pause(); + if (current_task_ == nullptr) { PADDLE_THROW(platform::errors::Fatal( "[BeginPass] after build_task, current task is not null.")); } - VLOG(0) << "pass_id:" << current_task_->pass_id_ << " begin_pass cost: " << timer.ElapsedSec() << "s"; -} - + VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; +} void PSGPUWrapper::EndPass() { if (!current_task_) { @@ -608,9 +1106,25 @@ void PSGPUWrapper::EndPass() { platform::errors::Fatal("[EndPass] current task has been ended.")); } auto fleet_ptr = FleetWrapper::GetInstance(); + fleet_ptr->pslib_ptr_->_worker_ptr->acquire_table_mutex(this->table_id_); platform::Timer timer; timer.Start(); - + size_t keysize_max = 0; + + // in case of feasign_num = 0, skip dump_to_cpu + if (!multi_mf_dim_) { + for (size_t i = 0; i < heter_devices_.size(); i++) { + keysize_max = + std::max(keysize_max, current_task_->device_keys_[i].size()); + } + } else { + for (size_t i = 0; i < heter_devices_.size(); i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + keysize_max = + std::max(keysize_max, current_task_->device_dim_keys_[i][j].size()); + } + } + } int thread_num = 8; auto dump_pool_to_cpu_func = [this, thread_num](int i, int j, int z) { @@ -618,7 +1132,7 @@ void PSGPUWrapper::EndPass() { PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - auto& device_keys = this->current_task_->device_keys_[i][j]; + auto& device_keys = this->current_task_->device_dim_keys_[i][j]; size_t len = device_keys.size(); // ============ add for multi-thread ================ int len_per_thread = len / thread_num; @@ -637,16 +1151,13 @@ void PSGPUWrapper::EndPass() { } // ============ add for multi-thread ================ int mf_dim = this->index_dim_vec_[j]; - size_t feature_value_size = g_transfor->get_gpu_value_size(mf_dim); - auto cpu_value = memory::Alloc(phi::GPUPinnedPlace(), feature_value_size * real_len); - char* cpu_value_ptr = reinterpret_cast(cpu_value->ptr()); + VLOG(3) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim; + size_t feature_value_size = + TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + char* test_build_values = (char*)malloc(feature_value_size * real_len); uint64_t offset = left * feature_value_size; - gpuStream_t streams; - cudaStreamCreate(&streams); - cudaMemcpyAsync(cpu_value_ptr, hbm_pool->mem() + offset, - feature_value_size * real_len, cudaMemcpyDeviceToHost, streams); - cudaStreamSynchronize(streams); - cudaStreamDestroy(streams); + cudaMemcpy(test_build_values, hbm_pool->mem() + offset, + feature_value_size * real_len, cudaMemcpyDeviceToHost); CHECK(len == hbm_pool->capacity()); uint64_t unuse_key = std::numeric_limits::max(); for (int i = left; i < right; ++i) { @@ -654,103 +1165,231 @@ void PSGPUWrapper::EndPass() { continue; } size_t local_offset = (i - left) * feature_value_size; - void* gpu_val = (void*)(cpu_value_ptr + local_offset); - g_transfor->value_gpu_to_cpu(gpu_val); + FeatureValue* gpu_val = (FeatureValue*)(test_build_values + local_offset); + auto* downpour_value = + (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr); + int downpour_value_size = downpour_value->size(); + + if (gpu_val->mf_size > 0 && downpour_value_size == 8) { + downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size); + } + float* cpu_val = downpour_value->data(); + // cpu_val[0] = 0; + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + delta_score_index()] = gpu_val->delta_score; + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + show_index()] = gpu_val->show; + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + click_index()] = gpu_val->clk; + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + embed_w_index()] = gpu_val->lr; + cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + embed_g2sum_index()] = gpu_val->lr_g2sum; + // useless + if (cpu_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()] < 0) { + cpu_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()] = + -1 * gpu_val->slot; + } else { + cpu_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()] = gpu_val->slot; + } + if (gpu_val->mf_size > 0) { + for (int x = 0; x < gpu_val->mf_dim + 1; x++) { + if (x + 8 >= int(downpour_value->size())) { + VLOG(0) << "x: " << x << " size: "<< downpour_value_size; + } + cpu_val[x + 8] = gpu_val->mf[x]; + } + } } + free(test_build_values); }; - size_t device_num = heter_devices_.size(); - std::vector threads(device_num * multi_mf_dim_ * thread_num); - for (size_t i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - for (int k = 0; k < thread_num; k++) { - threads[(i + j * device_num) * thread_num + k] = - std::thread(dump_pool_to_cpu_func, i, j, k); + if (multi_mf_dim_) { + VLOG(0) << "dynamic mf dump pool: multi_mf_dim_: " << multi_mf_dim_; + size_t device_num = heter_devices_.size(); + std::vector threads(device_num * multi_mf_dim_ * thread_num); + for (size_t i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + for (int k = 0; k < thread_num; k++) { + threads[(i + j * device_num) * thread_num + k] = + std::thread(dump_pool_to_cpu_func, i, j, k); + } } } + for (std::thread& t : threads) { + t.join(); + } } - for (std::thread& t : threads) { - t.join(); + if (keysize_max != 0) { + HeterPs_->end_pass(); } for (size_t i = 0; i < hbm_pools_.size(); i++) { delete hbm_pools_[i]; } + gpu_task_pool_.Push(current_task_); current_task_ = nullptr; gpu_free_channel_->Put(current_task_); + fleet_ptr->pslib_ptr_->_worker_ptr->release_table_mutex(this->table_id_); timer.Pause(); VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } -//拉取gpu-table数据,并做cvm填充(对应pull_box_sparse_op算子, 非动态维度) void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, const int table_id, const std::vector& keys, const std::vector& values, const std::vector& slot_lengths, const int hidden_size) { - - const std::vector slot_dim; - PullSparse(place, table_id, keys, values, slot_lengths, slot_dim, hidden_size); - return; + VLOG(3) << "Begine Gpu Ps PullSparse"; + platform::Timer all_timer; + platform::Timer pull_gpups_timer; + all_timer.Start(); + int64_t total_length = + std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); + auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue)); + FeatureValue* total_values_gpu = reinterpret_cast(buf->ptr()); + if (platform::is_cpu_place(place)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Warning:: CPUPlace is not supported in GpuPs now.")); + } else if (platform::is_gpu_place(place)) { + VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; + int device_id = place.GetDeviceId(); + int devid_2_index = HeterPs_->get_index_by_devid(device_id); + LoDTensor& total_keys_tensor = keys_tensor[devid_2_index]; + uint64_t* total_keys = reinterpret_cast( + total_keys_tensor.mutable_data({total_length, 1}, place)); + + // construct slot_level lod info + auto slot_lengths_lod = slot_lengths; + for (size_t i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*)); + auto buf_length = + memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); + uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); + + this->CopyKeys(place, gpu_keys, total_keys, gpu_len, + static_cast(slot_lengths.size()), + static_cast(total_length)); + VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index + << " len: " << total_length; + pull_gpups_timer.Start(); + HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, + static_cast(total_length)); + pull_gpups_timer.Pause(); + VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length + << "]"; + this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len, + static_cast(slot_lengths.size()), hidden_size, + total_length); + } else { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "GpuPs: PullSparse Only Support CUDAPlace Now.")); + } + all_timer.Pause(); + VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec() + << " s, of which GPUPS costs: " << pull_gpups_timer.ElapsedSec() + << " s"; + VLOG(3) << "End PullSparse"; } -//拉取gpu-table数据,并做cvm填充(对应pull_gpups_sparse_op算子, 非动态维度) void PSGPUWrapper::PullSparse( const paddle::platform::Place& place, const int table_id, const std::vector& keys, const std::vector& values, const std::vector& slot_lengths, const std::vector& slot_dim, // dimension for each slot const int hidden_size) { - - if (!platform::is_gpu_place(place)) { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "GpuPs: PullSparse Only Support CUDAPlace Now.")); + VLOG(3) << "Begine Gpu Ps PullSparse"; + platform::Timer all_timer; + platform::Timer pull_gpups_timer; + all_timer.Start(); + + size_t total_length = + std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); + + size_t feature_value_size = 0; + if (!multi_mf_dim_) { + feature_value_size = sizeof(FeatureValue); + } else { + feature_value_size = TYPEALIGN( + 8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1)); } - //step1 将散乱的key重整到一块连续的空间上面来 - size_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - size_t feature_value_size = g_transfor->get_gpu_value_size(max_mf_dim_); auto buf = memory::Alloc(place, total_length * feature_value_size); - void* total_values_gpu = reinterpret_cast(buf->ptr()); - int device_id = place.GetDeviceId(); - int devid_2_index = HeterPs_->get_index_by_devid(device_id); - LoDTensor& total_keys_tensor = keys_tensor_[devid_2_index]; - uint64_t* total_keys = reinterpret_cast(total_keys_tensor.mutable_data( - {int64_t(total_length), 1}, place)); - auto slot_lengths_lod = slot_lengths; - for (size_t i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - - PinnedVector pinned_buf_key(keys.data(), keys.size() * sizeof(uint64_t*), stream, place); - uint64_t** gpu_keys = pinned_buf_key.get_gpu_ptr(); - PinnedVector pinned_buf_length(slot_lengths_lod.data(), slot_lengths.size() * sizeof(int64_t), stream, place); - int64_t* gpu_len = pinned_buf_length.get_gpu_ptr(); - - this->CopyKeys(place, gpu_keys, total_keys, gpu_len, - static_cast(slot_lengths.size()), - static_cast(total_length)); - - //step2 查表获得gpu-value数据 - HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, total_length); - - //step3 做cvm转换处理 - PinnedVector pinned_buf_value(values.data(), values.size() * sizeof(float*), stream, place); - float** gpu_values = pinned_buf_value.get_gpu_ptr(); - if (slot_dim.size() != 0) { //动态mf模式 - PinnedVector pinned_dim(slot_dim.data(), slot_dim.size() * sizeof(int), stream, place); - int* gpu_dim = pinned_dim.get_gpu_ptr(); - g_transfor->value_to_cvm(gpu_values, total_values_gpu, gpu_keys, slot_lengths.size(), gpu_len, - gpu_dim, total_length, 0, feature_value_size, stream); + FeatureValue* total_values_gpu = reinterpret_cast(buf->ptr()); + if (platform::is_cpu_place(place)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Warning:: CPUPlace is not supported in GpuPs now.")); + } else if (platform::is_gpu_place(place)) { + VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; + int device_id = place.GetDeviceId(); + int devid_2_index = HeterPs_->get_index_by_devid(device_id); + LoDTensor& total_keys_tensor = keys_tensor[devid_2_index]; + uint64_t* total_keys = + reinterpret_cast(total_keys_tensor.mutable_data( + {int64_t(total_length), 1}, place)); + // construct slot_level lod info + auto slot_lengths_lod = slot_lengths; + for (size_t i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*)); + auto buf_length = + memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); + uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); + auto buf_dim = memory::Alloc(place, slot_dim.size() * sizeof(int)); + int* gpu_dim = reinterpret_cast(buf_dim->ptr()); + cudaMemcpy(gpu_dim, slot_dim.data(), slot_dim.size() * sizeof(int), + cudaMemcpyHostToDevice); + this->CopyKeys(place, gpu_keys, total_keys, gpu_len, + static_cast(slot_lengths.size()), + static_cast(total_length)); + + VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index + << " len: " << total_length; + + pull_gpups_timer.Start(); + HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, + total_length); + + VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length + << "]"; + if (!multi_mf_dim_) { + this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len, + static_cast(slot_lengths.size()), hidden_size, + total_length); + } else { + this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len, + static_cast(slot_lengths.size()), hidden_size, + total_length, gpu_dim); + } + pull_gpups_timer.Pause(); } else { - g_transfor->value_to_cvm(gpu_values, total_values_gpu, gpu_keys, slot_lengths.size(), gpu_len, - nullptr, total_length, hidden_size, feature_value_size, stream); + PADDLE_THROW(platform::errors::PreconditionNotMet( + "GpuPs: PullSparse Only Support CUDAPlace Now.")); } + all_timer.Pause(); + time_1 += all_timer.ElapsedSec(); + time_2 += pull_gpups_timer.ElapsedSec(); + VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec() + << " s, of which pullsparse costs: " << pull_gpups_timer.ElapsedSec() + << " s"; + VLOG(3) << "End PullSparse"; } void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, @@ -759,52 +1398,53 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, const std::vector& grad_values, const std::vector& slot_lengths, const int hidden_size, const int batch_size) { - if (!platform::is_gpu_place(place)) { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "GPUPS: PushSparseGrad Only Support CUDAPlace Now.")); - } - //step1 将零散的梯度重整到一块连续的空间上面来 - int64_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - size_t grad_value_size = g_transfor->get_gpu_push_value_size(max_mf_dim_); - int device_id = place.GetDeviceId(); - int devid_2_index = HeterPs_->get_index_by_devid(device_id); - LoDTensor& cached_total_keys_tensor = keys_tensor_[devid_2_index]; - uint64_t* total_keys = reinterpret_cast(cached_total_keys_tensor.data()); - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - - PinnedVector pinned_gpu_values(grad_values.data(), grad_values.size() * sizeof(float*), stream, place); - float** gpu_values = pinned_gpu_values.get_gpu_ptr(); - - PinnedVector pinned_slot_vector(slot_vector_.data(), slot_vector_.size() * sizeof(int), stream, place); - int* d_slot_vector = pinned_slot_vector.get_gpu_ptr(); - - auto slot_lengths_lod = slot_lengths; - for (size_t i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - PinnedVector pinned_buf_length(slot_lengths_lod.data(), slot_lengths_lod.size() * sizeof(int64_t), stream, place); - int64_t* gpu_len = pinned_buf_length.get_gpu_ptr(); - + VLOG(3) << "Begin GPUPS PushSparseGrad"; + platform::Timer all_timer; + platform::Timer push_gpups_timer; + all_timer.Start(); + int64_t total_length = + std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); + size_t grad_value_size = + TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); auto buf = memory::Alloc(place, total_length * grad_value_size); - void* total_grad_values_gpu = reinterpret_cast(buf->ptr()); - - if (hidden_size == 0) { //动态mf - PinnedVector pinned_mf_dim(slot_mf_dim_vector_.data(), slot_mf_dim_vector_.size() * sizeof(int), stream, place); - int* d_mf_dim_vector = pinned_mf_dim.get_gpu_ptr(); - g_transfor->grad_to_push( - total_grad_values_gpu, gpu_values, slot_lengths.size(), gpu_len, d_mf_dim_vector, - total_length, 0, grad_value_size, batch_size, d_slot_vector, stream); - } else { - g_transfor->grad_to_push( - total_grad_values_gpu, gpu_values, slot_lengths.size(), gpu_len, 0, - total_length, hidden_size, grad_value_size, batch_size, d_slot_vector, stream); - } + VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_; + FeaturePushValue* total_grad_values_gpu = + reinterpret_cast(buf->ptr()); + if (platform::is_cpu_place(place)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Warning:: CPUPlace is not supported in GPUPS now.")); + } else if (platform::is_gpu_place(place)) { + int device_id = place.GetDeviceId(); + int devid_2_index = HeterPs_->get_index_by_devid(device_id); + LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index]; + uint64_t* total_keys = + reinterpret_cast(cached_total_keys_tensor.data()); + VLOG(3) << "Begin copy grad tensor to gpups struct"; + if (!multi_mf_dim_) { + this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, + hidden_size, total_length, batch_size); + } else { + this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, + total_length, batch_size, grad_value_size); + } - //step2,梯度更新了 - HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu, + VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index + << " len: " << total_length; + push_gpups_timer.Start(); + HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu, static_cast(total_length)); + push_gpups_timer.Pause(); + } else { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "GPUPS: PushSparseGrad Only Support CUDAPlace Now.")); + } + all_timer.Pause(); + time_3 += all_timer.ElapsedSec(); + time_4 += push_gpups_timer.ElapsedSec(); + VLOG(3) << "PushSparseGrad total cost: " << all_timer.ElapsedSec() + << " s, of which GPUPS cost: " << push_gpups_timer.ElapsedSec() + << " s"; + VLOG(3) << "End PushSparseGrad"; } } // end namespace framework diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 9ac41e970e7c4f..4a25d5e3783a0b 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -25,6 +25,91 @@ limitations under the License. */ namespace paddle { namespace framework { +__global__ void PullCopy(float** dest, const FeatureValue* src, + const int64_t* len, int hidden, int slot_num, + int total_len, uint64_t** keys) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[x - 1] : 0); + if (*(keys[x] + y) == 0) { + *(dest[x] + y * hidden) = 0; + *(dest[x] + y * hidden + 1) = 0; + *(dest[x] + y * hidden + 2) = 0; + } else { + *(dest[x] + y * hidden) = (src + i)->show; + *(dest[x] + y * hidden + 1) = (src + i)->clk; + *(dest[x] + y * hidden + 2) = (src + i)->lr; + } + if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) { + for (int j = 0; j < hidden - 3; j++) { + *(dest[x] + y * hidden + 3 + j) = 0; + } + } else { + for (int j = 0; j < hidden - 3; j++) { + *(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j]; + } + } + } +} + +__global__ void PullCopy(float** dest, const FeatureValue* src, + const int64_t* len, int slot_num, int total_len, + uint64_t** keys, uint64_t max_val_size, int* gpu_dim) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[x - 1] : 0); + int cur_dim = gpu_dim[x] - 3; + FeatureValue* feature_value_ptr = + (FeatureValue*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); + + int mf_dim = feature_value_ptr->mf_dim; + mf_dim = gpu_dim[x] - 3; + if (*(keys[x] + y) == 0) { + *(dest[x] + y * (cur_dim + 3)) = 0; + *(dest[x] + y * (cur_dim + 3) + 1) = 0; + *(dest[x] + y * (cur_dim + 3) + 2) = 0; + } else { + *(dest[x] + y * (mf_dim + 3)) = feature_value_ptr->show; + *(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr->clk; + *(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr->lr; + } + if ((feature_value_ptr)->mf_size == 0 || *(keys[x] + y) == 0 ){ + if (*(keys[x] + y) == 0) { + for (int j = 0; j < cur_dim; j++) { + *(dest[x] + y * (cur_dim + 3) + 3 + j) = 0; + } + } else { + for (int j = 0; j < mf_dim; j++) { + *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0; + } + } + + } else { + for (int j = 0; j < mf_dim; j++) { + *(dest[x] + y * (mf_dim + 3) + 3 + j) = feature_value_ptr->mf[1 + j]; + } + } + } +} + __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, const int64_t* len, int slot_num, int total_len) { @@ -44,6 +129,139 @@ __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, } } +//__global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, +// const int64_t* len, int slot_num, +// int total_len, int* gpu_dim) { +// CUDA_KERNEL_LOOP(i, total_len) { +// int low = 0; +// int high = slot_num - 1; +// while (low < high) { +// int mid = (low + high) / 2; +// if (i < len[mid]) +// high = mid; +// else +// low = mid + 1; +// } +// int x = low; +// int y = i - (x ? len[x - 1] : 0); +// dest_total_keys[i] = src_keys[x][y]; +// //if (src_keys[x][y] == 0 && gpu_dim[x] > 30) { +// // dest_total_keys[i] = 1; +// //}; +// } +//} + +__global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len, + int hidden, int slot_num, int total_len, int bs, + int* slot_vector) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[low - 1] : 0); + FeaturePushValue val; + float* src_ptr = src[x]; + int hidden_off = y * hidden; + val.slot = slot_vector[x]; + val.show = *(src_ptr + hidden_off); + val.clk = *(src_ptr + hidden_off + 1); + val.lr_g = *(src_ptr + hidden_off + 2) * -1. * bs; + for (int j = 0; j < hidden - 3; j++) { + val.mf_g[j] = *(src_ptr + hidden_off + 3 + j) * -1. * bs; + } + *(dest + i) = val; + } +} + +__global__ void PushCopyWithPool(FeaturePushValue* dest, float** src, + int64_t* len, int slot_num, uint64_t total_len, + int bs, int* slot_vector, int* mf_dim_vector, + size_t grad_value_size) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[low - 1] : 0); + FeaturePushValue* cur = + (FeaturePushValue*)((char*)dest + i * grad_value_size); + + cur->slot = slot_vector[x]; + int mf_dim = mf_dim_vector[x]; // slot_vector holds both slot and + // slot:mf_dim information + cur->mf_dim = mf_dim; + + cur->show = *(src[x] + y * (mf_dim + 3)); + cur->clk = *(src[x] + y * (mf_dim + 3) + 1); + cur->lr_g = *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; + for (int j = 0; j < cur->mf_dim; j++) { + cur->mf_g[j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; + } + } +} + +void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const FeatureValue* total_values_gpu, + const int64_t* gpu_len, const int slot_num, + const int hidden_size, + const int64_t total_length) { + auto stream = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); + float** gpu_values = reinterpret_cast(buf_value->ptr()); + cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + + PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + gpu_values, total_values_gpu, gpu_len, hidden_size, slot_num, + total_length, gpu_keys); + cudaStreamSynchronize(stream); +} + +void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const FeatureValue* total_values_gpu, + const int64_t* gpu_len, const int slot_num, + const int hidden_size, + const int64_t total_length, int* gpu_dim) { + auto stream = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); + float** gpu_values = reinterpret_cast(buf_value->ptr()); + cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + + if (!multi_mf_dim_) { + PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + gpu_values, total_values_gpu, gpu_len, hidden_size, slot_num, + total_length, gpu_keys); + } else { + PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + gpu_values, total_values_gpu, gpu_len, slot_num, total_length, gpu_keys, + val_type_size_, gpu_dim); + } + + cudaStreamSynchronize(stream); +} + void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, @@ -56,6 +274,99 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, cudaStreamSynchronize(stream); } +//void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, +// uint64_t** origin_keys, uint64_t* total_keys, +// const int64_t* gpu_len, int slot_num, +// int total_len, int* gpu_dim) { +// auto stream = dynamic_cast( +// platform::DeviceContextPool::Instance().Get(place)) +// ->stream(); +// CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>( +// origin_keys, total_keys, gpu_len, slot_num, total_len, gpu_dim); +// cudaStreamSynchronize(stream); +//} + +void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + FeaturePushValue* total_grad_values_gpu, + const std::vector& slot_lengths, + const int hidden_size, + const int64_t total_length, + const int batch_size) { + auto stream = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto slot_lengths_lod = slot_lengths; + for (int i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_grad_value = + memory::Alloc(place, grad_values.size() * sizeof(float*)); + auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); + auto buf_slot_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + + float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); + + cudaMemcpy(gpu_values, grad_values.data(), + grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_slot_vector, slot_vector_.data(), + slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); + + PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + total_grad_values_gpu, gpu_values, gpu_len, hidden_size, + slot_lengths.size(), total_length, batch_size, d_slot_vector); + cudaStreamSynchronize(stream); +} + +void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + FeaturePushValue* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, const int batch_size, + size_t grad_value_size) { + auto stream = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto slot_lengths_lod = slot_lengths; + for (int i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_grad_value = + memory::Alloc(place, grad_values.size() * sizeof(float*)); + auto buf_length = + memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); + auto buf_slot_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + auto buf_mf_dim_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + + float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); + int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); + + cudaMemcpy(gpu_values, grad_values.data(), + grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_slot_vector, slot_vector_.data(), + slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); + cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector_.data(), + slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); + + PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(), + total_length, batch_size, d_slot_vector, d_mf_dim_vector, + grad_value_size); + + cudaStreamSynchronize(stream); +} + void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float min_bound, float max_bound, float learning_rate, float initial_g2sum, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index ad0321b5466783..e54ff6a3bd58b8 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -90,8 +90,14 @@ class PSGPUWrapper { PSGPUWrapper() { HeterPs_ = NULL; sleep_seconds_before_fail_exit_ = 300; - ps_accessor_type_ = "DownpourCtrAccessor"; - gpu_value_type_ = "FeatureValue"; + hbm_thread_pool_.resize(thread_keys_shard_num_); + for (size_t i = 0; i < hbm_thread_pool_.size(); i++) { + hbm_thread_pool_[i].reset(new ::ThreadPool(1)); + } + pull_thread_pool_.resize(thread_keys_shard_num_); + for (size_t i = 0; i < pull_thread_pool_.size(); i++) { + pull_thread_pool_[i].reset(new ::ThreadPool(1)); + } } void PullSparse(const paddle::platform::Place& place, const int table_id, @@ -115,10 +121,33 @@ class PSGPUWrapper { void CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, int total_len, int* gpu_dim); + void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, + const std::vector& values, + const FeatureValue* total_values_gpu, const int64_t* gpu_len, + const int slot_num, const int hidden_size, + const int64_t total_length); + void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, + const std::vector& values, + const FeatureValue* total_values_gpu, const int64_t* gpu_len, + const int slot_num, const int hidden_size, + const int64_t total_length, int* gpu_dim); + void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + FeaturePushValue* total_grad_values_gpu, + const std::vector& slot_lengths, + const int hidden_size, const int64_t total_length, + const int batch_size); + void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + FeaturePushValue* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, const int batch_size, + size_t grad_value_size); void BuildGPUTask(std::shared_ptr gpu_task); void PreBuildTask(std::shared_ptr gpu_task); void BuildPull(std::shared_ptr gpu_task); + void PrepareGPUTask(std::shared_ptr gpu_task); void LoadIntoMemory(bool is_shuffle); void BeginPass(); void EndPass(); @@ -151,7 +180,7 @@ class PSGPUWrapper { is_initialized_ = true; resource_ = std::make_shared(dev_ids); resource_->enable_p2p(); - keys_tensor_.resize(resource_->total_gpu()); + keys_tensor.resize(resource_->total_gpu()); #ifdef PADDLE_WITH_GLOO auto gloo = paddle::framework::GlooWrapper::GetInstance(); if (gloo->Size() > 1) { @@ -279,9 +308,7 @@ class PSGPUWrapper { day_ = day; } - void SetDataset(Dataset* dataset) { - dataset_ = dataset; - } + void SetDataset(Dataset* dataset) { dataset_ = dataset; } // PSGPUWrapper singleton static std::shared_ptr GetInstance() { @@ -290,6 +317,10 @@ class PSGPUWrapper { } return s_instance_; } + std::vector>>& GetLocalTable( + int table_id) { + return local_tables_[table_id]; + } void SetSlotVector(const std::vector& slot_vector) { slot_vector_ = slot_vector; } @@ -303,41 +334,48 @@ class PSGPUWrapper { assert(slot_mf_dim_vector_.size() == slot_vector_.size()); } - void SetSlotDimFixed(const int dim) { - assert(slot_vector_.size() != 0); - for (size_t ii = 0; ii < slot_vector_.size(); ii++) { - slot_mf_dim_vector_.push_back(dim); - } - } - - void SetTableShardNum(const int shard_num) { - thread_keys_shard_num_ = shard_num; - } - - void SetAccessorAndValueType(std::string accessor_type, std::string value_type) { - ps_accessor_type_ = accessor_type; - gpu_value_type_ = value_type; - } - void InitSlotInfo() { if (slot_info_initialized_) { return; } SlotRecordDataset* dataset = dynamic_cast(dataset_); auto slots_vec = dataset->GetSlots(); + auto multi_slot_desc = dataset_->GetDataFeedDesc().multi_slot_desc(); + std::vector slots_vec_test; + for (int i = 0; i < multi_slot_desc.slots_size(); ++i) { + const auto& slot = multi_slot_desc.slots(i); + // VLOG(0) << "yxfslotname: " << slot.name(); + if (slot.type() == "uint64" || slot.type() == "uint32") { + slots_vec_test.push_back(slot.name()); + } + } + std::cout << "wrapper use slots: "; + for (auto s : slots_vec_test) { + std::cout << s << " | "; + } + std::cout << " end wrapper " << std::endl; + VLOG(0) << "get slot desc"; slot_offset_vector_.clear(); for (auto& slot : slot_vector_) { for (size_t i = 0; i < slots_vec.size(); ++i) { if (std::to_string(slot) == slots_vec[i]) { + // VLOG(0) << "yxf slot: " << slot; slot_offset_vector_.push_back(i); break; } } } + for (auto s : slot_offset_vector_) { + std::cout << s << " | "; + } + std::cout << " end " << std::endl; + for (size_t i = 0; i < slot_mf_dim_vector_.size(); i++) { + slot_dim_map_[slot_vector_[i]] = slot_mf_dim_vector_[i]; + } std::unordered_set dims_set; - for (auto& it : slot_mf_dim_vector_) { - dims_set.insert(it); + for (auto& it : slot_dim_map_) { + dims_set.insert(it.second); } size_t num_of_dim = dims_set.size(); index_dim_vec_.resize(num_of_dim); @@ -356,17 +394,10 @@ class PSGPUWrapper { for (size_t i = 0; i < slot_index_vec_.size(); i++) { slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]]; } - //初始化一些线程池 - CHECK(thread_keys_shard_num_ != 0); - hbm_thread_pool_.resize(thread_keys_shard_num_); - for (size_t i = 0; i < hbm_thread_pool_.size(); i++) { - hbm_thread_pool_[i].reset(new ::ThreadPool(1)); - } - pull_thread_pool_.resize(thread_keys_shard_num_); - for (size_t i = 0; i < pull_thread_pool_.size(); i++) { - pull_thread_pool_[i].reset(new ::ThreadPool(1)); - } - GlobalValueTransfor::get_instance().init(ps_accessor_type_, gpu_value_type_); + val_type_size_ = + TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); + grad_type_size_ = + TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); slot_info_initialized_ = true; } @@ -386,26 +417,33 @@ class PSGPUWrapper { private: static std::shared_ptr s_instance_; - //主要用于load数据的时候用的 Dataset* dataset_; - //当load数据完成后,会将其筛入到如下队列,后续异步pull会用到这个队列的数据 - //因为load 和 异步build是两个线程,所以才需要下面的队列来解耦这个dataset对象 - std::queue dataset_pipe_; - std::mutex dataset_mutex_; #ifdef PADDLE_WITH_PSLIB paddle::ps::AfsApiWrapper afs_handler_; #endif + std::unordered_map< + uint64_t, std::vector>>> + local_tables_; HeterPsBase* HeterPs_; - std::vector keys_tensor_; // Cache for pull_sparse + std::vector keys_tensor; // Cache for pull_sparse std::shared_ptr resource_; int32_t sleep_seconds_before_fail_exit_; std::vector slot_vector_; std::vector slot_offset_vector_; std::vector slot_mf_dim_vector_; + std::unordered_map slot_dim_map_; std::vector slot_index_vec_; std::vector index_dim_vec_; int multi_mf_dim_{0}; int max_mf_dim_{0}; + size_t val_type_size_{0}; + size_t grad_type_size_{0}; + + double time_1 = 0.0; + double time_2 = 0.0; + double time_3 = 0.0; + double time_4 = 0.0; + int multi_node_{0}; int node_size_; uint64_t table_id_; @@ -413,18 +451,20 @@ class PSGPUWrapper { std::vector inter_comms_; std::vector inter_ncclids_; std::vector heter_devices_; + std::unordered_set gpu_ps_config_keys_; HeterObjectPool gpu_task_pool_; - std::vector>>> thread_keys_; + std::vector>> thread_keys_; + std::vector>>> + thread_dim_keys_; int thread_keys_thread_num_ = 37 * 4; - int thread_keys_shard_num_ = 0; + int thread_keys_shard_num_ = 64; uint64_t max_fea_num_per_pass_ = 5000000000; int year_; int month_; int day_; bool slot_info_initialized_ = false; int use_afs_api_ = 0; - std::string ps_accessor_type_; - std::string gpu_value_type_; + std::vector mem_pools_; std::vector hbm_pools_; // in multi mfdim, one table need hbm // pools of totol dims number diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 47fc901372e03f..9a373be92b636c 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -242,6 +242,7 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { } config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); } + auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance(); ps_gpu_wrapper->InitializeGPUServer(config); } diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index dee112179be328..5e2274cb651385 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -313,8 +313,6 @@ void BindDataset(py::module *m) { &framework::Dataset::SetFleetSendSleepSeconds, py::call_guard()) .def("enable_pv_merge", &framework::Dataset::EnablePvMerge, - py::call_guard()) - .def("set_pass_id", &framework::Dataset::SetPassId, py::call_guard()); py::class_(*m, "IterableDatasetWrapper") diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index 48dccf85f6bb1c..42703fc17bde52 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -41,12 +41,6 @@ void BindPSGPUWrapper(py::module* m) { py::call_guard()) .def("set_slot_dim_vector", &framework::PSGPUWrapper::SetSlotDimVector, py::call_guard()) - .def("set_slot_dim_fixed", &framework::PSGPUWrapper::SetSlotDimFixed, - py::call_guard()) - .def("set_table_shard_num", &framework::PSGPUWrapper::SetTableShardNum, - py::call_guard()) - .def("set_accssor_and_gpuvalue_type", &framework::PSGPUWrapper::SetAccessorAndValueType, - py::call_guard()) .def("set_slot_offset_vector", &framework::PSGPUWrapper::SetSlotOffsetVector, py::call_guard()) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index c8699acf5b7b57..84064669c0dc67 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -73,24 +73,10 @@ def __init__(self): self.proto_desc.pipe_command = "cat" self.dataset = core.Dataset("MultiSlotDataset") self.thread_num = 1 - self.pass_id = 0 self.filelist = [] self.use_ps_gpu = False self.psgpu = None - def set_pass_id(self, pass_id): - """ - set_pass_id - """ - self.pass_id = pass_id - self.dataset.set_pass_id(pass_id) - - def get_pass_id(self): - """ - get_pass_id - """ - return self.pass_id - def set_pipe_command(self, pipe_command): """ Set pipe command of current dataset diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index 9bcf78b6c9f023..8d803c0d5bd7d9 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -313,7 +313,7 @@ def save_inference_model(self, """ self._fleet_ptr.save_model(dirname, 0) - def print_table_stat(self, table_id, pass_id, threshold): + def print_table_stat(self, table_id): """ print stat info of table_id, format: tableid, feasign size, mf size @@ -325,7 +325,7 @@ def print_table_stat(self, table_id, pass_id, threshold): """ self._role_maker._barrier_worker() if self._role_maker.is_first_worker(): - self._fleet_ptr.print_table_stat(table_id, pass_id, threshold) + self._fleet_ptr.print_table_stat(table_id) self._role_maker._barrier_worker() def set_file_num_one_shard(self, table_id, file_num):