diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 87b943abd0106..e9cb7d325f711 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -6,28 +6,34 @@ if(WITH_NV_JETSON) add_definitions(-DWITH_NV_JETSON) set(paddle_known_gpu_archs "53 62 72") set(paddle_known_gpu_archs10 "53 62 72") + set(paddle_known_gpu_archs11 "53 62 72 87") + set(paddle_known_gpu_archs12 "53 62 72 87 90") elseif(NEW_RELEASE_ALL) message("Using New Release Strategy - All Arches Packge") add_definitions(-DNEW_RELEASE_ALL) - set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86") - set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75") + set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90") + set(paddle_known_gpu_archs10 "50 52 60 61 70 75") set(paddle_known_gpu_archs11 "50 60 61 70 75 80") + set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90") elseif(NEW_RELEASE_PYPI) message("Using New Release Strategy - Cubin Packge") add_definitions(-DNEW_RELEASE_PYPI) - set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86") + set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90") set(paddle_known_gpu_archs10 "") - set(paddle_known_gpu_archs11 "60 61 70 75 80") + set(paddle_known_gpu_archs11 "61 70 75 80") + set(paddle_known_gpu_archs12 "61 70 75 80 90") elseif(NEW_RELEASE_JIT) message("Using New Release Strategy - JIT Packge") add_definitions(-DNEW_RELEASE_JIT) - set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86") - set(paddle_known_gpu_archs10 "35 50 60 70 75") - set(paddle_known_gpu_archs11 "35 50 60 70 75 80") + set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90") + set(paddle_known_gpu_archs10 "50 60 70 75") + set(paddle_known_gpu_archs11 "50 60 70 75 80") + set(paddle_known_gpu_archs12 "50 60 70 75 80 90") else() - set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80") - set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75") + set(paddle_known_gpu_archs "70 80") + set(paddle_known_gpu_archs10 "50 52 60 61 70 75") set(paddle_known_gpu_archs11 "52 60 61 70 75 80") + set(paddle_known_gpu_archs12 "70 80") endif() ###################################################################################### @@ -98,12 +104,12 @@ endfunction() function(select_nvcc_arch_flags out_variable) # List of arch names set(archs_names - "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" + "Hopper" "All" "Manual") set(archs_name_default "Auto") @@ -142,9 +148,7 @@ function(select_nvcc_arch_flags out_variable) unset(CUDA_ARCH_PTX CACHE) endif() - if(${CUDA_ARCH_NAME} STREQUAL "Kepler") - set(cuda_arch_bin "30 35") - elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") + if(${CUDA_ARCH_NAME} STREQUAL "Maxwell") if(WITH_NV_JETSON) set(cuda_arch_bin "53") else() @@ -165,11 +169,17 @@ function(select_nvcc_arch_flags out_variable) elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") set(cuda_arch_bin "75") elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere") - if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0 - set(cuda_arch_bin "80") - elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+ - set(cuda_arch_bin "80 86") + if(WITH_NV_JETSON) + set(cuda_arch_bin "87") + else() + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0 + set(cuda_arch_bin "80") + else() + set(cuda_arch_bin "80 86") + endif() endif() + elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper") + set(cuda_arch_bin "90") elseif(${CUDA_ARCH_NAME} STREQUAL "All") set(cuda_arch_bin ${paddle_known_gpu_archs}) elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") @@ -186,6 +196,13 @@ function(select_nvcc_arch_flags out_variable) set(cuda_arch_bin ${CUDA_ARCH_BIN}) endif() + # cuda11.4 + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.6) + set(cuda_arch_bin "70 80") + else() + set(cuda_arch_bin "70 80 90") + endif() + if(NEW_RELEASE_JIT) set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}") set(cuda_arch_bin "") @@ -249,6 +266,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") +elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+ + set(paddle_known_gpu_archs "${paddle_known_gpu_archs12} 90") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") endif() if(NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index cd7b254892ed1..69a1058d0db0f 100644 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -25,8 +25,18 @@ set(GLOO_LIBRARY_DIR "${GLOO_INSTALL_DIR}/lib" CACHE PATH "gloo library directory." FORCE) # As we add extra features for gloo, we use the non-official repo -set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git) -set(GLOO_TAG v0.0.2) +if(WITH_GPU) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) + set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git) + set(GLOO_TAG v0.0.2) + else() + set(GLOO_REPOSITORY ${GIT_URL}/ziyoujiyi/gloo.git) + set(GLOO_TAG v0.0.3) + endif() +else() + set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git) + set(GLOO_TAG v0.0.2) +endif() set(GLOO_LIBRARIES "${GLOO_INSTALL_DIR}/lib/libgloo.a" CACHE FILEPATH "gloo library." FORCE) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index c7a4e1d99bff1..4eeac3515d160 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -23,7 +23,15 @@ set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) # in case of low internet speed #set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) -set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b) +if(WITH_GPU) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) + set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b) + else() + set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184) + endif() +else() + set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b) +endif() set(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index fa9e408b5af66..7b1a75eb03469 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -8,6 +8,8 @@ set(XPU_API_LIB_NAME "libxpuapi.so") set(XPU_API_PLUGIN_NAME "libxpuplugin.so") set(XPU_RT_LIB_NAME "libxpurt.so") set(XPU_RT_ALIAS_LIB_NAME "libxpurt.so.1") +set(XPU_ML_LIB_NAME "libxpuml.so") +set(XPU_ML_ALIAS_LIB_NAME "libxpuml.so.1") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE @@ -128,6 +130,8 @@ set(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}") set(XPU_API_PLUGIN "${XPU_LIB_DIR}/${XPU_API_PLUGIN_NAME}") set(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}") set(XPU_RT_ALIAS_LIB "${XPU_LIB_DIR}/${XPU_RT_ALIAS_LIB_NAME}") +set(XPU_ML_LIB "${THIRD_PARTY_PATH}/xpu/src/extern_xpu/xre-bdcentos_x86_64/so/${XPU_ML_LIB_NAME}") +set(XPU_ML_ALIAS_LIB "${THIRD_PARTY_PATH}/xpu/src/extern_xpu/xre-bdcentos_x86_64/so/${XPU_ML_ALIAS_LIB_NAME}") set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib") @@ -173,9 +177,9 @@ if(WITH_XPU_BKCL) set(XPU_BKCL_LIB "${XPU_LIB_DIR}/${XPU_BKCL_LIB_NAME}") set(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") include_directories(${XPU_BKCL_INC_DIR}) - target_link_libraries(xpulib -Wl,--push-state,--no-as-needed ${XPU_API_LIB} ${XPU_API_PLUGIN} ${XPU_RT_LIB} ${XPU_BKCL_LIB} -Wl,--pop-state) + target_link_libraries(xpulib -Wl,--push-state,--no-as-needed ${XPU_API_LIB} ${XPU_API_PLUGIN} ${XPU_RT_LIB} ${XPU_BKCL_LIB} ${XPU_ML_LIB} -Wl,--pop-state) else() - target_link_libraries(xpulib -Wl,--push-state,--no-as-needed ${XPU_API_LIB} ${XPU_API_PLUGIN} ${XPU_RT_LIB} -Wl,--pop-state) + target_link_libraries(xpulib -Wl,--push-state,--no-as-needed ${XPU_API_LIB} ${XPU_API_PLUGIN} ${XPU_RT_LIB} ${XPU_ML_LIB} -Wl,--pop-state) endif() add_dependencies(xpulib ${XPU_PROJECT}) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index c7bc72a9c959b..fde58e2d56183 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -317,8 +317,7 @@ if(WITH_ONNXRUNTIME) endif() if(WITH_GPU) - if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0 OR ${CMAKE_CUDA_COMPILER_VERSION} - GREATER_EQUAL 11.6) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) include(external/cub) # download cub list(APPEND third_party_deps extern_cub) endif() diff --git a/cmake/version.cmake b/cmake/version.cmake index 83bd3f1b1bc4a..88e767b968bd9 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -71,3 +71,27 @@ math(EXPR PADDLE_VERSION_INTEGER "${PADDLE_MAJOR_VER} * 1000000 add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION}) add_definitions(-DPADDLE_VERSION_INTEGER=${PADDLE_VERSION_INTEGER}) message(STATUS "Paddle version is ${PADDLE_VERSION}") + +#add git version +set(COMMIT_HASH "") +set(BRANCH_NAME "") +find_package(Git QUIET) +if(GIT_FOUND) +execute_process( + COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%H + OUTPUT_VARIABLE COMMIT_HASH + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +) +execute_process( + COMMAND ${GIT_EXECUTABLE} symbolic-ref --short -q HEAD + OUTPUT_VARIABLE BRANCH_NAME + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} +) +endif() +message(STATUS "Git version is ${BRANCH_NAME}:${COMMIT_HASH}") +add_definitions(-DPADDLE_BRANCH_NAME="${BRANCH_NAME}") +add_definitions(-DPADDLE_COMMIT_HASH="${COMMIT_HASH}") diff --git a/paddle/fluid/framework/boxps_trainer.cc b/paddle/fluid/framework/boxps_trainer.cc index 76a3e7c43057d..3558eebbfe0c7 100644 --- a/paddle/fluid/framework/boxps_trainer.cc +++ b/paddle/fluid/framework/boxps_trainer.cc @@ -15,10 +15,10 @@ #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/fleet/box_wrapper.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/trainer_desc.pb.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/io/fs.h" DECLARE_bool(enable_binding_train_cpu); namespace paddle { @@ -94,7 +94,17 @@ void BoxPSTrainer::InitOtherEnv(const ProgramDesc& main_program) { } VLOG(3) << "init other env done."; } - +// dump thread pool +inline std::shared_ptr& GetDumpThreadPool( + int thread_num) { + static std::shared_ptr dump_thread_pool = + nullptr; + if (dump_thread_pool != nullptr) { + return dump_thread_pool; + } + dump_thread_pool.reset(new paddle::framework::ThreadPool(thread_num)); + return dump_thread_pool; +} std::string BoxPSTrainer::GetDumpPath(int tid) { return string::format_string("%s/part-%05d", dump_fields_path_.c_str(), tid); } @@ -104,8 +114,8 @@ void BoxPSTrainer::DumpWork(int tid) { int fileid = 0; size_t file_size = 0; while (!is_finish) { - std::string path = string::format_string("%s/part-%05d-%05d", - dump_fields_path_.c_str(), tid, fileid++); + std::string path = string::format_string( + "%s/part-%05d-%05d", dump_fields_path_.c_str(), tid, fileid++); int err_no = 0; std::shared_ptr fp = fs_open_write(path, &err_no, dump_converter_); // split dump file size @@ -134,36 +144,53 @@ void BoxPSTrainer::InitDumpEnv() { workers_[i]->SetChannelWriter(queue_.get()); } // TODO(hutuxian): should make it as a config + dump_futures_.clear(); + auto pool = GetDumpThreadPool(dump_thread_num_); for (int i = 0; i < dump_thread_num_; i++) { - dump_thread_.push_back( - std::thread(std::bind(&TrainerBase::DumpWork, this, i))); + dump_futures_.emplace_back(pool->Run([this, i]() { this->DumpWork(i); })); } VLOG(0) << "init dump write file thread num=" << dump_thread_num_; } - -void BoxPSTrainer::CopyParameters(const Scope& root_scope, int device_id) { - Scope* thread_scope = GetWorkerScope(device_id); - for (const std::string& name : *param_need_sync_) { - const LoDTensor& root_tensor = root_scope.FindVar(name)->Get(); - - // TODO(hutxian): check a new var of the same name is created in - LoDTensor* gpu_tensor = thread_scope->Var(name)->GetMutable(); - platform::Place place = platform::CUDAPlace(device_id); - TensorCopy(*static_cast(&root_tensor), place, - static_cast(gpu_tensor)); +// final dump env +void BoxPSTrainer::FinalizeDumpEnv() { + queue_->Close(); + for (auto& th : dump_futures_) { + th.get(); } + dump_futures_.clear(); + queue_.reset(); + VLOG(0) << "finalize dump write file thread"; } -void BoxPSTrainer::DumpParameters(void) { - Scope* thread_scope = GetWorkerScope(0); - for (const auto& var : persistable_vars_) { - auto* root_tensor = root_scope_->Var(var)->GetMutable(); - // TODO(hutuxian): Add a final all-reduce? - const auto& thread_tensor = thread_scope->FindVar(var)->Get(); - TensorCopy(thread_tensor, root_tensor->place(), root_tensor); +inline std::vector>& +GetThreadPool(int thread_num) { + static std::vector> + thread_pools; + if (!thread_pools.empty()) { + return thread_pools; } + thread_pools.resize(thread_num); + for (int i = 0; i < thread_num; ++i) { + thread_pools[i].reset(new paddle::framework::ThreadPool(1)); + } + if (!FLAGS_enable_binding_train_cpu) { + return thread_pools; + } + std::vector& train_cores = boxps::get_train_cores(); + if (train_cores.size() < static_cast(thread_num)) { + return thread_pools; + } + std::vector ncores; + for (int i = 0; i < thread_num; ++i) { + ncores.push_back(train_cores[i]); + if (train_cores.size() / 2 == static_cast(thread_num)) { + ncores.push_back(train_cores[i + thread_num]); + } + thread_pools[i]->SetCPUAffinity(ncores, false); + ncores.clear(); + } + return thread_pools; } - void BoxPSTrainer::InitTrainerEnv(const ProgramDesc& main_program, const platform::Place& place) { PADDLE_ENFORCE(root_scope_, "Null root_scope pointer"); @@ -183,51 +210,75 @@ void BoxPSTrainer::InitTrainerEnv(const ProgramDesc& main_program, std::set async_param_name; if (async_mode_) { - async_param_name = dense_table_->Init(*root_scope_, *param_need_sync_.get(), + async_param_name = dense_table_->Init(*root_scope_, + *param_need_sync_.get(), persistable_vars_, async_grad_name_); } + auto pool = GetThreadPool(thread_num_); + wait_futures_.clear(); + CHECK(static_cast(pool.size()) == thread_num_); for (int i = 0; i < thread_num_; ++i) { - auto this_worker = - std::dynamic_pointer_cast(workers_[i]); - this_worker->SetRootScope(root_scope_); - if (async_mode_) { - this_worker->SetDenseTable(dense_table_.get()); - this_worker->SetAsyncParamName(async_param_name); - } - this_worker->CreateDeviceResource(main_program); - // CopyParameters(*root_scope_, i); - } -} -inline std::vector>& -GetThreadPool(int thread_num) { - static std::vector> - thread_pools; - if (!thread_pools.empty()) { - return thread_pools; + wait_futures_.emplace_back( + pool[i]->Run([this, i, &async_param_name, &main_program]() { + auto this_worker = + std::dynamic_pointer_cast( + workers_[i]); + this_worker->SetRootScope(root_scope_); + if (async_mode_) { + this_worker->SetDenseTable(dense_table_.get()); + this_worker->SetAsyncParamName(async_param_name); + } + this_worker->CreateDeviceResource(main_program); + })); } - thread_pools.resize(thread_num); - for (int i = 0; i < thread_num; ++i) { - thread_pools[i].reset(new paddle::framework::ThreadPool(1)); - } - if (!FLAGS_enable_binding_train_cpu) { - return thread_pools; + RemoveOtherDeviceVars(main_program, root_scope_); + for (auto& th : wait_futures_) { + th.get(); } - std::vector& train_cores = boxps::get_train_cores(); - if (train_cores.size() < static_cast(thread_num)) { - return thread_pools; + VLOG(0) << "InitTrainerEnv done!"; +} + +void BoxPSTrainer::RemoveOtherDeviceVars(const ProgramDesc& main_program, + Scope* root_scope) { + std::vector remove_vars; + std::unordered_set unpersist_var_names; + auto& block = main_program.Block(0); + auto all_desc = block.AllOps(); + auto box_wrapper = BoxWrapper::GetInstance(); + int rank_id = box_wrapper->GetMpiRank(); + int gum_num = box_wrapper->GetGpuNum(); + // 1. Get other device's Param + for (auto& op_desc : all_desc) { + // broadcast op + if (op_desc->Type() != "c_broadcast") { + continue; + } + int root_id = op_desc->GetAttrIfExists("root"); + if ((root_id / gum_num) == rank_id) { + continue; + } + for (auto& o : op_desc->Inputs()) { + for (auto& name : o.second) { + unpersist_var_names.insert(name); + } + } } - std::vector ncores; - for (int i = 0; i < thread_num; ++i) { - ncores.push_back(train_cores[i]); - if (train_cores.size() / 2 == static_cast(thread_num)) { - ncores.push_back(train_cores[i + thread_num]); + VLOG(0) << "root scope remove_params size = " << unpersist_var_names.size(); + // 2. Get moment param + for (auto& unpersist_var_name : unpersist_var_names) { + for (auto& var : block.AllVars()) { + std::string name = var->Name(); + if (var->Persistable() && name.find(unpersist_var_name) == 0) { + remove_vars.push_back(name); + } } - thread_pools[i]->SetCPUAffinity(ncores, false); - ncores.clear(); } - return thread_pools; + if (remove_vars.empty()) return; + VLOG(0) << "root scope remove_vars's size = " << remove_vars.size(); + root_scope->EraseVars(remove_vars); } + void BoxPSTrainer::Run() { VLOG(3) << "Going to run"; auto pool = GetThreadPool(thread_num_); @@ -242,11 +293,16 @@ void BoxPSTrainer::Run() { pool[i]->Run([this, i]() { workers_[i]->TrainFilesWithProfiler(); })); } } + for (auto& th : wait_futures_) { + th.get(); + } } void BoxPSTrainer::Finalize() { - for (auto& th : wait_futures_) { - th.get(); + for (int i = 0; i < thread_num_; ++i) { + auto this_worker = + std::dynamic_pointer_cast(workers_[i]); + this_worker->Finalize(); } if (async_mode_) { // must be after train thread, otherwise the ps_buffer_ will be closed first @@ -255,14 +311,12 @@ void BoxPSTrainer::Finalize() { if (need_dump_field_ || need_dump_param_) { FinalizeDumpEnv(); } - DumpParameters(); root_scope_->DropKids(); } Scope* BoxPSTrainer::GetWorkerScope(int thread_id) { return workers_[thread_id]->GetThreadScope(); } - } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc index 4ad91e2e90881..4ca0ad514bdb2 100644 --- a/paddle/fluid/framework/boxps_worker.cc +++ b/paddle/fluid/framework/boxps_worker.cc @@ -41,13 +41,31 @@ limitations under the License. */ #include #include #endif +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/program_utils.h" +DECLARE_bool(enable_dump_main_program); DECLARE_bool(enable_sync_dense_moment); DECLARE_bool(check_nan_inf); -PADDLE_DEFINE_EXPORTED_bool(padbox_enable_gc, false, "enable paddlebox gc"); +DECLARE_bool(lineid_have_extend_info); +DECLARE_bool(dump_filed_same_as_aibox); +PADDLE_DEFINE_EXPORTED_bool(padbox_enable_gc, true, "enable paddlebox gc"); +PADDLE_DEFINE_EXPORTED_bool(padbox_enable_print_op_debug, + false, + "enable print op debug ,default false"); +PADDLE_DEFINE_EXPORTED_bool(enable_print_dump_field_debug, + false, + "enable print dump field debug ,default false"); +PADDLE_DEFINE_EXPORTED_bool(enable_print_dump_info_debug, + false, + "enable print dump info debug ,default false"); +PADDLE_DEFINE_EXPORTED_bool( + padbox_enable_sharding_stage, + true, + "enable sharding stage step1 only param and grad split, default false"); + namespace paddle { namespace framework { - BoxPSAsynDenseTable::BoxPSAsynDenseTable(const int device_num) : device_num_(device_num) { int buffer_size = device_num * 4; // magic number @@ -358,7 +376,28 @@ void BoxPSAsynDenseTable::InitThreadGroup() { } thread_pool.reset(new paddle::framework::ThreadPool(thread_num_)); } - +//======================== BoxPSWorker ====================== +// init +void BoxPSWorker::MemoryShareTensor::init(const std::string& name, + const platform::Place& place, + const int64_t& total_len, + Scope* root_scope) { + char szname[512] = {0}; + snprintf(szname, + sizeof(szname), + "paddlebox_boxps_worker_share_%s_%d", + name.c_str(), + place.GetDeviceId()); + data_tensor_ = root_scope->Var(szname)->GetMutable(); + data_tensor_->mutable_data({total_len, 1}, place); +} +// share +phi::DenseTensor& BoxPSWorker::MemoryShareTensor::share( + phi::DenseTensor* gpu_tensor, const size_t& len) { + gpu_tensor->ShareDataWith(data_tensor_->Slice(offset_, offset_ + len)); + offset_ += len; + return *gpu_tensor; +} static const int DenseKStepNode = 1; static const int DenseKStepALL = 2; static const int DenseDataNormal = 3; @@ -371,11 +410,32 @@ void BoxPSWorker::Initialize(const TrainerDesc& desc) { } VLOG(1) << "boxps_worker init device num: " << device_num_; } - +void BoxPSWorker::Finalize() { + if (sharding_mode_ || device_id_ == 0) { + for (auto& name : need_copy_vars_) { + Variable* root_var = root_scope_->FindVar(name); + if (root_var == nullptr) { + continue; + } + auto root_tensor = root_var->GetMutable(); + Variable* var = thread_scope_->FindVar(name); + auto tensor = var->Get(); + TensorCopy(tensor, root_tensor->place(), root_tensor); + } + dev_ctx_->Wait(); + } +} void BoxPSWorker::SetDenseTable(BoxPSAsynDenseTable* dense) { dense_table_ = dense; } - +inline bool IsDataNormParam(const std::string& name) { + if (name.find(".batch_size") != std::string::npos || + name.find(".batch_sum") != std::string::npos || + name.find(".batch_square_sum") != std::string::npos) { + return true; + } + return false; +} int BoxPSWorker::CheckNeedParam(VarDesc* var) { if (!var->Persistable()) { return 0; @@ -413,8 +473,9 @@ int BoxPSWorker::CheckNeedParam(VarDesc* var) { return 0; } -int64_t BoxPSWorker::AllocParamTensor(int64_t* pad_len) { - auto& block = program_->Block(0); +int64_t BoxPSWorker::AllocParamTensor(const ProgramDesc& program, + int64_t* pad_len) { + auto& block = program.Block(0); // init var and copy persistable int64_t total_param_len = 0; int64_t total_moment_len = 0; @@ -449,12 +510,12 @@ int64_t BoxPSWorker::AllocParamTensor(int64_t* pad_len) { << ", sync length:" << all_sync_param_len << ", sync mode:" << sync_mode_ << ", node size:" << node_size_ << ", device num:" << device_num_ << ", one ring:" << one_ring_; - param_sync_.mutable_data({all_sync_param_len, 1}, place_); + param_sync_.init("total_param_sync", place_, all_sync_param_len, root_scope_); return total_param_len; } -int64_t BoxPSWorker::AllocParamTensorAsync() { - auto& block = program_->Block(0); +int64_t BoxPSWorker::AllocParamTensorAsync(const ProgramDesc& program) { + auto& block = program.Block(0); // init var and copy persistable int64_t total_param_len = 0; for (auto& var : block.AllVars()) { @@ -475,17 +536,263 @@ int64_t BoxPSWorker::AllocParamTensorAsync() { CHECK(total_param_len > 0) << "error param total zero"; CHECK(dense_table_->GetParamTotalLen() == total_param_len); - param_async_.mutable_data({total_param_len, 1}, place_); - grad_async_.mutable_data({total_param_len, 1}, place_); + param_async_.init("total_param_async", place_, total_param_len, root_scope_); + grad_async_.init("total_grad_async", place_, total_param_len, root_scope_); return total_param_len; } -void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { - program_.reset(new ProgramDesc(main_prog)); - auto& block = program_->Block(0); - for (auto& op_desc : block.AllOps()) { +int BoxPSWorker::IsParameter(const std::string& name, bool full_match) { + if (full_match) { + auto it = params2rootid_.find(name); + if (it == params2rootid_.end()) { + return -1; + } + if (it->second == nccl_rank_id_) { + return 1; + } + return 0; + } else { + // moment, acc + for (auto it = params2rootid_.begin(); it != params2rootid_.end(); ++it) { + if (strncmp(name.c_str(), it->first.c_str(), it->first.length()) != 0) { + continue; + } + if (it->second == nccl_rank_id_) { + return 1; + } + return 0; + } + return -1; + } +} + +static bool FindVarInMap(const VariableNameMap& op_var_map, + const std::multiset& var_set) { + for (auto& o : op_var_map) { + for (auto& name : o.second) { + if (var_set.find(name) != var_set.end()) { + return true; + } + } + } + return false; +} + +static bool IsAvgOp(OpDesc* op_desc) { + if (op_desc->Type() != "elementwise_add" && + op_desc->Type() != "elementwise_mul") { + return false; + } + for (auto& o : op_desc->Outputs()) { + for (auto& name : o.second) { + if (name.find("avg_weight") != std::string::npos || + name.find("@avg") != std::string::npos) { + return true; + } + } + } + return false; +} +void BoxPSWorker::BuildShardingDepends(const ProgramDesc& program) { + nccl_rank_id_ = place_.GetDeviceId(); +#if defined(PADDLE_WITH_CUDA) + auto box_wrapper = BoxWrapper::GetInstance(); + nccl_rank_id_ = box_wrapper->GetNCCLRankId(nccl_rank_id_); +#endif + + auto& block = program.Block(0); + auto all_desc = block.AllOps(); + + for (auto& op_desc : all_desc) { + // broadcast op + if (op_desc->Type() != "c_broadcast") { + continue; + } + int root_id = op_desc->GetAttrIfExists("root"); + int ring_id = op_desc->GetAttrIfExists("ring_id"); + if (ring_id >= 0 && ring_id != ring_id_) { + ring_id_ = ring_id; + } + for (auto& o : op_desc->Inputs()) { + for (auto& name : o.second) { + auto var = block.FindVar(name); + if (!var->Persistable() || !var->IsParameter()) { + continue; + } + if (params2rootid_.find(name) != params2rootid_.end()) { + auto it = params2rootid_.find(name); + if (it->second != root_id) { + std::cout << "error: param name conflict" << std::endl; + } + continue; + } + params2rootid_.insert(std::make_pair(name, root_id)); + } + } + } + if (params2rootid_.empty()) { + return; + } + sharding_mode_ = true; + size_t copy_param_cnt = 0; + // check find + for (auto& var : block.AllVars()) { + if (!var->Persistable()) { + continue; + } + std::string name = var->Name(); + int ret = IsParameter(name, var->IsParameter()); + if (ret < 0 || ret == 1) { + if (ret == 1) { + persist_param_vars_.insert(name); + } + continue; + } + if (var->IsParameter()) { + // persist parameter,eg: data_norm, learning_rate + if (IsDataNormParam(name) || + name.find("learning_rate") != std::string::npos) { + ++copy_param_cnt; + } else { + unpersist_vars_.insert(name); + } + } else { + // adam ubmq1_h2_param.b_0_moment1_0, avg_weight @avg @w_backup + remove_vars_.insert(name); + } + } + + std::multiset all_remove_inputs; + for (auto& op_desc : all_desc) { + if (FindVarInMap(op_desc->Inputs(), remove_vars_)) { + for (auto& o : op_desc->Inputs()) { + for (auto& name : o.second) { + all_remove_inputs.insert(name); + } + } + remove_ops_.insert(op_desc); + } else if (IsAvgOp(op_desc) && + (FindVarInMap(op_desc->Outputs(), remove_vars_) || + FindVarInMap(op_desc->Inputs(), unpersist_vars_))) { + remove_ops_.insert(op_desc); + } + } + + size_t total_scale_cnt = 0; + size_t remove_scale_cnt = 0; + // remove scale op + for (auto& op_desc : all_desc) { + if (op_desc->Type() != "scale") { + continue; + } + ++total_scale_cnt; + // check scale output + for (auto& name : op_desc->Output("Out")) { + if (all_remove_inputs.find(name) == all_remove_inputs.end()) { + continue; + } + ++remove_scale_cnt; + remove_ops_.insert(op_desc); + break; + } + } + // stage1 + if (FLAGS_padbox_enable_sharding_stage) { + std::multiset broadcast_vars; + for (auto& op_desc : all_desc) { + if (op_desc->Type() != "c_broadcast") { + continue; + } + bool find = false; + for (auto& o : op_desc->Inputs()) { + for (auto& name : o.second) { + auto it = broadcast_vars.find(name); + if (it != broadcast_vars.end()) { + find = true; + continue; + } + broadcast_vars.insert(name); + } + } + if (find) { + remove_ops_.insert(op_desc); + } + } + } + + // reset dump param + if (need_dump_param_ && dump_param_ != nullptr) { + for (auto& name : *dump_param_) { + auto var = block.FindVar(name); + if (var == nullptr) { + continue; + } + std::string new_name = name; + size_t pos = new_name.find("@"); + if (pos > 0) { + new_name = name.substr(0, pos); + } + if (persist_param_vars_.find(new_name) == persist_param_vars_.end()) { + continue; + } + shard_dump_params_.push_back(name); + } + dump_param_ = &shard_dump_params_; + } + // reset dump fields + if (need_dump_field_ && dump_fields_ != nullptr) { + for (auto& name : *dump_fields_) { + auto var = block.FindVar(name); + if (var == nullptr) { + continue; + } + if (remove_vars_.find(name) != remove_vars_.end()) { + continue; + } + shard_dump_fields_.push_back(name); + } + dump_fields_ = &shard_dump_fields_; + } + VLOG(3) << "device id=" << int(place_.GetDeviceId()) + << ", nccl rank=" << nccl_rank_id_ + << ", total param count=" << params2rootid_.size() + << ", remove op count=" << remove_ops_.size() + << ", total scale op=" << total_scale_cnt << ", remove " + << remove_scale_cnt << ", remove var count=" << remove_vars_.size() + << ", unpersist var count=" << unpersist_vars_.size() + << ", dump param count=" << shard_dump_params_.size() + << ", dump fields count=" << shard_dump_fields_.size(); +} +inline bool IsCommunicationOp(const std::string& op_name) { + if (op_name == "c_broadcast" || op_name == "c_reduce_sum" || + op_name == "c_allreduce_sum") { + return true; + } + return false; +} +inline bool IsSyncStreamOp(const std::string& op_name) { + if (op_name == "c_sync_comm_stream" || op_name == "c_sync_calc_stream") { + return true; + } + return false; +} +void BoxPSWorker::CreateThreadOperators(const ProgramDesc& program) { + auto& block = program.Block(0); + + size_t op_index = 0; + auto ops_descs = block.AllOps(); + for (auto& op_desc : ops_descs) { + // skip remove ops + if (remove_ops_.find(op_desc) != remove_ops_.end()) { + continue; + } + std::string op_name = op_desc->Type(); + // single stream not need sync + if (IsSyncStreamOp(op_name)) { + continue; + } // skip feed fetch op - if (op_desc->Type() == "feed" || op_desc->Type() == "fetch") { + if (op_name == "feed" || op_name == "fetch") { for (auto& o : op_desc->Inputs()) { skip_vars_.insert(skip_vars_.end(), o.second.begin(), o.second.end()); } @@ -494,6 +801,29 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { } } ops_.push_back(OpRegistry::CreateOp(*op_desc)); + // change to device stream + if (IsCommunicationOp(op_name)) { + ops_[op_index]->SetAttr("use_calc_stream", true); + } + ++op_index; + } + // add stream sync point + bool find = false; + for (size_t op_id = 0; op_id < ops_.size(); ++op_id) { + auto& op = ops_[op_id]; + std::string op_name = op->Type(); + if (!IsCommunicationOp(op_name)) { + if (find) { + find = false; + sync_points_.insert(op.get()); + } + continue; + } + if (find) { + continue; + } + find = true; + sync_points_.insert(op.get()); } // skip dump fields if (need_dump_field_ && dump_fields_ != nullptr) { @@ -512,19 +842,23 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { skip_vars_.insert( skip_vars_.end(), monitor_vars.begin(), monitor_vars.end()); } - - int64_t pad_len = 0; - if (sync_mode_ > 0) { - AllocParamTensor(&pad_len); - } else if (dense_table_) { - AllocParamTensorAsync(); + if (FLAGS_padbox_enable_gc) { + // add op gc vars + unused_vars_ = GetUnusedVars(block, ops_, skip_vars_, &unpersist_vars_); } + VLOG(3) << "device[" << device_id_ << "] total op count=" << block.OpSize() + << ", create op count=" << ops_.size() + << ", skip vars count=" << skip_vars_.size() + << ", unused vars count=" << unused_vars_.size(); +} + +void BoxPSWorker::CreateThreadScopeForAsync(const ProgramDesc& program) { + AllocParamTensorAsync(program); thread_scope_ = &(root_scope_->NewScope()); - int64_t offset = 0; - int64_t grad_offset = 0; // make param and param@GRAD in same order + auto& block = program.Block(0); std::vector sorted_var = block.AllVars(); std::sort(sorted_var.begin(), sorted_var.end(), @@ -541,18 +875,19 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { return var1->Name() < var2->Name(); } }); + // init var and copy persistable int grad_var_num = 0; int var_num = 0; int persistable_num = 0; - int share_var_num = 0; - int64_t share_persistable_len = 0; int64_t total_persistable_len = 0; + int param_total = 0; + for (auto& var : sorted_var) { std::string name = var->Name(); + ++param_total; if (!var->Persistable()) { - if (dense_table_ && - async_param_name_.find(name) != async_param_name_.end()) { + if (async_param_name_.find(name) != async_param_name_.end()) { // parm@GRAD can not find in root_scope_ use parm length replace VLOG(3) << "device[" << device_id_ << "] grad var name " << name; const LoDTensor& root_tensor = @@ -562,10 +897,7 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { thread_scope_->Var(name)->GetMutable(); auto dim = root_tensor.dims(); size_t len = root_tensor.numel(); - gpu_tensor - ->ShareDataWith(grad_async_.Slice(grad_offset, grad_offset + len)) - .Resize(dim); - grad_offset += len; + grad_async_.share(gpu_tensor, len).Resize(dim); grad_var_num += 1; skip_vars_.push_back(name); } else { @@ -573,75 +905,291 @@ void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { InitializeVariable(ptr, var->GetType()); } } else { - const LoDTensor& root_tensor = - root_scope_->FindVar(name)->Get(); - size_t len = root_tensor.numel(); + Variable* root_var = root_scope_->FindVar(name); + if (!root_var) { + VLOG(0) << "not found var name=" << name; + continue; + } + if (root_var->IsType()) { + continue; + } + phi::DenseTensor* root_tensor = root_var->GetMutable(); + size_t len = root_tensor->numel(); + ++persistable_num; + total_persistable_len += len; + + LoDTensor* gpu_tensor = thread_scope_->Var(name)->GetMutable(); + if (async_param_name_.find(name) != async_param_name_.end()) { + VLOG(3) << "device[" << device_id_ << "] Persistable var name " << name; + auto dim = root_tensor->dims(); + param_async_.share(gpu_tensor, len).Resize(dim); + var_num += 1; + skip_vars_.push_back(name); + } + // only support copy + TensorCopy(*static_cast(root_tensor), + place_, + static_cast(gpu_tensor)); + } + } + VLOG(0) << "device[" << device_id_ << "] total param count=" << param_total + << ", persistable=" << persistable_num << "(" + << total_persistable_len / 262144.0 << "MB)" + << ", param_async_ offset:" << param_async_.offset_ + << ", grad_offset: " << grad_async_.offset_ + << ", var_num: " << var_num << ", grad_var_num: " << grad_var_num; + CHECK(param_async_.offset_ <= param_async_.numel()); + CHECK(grad_async_.offset_ <= grad_async_.numel()); +} +void BoxPSWorker::CreateThreadScopeForNorm(const ProgramDesc& program) { + int64_t pad_len = 0; + if (sync_mode_ > 0) { + AllocParamTensor(program, &pad_len); + } + thread_scope_ = &(root_scope_->NewScope()); + + auto& block = program.Block(0); + std::vector all_vars = block.AllVars(); + + // init var and copy persistable + int persistable_num = 0; + int share_var_num = 0; + int64_t share_persistable_len = 0; + int64_t total_persistable_len = 0; + int param_total = 0; + int copy_persist_num = 0; + + for (auto& var : all_vars) { + std::string name = var->Name(); + ++param_total; + if (var->Persistable()) { + Variable* root_var = root_scope_->FindVar(name); + if (!root_var) { + VLOG(0) << "not found var name=" << name; + continue; + } + if (root_var->IsType()) { + continue; + } + phi::DenseTensor* root_tensor = root_var->GetMutable(); + size_t len = root_tensor->numel(); ++persistable_num; total_persistable_len += len; - // add gc skip vars - skip_vars_.push_back(name); LoDTensor* gpu_tensor = thread_scope_->Var(name)->GetMutable(); if (sync_mode_ > 0) { if (CheckNeedParam(var)) { - auto dim = root_tensor.dims(); - gpu_tensor->ShareDataWith(param_sync_.Slice(offset, offset + len)) - .Resize(dim); - offset += len; + auto dim = root_tensor->dims(); + param_sync_.share(gpu_tensor, len).Resize(dim); + skip_vars_.push_back(name); } - } else if (dense_table_) { - if (async_param_name_.find(name) != async_param_name_.end()) { - VLOG(3) << "device[" << device_id_ << "] Persistable var name " - << name; - auto dim = root_tensor.dims(); - gpu_tensor->ShareDataWith(param_async_.Slice(offset, offset + len)) - .Resize(dim); - offset += len; - var_num += 1; + } + // data norm copy and learning rate + if (!gpu_tensor->initialized() && place_ == root_tensor->place()) { + auto dim = root_tensor->dims(); + gpu_tensor->ShareDataWith(*root_tensor).Resize(dim); + ++share_var_num; + share_persistable_len += len; + } else { + TensorCopy(*static_cast(root_tensor), + place_, + static_cast(gpu_tensor)); + ++copy_persist_num; + // add copy back to root scope + if (device_id_ == 0) { + need_copy_vars_.push_back(name); + skip_vars_.push_back(name); } } - if (!gpu_tensor->initialized() && place_ == root_tensor.place()) { - auto dim = root_tensor.dims(); - gpu_tensor->ShareDataWith(root_tensor).Resize(dim); + } else { + auto* ptr = thread_scope_->Var(name); + InitializeVariable(ptr, var->GetType()); + } + } + if (sync_mode_ > 0) { + CHECK(param_sync_.offset_ <= (param_sync_.numel() - pad_len)); + } + VLOG(0) << "device[" << device_id_ << "] total param count=" << param_total + << ", persistable=[total:" << persistable_num << "(" + << total_persistable_len / 262144.0 << "MB)" + << ", share:" << share_var_num << "(" + << share_persistable_len / 262144.0 << "MB)" + << ", copy:" << copy_persist_num << "]"; +} +void BoxPSWorker::CreateThreadScopeForSharding(const ProgramDesc& program) { + int64_t pad_len = 0; + if (sync_mode_ > 0) { + AllocParamTensor(program, &pad_len); + } + thread_scope_ = &(root_scope_->NewScope()); + + auto& block = program.Block(0); + std::vector all_vars = block.AllVars(); + + // init var and copy persistable + int persistable_num = 0; + int share_var_num = 0; + int64_t share_persistable_len = 0; + int64_t total_persistable_len = 0; + int persist_reset = 0; + int param_total = 0; + int unpersist_num = 0; + int copy_persist_num = 0; + int64_t real_persist_len = 0; + int real_persist_num = 0; + int delete_vars_num = 0; + + for (auto& var : all_vars) { + std::string name = var->Name(); + all_vars_.push_back(name); + if (remove_vars_.find(name) != remove_vars_.end()) { + ++delete_vars_num; + continue; + } + thread_vars_.push_back(name); + ++param_total; + if (var->Persistable()) { + if (unpersist_vars_.find(name) != unpersist_vars_.end()) { + // unpersist vars(include other thread var and other device var) + auto* ptr = thread_scope_->Var(name); + InitializeVariable(ptr, var->GetType()); + // set dims + auto dims = phi::make_ddim(var->GetShape()); + auto var_dtype = + paddle::framework::TransToPhiDataType(var->GetDataType()); + ptr->GetMutable()->Resize(dims).set_type(var_dtype); + ++unpersist_num; + ++persistable_num; + total_persistable_len += ptr->GetMutable()->numel(); + continue; + } + Variable* root_var = root_scope_->FindVar(name); + if (!root_var) { + VLOG(0) << "not found var name=" << name; + continue; + } + if (root_var->IsType()) { + continue; + } + phi::DenseTensor* root_tensor = root_var->GetMutable(); + size_t len = root_tensor->numel(); + ++persistable_num; + total_persistable_len += len; + real_persist_len += len; + ++real_persist_num; + // convert one device to other device c_broadcast param + if (persist_param_vars_.find(name) != persist_param_vars_.end()) { + // same device + if (place_ == root_tensor->place()) { + ++share_var_num; + share_persistable_len += len; + continue; + } + + auto src_place = root_tensor->place(); + auto holder = root_tensor->MoveMemoryHolder(); + auto dst_ptr = root_tensor->mutable_data( + place_, root_tensor->dtype(), holder->size()); + + #if defined(PADDLE_WITH_CUDA) + auto stream = static_cast(dev_ctx_)->stream(); + memory::Copy( + place_, dst_ptr, src_place, holder->ptr(), holder->size(), stream); + CHECK(platform::is_gpu_place(root_tensor->place())); + #elif defined(PADDLE_WITH_XPU) + // XPUStream stream = static_cast(dev_ctx_) + // ->x_context() + // ->xpu_stream; + memory::Copy( + place_, dst_ptr, src_place, holder->ptr(), holder->size()); + CHECK(platform::is_xpu_place(root_tensor->place())); + #endif + + ++persist_reset; + continue; + } + + LoDTensor* gpu_tensor = thread_scope_->Var(name)->GetMutable(); + if (sync_mode_ > 0) { + if (CheckNeedParam(var)) { + auto dim = root_tensor->dims(); + param_sync_.share(gpu_tensor, len).Resize(dim); + skip_vars_.push_back(name); + } + } + // data norm copy and learning rate + if (!gpu_tensor->initialized() && place_ == root_tensor->place()) { + auto dim = root_tensor->dims(); + gpu_tensor->ShareDataWith(*root_tensor).Resize(dim); ++share_var_num; share_persistable_len += len; } else { - TensorCopy(*static_cast(&root_tensor), + TensorCopy(*static_cast(root_tensor), place_, static_cast(gpu_tensor)); + ++copy_persist_num; + // device 0 need sync datanorm and learning rate to root scope + if (device_id_ == 0) { + need_copy_vars_.push_back(name); + skip_vars_.push_back(name); + } } + } else { + auto* ptr = thread_scope_->Var(name); + InitializeVariable(ptr, var->GetType()); } } if (sync_mode_ > 0) { - CHECK(offset <= (param_sync_.numel() - pad_len)); - } else if (dense_table_) { - VLOG(3) << "device[" << device_id_ - << "]CreateDeviceResource param_async_ offset:" << offset - << " grad_offset: " << grad_offset << " var_num: " << var_num - << " grad_var_num: " << grad_var_num; - CHECK(offset <= param_async_.numel()); - CHECK(grad_offset <= grad_async_.numel()); - } - if (share_var_num > 0) { - VLOG(0) << "device[" << device_id_ << "] persistable total num [" - << persistable_num << "," << total_persistable_len << "," - << total_persistable_len / 262144.0 - << "MB], share persistable num [" << share_var_num << "," - << share_persistable_len << "," << share_persistable_len / 262144.0 - << "MB]"; + CHECK(param_sync_.offset_ <= (param_sync_.numel() - pad_len)); } - if (FLAGS_padbox_enable_gc) { - // add op gc vars - unused_vars_ = GetUnusedVars2(block, ops_, skip_vars_); - // for (auto &var : unused_vars_) { - // VLOG(0) << "op name=" << var.first->Type() << ", gc names: " << - // paddle::string::join_strings(var.second, ","); - // } - if (device_id_ == 0) { - VLOG(0) << "total op count=" << ops_.size() - << ", skip vars count=" << skip_vars_.size() - << ", unused vars op count=" << unused_vars_.size(); + VLOG(0) << "device[" << device_id_ << "] total param count=" << param_total + << ", persistable=[total:" << persistable_num << "(" + << total_persistable_len / 262144.0 << "MB)" + << ", real:" << real_persist_num << "(" << real_persist_len / 262144.0 + << "MB)" + << ", share:" << share_var_num << "(" + << share_persistable_len / 262144.0 << "MB)" + << ", reset:" << persist_reset << ", unpersist:" << unpersist_num + << ", copy:" << copy_persist_num << "], delete=" << delete_vars_num; +} +void BoxPSWorker::CreateDeviceResource(const ProgramDesc& main_prog) { + BuildShardingDepends(main_prog); + if (dense_table_) { + // async + CreateThreadScopeForAsync(main_prog); + } else if (sharding_mode_) { + // sharding mode + CreateThreadScopeForSharding(main_prog); + } else { + // normal + CreateThreadScopeForNorm(main_prog); + } + CreateThreadOperators(main_prog); + + // debug str + if (FLAGS_enable_dump_main_program) { + std::ostringstream str_os; + for (auto& op : ops_) { + str_os << op->DebugStringEx(thread_scope_); + // add gc + auto it = unused_vars_.find(op.get()); + if (it != unused_vars_.end()) { + str_os << ", gc names: ["; + for (auto& name : it->second) { + str_os << name << ","; + } + str_os << "]"; + } + str_os << "\n"; } + auto box_ptr = BoxWrapper::GetInstance(); + char filename[512] = {0}; + snprintf(filename, + sizeof(filename), + "./device_%d_ops_%d.txt", + thread_id_, + box_ptr->Phase()); + WriteToFile(filename, str_os.str()); } } void BoxPSWorker::SyncParam(void) { @@ -654,7 +1202,7 @@ void BoxPSWorker::SyncParam(void) { box_ptr->DenseNcclTimer(device_id_, false, 0x03); #if defined(PADDLE_WITH_CUDA) - auto comm = platform::NCCLCommContext::Instance().Get(0, device_id_); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id_, device_id_); auto stream = static_cast(dev_ctx_)->stream(); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); #elif defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_XPU) @@ -692,7 +1240,7 @@ void BoxPSWorker::SyncParam(void) { sendbuff, sendbuff, numel, ncclFloat32, ncclSum, comm->comm(), stream)); } const float scale = 1.0 / (device_num_ * node_size_); - TensorScaleValue(place_, param_sync_, ¶m_sync_, scale); + TensorScaleValue(place_, param_sync_.tensor(), ¶m_sync_.tensor(), scale); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); #elif defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_XPU) @@ -707,7 +1255,7 @@ void BoxPSWorker::SyncParam(void) { BKCL_SUCCESS, platform::errors::PreconditionNotMet("BKCL all reduce failed")); const float scale = 1.0 / (device_num_ * node_size_); - TensorScaleValue(place_, param_sync_, ¶m_sync_, scale); + TensorScaleValue(place_, param_sync_.tensor(), ¶m_sync_.tensor(), scale); PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(stream)); #endif @@ -732,7 +1280,6 @@ inline void AddAucMonitor(const Scope* scope, const platform::Place& place) { metric_msg->add_data(scope, place); } } - void BoxPSWorker::TrainFiles() { VLOG(3) << "begin gpubox_worker TrainFiles"; platform::Timer timer; @@ -756,20 +1303,25 @@ void BoxPSWorker::TrainFiles() { VLOG(2) << "[" << device_id_ << "]begin running ops, batch size:" << batch_size << ", batch id=" << step; - if (dense_table_) { - dense_table_->PullDense(place_, ¶m_async_); + dense_table_->PullDense(place_, ¶m_async_.tensor()); } - for (auto& op : ops_) { + if (FLAGS_padbox_enable_print_op_debug) { + VLOG(0) << "thread id=" << thread_id_ << ", " + << op->DebugStringEx(thread_scope_); + } + // add stream sync + if (sync_points_.find(op.get()) != sync_points_.end()) { + dev_ctx_->Wait(); + } op->Run(*thread_scope_, place_); if (gc) { DeleteUnusedTensors(*thread_scope_, op.get(), unused_vars_, gc.get()); } } - if (dense_table_) { - dense_table_->PushDense(place_, &grad_async_); + dense_table_->PushDense(place_, &grad_async_.tensor()); } else if (sync_mode_ > 0) { if (step > param_sync_step_) { step = 0; @@ -797,6 +1349,12 @@ void BoxPSWorker::TrainFiles() { thread_scope_->DropKids(); } ++step; + // std::stringstream ss; + // ss << "Malloc Cnt: "; + // for (int i = 0; i < 8; ++i) { + // ss << "dev: " << i << " malloc times: "<< platform::get_malloc_cnt(i) << " "; + // } + // VLOG(0) << ss.str(); } // sync param step if (sync_mode_ > 0) { @@ -847,7 +1405,6 @@ void BoxPSWorker::TrainFilesWithProfiler() { outer_timer.Start(); while (true) { main_timer.Resume(); - reader_timer.Resume(); #if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_START("PackBatchTask", dev_ctx_->Wait()); @@ -880,9 +1437,11 @@ void BoxPSWorker::TrainFilesWithProfiler() { dev_ctx_->Wait(); timeline.Pause(); op_total_time[op_id++] += timeline.ElapsedUS(); + #if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) RUNTIME_TRACE_SCOPE_END((op->Type()+" run").c_str(),); #endif + if (gc) { DeleteUnusedTensors(*thread_scope_, op.get(), unused_vars_, gc.get()); } @@ -909,12 +1468,12 @@ void BoxPSWorker::TrainFilesWithProfiler() { if (need_dump_field_) { dump_timer.Resume(); - DumpFieldBoxPS(*thread_scope_, dump_mode_, dump_interval_); + DumpField(*thread_scope_, dump_mode_, dump_interval_); dump_timer.Pause(); } - if (need_dump_param_ && device_id_ == 0) { + if (need_dump_param_ && (sharding_mode_ || device_id_ == 0)) { dump_timer.Resume(); - DumpParamBoxPS(*thread_scope_, step_cnt); + DumpParam(*thread_scope_, step_cnt); dump_timer.Pause(); } if (gc) { @@ -954,6 +1513,314 @@ void BoxPSWorker::TrainFilesWithProfiler() { auto box_ptr = BoxWrapper::GetInstance(); box_ptr->PrintSyncTimer(device_id_, outer_timer.ElapsedSec()); } + +//================================== paddlebox dump +//============================================ +template +inline void format_string_append( + std::string* str, + const char* fmt, + ARGS&&... args) { // use VA_ARGS may be better ? + int len = snprintf(NULL, 0, fmt, args...); + PADDLE_ENFORCE(len >= 0, "format args length error"); + size_t oldlen = str->length(); + str->resize(oldlen + len + 1); + PADDLE_ENFORCE(snprintf(&(*str)[oldlen], (size_t)len + 1, fmt, args...) == + len); + str->resize(oldlen + len); +} + +static const size_t max_fmt_buff_size = 40; + +inline void GetLodBound(const LoD& lod, + const int64_t& dim, + const int& index, + std::pair* bound) { + if (lod.size() != 0) { + bound->first = lod[0][index] * dim; + bound->second = lod[0][index + 1] * dim; + } else { + bound->first = index * dim; + bound->second = (index + 1) * dim; + } +} + +template +inline void PrintLodTensorFmtType(const Tensor* tensor, + const int64_t& start, + const int64_t& end, + const char* fmt, + std::string* str) { + int64_t num = end - start; + if (num <= 0) { + return; + } + + size_t oldlen = str->length(); + // resize string + str->resize(oldlen + num * max_fmt_buff_size); + + const C* ptr = reinterpret_cast(tensor->data()); + for (int64_t i = start; i < end; ++i) { + int ret = snprintf(&(*str)[oldlen], max_fmt_buff_size, fmt, ptr[i]); + PADDLE_ENFORCE(ret > 0, "args or buff size error"); + oldlen = oldlen + ret; + } + // resize real string + str->resize(oldlen); +} +inline void PrintLodTensor(const Tensor* tensor, + const int64_t& start, + const int64_t& end, + std::string* out) { + auto dtype = framework::TransToProtoVarType(tensor->dtype()); + if (dtype == proto::VarType::FP32) { + PrintLodTensorFmtType(tensor, start, end, ":%.9g", out); + } else if (dtype == proto::VarType::INT64) { + PrintLodTensorFmtType(tensor, start, end, ":%lu", out); + } else if (dtype == proto::VarType::FP64) { + PrintLodTensorFmtType(tensor, start, end, ":%.9g", out); + } else if (dtype == proto::VarType::INT32) { + PrintLodTensorFmtType(tensor, start, end, ":%d", out); + } else if (dtype == proto::VarType::INT16) { + PrintLodTensorFmtType(tensor, start, end, ":%d", out); + } else { + out->append("unsupported type"); + } +} +inline void GetTensorBound(const LoDTensor& tensor, + int index, + std::pair* bound) { + auto& dims = tensor.dims(); + if (tensor.lod().size() != 0) { + auto& lod = tensor.lod()[0]; + bound->first = lod[index] * dims[1]; + bound->second = lod[index + 1] * dims[1]; + } else { + bound->first = index * dims[1]; + bound->second = (index + 1) * dims[1]; + } +} +void BoxPSWorker::DumpParam(const Scope& scope, const int batch_id) { + size_t field_num = dump_param_->size(); + + auto chan = writer_.channel(); + // thread process fields +#ifdef PADDLE_WITH_BOX_PS + auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); + box_ptr->ExecuteFunc( + platform::CPUPlace(), +#else + parallel_run_dynamic( +#endif + field_num, + [this, &scope, batch_id, chan](const size_t& id) { + auto& name = (*dump_param_)[id]; + Variable* var = scope.FindVar(name); + if (var == nullptr || !var->IsInitialized()) { + return; + } + const LoDTensor& tensor = var->Get(); + if (!tensor.IsInitialized()) { + VLOG(0) << "Note: param[" << name + << "] is not initialized, so it was skipped."; + return; + } + framework::LoDTensor cpu_tensor; + if (!platform::is_cpu_place(tensor.place())) { + TensorCopySync(tensor, platform::CPUPlace(), &cpu_tensor); + } else { + cpu_tensor.ShareDataWith(tensor); + } + + std::string s; + format_string_append(&s, "(%d,%s)", batch_id, name.c_str()); + int64_t len = cpu_tensor.numel(); + PrintLodTensor(&cpu_tensor, 0, len, &s); + // write to channel + chan->WriteMove(1, &s); + }); +} + +void BoxPSWorker::DumpField(const Scope& scope, + int dump_mode, + int dump_interval) { + // dump_mode: 0: no random, 1: random with insid hash, 2: random with random + // number + size_t batch_size = device_reader_->GetCurBatchSize(); + size_t field_num = dump_fields_->size(); + std::vector dims(field_num, 0); + std::vector cpu_tensors(field_num); + std::vector lods(field_num, nullptr); + +// #ifdef PADDLE_WITH_XPU_KP +std::set used_slot_set; +#if (defined PADDLE_WITH_XPU_KP) && (defined PADDLE_WITH_BOX_PS) + auto real_reader = dynamic_cast(device_reader_); + PADDLE_ENFORCE_NOT_NULL( + real_reader, platform::errors::NotFound("In XPU only support SlotPaddleBoxDataFeed")); + std::vector used_slot_names; + real_reader->GetUsedSlotIndex(nullptr, &used_slot_names); + for (auto & slot : used_slot_names) { + used_slot_set.insert(slot); + } +#endif + + // copy fields +#ifdef PADDLE_WITH_BOX_PS + auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); + box_ptr->ExecuteFunc( + platform::CPUPlace(), +#else + parallel_run_dynamic( +#endif + field_num, + [this, &dims, &cpu_tensors, &lods, &scope, &used_slot_set, batch_size](const size_t& i) { + auto& field = (*dump_fields_)[i]; + Variable* var = scope.FindVar(field); + if (var == nullptr || !var->IsInitialized()) { + VLOG(3) << "Note: field[" << field + << "] cannot be find in scope, so it was skipped."; + return; + } + const LoDTensor& tensor = var->Get(); + if (!tensor.IsInitialized()) { + VLOG(3) << "Note: field[" << field + << "] is not initialized, so it was skipped."; + return; + } + if (!CheckValidOutput(&tensor, batch_size)) { + // VLOG(0) << "Note: field[" << field << "] cannot pass check, so + // it was " + // "skipped. Maybe the + // dimension is " "wrong "; + return; + } + dims[i] = tensor.dims()[1]; + lods[i] = (&tensor.lod()); + if (!platform::is_cpu_place(tensor.place())) { + TensorCopySync(tensor, platform::CPUPlace(), &cpu_tensors[i]); + } else { + cpu_tensors[i].ShareDataWith(tensor); + } +#ifdef PADDLE_WITH_XPU_KP + auto fid2sign_map_ptr = paddle::framework::BoxWrapper::GetInstance()->GetFid2SginMap(); + if (used_slot_set.find(field) != used_slot_set.end() \ + && fid2sign_map_ptr != nullptr && fid2sign_map_ptr->size() > 0) { + auto t_dtype = framework::TransToProtoVarType(cpu_tensors[i].dtype()); + if (t_dtype == proto::VarType::INT64) { + size_t numel = cpu_tensors[i].numel(); + int64_t * slot_data = cpu_tensors[i].data(); + for (size_t j = 0; j < numel; ++j) { + uint64_t fid = static_cast(slot_data[j]); + PADDLE_ENFORCE_LT(fid, fid2sign_map_ptr->size()); + uint64_t sign = (*fid2sign_map_ptr)[fid]; + PADDLE_ENFORCE(sign > 0 || (sign == 0 && fid == 0), + platform::errors::PreconditionNotMet( + "sign can only be 0 when fid is 0, fid:%llu, sign:%llu", + (unsigned long long)(fid), (unsigned long long)sign)); + slot_data[j] = static_cast(sign); + } + } + } +#endif + }); + + // dump data + std::default_random_engine engine(0); + std::uniform_int_distribution dist(0U, INT_MAX); + // need dump check + auto need_dump_func = [this, &dist, &engine, dump_mode, dump_interval]( + const std::string& lineid) { + size_t r = 0; + if (dump_mode == 1) { + r = XXH64(lineid.data(), lineid.length(), 0); + } else if (dump_mode == 2) { + r = dist(engine); + } + if (r % dump_interval != 0) { + return false; + } + return true; + }; + + std::atomic line_cnt{0}; + std::atomic num_cnt{0}; + + auto chan = writer_.channel(); +#ifdef PADDLE_WITH_BOX_PS + box_ptr->ExecuteFunc( + platform::CPUPlace(), +#else + // dump data + parallel_run_dynamic( +#endif + batch_size, + [this, + chan, + &dims, + &cpu_tensors, + &lods, + &need_dump_func, + field_num, + &line_cnt, + &num_cnt](const size_t& i) { + const std::string& lineid = device_reader_->GetLineId(i); + if (!need_dump_func(lineid)) { + return; + } + + ++line_cnt; + + thread_local std::pair bound; + std::string s; + size_t pos = 0; + if (FLAGS_lineid_have_extend_info) { + pos = lineid.find(" "); + if (pos != std::string::npos) { + s.append(&lineid[0], pos); + } else { + s.append(lineid); + } + } else { + s.append(lineid); + } + + size_t num = 0; + for (size_t k = 0; k < field_num; ++k) { + auto& lod = lods[k]; + if (lod == nullptr) { + continue; + } + auto& field = (*dump_fields_)[k]; + s.append("\t", 1); + GetLodBound(*lod, dims[k], i, &bound); + + num += (bound.second - bound.first); + if (FLAGS_dump_filed_same_as_aibox) { + size_t ext_pos = field.find("."); + if (ext_pos != std::string::npos) { + s.append(&field[0], ext_pos); + } else { + s.append(field); + } + } else { + format_string_append( + &s, "%s:%ld", field.c_str(), bound.second - bound.first); + } + PrintLodTensor(&cpu_tensors[k], bound.first, bound.second, &s); + } + num_cnt += num; + + // append extends tag info + if (pos > 0) { + s.append("\t", 1); + s.append(&lineid[pos + 1], lineid.length() - pos - 1); + } + // write to channel + chan->WriteMove(1, &s); + }); +} } // namespace framework } // namespace paddle #endif diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 9d26d285ebacc..d2bbd31bbf1e0 100755 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -3122,6 +3122,7 @@ void SlotPaddleBoxDataFeed::GetUsedSlotIndex( // get feasigns that FeedPass doesn't need const std::unordered_set& slot_name_omited_in_feedpass_ = boxps_ptr->GetOmitedSlot(); + if (used_slot_index != nullptr) { used_slot_index->clear(); } @@ -3694,9 +3695,11 @@ void SlotPaddleBoxDataFeed::BuildSlotBatchGPU(const int ins_num) { slot_total_num * sizeof(size_t), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); + #elif defined(PADDLE_WITH_XPU_KP) platform::MemcpySyncD2H(offsets.data(), d_slot_offsets, slot_total_num * sizeof(size_t), this->place_); #endif + copy_timer_.Pause(); data_timer_.Resume(); @@ -3762,6 +3765,7 @@ void SlotPaddleBoxDataFeed::BuildSlotBatchGPU(const int ins_num) { trans_timer_.Resume(); void** dest_gpu_p = reinterpret_cast(pack_->slot_buf_ptr()); + #if defined(PADDLE_WITH_CUDA) CUDA_CHECK(cudaMemcpyAsync(dest_gpu_p, h_tensor_ptrs.data(), use_slot_size_ * sizeof(void*), diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc index d173148718ae8..28b40e00ad126 100644 --- a/paddle/fluid/framework/device_worker.cc +++ b/paddle/fluid/framework/device_worker.cc @@ -17,11 +17,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/convert_utils.h" -#ifdef PADDLE_WITH_BOX_PS -#include "paddle/fluid/framework/fleet/box_wrapper.h" -#endif -DECLARE_bool(lineid_have_extend_info); -DECLARE_bool(dump_filed_same_as_aibox); namespace phi { class DenseTensor; @@ -247,20 +242,27 @@ bool CheckValidOutput(const LoDTensor* tensor, size_t batch_size) { void DeviceWorker::DumpParam(const Scope& scope, const int batch_id) { std::ostringstream os; + int device_id = int(place_.GetDeviceId()); for (auto& param : *dump_param_) { os.str(""); Variable* var = scope.FindVar(param); - if (var == nullptr) { + if (var == nullptr || !var->IsInitialized()) { + continue; + } + if (!var->IsType()) { continue; } LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr || !tensor->IsInitialized()) { + continue; + } framework::LoDTensor cpu_tensor; if (platform::is_gpu_place(tensor->place())) { TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); tensor = &cpu_tensor; } int64_t len = tensor->numel(); - os << "(" << batch_id << "," << param << ")" + os << "(" << device_id << "," << batch_id << "," << param << ")" << PrintLodTensor(tensor, 0, len); writer_ << os.str(); } @@ -429,6 +431,11 @@ void DeviceWorker::DumpField(const Scope& scope, << "] cannot be find in scope, so it was skipped."; continue; } + if (!var->IsType()) { + VLOG(3) << "Note: field[" << field + << "] is not dense tensor, so it was skipped."; + continue; + } LoDTensor* tensor = var->GetMutable(); if (!tensor->IsInitialized()) { VLOG(0) << "Note: field[" << field @@ -467,286 +474,6 @@ void DeviceWorker::DumpField(const Scope& scope, } writer_.Flush(); } -template -void format_string_append(std::string* str, - const char* fmt, - ARGS&&... args) { // use VA_ARGS may be better ? - int len = snprintf(NULL, 0, fmt, args...); - PADDLE_ENFORCE(len >= 0, "format args length error"); - size_t oldlen = str->length(); - str->resize(oldlen + len + 1); - PADDLE_ENFORCE(snprintf(&(*str)[oldlen], (size_t)len + 1, fmt, args...) == - len); - str->resize(oldlen + len); -} -inline void GetLodBound(const LoD& lod, - const int64_t& dim, - const int& index, - std::pair* bound) { - if (lod.size() != 0) { - bound->first = lod[0][index] * dim; - bound->second = lod[0][index + 1] * dim; - } else { - bound->first = index * dim; - bound->second = (index + 1) * dim; - } -} -template -void PrintLodTensorFmtType(const Tensor* tensor, - const int64_t& start, - const int64_t& end, - const char* fmt, - std::string* out_val) { - if (start >= end) { - return; - } - const T* ptr = tensor->data(); - for (int64_t i = start; i < end; i++) { - format_string_append(out_val, fmt, static_cast(ptr[i])); - } -} -void PrintLodTensor(const Tensor* tensor, - const int64_t& start, - const int64_t& end, - std::string* out) { - auto dtype = framework::TransToProtoVarType(tensor->dtype()); - if (dtype == proto::VarType::FP32) { - PrintLodTensorFmtType(tensor, start, end, ":%.9g", out); - } else if (dtype == proto::VarType::INT64) { - PrintLodTensorFmtType(tensor, start, end, ":%lu", out); - } else if (dtype == proto::VarType::FP64) { - PrintLodTensorFmtType(tensor, start, end, ":%.9g", out); - } else if (dtype == proto::VarType::INT32) { - PrintLodTensorFmtType(tensor, start, end, ":%d", out); - } else if (dtype == proto::VarType::INT16) { - PrintLodTensorFmtType(tensor, start, end, ":%d", out); - } else if (dtype == proto::VarType::BOOL) { - PrintLodTensorFmtType(tensor, start, end, ":%d", out); - } else { - out->append("unsupported type"); - } -} -void DeviceWorker::DumpParamBoxPS(const Scope& scope, const int batch_id) { - size_t field_num = dump_param_->size(); - - auto chan = writer_.channel(); - // thread process fields -#ifdef PADDLE_WITH_BOX_PS - auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); - box_ptr->ExecuteFunc( - platform::CPUPlace(), -#else - parallel_run_dynamic( -#endif - field_num, - [this, &scope, batch_id, chan](const size_t& id) { - auto& name = (*dump_param_)[id]; - Variable* var = scope.FindVar(name); - if (var == nullptr || !var->IsInitialized()) { - return; - } - const LoDTensor& tensor = var->Get(); - if (!tensor.IsInitialized()) { - VLOG(0) << "Note: param[" << name - << "] is not initialized, so it was skipped."; - return; - } - framework::LoDTensor cpu_tensor; - if (!platform::is_cpu_place(tensor.place())) { - TensorCopySync(tensor, platform::CPUPlace(), &cpu_tensor); - } else { - cpu_tensor.ShareDataWith(tensor); - } - - std::string s; - format_string_append(&s, "(%d,%s)", batch_id, name.c_str()); - int64_t len = cpu_tensor.numel(); - PrintLodTensor(&cpu_tensor, 0, len, &s); - // write to channel - chan->WriteMove(1, &s); - }); -} -void DeviceWorker::DumpFieldBoxPS( - const Scope& scope, - int dump_mode, - int dump_interval) { // dump_mode: 0: no random, - // 1: random with insid hash, - // 2: random with random - // number - size_t batch_size = device_reader_->GetCurBatchSize(); - size_t field_num = dump_fields_->size(); - std::vector dims(field_num, 0); - std::vector cpu_tensors(field_num); - std::vector lods(field_num, nullptr); - -// #ifdef PADDLE_WITH_XPU_KP -std::set used_slot_set; -#if (defined PADDLE_WITH_XPU_KP) && (defined PADDLE_WITH_BOX_PS) - auto real_reader = dynamic_cast(device_reader_); - PADDLE_ENFORCE_NOT_NULL( - real_reader, platform::errors::NotFound("In XPU only support SlotPaddleBoxDataFeed")); - std::vector used_slot_names; - real_reader->GetUsedSlotIndex(nullptr, &used_slot_names); - for (auto & slot : used_slot_names) { - used_slot_set.insert(slot); - } -#endif - - // copy fields -#ifdef PADDLE_WITH_BOX_PS - auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); - box_ptr->ExecuteFunc( - platform::CPUPlace(), -#else - parallel_run_dynamic( -#endif - field_num, - [this, &dims, &cpu_tensors, &lods, &scope, &used_slot_set, batch_size](const size_t& i) { - auto& field = (*dump_fields_)[i]; - Variable* var = scope.FindVar(field); - if (var == nullptr || !var->IsInitialized()) { - VLOG(3) << "Note: field[" << field - << "] cannot be find in scope, so it was skipped."; - return; - } - const LoDTensor& tensor = var->Get(); - if (!tensor.IsInitialized()) { - VLOG(3) << "Note: field[" << field - << "] is not initialized, so it was skipped."; - return; - } - if (!CheckValidOutput(&tensor, batch_size)) { - // VLOG(0) << "Note: field[" << field << "] cannot pass check, so - // it was " - // "skipped. Maybe the - // dimension is " "wrong "; - return; - } - dims[i] = tensor.dims()[1]; - lods[i] = (&tensor.lod()); - if (!platform::is_cpu_place(tensor.place())) { - TensorCopySync(tensor, platform::CPUPlace(), &cpu_tensors[i]); - } else { - cpu_tensors[i].ShareDataWith(tensor); - } -#ifdef PADDLE_WITH_XPU_KP - auto fid2sign_map_ptr = paddle::framework::BoxWrapper::GetInstance()->GetFid2SginMap(); - if (used_slot_set.find(field) != used_slot_set.end() \ - && fid2sign_map_ptr != nullptr && fid2sign_map_ptr->size() > 0) { - auto t_dtype = framework::TransToProtoVarType(cpu_tensors[i].dtype()); - if (t_dtype == proto::VarType::INT64) { - size_t numel = cpu_tensors[i].numel(); - int64_t * slot_data = cpu_tensors[i].data(); - for (size_t j = 0; j < numel; ++j) { - uint64_t fid = static_cast(slot_data[j]); - PADDLE_ENFORCE_LT(fid, fid2sign_map_ptr->size()); - uint64_t sign = (*fid2sign_map_ptr)[fid]; - PADDLE_ENFORCE(sign > 0 || (sign == 0 && fid == 0), - platform::errors::PreconditionNotMet( - "sign can only be 0 when fid is 0, fid:%llu, sign:%llu", - (unsigned long long)(fid), (unsigned long long)sign)); - slot_data[j] = static_cast(sign); - } - } - } -#endif - }); - - // dump data - std::default_random_engine engine(0); - std::uniform_int_distribution dist(0U, INT_MAX); - // need dump check - auto need_dump_func = [this, &dist, &engine, dump_mode, dump_interval]( - const std::string& lineid) { - size_t r = 0; - if (dump_mode == 1) { - r = XXH64(lineid.data(), lineid.length(), 0); - } else if (dump_mode == 2) { - r = dist(engine); - } - if (r % dump_interval != 0) { - return false; - } - return true; - }; - - std::atomic line_cnt{0}; - std::atomic num_cnt{0}; - - auto chan = writer_.channel(); -#ifdef PADDLE_WITH_BOX_PS - box_ptr->ExecuteFunc( - platform::CPUPlace(), -#else - // dump data - parallel_run_dynamic( -#endif - batch_size, - [this, - chan, - &dims, - &cpu_tensors, - &lods, - &need_dump_func, - field_num, - &line_cnt, - &num_cnt](const size_t& i) { - const std::string& lineid = device_reader_->GetLineId(i); - if (!need_dump_func(lineid)) { - return; - } - - ++line_cnt; - - thread_local std::pair bound; - std::string s; - size_t pos = 0; - if (FLAGS_lineid_have_extend_info) { - pos = lineid.find(" "); - if (pos != std::string::npos) { - s.append(&lineid[0], pos); - } else { - s.append(lineid); - } - } else { - s.append(lineid); - } - - size_t num = 0; - for (size_t k = 0; k < field_num; ++k) { - auto& lod = lods[k]; - if (lod == nullptr) { - continue; - } - auto& field = (*dump_fields_)[k]; - s.append("\t", 1); - GetLodBound(*lod, dims[k], i, &bound); - - num += (bound.second - bound.first); - if (FLAGS_dump_filed_same_as_aibox) { - size_t ext_pos = field.find("."); - if (ext_pos != std::string::npos) { - s.append(&field[0], ext_pos); - } else { - s.append(field); - } - } else { - format_string_append( - &s, "%s:%ld", field.c_str(), bound.second - bound.first); - } - PrintLodTensor(&cpu_tensors[k], bound.first, bound.second, &s); - } - num_cnt += num; - - // append extends tag info - if (pos > 0) { - s.append("\t", 1); - s.append(&lineid[pos + 1], lineid.length() - pos - 1); - } - // write to channel - chan->WriteMove(1, &s); - }); -} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index b46e3daeaa92f..9483ee84e9293 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -225,11 +225,7 @@ class DeviceWorker { virtual void DumpParam(const Scope& scope, const int batch_id); virtual void DumpField(const Scope& scope, int dump_mode, - int dump_interval = 10000); - virtual void DumpParamBoxPS(const Scope& scope, const int batch_id); - virtual void DumpFieldBoxPS(const Scope& scope, - int dump_mode, - int dump_interval = 10000); + int dump_interval = 10000); Scope* root_scope_ = nullptr; Scope* thread_scope_; @@ -851,15 +847,33 @@ class BoxPSAsynDenseTable { }; class BoxPSWorker : public DeviceWorker { + struct MemoryShareTensor { + int64_t offset_ = 0; + phi::DenseTensor* data_tensor_ = nullptr; + // init + void init(const std::string& name, + const platform::Place& place, + const int64_t& total_len, + Scope* root_scope); + // share + phi::DenseTensor& share(phi::DenseTensor* gpu_tensor, const size_t& len); + template + T* data() { + return data_tensor_->data(); + } + phi::DenseTensor& tensor() { return *data_tensor_; } + // numel + int64_t numel(void) { return data_tensor_->numel(); } + }; + public: BoxPSWorker() {} ~BoxPSWorker() override {} void Initialize(const TrainerDesc& desc) override; - + void Finalize(); void BindingDataFeedMemory() override {} void CreateDeviceResource(const ProgramDesc& main_prog) override; - void TrainFiles() override; void TrainFilesWithProfiler() override; @@ -881,23 +895,34 @@ class BoxPSWorker : public DeviceWorker { protected: int PackBatchTask(void); int CheckNeedParam(VarDesc* var); - int64_t AllocParamTensor(int64_t* pad_len); - int64_t AllocParamTensorAsync(); + int64_t AllocParamTensor(const ProgramDesc& program, int64_t* pad_len); + int64_t AllocParamTensorAsync(const ProgramDesc& program); void SyncParam(void); + void BuildShardingDepends(const ProgramDesc& program); + void CreateThreadScopeForAsync(const ProgramDesc& program); + void CreateThreadScopeForSharding(const ProgramDesc& program); + void CreateThreadScopeForNorm(const ProgramDesc& program); + void CreateThreadOperators(const ProgramDesc& program); + int IsParameter(const std::string& name, bool full_match); + + protected: + virtual void DumpParam(const Scope& scope, const int batch_id); + virtual void DumpField(const Scope& scope, + int dump_mode, + int dump_interval = 10000); protected: int device_id_; int thread_id_; - std::shared_ptr program_; std::vector> ops_; platform::DeviceContext* dev_ctx_ = nullptr; // dense async table BoxPSAsynDenseTable* dense_table_ = nullptr; - Tensor param_async_; - Tensor grad_async_; - Tensor param_sync_; + MemoryShareTensor param_async_; + MemoryShareTensor grad_async_; + MemoryShareTensor param_sync_; std::set async_param_name_; int param_sync_step_ = 0; int sync_mode_ = 0; @@ -908,6 +933,22 @@ class BoxPSWorker : public DeviceWorker { std::vector skip_vars_; std::unordered_map> unused_vars_; + + int nccl_rank_id_ = 0; + int ring_id_ = 0; + std::unordered_map params2rootid_; + std::multiset remove_vars_; + std::vector all_vars_; + std::vector thread_vars_; + std::multiset unpersist_vars_; + std::multiset persist_param_vars_; + std::multiset remove_ops_; + std::vector need_copy_vars_; + std::vector shard_dump_params_; + std::vector shard_dump_fields_; + bool sharding_mode_ = false; + // op extend + std::unordered_set sync_points_; }; #endif diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index b790a66b14d9c..e2a6fdf5c9545 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -61,16 +61,25 @@ bool OpInOutInfo::IsInArgBufferNeeded(const std::string &in_arg_name) const { return no_need_buffer_ins_.empty() || other_args_set_.count(in_arg_name) != 0; } -static bool VarCanBeDeleted(const std::string &name, - const BlockDesc &block, - const std::unordered_set &skip_vars) { +static bool VarCanBeDeleted( + const std::string &name, + const BlockDesc &block, + const std::unordered_set &skip_vars, + const std::multiset *unpersist_vars = nullptr) { if (skip_vars.count(name) != 0) { return false; } auto *var_desc = block.FindVar(name); if (var_desc == nullptr || var_desc->Persistable()) { - return false; + if (unpersist_vars != nullptr) { + // unpersist vars + if (unpersist_vars->find(name) == unpersist_vars->end()) { + return false; + } + } else { + return false; + } } auto type = var_desc->Proto()->type().type(); @@ -79,15 +88,19 @@ static bool VarCanBeDeleted(const std::string &name, type == proto::VarType::SELECTED_ROWS || type == proto::VarType::LOD_TENSOR_ARRAY; } - std::unordered_map> GetUnusedVars(const BlockDesc &block, const std::vector> &ops, - const std::vector &skip_var_list) { + const std::vector &skip_var_list, + const std::multiset *unpersist_vars) { std::unordered_set skip_vars(skip_var_list.begin(), skip_var_list.end()); std::unordered_map var_op_idx_map; + std::unordered_map old_to_new; + std::unordered_map new_to_old; + + bool is_sharding_mode = (unpersist_vars != nullptr && !unpersist_vars->empty()); for (size_t i = 0; i < ops.size(); ++i) { auto *op = ops[i].get(); @@ -95,9 +108,27 @@ GetUnusedVars(const BlockDesc &block, OpInOutInfo info; for (auto &name_pair : op->Inputs()) { for (auto &name : name_pair.second) { - if (!VarCanBeDeleted(name, block, skip_vars)) { + if (!VarCanBeDeleted(name, block, skip_vars, unpersist_vars)) { continue; } + bool is_unpersist_var = false; + if (is_sharding_mode) { + if (unpersist_vars->find(name) != unpersist_vars->end()) { + is_unpersist_var = true; + // c_broadcast + if (op->Type() == "c_broadcast") { + auto it = old_to_new.find(name); + if (it == old_to_new.end()) { + old_to_new[name] = name; + new_to_old[name] = name; + } else { + std::string new_name = it->second + "_"; + old_to_new[name] = new_name; + new_to_old[new_name] = name; + } + } + } + } // var can be gc-ed if (!info.IsBuilt()) { @@ -106,7 +137,11 @@ GetUnusedVars(const BlockDesc &block, if (info.IsInArgBufferNeeded(name)) { // Update the last living op of variable to current op - var_op_idx_map[name] = i; + if (is_unpersist_var && old_to_new.count(name) > 0) { + var_op_idx_map[old_to_new[name]] = i; + } else { + var_op_idx_map[name] = i; + } } else { VLOG(10) << "Skip reference count computing of variable " << name_pair.first << "(" << name << ") in Operator " @@ -114,12 +149,15 @@ GetUnusedVars(const BlockDesc &block, } } } - for (auto &name_pair : op->Outputs()) { for (auto &name : name_pair.second) { - if (VarCanBeDeleted(name, block, skip_vars)) { + if (VarCanBeDeleted(name, block, skip_vars, unpersist_vars)) { // Update the last living op of variable to current op - var_op_idx_map[name] = i; + if (is_sharding_mode && old_to_new.count(name) > 0) { + var_op_idx_map[old_to_new[name]] = i; + } else { + var_op_idx_map[name] = i; + } } } } @@ -129,7 +167,11 @@ GetUnusedVars(const BlockDesc &block, for (auto &name_op_idx_pair : var_op_idx_map) { auto &name = name_op_idx_pair.first; size_t op_idx = name_op_idx_pair.second; - result[ops[op_idx].get()].emplace_back(name); + if (is_sharding_mode && new_to_old.count(name) > 0) { + result[ops[op_idx].get()].emplace_back(new_to_old[name]); + } else { + result[ops[op_idx].get()].emplace_back(name); + } } return result; } diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h index f56505dced85b..b772af9110726 100644 --- a/paddle/fluid/framework/executor_gc_helper.h +++ b/paddle/fluid/framework/executor_gc_helper.h @@ -46,12 +46,11 @@ struct OpInOutInfo { std::unordered_set other_args_set_; bool is_built_{false}; }; - std::unordered_map> GetUnusedVars(const BlockDesc &block, const std::vector> &ops, - const std::vector &skip_vars); - + const std::vector &skip_vars, + const std::multiset *unpersist_vars = nullptr); // Collect unused tensors void DeleteUnusedTensors(const Scope &scope, const std::vector &delete_vars, diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc index 7386030ae217f..5d8f7876f28a8 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cc +++ b/paddle/fluid/framework/fleet/box_wrapper.cc @@ -1115,8 +1115,8 @@ const std::vector BoxWrapper::GetNanInfMetricMsg( std::vector metric_return_values_(4, 0.0); auto* naninf_cal_ = iter->second->GetCalculator(); naninf_cal_->computeNanInfMsg(); - metric_return_values_[0] = naninf_cal_->nan_rate(); - metric_return_values_[1] = naninf_cal_->inf_rate(); + metric_return_values_[0] = naninf_cal_->nan_cnt(); + metric_return_values_[1] = naninf_cal_->inf_cnt(); metric_return_values_[2] = naninf_cal_->nan_inf_rate(); metric_return_values_[3] = naninf_cal_->size(); naninf_cal_->reset_nan_inf(); diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index 40ebdecc78ca4..cbbdc0c4d4233 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -47,6 +47,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/timer.h" #include "paddle/fluid/string/string_helper.h" + #include "paddle/fluid/framework/fleet/metrics.h" #include "paddle/fluid/framework/fleet/box_wrapper_kernel.h" @@ -60,6 +61,7 @@ limitations under the License. */ #include #include #endif + #define BUF_SIZE 1024 * 1024 DECLARE_bool(padbox_auc_runner_mode); @@ -68,7 +70,9 @@ DECLARE_int32(padbox_dataset_shuffle_thread_num); namespace paddle { namespace framework { + extern int make_day_id(const int &y, const int &m, const int &d); + #ifdef PADDLE_WITH_BOX_PS #define MAX_GPU_NUM 16 @@ -352,6 +356,11 @@ class MetricMsg { platform::errors::NotFound("Error: var %s is not found in scope.", varname.c_str())); auto& gpu_tensor = var->Get(); + PADDLE_ENFORCE_EQ( + gpu_tensor.IsInitialized(), + true, + platform::errors::InvalidArgument( + "Error: monitor var `%s` uninitialized Tensor.", varname.c_str())); *data = gpu_tensor.data(); *len = gpu_tensor.numel(); } @@ -365,6 +374,11 @@ class MetricMsg { platform::errors::NotFound("Error: var %s is not found in scope.", varname.c_str())); auto& gpu_tensor = var->Get(); + PADDLE_ENFORCE_EQ( + gpu_tensor.IsInitialized(), + true, + platform::errors::InvalidArgument( + "Error: monitor var `%s` uninitialized Tensor.", varname.c_str())); auto* gpu_data = gpu_tensor.data(); auto len = gpu_tensor.numel(); data->resize(len); @@ -506,6 +520,12 @@ class BoxWrapper { std::cout<<"start profile in BoxWrapper"<& feasgin_to_box); @@ -569,6 +589,7 @@ class BoxWrapper { const int batch_size, const int skip_offset, bool expand_only); + void PushSparseGradCaseGPU(const paddle::platform::Place& place, const std::vector& keys, const std::vector& grad_values, @@ -578,6 +599,7 @@ class BoxWrapper { const int batch_size, const int skip_offset, bool expand_only); + void PushSparseGradCaseXPU(const paddle::platform::Place& place, const std::vector& keys, const std::vector& grad_values, @@ -935,7 +957,8 @@ class BoxWrapper { for (auto& name : var_names) { auto it = std::find(skip_gc_vars_.begin(), skip_gc_vars_.end(), name); if (it != skip_gc_vars_.end()) { - return; + // return; + continue; } skip_gc_vars_.push_back(name); } @@ -1099,11 +1122,13 @@ class BoxWrapper { std::set slot_eval_set_; std::atomic dataset_id_{0}; std::atomic round_id_{0}; + #if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) scalopus::TransportLoopbackFactory::Ptr factory; std::shared_ptr manager; scalopus::CatapultRecorder::Ptr catapult_recorder; #endif + // skip gc vars std::vector skip_gc_vars_; }; diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index 0bd62606c950f..738d175632df8 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -272,8 +272,10 @@ void BoxWrapper::PullSparseCaseCPU(const paddle::platform::Place& place, slot_lens[i + 1] = total_length; } dev.total_key_length = total_length; + uint64_t* total_keys = dev.keys_tensor.mutable_data( static_cast(total_length * 2) * sizeof(int64_t), place); + int* key2slot = dev.keys2slot.mutable_data( static_cast(total_length * 5) * sizeof(int), place); int* total_dims = @@ -1298,8 +1300,11 @@ void CheckPushValue( void BoxWrapper::PushSparseGradCaseXPU(const paddle::platform::Place& place, const std::vector& keys, const std::vector& grad_values, - const std::vector& slot_lengths, const int hidden_size, - const int expand_embed_dim, const int batch_size, const int skip_offset, + const std::vector& slot_lengths, + const int hidden_size, + const int expand_embed_dim, + const int batch_size, + const int skip_offset, bool expand_only) { #ifdef PADDLE_WITH_XPU_KP int device_id = place.GetDeviceId(); @@ -1516,6 +1521,7 @@ void BoxWrapper::PushSparseGradCase( const int batch_size, const int skip_offset, bool expand_only) { + if (platform::is_cpu_place(place)) { PushSparseGradCaseCPU(place, keys, diff --git a/paddle/fluid/framework/fleet/metrics.cc b/paddle/fluid/framework/fleet/metrics.cc index ae7111b4d28b5..036435b1eaa46 100644 --- a/paddle/fluid/framework/fleet/metrics.cc +++ b/paddle/fluid/framework/fleet/metrics.cc @@ -647,4 +647,4 @@ void BasicAucCalculator::computeNanInfMsg() { } // namespace framework } // namespace paddle -#endif +#endif \ No newline at end of file diff --git a/paddle/fluid/framework/fleet/metrics.h b/paddle/fluid/framework/fleet/metrics.h index e7e227f222dfe..1f62429759e8d 100644 --- a/paddle/fluid/framework/fleet/metrics.h +++ b/paddle/fluid/framework/fleet/metrics.h @@ -124,6 +124,8 @@ class BasicAucCalculator { double mae() const { return _mae; } double nan_rate() const { return _nan_rate; } double inf_rate() const { return _inf_rate; } + double nan_cnt() const { return _nan_cnt; } + double inf_cnt() const { return _inf_cnt; } double nan_inf_rate() const { return _nan_inf_rate; } double actual_ctr() const { return _actual_ctr; } double predicted_ctr() const { return _predicted_ctr; } @@ -852,4 +854,4 @@ class Metric { }; } // namespace framework } // namespace paddle -#endif +#endif \ No newline at end of file diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 51aeed2e5d734..50524e11b46ff 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -34,6 +34,9 @@ enum class OpRole { kDist = 0x0008, // Tag all learning rate scheduler operators. kLRSched = 0x0010, + + // scale lr(for adam) + kScaleLr = 0x0012, kLoss = 0x0100, // The default value of op's role. This should be only used for unittests and diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 9133489cb09c2..550997f3cdea6 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -71,6 +71,9 @@ DECLARE_bool(check_nan_inf); DECLARE_bool(enable_unused_var_check); DECLARE_bool(run_kp_kernel); DECLARE_bool(enable_host_event_recorder_hook); +PADDLE_DEFINE_EXPORTED_bool(enable_check_input_var, + false, + "enable check input var"); namespace paddle { namespace framework { @@ -1795,7 +1798,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, os << "\n"; printf("%s", os.str().c_str()); } - PADDLE_ENFORCE(false, "ERROR: check INF and NAN: %s", + PADDLE_ENFORCE(false, + "ERROR: check INF and NAN: %s", DebugStringEx(&exec_scope).c_str()); } #else @@ -1960,7 +1964,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { << ", fallbacking to CPU one!"; expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); - } else if (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) { + } else if (!paddle::platform::is_xpu_support_op(type_, + expected_kernel_key)) { VLOG(3) << "fluid XPU not support kernel: " << type_ << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; @@ -2441,13 +2446,15 @@ void OperatorWithKernel::ParseInputDataType( } } if (t != nullptr) { - PADDLE_ENFORCE_EQ( - t->IsInitialized(), - true, - platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " - "contains uninitialized Tensor.", - Type(), - name)); + if (FLAGS_enable_check_input_var) { + PADDLE_ENFORCE_EQ( + t->IsInitialized(), + true, + platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " + "contains uninitialized Tensor.", + Type(), + name)); + } *data_type = paddle::framework::TransToProtoVarType(t->dtype()); } } diff --git a/paddle/fluid/framework/program_utils.cc b/paddle/fluid/framework/program_utils.cc index a32350a569db0..09468273fd780 100644 --- a/paddle/fluid/framework/program_utils.cc +++ b/paddle/fluid/framework/program_utils.cc @@ -186,6 +186,30 @@ void ProgramProcessor::AddDepToBlockOp(const BlockDesc &block) { } ProgramProcessor::ProgramProcessor() {} +// write to file +void WriteToFile(const std::string &file_path, const std::string &msg) { + FILE *fp = fopen(file_path.c_str(), "w"); + if (fp == NULL) { + LOG(WARNING) << "open write file path=" << file_path << " failed"; + return; + } + fwrite(msg.c_str(), 1, msg.length(), fp); + fclose(fp); +} +void DumpProgramDescFile(const std::string &name, const ProgramDesc &program) { + ProgramDesc *new_prog = const_cast(&program); + std::string print_str; + google::protobuf::TextFormat::Printer printer; + printer.SetUseShortRepeatedPrimitives(true); + printer.SetSingleLineMode(false); + const ::google::protobuf::Message *message = + reinterpret_cast(new_prog->Proto()); + printer.PrintToString(*message, &print_str); + + char filename[512] = {0}; + snprintf(filename, sizeof(filename), "./%s_%lu.proto", name.c_str(), time(0)); + WriteToFile(filename, print_str); +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/program_utils.h b/paddle/fluid/framework/program_utils.h index 4a276e80112b7..a609181b9cf51 100644 --- a/paddle/fluid/framework/program_utils.h +++ b/paddle/fluid/framework/program_utils.h @@ -13,8 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/framework/program_desc.h" +#include + +#include +#include "paddle/fluid/framework/program_desc.h" namespace paddle { namespace framework { @@ -32,6 +35,21 @@ class ProgramProcessor { void AddDepToBlockOp(const BlockDesc &block); }; - +void WriteToFile(const std::string &file_path, const std::string &msg); +void DumpProgramDescFile(const std::string &name, const ProgramDesc &program); +template +void DumpV( + const V &v, + const char *path, + std::function f = + [](typename V::value_type it) -> std::string { return it; }) { + std::ostringstream str_os; + for (auto it : v) { + str_os << f(it) << std::endl; + } + std::ofstream ofs(path); + ofs << str_os.str(); + ofs.close(); +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index d930a64c533f7..c515646c3ef84 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -399,10 +399,9 @@ class BoxPSTrainer : public TrainerBase { void InitDumpEnv() override; virtual std::string GetDumpPath(int tid); virtual void DumpWork(int tid); - - protected: - void CopyParameters(const Scope& root_scope, int device_id); - void DumpParameters(void); + virtual void FinalizeDumpEnv(); + void RemoveOtherDeviceVars(const ProgramDesc& main_program, + Scope* root_scope); protected: int thread_num_; @@ -412,6 +411,7 @@ class BoxPSTrainer : public TrainerBase { // std::vector worker_threads_; std::vector> wait_futures_; std::vector readers_; + std::vector> dump_futures_; std::shared_ptr> param_need_sync_; std::vector persistable_vars_; diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc index 907fd37e44205..4a8ded3e7acf0 100644 --- a/paddle/fluid/memory/allocation/buddy_allocator.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator.cc @@ -23,6 +23,9 @@ limitations under the License. */ defined(PADDLE_WITH_MLU) || defined(PADDLE_WITH_ASCEND_CL) #define USE_DEVICE DECLARE_uint64(reallocate_gpu_memory_in_mb); +#elif defined(PADDLE_WITH_XPU) +#define USE_DEVICE +DECLARE_uint64(reallocate_xpu_memory_in_mb); #endif #include "paddle/fluid/platform/device/device_wrapper.h" @@ -55,8 +58,11 @@ BuddyAllocator::BuddyAllocator( }; } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - init_allocate_size_func_ = &platform::GpuInitAllocSize; + init_allocate_size_func_ = &platform::GpuInitAllocSize; re_allocate_size_func_ = &platform::GpuReallocSize; +#elif defined(PADDLE_WITH_XPU) + init_allocate_size_func_ = &platform::XpuInitAllocSize; + re_allocate_size_func_ = &platform::XpuReallocSize; #elif defined(PADDLE_WITH_ASCEND_CL) init_allocate_size_func_ = &platform::NPUInitAllocSize; re_allocate_size_func_ = &platform::NPUReallocSize; @@ -97,7 +103,8 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) { VLOG(10) << "alloc: " << unaligned_size << ", padding for desc: " << sizeof(MemoryBlock::Desc) << ", extra padding: " << extra_padding_size_ - << ", alignment: " << min_chunk_size_; + << ", alignment: " << min_chunk_size_ + << ", max_chunk_size: " << max_chunk_size_; // acquire the allocator lock std::lock_guard lock(mutex_); @@ -263,6 +270,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( #elif defined(PADDLE_WITH_MLU) allocate_bytes = DeviceAllocateSize( &platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes); +#elif defined(PADDLE_WITH_XPU) + allocate_bytes = DeviceAllocateSize(&platform::XpuInitAllocSize, &platform::XpuReallocSize, request_bytes); #endif #endif @@ -357,7 +366,12 @@ size_t BuddyAllocator::DeviceAllocateSize( } else { // Compute the re-allocation size, we store the re-allocation size when // user set FLAGS_reallocate_gpu_memory_in_mb to fix value. - if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) { +#if defined(PADDLE_WITH_XPU) + auto flag_realloc_size = FLAGS_reallocate_xpu_memory_in_mb; +#else + auto flag_realloc_size = FLAGS_reallocate_gpu_memory_in_mb; +#endif + if (realloc_size_ == 0 || flag_realloc_size == 0ul) { realloc_size_ = re_allocate_size_func(); } allocate_bytes = std::max(realloc_size_, request_bytes); diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 0508bf211d832..fe8110219ca24 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -29,6 +29,14 @@ #include "paddle/phi/common/place.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" +#elif defined(PADDLE_WITH_XPU) +#include "paddle/fluid/platform/device/xpu/xpu_info.h" + +DECLARE_bool(use_xpu_buddy_allocator); +DECLARE_double(fraction_of_xpu_memory_to_use); +DECLARE_uint64(initial_xpu_memory_in_mb); +DECLARE_uint64(reallocate_xpu_memory_in_mb); + #endif PADDLE_DEFINE_EXPORTED_bool( @@ -143,36 +151,132 @@ size_t Used(const platform::IPUPlace &place) { return GetCPUBuddyAllocator()->Used(); } +// TODO@yaocheng05: Add xpu support GetXPUBuddyAllocator + +// For XPU +#if defined(PADDLE_WITH_XPU) +class XPUBuddyAllocatorList { + private: + XPUBuddyAllocatorList() : devices_(platform::GetXPUSelectedDevices()) { + auto xpu_num = devices_.size(); + allocators_.resize(xpu_num); + init_flags_.reserve(xpu_num); + for (size_t i = 0; i < xpu_num; ++i) { + init_flags_.emplace_back(new std::once_flag()); + } + } + + static XPUBuddyAllocatorList *CreateNewInstance() { + return new XPUBuddyAllocatorList(); + } + + public: + static XPUBuddyAllocatorList *Instance() { + static auto *instance = CreateNewInstance(); + return instance; + } + + BuddyAllocator *Get(int xpu_id) { + auto pos = std::distance( + devices_.begin(), std::find(devices_.begin(), devices_.end(), xpu_id)); + PADDLE_ENFORCE_LT(pos, + devices_.size(), + platform::errors::OutOfRange( + "The index exceeds the size of devices, the size of " + "devices is %d, the index is %d", + devices_.size(), + pos)); + + std::call_once(*init_flags_[pos], [this, pos] { + platform::SetXPUDeviceId(devices_[pos]); + allocators_[pos].reset( + new BuddyAllocator(std::unique_ptr( + new detail::XPUAllocator(devices_[pos])), + platform::XpuMinChunkSize(), + platform::XpuMaxChunkSize())); + VLOG(10) << "\n\nNOTE:\n" + << "You can set GFlags environment variable " + << "'FLAGS_fraction_of_gpu_memory_to_use' " + << "or 'FLAGS_initial_gpu_memory_in_mb' " + << "or 'FLAGS_reallocate_gpu_memory_in_mb' " + << "to change the memory size for GPU usage.\n" + << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is " + << FLAGS_fraction_of_xpu_memory_to_use + << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is " + << FLAGS_initial_xpu_memory_in_mb + << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " + << FLAGS_reallocate_xpu_memory_in_mb << "\n\n"; + }); + + return allocators_[pos].get(); + } + + private: + std::vector devices_; + std::vector> init_flags_; + std::vector> allocators_; +}; + +BuddyAllocator *GetXPUBuddyAllocator(int xpu_id) { + return XPUBuddyAllocatorList::Instance()->Get(xpu_id); +} +#endif + + // For kunlun XPU template <> void *Alloc(const platform::XPUPlace &place, size_t size) { -#ifdef PADDLE_WITH_XPU - VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); - void *p = nullptr; - platform::XPUDeviceGuard gurad(place.device); - int ret = xpu_malloc(reinterpret_cast(&p), size); - if (ret != XPU_SUCCESS) { - VLOG(10) << "xpu memory malloc(" << size << ") failed, try again"; - xpu_wait(); - ret = xpu_malloc(reinterpret_cast(&p), size); - } - PADDLE_ENFORCE_EQ( - ret, - XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], no enough memory", ret)); - if (FLAGS_init_allocated_mem) { - PADDLE_THROW(platform::errors::Unimplemented( - "xpu memory FLAGS_init_allocated_mem is not implemented.")); +#ifdef PADDLE_WITH_XPU + if (FLAGS_use_xpu_buddy_allocator) { + auto *buddy_allocator = GetXPUBuddyAllocator(place.device); + auto *ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + platform::XPUMLHandler handler; + auto re = handler.getMemoryUsageTuple(place.device); + + PADDLE_THROW(platform::errors::ResourceExhausted( + "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize " + "%s, GpuMaxChunkSize %s, GPU memory used: %s.", + string::HumanReadableSize(size), + place.device, + string::HumanReadableSize(std::get<2>(re)), + string::HumanReadableSize(std::get<0>(re)), + string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), + string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), + string::HumanReadableSize(std::get<1>(re)))); + } + return ptr; + } else { + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + void *p = nullptr; + + platform::XPUDeviceGuard gurad(place.device); + // platform::inc_malloc_cnt(place.device); + int ret = xpu_malloc(reinterpret_cast(&p), size); + if (ret != XPU_SUCCESS) { + VLOG(10) << "xpu memory malloc(" << size << ") failed, try again"; + xpu_wait(); + // platform::inc_malloc_cnt(place.device); + ret = xpu_malloc(reinterpret_cast(&p), size); + } + PADDLE_ENFORCE_EQ( + ret, + XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], no enough memory", ret)); + if (FLAGS_init_allocated_mem) { + PADDLE_THROW(platform::errors::Unimplemented( + "xpu memory FLAGS_init_allocated_mem is not implemented.")); + } + VLOG(10) << " pointer=" << p; + return p; } - VLOG(10) << " pointer=" << p; - return p; #else PADDLE_THROW( platform::errors::PermissionDenied("'XPUPlace' is not supported.")); - return nullptr; #endif + return nullptr; } template <> @@ -182,9 +286,12 @@ void Free(const platform::XPUPlace &place, #ifdef PADDLE_WITH_XPU VLOG(10) << "Free " << size << " bytes on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); - - platform::XPUDeviceGuard gurad(place.device); - xpu_free(p); + if (FLAGS_use_xpu_buddy_allocator) { + GetXPUBuddyAllocator(place.device)->Free(p); + } else { + platform::XPUDeviceGuard gurad(place.device); + xpu_free(p); + } #else PADDLE_THROW( platform::errors::PermissionDenied("'XPUPlace' is not supported.")); diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc index fcfece978cb7f..91a6e3ffd4a62 100644 --- a/paddle/fluid/memory/allocation/system_allocator.cc +++ b/paddle/fluid/memory/allocation/system_allocator.cc @@ -39,6 +39,9 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#if defined(PADDLE_WITH_XPU) +DECLARE_double(fraction_of_xpu_memory_to_use); +#endif #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler/mem_tracing.h" @@ -289,6 +292,82 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; } #endif +#if defined(PADDLE_WITH_XPU) +void* XPUAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + void* p; + // platform::inc_malloc_cnt(xpu_id_); + auto result = platform::RecordedXpuMalloc(&p, size, xpu_id_); + + if (result == XPU_SUCCESS) { + *index = 0; + xpu_alloc_size_ += size; + return p; + } else { + size_t avail, total, actual_avail, actual_total; + bool is_limited = platform::RecordedXpuMemGetInfo( + &avail, &total, &actual_avail, &actual_total, xpu_id_); + size_t allocated = total - avail; + + std::string err_msg; + if (is_limited) { + auto limit_size = (total >> 20); + err_msg = string::Sprintf( + "\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a " + "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " + "maximum GPU memory usage is limited to %d MB.\n" + " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", + limit_size, + limit_size); + } + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on XPU %d. " + "Cannot allocate %s memory on XPU %d, %s memory has been allocated and " + "available memory is only %s.\n\n" + "Please check whether there is any other process using GPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" + "2. If no, please try one of the following suggestions:\n" + " 1) Decrease the batch size of your model.\n" + " 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, " + "please set it to a higher value but less than 1.0.\n" + " The command is " + "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", + xpu_id_, + string::HumanReadableSize(size), + xpu_id_, + string::HumanReadableSize(allocated), + string::HumanReadableSize(avail), + xpu_id_, + FLAGS_fraction_of_xpu_memory_to_use, + err_msg)); + return nullptr; + } +} + +void XPUAllocator::Free(void* p, size_t size, size_t index) { + PADDLE_ENFORCE_EQ(index, + 0, + platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(xpu_alloc_size_, + size, + platform::errors::InvalidArgument( + "The size of memory (%d) to free exceeds the size of " + "allocated gpu memory (%d)", + size, + xpu_alloc_size_)); + xpu_alloc_size_ -= size; + + platform::RecordedXpuFree(p, size, xpu_id_); +} + +bool XPUAllocator::UseGpu() const { return true; } + +#endif + + #ifdef PADDLE_WITH_ASCEND_CL void* NPUAllocator::Alloc(size_t* index, size_t size) { if (size <= 0) return nullptr; diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h index 18c2e278f99c5..8d84a130a1b23 100644 --- a/paddle/fluid/memory/allocation/system_allocator.h +++ b/paddle/fluid/memory/allocation/system_allocator.h @@ -68,6 +68,21 @@ class CUDAPinnedAllocator : public SystemAllocator { }; #endif +#if defined(PADDLE_WITH_XPU) +class XPUAllocator : public SystemAllocator { + public: + explicit XPUAllocator(int xpu_id) : xpu_id_(xpu_id) {} + + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t xpu_alloc_size_ = 0; + int xpu_id_; +}; +#endif + #ifdef PADDLE_WITH_ASCEND_CL class NPUAllocator : public SystemAllocator { diff --git a/paddle/fluid/operators/batch_fc_op.cc b/paddle/fluid/operators/batch_fc_op.cc index 9ee4bad1d73b7..7cc1844393b03 100644 --- a/paddle/fluid/operators/batch_fc_op.cc +++ b/paddle/fluid/operators/batch_fc_op.cc @@ -44,6 +44,61 @@ class BatchFCOp : public framework::OperatorWithKernel { auto w_dims = ctx->GetInputDim("W"); int batchcount = ctx->Attrs().Get("batchcount"); + int transpose_weight = ctx->Attrs().Get("transpose_weight"); + + if (transpose_weight) { + // Input_dim: [batch_count, ?, in_dim] + // W_dim: [in_dim, batch_count * out_dim] + // Bias_dim: [1, batch_count * out_dim] + // Out_dim: [batch_count, ?, out_dim] + PADDLE_ENFORCE_GT( + batchcount, + 0, + platform::errors::PreconditionNotMet( + "with transpose weight, batchcount should > 0")); + PADDLE_ENFORCE_EQ( + w_dims.size(), + 2, + platform::errors::InvalidArgument( + "W of BatchFCOp should have 2D.")); + + int out_dim = w_dims[1] / batchcount; + PADDLE_ENFORCE_EQ( + input_dims.size(), + 3, + platform::errors::InvalidArgument( + "Input of BatchFCOp should have 3D.")); + PADDLE_ENFORCE_EQ( + input_dims[2], + w_dims[0], + platform::errors::InvalidArgument( + "Input.dim[2] and w_dims[0] of BatchFCOp should be same.")); + PADDLE_ENFORCE_EQ( + input_dims[0], + batchcount, + platform::errors::InvalidArgument( + "Input.dim[0] and batchcount of BatchFCOp should be same.")); + PADDLE_ENFORCE_EQ( + input_dims[2], + w_dims[0], + platform::errors::InvalidArgument( + "Input.dim[2] and W.dim[1] of BatchFCOp should be same.")); + + auto bias_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ( + bias_dims.size(), + 2, + platform::errors::InvalidArgument("Bias of BatchFCOp should have 2D.")); + PADDLE_ENFORCE_EQ( + bias_dims[1], + w_dims[1], + platform::errors::InvalidArgument( + "Bias.dim[1] should be same as input.dim[2].")); + + ctx->SetOutputDim("Out", {input_dims[0], input_dims[1], out_dim}); + ctx->ShareLoD("Input", /*->*/ "Out"); + return; + } if (batchcount > 0) { int feature_dim = input_dims[1] / batchcount; PADDLE_ENFORCE_EQ(feature_dim, w_dims[0], @@ -139,6 +194,7 @@ class BatchFCOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Bias", "(Tensor) Input tensor of batch_fc_op operator."); AddOutput("Out", "Output tensor of batch_fc_op operator."); AddAttr("batchcount", "(int64_t) the batchcount").SetDefault(0); + AddAttr("transpose_weight", "(bool) the transpose_weight").SetDefault(false); AddComment(R"DOC( BatchFC Operator. Notice: It currently supports GPU device. diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu index f9fac45ef6e5e..652eddb560099 100644 --- a/paddle/fluid/operators/batch_fc_op.cu +++ b/paddle/fluid/operators/batch_fc_op.cu @@ -171,11 +171,96 @@ void transpose_split_row(cudaStream_t stream, const unsigned int rown, stream>>>(rown, coln, num_block, source, dest); } +template +__global__ void transpose_weight_kernel(const T* source, T* dest, + const unsigned int rown, const unsigned int coln, const int64_t batch_count) { + int x = blockIdx.x * blockDim.x + threadIdx.x; + int y = blockIdx.y * blockDim.y + threadIdx.y; + if (x < rown && y < coln) { + int dst_coln = coln / batch_count; + int dst_x = x + y / dst_coln * rown; + int dst_y = y % dst_coln; + dest[dst_x * dst_coln + dst_y] = source[x * coln + y]; + } +} + +template +void transpose_weight_impl(cudaStream_t stream, const T* source, T* dest, + const unsigned int rown, const unsigned int coln, const int64_t batch_count) { + dim3 grid((rown + 15) / 16, (coln + 15) / 16); + dim3 block(16, 16); + transpose_weight_kernel<<>>(source, dest, rown, coln, batch_count); +} + template class BatchFCCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { int batchcount = ctx.Attr("batchcount"); + auto transpose_weight = ctx.Attr("transpose_weight"); + if (transpose_weight) { + // Input_dim: [batch_count, ?, in_dim] + // W_dim: [in_dim, batch_count * out_dim] + // Bias_dim: [1, batch_count * out_dim] + // Out_dim: [batch_count, ?, out_dim] + auto* input = ctx.Input("Input"); + auto* w = ctx.Input("W"); + auto* bias = ctx.Input("Bias"); + auto* output = ctx.Output("Out"); + auto input_dims = input->dims(); + auto w_dims = w->dims(); + auto slot_pairs_num = input_dims[0]; + auto ins_num = input_dims[1]; + auto in_dim = input_dims[2]; + auto out_dim = w_dims[1] / batchcount; + + // get data ptr + const T* in_data = input->data(); + const T* w_data = w->data(); + const T* bias_data = bias->data(); + + output->Resize({slot_pairs_num, ins_num, out_dim}); + T* out_data = output->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + + Tensor w_help; + w_help = + ctx.AllocateTmpTensor({batchcount, w_dims[0], w_dims[1] / batchcount}, dev_ctx); + T* w_help_data = w_help.data(); + + transpose_weight_impl(ctx.cuda_device_context().stream(), w_data, w_help_data, w_dims[0], w_dims[1], batchcount); + + CBLAS_TRANSPOSE transA = CblasNoTrans; + CBLAS_TRANSPOSE transB = CblasNoTrans; + + T alpha = 1; + T beta = 0; + int64_t strideA = ins_num * in_dim; + int64_t strideB = in_dim * out_dim; + + auto blas = phi::funcs::GetBlas(dev_ctx); + blas.BatchedGEMM(transA, + transB, + ins_num, + out_dim, + in_dim, + alpha, + in_data, + w_help_data, + beta, + out_data, + slot_pairs_num, + strideA, + strideB); + add_bias(ctx.cuda_device_context().stream(), + out_data, + slot_pairs_num, + ins_num, + out_dim, + bias_data); + return; + } if (batchcount > 0) { auto* input = ctx.Input("Input"); auto* w = ctx.Input("W"); diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h index 8425dcb521ab6..4dc83c9717ae7 100644 --- a/paddle/fluid/operators/conv_base_helper.h +++ b/paddle/fluid/operators/conv_base_helper.h @@ -36,25 +36,33 @@ using framework::ConvSearchCache; template using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; -// As the basic for SearchAlgorithm struct. -template -struct SearchAlgorithm {}; - // As the container of searchAlgorithm::Find() result. template struct SearchResult { SearchResult() {} + explicit SearchResult(AlgoT a) : algo(a) {} + explicit SearchResult(AlgoT a, float t, size_t size) + : algo(a), time(t), workspace_size(size) {} AlgoT algo = static_cast(0); float time = -1.f; size_t workspace_size = 0; + bool exhaustive_search = false; }; template static std::ostream& operator<<(std::ostream& out, const std::vector& v) { out << "["; - for (auto const& tmp : v) out << tmp << ","; + bool is_first = true; + for (auto const& tmp : v) { + if (is_first) { + out << tmp; + is_first = false; + } else { + out << ", " << tmp; + } + } out << "]"; return out; } @@ -76,28 +84,50 @@ struct ConvArgsBase { // dilations std::vector d; + // groups + int group; + + // data foramt + DataLayout data_layout; + ConvArgsBase(const framework::Tensor* x, const framework::Tensor* w, const framework::Tensor* o, const std::vector s, const std::vector p, const std::vector d, - DataT dtype) - : x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {} + DataT dtype, + int g, + DataLayout layout) + : x(x), + w(w), + o(o), + s(s), + p(p), + d(d), + cudnn_dtype(dtype), + group(g), + data_layout(layout) {} template - size_t GetCacheKey() const { + phi::autotune::ConvCacheKey Convert2ConvCacheKey() const { auto x_shape = phi::vectorize(x->dims()); auto w_shape = phi::vectorize(w->dims()); VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape - << ", strides=" << s << ", paddings=" << p << ", dilations=" << d; - return phi::autotune::ConvKey( + << ", strides=" << s << ", paddings=" << p << ", dilations=" << d + << ", data=" << paddle::experimental::CppTypeToDataType::Type() + << ", group=" << group + << ", data layout=" << static_cast(data_layout); + + return phi::autotune::ConvCacheKey( x_shape, w_shape, p, s, d, - paddle::experimental::CppTypeToDataType::Type()); + paddle::experimental::CppTypeToDataType::Type(), + group, + static_cast(data_layout)); } }; diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 1b8d421d133f1..2fa1683833c33 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -146,79 +146,21 @@ void ChooseAlgoByWorkspace(const std::vector& perf_results, } } -static void SetConvMathType(const phi::GPUContext& ctx, - cudnnDataType_t dtype, - const platform::ConvolutionDescriptor& cdesc) { -#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_TENSOR_OP_MATH)); - VLOG(5) << "use cudnn_tensor_op_math"; -#if CUDA_VERSION >= 11000 -#if CUDNN_VERSION_MIN(8, 1, 0) - } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_TENSOR_OP_MATH)); -#endif // CUDNN_VERSION_MIN(8, 1, 0) - } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_FMA_MATH)); -#endif // CUDA_VERSION >= 11000 - } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - cdesc.desc(), CUDNN_DEFAULT_MATH)); - VLOG(5) << "NOT use cudnn_tensor_op_math"; - } -#endif -} +template +struct SearchAlgorithmBase {}; // cuDNN convolution forward algorithm searcher, consisted of three searching // modes, namely: deterministic, heuristic and exhaustive_search mode. // As well as one workspace size acquirsition function with respect to // the chosen alogrithm. template <> -struct SearchAlgorithm { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionFwdAlgoPerf_t; using AlgoT = cudnnConvolutionFwdAlgo_t; + constexpr static phi::autotune::AlgorithmType kAlgoType = + phi::autotune::AlgorithmType::kConvForward; - template - static SearchResult Find(const ConvArgs& args, - bool exhaustive_search, - bool deterministic, - const phi::GPUContext& ctx) { - SearchResult result; - auto dtype = platform::CudnnDataType::type; - SetConvMathType(ctx, dtype, args.cdesc); - - if (deterministic) { - result = FindAlgoDeterministic(); - } else { - // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. - // 2. Once turning on auto-tune, runn heuristic search(default) before - // auto-tune process, run exhaustive_search during mentioned process. - // 3. After auto-tune process, run cached algorithm if cached, run - // default mode for the rest. - size_t key = args.GetCacheKey(); - auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward(); - if (cache.Find(key)) { - result.algo = static_cast(cache.Get(key)); - } else { - bool use_autotune = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); - if (exhaustive_search || use_autotune) { - result = FindAlgoExhaustiveSearch(args, ctx); - cache.Set(key, static_cast(result.algo)); - } else { - result = FindAlgoHeuristic(args, ctx); - } - } - } - VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search - << ", deterministic=" << deterministic - << ", choose algo=" << result.algo << ", workspace=" - << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB"; - return result; - } + static const std::string GetPerfName() { return "ConvForward"; } static size_t GetWorkspaceSize(const ConvArgs& args, cudnnConvolutionFwdAlgo_t algo) { @@ -235,9 +177,10 @@ struct SearchAlgorithm { return workspace_size; } - private: - static SearchResult FindAlgoDeterministic() { - return SearchResult(static_cast(1)); + protected: + static SearchResult FindAlgoDeterministic(const ConvArgs& args) { + auto workspace_size = GetWorkspaceSize(args, static_cast(1)); + return SearchResult(static_cast(1), -1.0, workspace_size); } // Heuristic search mode, calling the cudnnGetXxxAlgorithm. @@ -266,6 +209,10 @@ struct SearchAlgorithm { if (result.workspace_size > workspace_size_limit) { #if CUDNN_VERSION >= 8000 + VLOG(4) << GetPerfResultString("[Heuristic] FwdAlgo Perf result", + perf_results, + actual_perf_count, + workspace_size_limit); // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 ChooseAlgoByWorkspace( perf_results, workspace_size_limit, &result); @@ -298,6 +245,7 @@ struct SearchAlgorithm { workspace_size_limit, &(result.algo))); #endif + result.workspace_size = GetWorkspaceSize(args, result.algo); return result; } @@ -343,6 +291,7 @@ struct SearchAlgorithm { ChooseAlgoByWorkspace( perf_results, workspace_size_limit, &result); + result.workspace_size = GetWorkspaceSize(args, result.algo); return result; } @@ -380,49 +329,13 @@ struct SearchAlgorithm { // As well as one workspace size acquirsition function with // respect to the chosen alogrithm. template <> -struct SearchAlgorithm { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionBwdDataAlgoPerf_t; using AlgoT = cudnnConvolutionBwdDataAlgo_t; + constexpr static phi::autotune::AlgorithmType kAlgoType = + phi::autotune::AlgorithmType::kConvBackwardData; - template - static SearchResult Find(const ConvArgs& args, - bool exhaustive_search, - bool deterministic, - const phi::GPUContext& ctx) { - SearchResult result; - auto dtype = platform::CudnnDataType::type; - SetConvMathType(ctx, dtype, args.cdesc); - - if (deterministic) { - result = FindAlgoDeterministic(); - } else { - // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. - // 2. Once turning on auto-tune, runn heuristic search(default) before - // auto-tune process, run exhaustive_search during mentioned process. - // 3. After auto-tune process, run cached algorithm if cached, run - // default mode for the rest. - size_t key = args.GetCacheKey(); - auto& cache = - phi::autotune::AutoTuneCache::Instance().GetConvBackwardData(); - if (cache.Find(key)) { - result.algo = static_cast(cache.Get(key)); - } else { - bool use_autotune = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); - if (exhaustive_search || use_autotune) { - result = FindAlgoExhaustiveSearch(args, ctx); - cache.Set(key, static_cast(result.algo)); - } else { - result = FindAlgoHeuristic(args, ctx); - } - } - } - VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search - << ", deterministic=" << deterministic - << ", choose algo=" << result.algo << ", workspace=" - << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB"; - return result; - } + static const std::string GetPerfName() { return "ConvBackwardData"; } static size_t GetWorkspaceSize(const ConvArgs& args, cudnnConvolutionBwdDataAlgo_t algo) { @@ -439,9 +352,12 @@ struct SearchAlgorithm { return workspace_size; } - private: - static SearchResult FindAlgoDeterministic() { - return SearchResult(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1); + protected: + static SearchResult FindAlgoDeterministic(const ConvArgs& args) { + auto workspace_size = + GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1); + return SearchResult( + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size); } static SearchResult FindAlgoHeuristic(const ConvArgs& args, @@ -513,7 +429,7 @@ struct SearchAlgorithm { workspace_size_limit, &(result.algo))); #endif - + result.workspace_size = GetWorkspaceSize(args, result.algo); return result; } @@ -559,6 +475,7 @@ struct SearchAlgorithm { ChooseAlgoByWorkspace( perf_results, workspace_size_limit, &result); + result.workspace_size = GetWorkspaceSize(args, result.algo); return result; } @@ -594,50 +511,13 @@ struct SearchAlgorithm { // exhaustive_search mode. As well as one workspace size acquirsition function // with respect to the chosen alogrithm. template <> -struct SearchAlgorithm { +struct SearchAlgorithmBase { using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t; using AlgoT = cudnnConvolutionBwdFilterAlgo_t; + constexpr static phi::autotune::AlgorithmType kAlgoType = + phi::autotune::AlgorithmType::kConvBackwardFilter; - template - static SearchResult Find(const ConvArgs& args, - bool exhaustive_search, - bool deterministic, - const phi::GPUContext& ctx) { - platform::CUDAGraphCaptureModeGuard guard; - SearchResult result; - auto dtype = platform::CudnnDataType::type; - SetConvMathType(ctx, dtype, args.cdesc); - - if (deterministic) { - result = FindAlgoDeterministic(); - } else { - // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. - // 2. Once turning on auto-tune, runn heuristic search(default) before - // auto-tune process, run exhaustive_search during mentioned process. - // 3. After auto-tune process, run cached algorithm if cached, run - // default mode for the rest. - size_t key = args.GetCacheKey(); - auto& cache = - phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter(); - if (cache.Find(key)) { - result.algo = static_cast(cache.Get(key)); - } else { - bool use_autotune = - phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); - if (exhaustive_search || use_autotune) { - result = FindAlgoExhaustiveSearch(args, ctx); - cache.Set(key, static_cast(result.algo)); - } else { - result = FindAlgoHeuristic(args, ctx); - } - } - } - VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search - << ", deterministic=" << deterministic - << ", choose algo=" << result.algo << ", workspace=" - << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB"; - return result; - } + static const std::string GetPerfName() { return "ConvBackwardFilter"; } static size_t GetWorkspaceSize(const ConvArgs& args, cudnnConvolutionBwdFilterAlgo_t algo) { @@ -655,9 +535,12 @@ struct SearchAlgorithm { return workspace_size; } - private: - static SearchResult FindAlgoDeterministic() { - return SearchResult(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1); + protected: + static SearchResult FindAlgoDeterministic(const ConvArgs& args) { + auto workspace_size = + GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1); + return SearchResult( + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size); } static SearchResult FindAlgoHeuristic(const ConvArgs& args, @@ -718,6 +601,7 @@ struct SearchAlgorithm { &(result.algo))); #endif + result.workspace_size = GetWorkspaceSize(args, result.algo); return result; } @@ -786,6 +670,7 @@ struct SearchAlgorithm { ChooseAlgo(perf_results, workspace_size_limit, &result); } + result.workspace_size = GetWorkspaceSize(args, result.algo); return result; } @@ -867,5 +752,103 @@ struct SearchAlgorithm { } }; +template +struct SearchAlgorithm : public SearchAlgorithmBase { + using AlgoT = typename SearchAlgorithmBase::AlgoT; + + template + static SearchResult Find(const phi::GPUContext& ctx, + const ConvArgs& args, + bool exhaustive_search, + bool deterministic, + bool enable_autotune = true) { + SearchResult result; + bool use_autotune = false; + auto dtype = platform::CudnnDataType::type; + SetConvMathType(ctx, dtype, args.cdesc); + + if (deterministic) { + result = SearchAlgorithmBase::FindAlgoDeterministic(args); + } else { + // 1. Once turning on exhaustive FLAGS, always get exhaustive_search. + // 2. Once turning on auto-tune, run heuristic (default) before + // auto-tune process, run exhaustive_search during mentioned process. + // Auto tune is only enabled between specified range. + // 3. After auto-tune process, run cached algorithm if cached, run + // default mode for the rest. + auto key = args.Convert2ConvCacheKey(); + auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv( + SearchAlgorithmBase::kAlgoType); + bool find_in_cache = cache.Find(key); + if (find_in_cache) { + auto t = cache.Get(key); + result.algo = static_cast(t.algo); + result.workspace_size = t.workspace_size; + result.exhaustive_search = t.exhaustive_search; + } + if (!result.exhaustive_search) { + bool need_update_cache = false; + // In conv2d_tranpose, enable_autotune is set to false because some + // algorithm picked by exhaustive search method produce wrong result. + use_autotune = enable_autotune && + phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); + if (exhaustive_search || use_autotune) { + // Once autotune is enabled, the autotuned result can rewrite the + // previous result in cache found by heuristic method. + result = + SearchAlgorithmBase::template FindAlgoExhaustiveSearch( + args, ctx); + need_update_cache = true; + } else if (!find_in_cache) { + result = SearchAlgorithmBase::FindAlgoHeuristic(args, ctx); + need_update_cache = true; + } + if (need_update_cache) { + phi::autotune::ConvAutoTuneResult node( + static_cast(result.algo), + result.workspace_size, + exhaustive_search || use_autotune); + cache.Set(key, node); + } + } + } + VLOG(3) << "[cuDNN " << SearchAlgorithmBase::GetPerfName() + << "] exhaustive_search=" << exhaustive_search + << ", use_autotune=" << use_autotune + << ", deterministic=" << deterministic + << ", choose algo=" << result.algo + << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB"; + return result; + } + + static void SetConvMathType(const phi::GPUContext& ctx, + cudnnDataType_t dtype, + const platform::ConvolutionDescriptor& cdesc) { +#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) + if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_TENSOR_OP_MATH)); + VLOG(5) << "Enable Tensor Core for FLOAT16"; +#if CUDA_VERSION >= 11000 +#if CUDNN_VERSION_MIN(8, 1, 0) + } else if (ctx.GetComputeCapability() >= 80 && + dtype == CUDNN_DATA_BFLOAT16) { + VLOG(5) << "Enable Tensor Core for BFLOAT16"; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_TENSOR_OP_MATH)); +#endif // CUDNN_VERSION_MIN(8, 1, 0) + } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) { + VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT"; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_FMA_MATH)); +#endif // CUDA_VERSION >= 11000 + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cdesc.desc(), CUDNN_DEFAULT_MATH)); + } +#endif + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/cross_norm_hadamard_op.cu b/paddle/fluid/operators/cross_norm_hadamard_op.cu index df643de164ffe..4594421565770 100644 --- a/paddle/fluid/operators/cross_norm_hadamard_op.cu +++ b/paddle/fluid/operators/cross_norm_hadamard_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include #include "paddle/fluid/framework/eigen.h" diff --git a/paddle/fluid/operators/fused/fused_seq_tensor_op.cc b/paddle/fluid/operators/fused/fused_seq_tensor_op.cc new file mode 100644 index 0000000000000..7430d0d32ca37 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_seq_tensor_op.cc @@ -0,0 +1,132 @@ +#include "paddle/fluid/operators/fused/fused_seq_tensor_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include + +namespace paddle { +namespace operators { + +class FusedSeqTensorOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "FusedSeqTensorOp"); + OP_INOUT_CHECK(ctx->HasInput("ADInput"), "ADInput", "ADInput", "FusedSeqTensorOp"); + + OP_INOUT_CHECK(ctx->HasOutput("DINOut"), "DINOut", "DINOut", "FusedSeqTensorOp"); + OP_INOUT_CHECK(ctx->HasOutput("MaskOut"), "MaskOut", "MaskOut", "FusedSeqTensorOp"); + OP_INOUT_CHECK(ctx->HasOutput("SideInfoOut"), "SideInfoOut", "SideInfoOut", "FusedSeqTensorOp"); + OP_INOUT_CHECK(ctx->HasOutput("ADSlotSessionOut"), "ADSlotSessionOut", "ADSlotSessionOut", "FusedSeqTensorOp"); + + const framework::DDim input_dims = ctx->GetInputDim("Input"); + const framework::DDim ad_input_dims = ctx->GetInputDim("ADInput"); + + auto ad_slot_num = ctx->Attrs().Get("ad_slot_num"); + auto batch_count = ctx->Attrs().Get("batch_count"); + auto max_length = ctx->Attrs().Get("max_length"); + auto slot_num = ctx->Attrs().Get("slot_num"); + auto fea_emb_dim = ctx->Attrs().Get("fea_emb_dim"); + auto ad_slot_offset = ctx->Attrs().Get("ad_slot_offset"); + + int64_t one_ins_dim = batch_count * max_length * slot_num * fea_emb_dim; + PADDLE_ENFORCE_EQ( + input_dims[1], one_ins_dim, + platform::errors::InvalidArgument( + "input dims error, %ld != %ld", input_dims[1], one_ins_dim)); + + int64_t one_ins_ad_dim = batch_count * 1 * ad_slot_num * fea_emb_dim; + PADDLE_ENFORCE_EQ( + ad_input_dims[1], one_ins_ad_dim, + platform::errors::InvalidArgument( + "input dims error, %ld != %ld", ad_input_dims[1], one_ins_ad_dim)); + PADDLE_ENFORCE_LT( + ad_slot_num, slot_num, + platform::errors::InvalidArgument( + "ad_slot_num [%ld] > slot_num [%ld]", ad_slot_num, slot_num)); + PADDLE_ENFORCE_GT( + ad_slot_num, 0, + platform::errors::InvalidArgument( + "ad_slot_num [%ld] <= 0", ad_slot_num)); + PADDLE_ENFORCE_LE( + ad_slot_offset, slot_num - 1, + platform::errors::InvalidArgument( + "ad_slot_num [%ld] > slot_num - 1 [%ld]", ad_slot_offset, slot_num)); + PADDLE_ENFORCE_GE( + ad_slot_offset, 0, + platform::errors::InvalidArgument( + "ad_slot_offset [%ld] < 0", ad_slot_offset)); + if (ad_slot_offset != 0) { + PADDLE_ENFORCE_EQ( + ad_slot_num + ad_slot_offset, slot_num, + platform::errors::InvalidArgument( + "ad_slot_num [%ld] + ad_slot_offset [%ld] != slot_num [%ld]", ad_slot_num, ad_slot_offset, slot_num)); + } + + auto ins_num = input_dims[0]; + if (batch_count > 1) { + ctx->SetOutputDim("DINOut", {batch_count, ins_num * max_length, ad_slot_num * fea_emb_dim * 4}); + ctx->SetOutputDim("MaskOut", {batch_count, ins_num, max_length}); + ctx->SetOutputDim("SideInfoOut", {batch_count, ins_num * max_length, (slot_num - ad_slot_num) * fea_emb_dim}); + ctx->SetOutputDim("ADSlotSessionOut", {batch_count, ins_num * max_length, ad_slot_num, fea_emb_dim}); + } else { + ctx->SetOutputDim("DINOut", {ins_num, max_length, ad_slot_num * fea_emb_dim * 4}); + ctx->SetOutputDim("MaskOut", {ins_num, max_length}); + ctx->SetOutputDim("SideInfoOut", {ins_num, max_length, (slot_num - ad_slot_num) * fea_emb_dim}); + ctx->SetOutputDim("ADSlotSessionOut", {ins_num, max_length, ad_slot_num * fea_emb_dim}); + } + ctx->ShareLoD("Input", "DINOut"); + ctx->ShareLoD("Input", "MaskOut"); + ctx->ShareLoD("Input", "SideInfoOut"); + ctx->ShareLoD("Input", "ADSlotSessionOut"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Input"), + ctx.device_context()); + } +}; + +class FusedSeqTensorOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "The input tensors of operator."); + AddInput("ADInput", + "The input ad tensors of operator. "); + AddOutput("DINOut", + "DINOut"); + AddOutput("MaskOut", + "MaskOut"); + AddOutput("SideInfoOut", + "SideInfoOut"); + AddOutput("ADSlotSessionOut", + "ADSlotSessionOut"); + + AddAttr("batch_count", "(int, default 1)"); + AddAttr("max_length", "(int, default 1)"); + AddAttr("slot_num", "(int, default 1)"); + AddAttr("fea_emb_dim", "(int, default 1)"); + AddAttr("ad_slot_num", "(int, default 1)"); + AddAttr("ad_slot_offset", "(int, default 1)"); + + AddComment(R"DOC( +Fuse seq tensor. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(fused_seq_tensor, + ops::FusedSeqTensorOp, ops::FusedSeqTensorOpMaker); + +REGISTER_OP_CPU_KERNEL( + fused_seq_tensor, + ops::FusedSeqTensorCPUKernel); diff --git a/paddle/fluid/operators/fused/fused_seq_tensor_op.cu b/paddle/fluid/operators/fused/fused_seq_tensor_op.cu new file mode 100644 index 0000000000000..8210cd43808c3 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_seq_tensor_op.cu @@ -0,0 +1,290 @@ +#include +#include +#include +#include "paddle/fluid/operators/fused/fused_seq_tensor_op.h" // don't remove this +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace paddle { +namespace operators { + +template +__global__ void cal_ad_slot_session_kernel(const T* input, + const T* ad_input, + T* din_output, + T* ad_slot_session_output, + const size_t batch_num, + const size_t ins_num, + const size_t slot_num, + const size_t max_length, + const size_t fea_emb_dim, + const size_t ad_slot_num, + const size_t ad_slot_offset) { + + size_t batch_idx = blockIdx.x; + size_t ins_idx = blockIdx.y; + size_t fea_idx = blockIdx.z; + + const size_t one_slot_dim = max_length * fea_emb_dim; + const size_t one_seq_dim = slot_num * one_slot_dim; + const size_t ad_seq_dim = ad_slot_num * one_slot_dim; + + const size_t piece_of_ad_seq_dim = ad_slot_num * fea_emb_dim; + for (size_t idx = threadIdx.x; idx < piece_of_ad_seq_dim; idx += blockDim.x) { + size_t slot_idx = idx / fea_emb_dim + ad_slot_offset; + size_t out_slot_idx = idx / fea_emb_dim; + size_t fea_dim_idx = idx % fea_emb_dim; + + size_t input_fea_begin_idx = ins_idx * (batch_num * one_seq_dim) + batch_idx * one_seq_dim + + slot_idx * one_slot_dim + fea_idx * fea_emb_dim; + + size_t ad_fea_begin_idx = + ins_idx * (1 * batch_num * piece_of_ad_seq_dim) + batch_idx * piece_of_ad_seq_dim + + out_slot_idx * fea_emb_dim; + + const T input_val = input[input_fea_begin_idx + fea_dim_idx]; + const T ad_val = ad_input[ad_fea_begin_idx + fea_dim_idx]; + + size_t fea_concat_start_idx = + batch_idx * (ins_num * ad_seq_dim * 4) + ins_idx * (ad_seq_dim * 4) + + fea_idx * (piece_of_ad_seq_dim * 4) + out_slot_idx * fea_emb_dim; + + din_output[fea_concat_start_idx + fea_dim_idx] = input_val; + din_output[fea_concat_start_idx + fea_dim_idx + piece_of_ad_seq_dim] = ad_val; + din_output[fea_concat_start_idx + fea_dim_idx + piece_of_ad_seq_dim * 2] = input_val - ad_val; + din_output[fea_concat_start_idx + fea_dim_idx + piece_of_ad_seq_dim * 3] = input_val * ad_val; + + size_t ad_slot_session_out_start_idx = + batch_idx * (ins_num * ad_seq_dim) + ins_idx * ad_seq_dim + + fea_idx * piece_of_ad_seq_dim + out_slot_idx * fea_emb_dim; + ad_slot_session_output[ad_slot_session_out_start_idx + fea_dim_idx] = input_val; + } +} + +template +__global__ void cal_sideinfo_kernel(const T* input, + T* side_info_output, + const size_t batch_num, + const size_t ins_num, + const size_t slot_num, + const size_t max_length, + const size_t fea_emb_dim, + const size_t sideinfo_slot_num, + const size_t sideinfo_slot_offset) { + + size_t batch_idx = blockIdx.x; + size_t ins_idx = blockIdx.y; + size_t fea_idx = blockIdx.z; + + const size_t one_slot_dim = max_length * fea_emb_dim; + const size_t input_one_seq_dim = slot_num * one_slot_dim; + const size_t sideinfo_seq_dim = sideinfo_slot_num * one_slot_dim; + + const size_t piece_of_sideinfo_seq_dim = sideinfo_slot_num * fea_emb_dim; + for (size_t idx = threadIdx.x; idx < piece_of_sideinfo_seq_dim; idx += blockDim.x) { + size_t out_slot_idx = idx / fea_emb_dim; + size_t slot_idx = out_slot_idx + sideinfo_slot_offset; + size_t fea_dim_idx = idx % fea_emb_dim; + + size_t input_fea_begin_idx = ins_idx * (batch_num * input_one_seq_dim) + batch_idx * input_one_seq_dim + + slot_idx * one_slot_dim + fea_idx * fea_emb_dim; + + size_t fea_transpose_start_idx = + batch_idx * (ins_num * sideinfo_seq_dim) + ins_idx * sideinfo_seq_dim + + fea_idx * (sideinfo_slot_num * fea_emb_dim) + out_slot_idx * fea_emb_dim; + + side_info_output[fea_transpose_start_idx + fea_dim_idx] = input[input_fea_begin_idx + fea_dim_idx]; + } +} + +template +__global__ void cal_sideinfo_kernel_without_loop(const T* input, + T* side_info_output, + const size_t batch_num, + const size_t ins_num, + const size_t slot_num, + const size_t max_length, + const size_t fea_emb_dim, + const size_t sideinfo_slot_num, + const size_t sideinfo_slot_offset) { + + size_t batch_idx = blockIdx.x; + size_t ins_idx = blockIdx.y; + size_t fea_idx = blockIdx.z; + + size_t slot_idx = threadIdx.y + sideinfo_slot_offset; + size_t out_slot_idx = threadIdx.y; + size_t fea_dim_idx = threadIdx.x; + + const size_t one_slot_dim = max_length * fea_emb_dim; + size_t input_one_seq_dim = slot_num * one_slot_dim; + size_t out_one_seq_dim = sideinfo_slot_num * one_slot_dim; + + size_t input_fea_begin_idx = ins_idx * (batch_num * input_one_seq_dim) + batch_idx * (input_one_seq_dim) + + slot_idx * one_slot_dim + fea_idx * fea_emb_dim; + + size_t fea_transpose_start_idx = + batch_idx * (ins_num * out_one_seq_dim) + ins_idx * out_one_seq_dim + + fea_idx * (sideinfo_slot_num * fea_emb_dim) + out_slot_idx * fea_emb_dim; + + side_info_output[fea_transpose_start_idx + fea_dim_idx] = input[input_fea_begin_idx + fea_dim_idx]; +} + +template +__device__ void warpReduce(volatile T* cache, int tid) { + cache[tid] += cache[tid+32]; + cache[tid] += cache[tid+16]; + cache[tid] += cache[tid+8]; + cache[tid] += cache[tid+4]; + cache[tid] += cache[tid+2]; + cache[tid] += cache[tid+1]; +} + +#define THREAD_PER_BLOCK 128 +template +__global__ void reduce_sum_max_length(const T* input, + T* mask_output, + const size_t batch_count, + const size_t ins_num, + const size_t slot_num, + const size_t max_length, + const size_t fea_emb_dim) { + size_t batch_idx = blockIdx.x; + size_t ins_idx = blockIdx.y; + size_t fea_idx = blockIdx.z; + + size_t data_len_per_block = slot_num * fea_emb_dim; + + __shared__ T sdata[THREAD_PER_BLOCK]; + //each thread loads one element from global memory to shared mem + size_t input_start_idx = ins_idx * (batch_count * slot_num * max_length * fea_emb_dim) + + batch_idx * (slot_num * max_length * fea_emb_dim); + + size_t tid = threadIdx.x; + // memset shared mem + sdata[tid] = 0; + for (size_t idx = tid; idx < data_len_per_block; idx += blockDim.x) { + size_t slot_idx = idx / fea_emb_dim; + size_t fea_dim_idx = idx % fea_emb_dim; + size_t offset = slot_idx * (max_length * fea_emb_dim) + fea_idx * fea_emb_dim + fea_dim_idx; + sdata[tid] += input[input_start_idx + offset]; + } + __syncthreads(); + + for(size_t s = blockDim.x / 2; s > 32; s >>= 1) { + if (tid < s) { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); + } + // When s < 32, we have only one warp left, no need to sync threads, no need to if (tid < s) + if(tid < 32) { + warpReduce(sdata, tid); + } + + if(tid == 0) { + // [batch_count, ins_num, max_length] + size_t out_idx = batch_idx * (ins_num * max_length) + + ins_idx * (max_length) + + fea_idx; + if (fabs(sdata[tid]) > 1e-8) { + mask_output[out_idx] = 1; + } else { + mask_output[out_idx] = 0; + } + } +} + +template +class FusedSeqTensorCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto input = ctx.Input("Input"); + PADDLE_ENFORCE_NOT_NULL(input, platform::errors::NotFound("Input not found")); + auto ad_input = ctx.Input("ADInput"); + PADDLE_ENFORCE_NOT_NULL(ad_input, platform::errors::NotFound("Input not found")); + + auto din_output = ctx.Output("DINOut"); + PADDLE_ENFORCE_NOT_NULL(din_output, + platform::errors::NotFound("DINOut not found")); + T* din_output_data = din_output->mutable_data(ctx.GetPlace()); + auto mask_output = ctx.Output("MaskOut"); + PADDLE_ENFORCE_NOT_NULL(mask_output, + platform::errors::NotFound("MaskOut not found")); + T* mask_output_output_data = mask_output->mutable_data(ctx.GetPlace()); + auto side_info_output = ctx.Output("SideInfoOut"); + PADDLE_ENFORCE_NOT_NULL(side_info_output, + platform::errors::NotFound("Output not found")); + T* side_info_output_data = + side_info_output->mutable_data(ctx.GetPlace()); + auto ad_slot_session_output = + ctx.Output("ADSlotSessionOut"); + PADDLE_ENFORCE_NOT_NULL(ad_slot_session_output, + platform::errors::NotFound("Output not found")); + T* ad_slot_session_output_data = + ad_slot_session_output->mutable_data(ctx.GetPlace()); + + auto batch_count = ctx.Attr("batch_count"); + auto max_length = ctx.Attr("max_length"); + auto slot_num = ctx.Attr("slot_num"); + auto fea_emb_dim = ctx.Attr("fea_emb_dim"); + auto ad_slot_num = ctx.Attr("ad_slot_num"); + auto ad_slot_offset = ctx.Attr("ad_slot_offset"); + + auto& dev_ctx = ctx.template device_context(); + auto stream = ctx.cuda_device_context().stream(); + + auto input_dims = input->dims(); + size_t ins_num = input_dims[0]; + + dim3 ad_grid(batch_count, ins_num, max_length); + dim3 ad_block(std::min(static_cast(1024), static_cast(ad_slot_num * fea_emb_dim))); + + cal_ad_slot_session_kernel<<>>( + input->data(), ad_input->data(), din_output_data, + ad_slot_session_output_data, + batch_count, ins_num, slot_num, max_length, fea_emb_dim, + ad_slot_num, ad_slot_offset); + + size_t sideinfo_slot_offset = 0; + if (ad_slot_offset == 0) { + sideinfo_slot_offset = ad_slot_num; + } + size_t fea_padding_dim = ((fea_emb_dim + 31) / 32) * 32; + size_t sideinfo_slot_num = slot_num - ad_slot_num; + + if (sideinfo_slot_num * fea_emb_dim < 1024) { + dim3 sideinfo_grid(batch_count, ins_num, max_length); + dim3 sideinfo_block(fea_emb_dim, sideinfo_slot_num); + cal_sideinfo_kernel_without_loop<<>>( + input->data(), side_info_output_data, batch_count, ins_num, + slot_num, max_length, fea_emb_dim, + sideinfo_slot_num, sideinfo_slot_offset); + } else { + dim3 sideinfo_grid(batch_count, ins_num, max_length); + dim3 sideinfo_block(sideinfo_slot_num * fea_emb_dim); + cal_sideinfo_kernel<<>>( + input->data(), side_info_output_data, batch_count, ins_num, + slot_num, max_length, fea_emb_dim, + sideinfo_slot_num, sideinfo_slot_offset); + } + + dim3 reduce_grid(batch_count, ins_num, max_length); + dim3 reduce_block(THREAD_PER_BLOCK); + reduce_sum_max_length<<>>( + input->data(), mask_output_output_data, batch_count, + ins_num, slot_num, max_length, fea_emb_dim); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + fused_seq_tensor, + ops::FusedSeqTensorCUDAKernel); \ No newline at end of file diff --git a/paddle/fluid/operators/fused/fused_seq_tensor_op.h b/paddle/fluid/operators/fused/fused_seq_tensor_op.h new file mode 100644 index 0000000000000..d7bbadd72e3b5 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_seq_tensor_op.h @@ -0,0 +1,16 @@ +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class FusedSeqTensorCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext&) const override { + PADDLE_THROW(platform::errors::Unimplemented("fused_seq_tensor supports only GPU")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc index 474863cad18b9..4b1eda15dfcfc 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cc @@ -48,6 +48,7 @@ class FusedSeqpoolCVMOp : public framework::OperatorWithKernel { bool clk_filter = ctx->Attrs().Get("clk_filter"); const int embed_thres_size = ctx->Attrs().Get("embed_thres_size"); const int embedx_concate_size = ctx->Attrs().Get("embedx_concate_size"); + //const bool fill_zero = ctx->Attrs().Get("fill_zero"); // need filter quant_ratio more than zero if (ctx->Attrs().Get("need_filter")) { @@ -142,6 +143,7 @@ class FusedSeqpoolCVMOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("embed_thres_size", "(int, default 0)").SetDefault(0); AddAttr("embedx_concate_size", "(int, default 1)").SetDefault(1); AddAttr("embedx_concate_filter", "(bool, default false)").SetDefault(false); + AddAttr("fill_zero", "(bool, default true)").SetDefault(true); AddAttr("fix_ctr_to_click", "(bool, default false)").SetDefault(false); AddComment(R"DOC( diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu index d7ba888aa1dd5..76c02b4a6c93f 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu @@ -177,7 +177,7 @@ __global__ void FusedSeqpoolKernelEmbedQuantFilterEmbedxConcate( size_t **lods_values, const int batch_size, const int embedding_size, const float pad_value, const int cvm_offset, const float show_coeff, const float clk_coeff, const float threshold, const int quant_ratio, - const float embed_threshold, const int embedx_concate_size, bool embedx_concate_filter) { + const float embed_threshold, const int embedx_concate_size, bool embedx_concate_filter, bool fill_zero) { CUDA_KERNEL_LOOP(i, N) { int key = i / embedding_size; int offset = i % embedding_size; // embedx id @@ -188,11 +188,17 @@ __global__ void FusedSeqpoolKernelEmbedQuantFilterEmbedxConcate( double val = pad_value; int concate_index = 0; + bool val_use_zero = false; for (auto k = start; k < end; ++k) { + val_use_zero = false; T &show = *(input_values[x] + k * embedding_size); T &click = *(input_values[x] + k * embedding_size + 1); - if (!embedx_concate_filter&&(show - click) * show_coeff + click * clk_coeff < threshold) { - continue; + if (embedx_concate_filter && (show - click) * show_coeff + click * clk_coeff < threshold) { + if (fill_zero) { + val_use_zero = true; + } else { + continue; + } } T &embedw = *(input_values[x] + k * embedding_size + cvm_offset); T embedx_weight_score = 0.0; @@ -202,16 +208,28 @@ __global__ void FusedSeqpoolKernelEmbedQuantFilterEmbedxConcate( } embedx_weight_score = std::sqrt(embedx_weight_score) + std::abs(embedw); if (embedx_concate_filter && embedx_weight_score < embed_threshold) { - continue; + if (fill_zero) { + val_use_zero = true; + } else { + continue; + } } if (offset < cvm_offset) { // show & click - val = *(input_values[x] + k * embedding_size + offset); + if (val_use_zero) { + val = pad_value; + } else { + val = *(input_values[x] + k * embedding_size + offset); + } } else { - val = ((static_cast( + if (val_use_zero) { + val = pad_value; + } else { + val = ((static_cast( *(input_values[x] + k * embedding_size + offset) * quant_ratio + 0.5)) / static_cast(quant_ratio)); + } } if (concate_index == embedx_concate_size) { *(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + (embedx_concate_size-1) * embedding_size + offset) += val; @@ -356,7 +374,9 @@ void FusedSeqpoolCVM(const paddle::platform::Place &place, float clk_coeff, float threshold, float embed_threshold, const int quant_ratio, const bool clk_filter, const int embed_thres_size, const int embedx_concate_size, - bool embedx_concate_filter, bool fix_ctr_to_click) { + bool embedx_concate_filter, + bool fill_zero, + bool fix_ctr_to_click) { auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -399,7 +419,7 @@ void FusedSeqpoolCVM(const paddle::platform::Place &place, 0, stream>>>( N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size, embedding_size, padding_value, cvm_offset, show_coeff, clk_coeff, - threshold, quant_ratio, embed_threshold, embedx_concate_size, embedx_concate_filter); + threshold, quant_ratio, embed_threshold, embedx_concate_size, embedx_concate_filter, fill_zero); } } else if (need_filter) { // quant need filter FusedSeqpoolKernelQuantFilter<< { const int embed_thres_size = ctx.Attr("embed_thres_size"); const int embedx_concate_size = ctx.Attr("embedx_concate_size"); bool embedx_concate_filter = ctx.Attr("embedx_concate_filter"); + bool fill_zero = ctx.Attr("fill_zero"); bool fix_ctr_to_click = ctx.Attr("fix_ctr_to_click"); framework::GPULodVector gpu_lods[slot_size]; @@ -742,8 +763,8 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel { embedding_size, padding_value, use_cvm, cvm_offset, need_filter, embed_threshold_filter, show_coeff, clk_coeff, threshold, embed_threshold, quant_ratio, clk_filter, - embed_thres_size, embedx_concate_size, embedx_concate_filter, - fix_ctr_to_click); + embed_thres_size, embedx_concate_size, embedx_concate_filter, + fill_zero, fix_ctr_to_click); } }; diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.cu index e552d60a24c8b..0e01eb1785132 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_with_conv_op.cu @@ -52,6 +52,74 @@ __global__ void FusedSeqpoolWithConvKernelNormal(const size_t N, T **input_value *(seqpool_output_values[x] + y * embedding_size + offset) = val; } } + +// Filter +template +__global__ void FusedSeqpoolWithConvKernelFilter(const size_t N, T **input_values, + T **seqpool_output_values, + size_t **lods_values, + const int batch_size, + const int embedding_size, + const float pad_value, + const float show_coeff, + const float clk_coeff, + const float threshold) { + CUDA_KERNEL_LOOP(i, N) { + int key = i / embedding_size; + int offset = i % embedding_size; + int x = key / batch_size; // slot id + int y = key % batch_size; // ins id + auto &start = *(lods_values[x] + y); + auto &end = *(lods_values[x] + y + 1); + + double val = pad_value; + for (auto k = start; k < end; ++k) { + T &show = *(input_values[x] + k * embedding_size); + T &click = *(input_values[x] + k * embedding_size + 1); + if ((show - click) * show_coeff + click * clk_coeff < threshold) { + continue; + } + val += *(input_values[x] + k * embedding_size + offset); + } + *(seqpool_output_values[x] + y * embedding_size + offset) = val; + } +} + +// normal & expand slot's feasign +template +__global__ void FusedSeqpoolWithConvKernelNormalEmbedxConcate(const size_t N, T **input_values, + T **seqpool_output_values, + size_t **lods_values, + const int batch_size, + const int embedding_size, + const float pad_value, + const int embedx_concate_size) { + CUDA_KERNEL_LOOP(i, N) { + int key = i / embedding_size; + int offset = i % embedding_size; + int x = key / batch_size; // slot id + int y = key % batch_size; // ins id + auto &start = *(lods_values[x] + y); + auto &end = *(lods_values[x] + y + 1); + int concate_index = 0; + + double val = pad_value; + for (auto k = start; k < end; ++k) { + val = *(input_values[x] + k * embedding_size + offset); + if (concate_index == embedx_concate_size) { + *(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + (embedx_concate_size-1) * embedding_size + offset) += val; + } else { + *(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + concate_index * embedding_size + offset) = val; + concate_index += 1; + } + } + while (concate_index < embedx_concate_size) { + *(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + concate_index * embedding_size + offset) = pad_value; + concate_index += 1; + } + } +} + // join only need show input template __global__ void FusedCVMWithConvKernelNormal(const size_t N, T **output_values, @@ -81,6 +149,38 @@ __global__ void FusedCVMWithConvKernelNormal(const size_t N, T **output_values, } } +// join only need show input, and expand slot's feasign +template +__global__ void FusedCVMWithConvKernelNormalConcate(const size_t N, T **output_values, + T **seqpool_output_values, + const int batch_size, + const int embedding_size, + const int noclk_embedding_size, + const int embedx_concate_size) { + CUDA_KERNEL_LOOP(i, N) { + int key = i / noclk_embedding_size; + int offset = i % noclk_embedding_size; + int x = key / batch_size; // slot id + int y = key % batch_size; // ins id + for (int k=0; k < embedx_concate_size; k++) { + if (offset == 0) { // show + *(output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size) = + log(*(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size) + 1); + } else if (offset == 1) { // click + *(output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + 1) = + log(*(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + 1) + 1); + } else if (offset == 2) { // conv + *(output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + 2) = + log(*(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + 2) + 1) - + log(*(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + 1) + 1); + } else { // filter show, offset - 1 + *(output_values[x] + y * noclk_embedding_size * embedx_concate_size + k * embedding_size + offset) = + *(seqpool_output_values[x] + y * embedding_size * embedx_concate_size + k * embedding_size + offset); + } + } + } +} + // join only need show input template __global__ void FusedCVMWithConvKernelWithOutShow(const size_t N, T **output_values, @@ -109,6 +209,37 @@ __global__ void FusedCVMWithConvKernelWithOutShow(const size_t N, T **output_val } } +// join only need show input, and expand slot's feasign +template +__global__ void FusedCVMWithConvKernelWithOutShowConcate(const size_t N, T **output_values, + T **seqpool_output_values, + const int batch_size, + const int embedding_size, + const int noclk_embedding_size, + const int embedx_concate_size) { + CUDA_KERNEL_LOOP(i, N) { + int key = i / noclk_embedding_size; + int offset = i % noclk_embedding_size; + int x = key / batch_size; // slot id + int y = key % batch_size; // ins id + for (int k=0; k < embedx_concate_size; k++) { + if (offset == 0) { // show + // do nothing + } else if (offset == 1) { // click + *(output_values[x] + y * noclk_embedding_size * embedx_concate_size + k * noclk_embedding_size) = + log(*(seqpool_output_values[x] + y * embedding_size + 1) + 1); + } else if (offset == 2) { // conv + *(output_values[x] + y * noclk_embedding_size * embedx_concate_size + k * noclk_embedding_size + 1) = + log(*(seqpool_output_values[x] + y * embedding_size + 2) + 1) - + log(*(seqpool_output_values[x] + y * embedding_size + 1) + 1); + } else { // filter show, offset - 1 + *(output_values[x] + y * noclk_embedding_size * embedx_concate_size + k * noclk_embedding_size + offset) = + *(seqpool_output_values[x] + y * embedding_size + offset); + } + } + } +} + // update not need show click input template __global__ void FusedCVMWithConvKernelNoCVM(const size_t N, T **output_values, @@ -128,6 +259,28 @@ __global__ void FusedCVMWithConvKernelNoCVM(const size_t N, T **output_values, } } +// update not need show click input, expand slot's feasign +template +__global__ void FusedCVMWithConvKernelNoCVMConcate(const size_t N, T **output_values, + T **seqpool_output_values, + const int batch_size, + const int no_cvm_embedding_size, + const int cvm_offset, + const int embedx_concate_size) { + CUDA_KERNEL_LOOP(i, N) { + int key = i / no_cvm_embedding_size; + int offset = i % no_cvm_embedding_size; + int x = key / batch_size; // slot id + int y = key % batch_size; // ins id + // no cvm + for (int k=0; k < embedx_concate_size; k++) { + *(output_values[x] + y * no_cvm_embedding_size * embedx_concate_size + k * no_cvm_embedding_size + offset) = + *(seqpool_output_values[x] + y * (no_cvm_embedding_size + cvm_offset) * embedx_concate_size + + k * (no_cvm_embedding_size + cvm_offset) + offset + cvm_offset); + } + } +} + template void FusedSeqpoolCVMWithConv(const paddle::platform::Place &place, const std::vector &input_data, @@ -136,7 +289,10 @@ void FusedSeqpoolCVMWithConv(const paddle::platform::Place &place, std::vector lods, const int batch_size, const int slot_num, const int embedding_size, const float padding_value, const bool use_cvm, - const int cvm_offset, bool show_filter) { + float need_filter, float show_coeff, + float clk_coeff, float threshold, + const int cvm_offset, bool show_filter, + const int embedx_concate_size) { auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -167,81 +323,169 @@ void FusedSeqpoolCVMWithConv(const paddle::platform::Place &place, size_t N = static_cast(batch_size * slot_num * embedding_size); // first sum pool - FusedSeqpoolWithConvKernelNormal<<>>( - N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size, - embedding_size, padding_value); + if (embedx_concate_size == 1){ + if (need_filter) { //filter + FusedSeqpoolWithConvKernelFilter<<>>( + N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size, + embedding_size, padding_value, show_coeff, clk_coeff, threshold); + } else { //normal + FusedSeqpoolWithConvKernelNormal<<>>( + N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size, + embedding_size, padding_value); + } + } else { + FusedSeqpoolWithConvKernelNormalEmbedxConcate<<>>( + N, gpu_input_values, gpu_seqpool_output_values, lods_values, batch_size, + embedding_size, padding_value, embedx_concate_size); + } // second log if (use_cvm) { if (show_filter) { N = static_cast(batch_size * slot_num * (embedding_size - 1)); - FusedCVMWithConvKernelWithOutShow<<>>(N, gpu_output_values, - gpu_seqpool_output_values, batch_size, - embedding_size, embedding_size - 1); + if (embedx_concate_size == 1) { + FusedCVMWithConvKernelWithOutShow<<>>(N, gpu_output_values, + gpu_seqpool_output_values, batch_size, + embedding_size, embedding_size - 1); + } else { + FusedCVMWithConvKernelWithOutShowConcate<<>>(N, gpu_output_values, + gpu_seqpool_output_values, batch_size, + embedding_size, embedding_size - 1, embedx_concate_size); + } } else { - FusedCVMWithConvKernelNormal<<>>(N, gpu_output_values, - gpu_seqpool_output_values, batch_size, - embedding_size, embedding_size); + if (embedx_concate_size == 1) { + FusedCVMWithConvKernelNormal<<>>(N, gpu_output_values, + gpu_seqpool_output_values, batch_size, + embedding_size, embedding_size); + } else { + FusedCVMWithConvKernelNormalConcate<<>>(N, gpu_output_values, + gpu_seqpool_output_values, batch_size, + embedding_size, embedding_size, embedx_concate_size); + } } } else { // not need show click input N = static_cast(batch_size * slot_num * (embedding_size - cvm_offset)); - FusedCVMWithConvKernelNoCVM<<>>( - N, gpu_output_values, gpu_seqpool_output_values, batch_size, - (embedding_size - cvm_offset), cvm_offset); + if (embedx_concate_size == 1) { + FusedCVMWithConvKernelNoCVM<<>>( + N, gpu_output_values, gpu_seqpool_output_values, batch_size, + (embedding_size - cvm_offset), cvm_offset); + } else { + FusedCVMWithConvKernelNoCVMConcate<<>>( + N, gpu_output_values, gpu_seqpool_output_values, batch_size, + (embedding_size - cvm_offset), cvm_offset, embedx_concate_size); + } + } +} + +// join grad +template +__global__ void FusedSeqpoolCVMWithConvGradKernelWithCVM( + const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values, + size_t **lods_values, const int batch_size, const int embedding_size, + const int cvm_offset) { + CUDA_KERNEL_LOOP(i, N) { + int key = i / embedding_size; + int offset = i % embedding_size; // embedx offset + int x = key / batch_size; // slot id + int y = key % batch_size; // ins id + + T &val = (offset < cvm_offset) + ? *(cvm_values[x] + y * cvm_offset + offset) + : *(out_grads_values[x] + y * embedding_size + offset); + + auto &start = *(lods_values[x] + y); + auto &end = *(lods_values[x] + y + 1); + for (auto k = start; k < end; ++k) { + *(in_grads_values[x] + k * embedding_size + offset) = val; + } + } +} + +// join grad, expand slot's feasign +template +__global__ void FusedSeqpoolCVMWithConvGradKernelWithCVMConcate( + const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values, + size_t **lods_values, const int batch_size, const int embedding_size, + const int cvm_offset, const int embedx_concate_size) { + CUDA_KERNEL_LOOP(i, N) { + int key = i / embedding_size; + int offset = i % embedding_size; // embedx offset + int x = key / batch_size; // slot id + int y = key % batch_size; // ins id + + int concate_index = 0; + auto &start = *(lods_values[x] + y); + auto &end = *(lods_values[x] + y + 1); + for (auto k = start; k < end; ++k) { + T &val = (offset < cvm_offset) + ? *(cvm_values[x] + y * cvm_offset + offset) + : *(out_grads_values[x] + y * embedding_size * embedx_concate_size + + embedding_size * concate_index + offset); + *(in_grads_values[x] + k * embedding_size + offset) = val; + concate_index = concate_index == (embedx_concate_size - 1) ? concate_index : concate_index + 1; + } } } - // join grad - template - __global__ void FusedSeqpoolCVMWithConvGradKernelWithCVM( - const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values, - size_t **lods_values, const int batch_size, const int embedding_size, - const int cvm_offset) { - CUDA_KERNEL_LOOP(i, N) { - int key = i / embedding_size; - int offset = i % embedding_size; // embedx offset - int x = key / batch_size; // slot id - int y = key % batch_size; // ins id - - T &val = (offset < cvm_offset) - ? *(cvm_values[x] + y * cvm_offset + offset) - : *(out_grads_values[x] + y * embedding_size + offset); - - auto &start = *(lods_values[x] + y); - auto &end = *(lods_values[x] + y + 1); - for (auto k = start; k < end; ++k) { - *(in_grads_values[x] + k * embedding_size + offset) = val; - } - } - } - - // join only show not has click - template - __global__ void FusedSeqpoolCVMWithConvGradKernelWithShow( - const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values, - size_t **lods_values, const int batch_size, const int embedding_size, - const int cvm_offset) { - CUDA_KERNEL_LOOP(i, N) { - int key = i / embedding_size; - int offset = i % embedding_size; // embedx offset - int x = key / batch_size; // slot id - int y = key % batch_size; // ins id - - T &val = - (offset < cvm_offset) - ? *(cvm_values[x] + y * cvm_offset + offset) - : *(out_grads_values[x] + y * (embedding_size - 1) + offset - 1); - auto &start = *(lods_values[x] + y); - auto &end = *(lods_values[x] + y + 1); - for (auto k = start; k < end; ++k) { - *(in_grads_values[x] + k * embedding_size + offset) = val; - } - } - } +// join only show not has click +template +__global__ void FusedSeqpoolCVMWithConvGradKernelWithShow( + const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values, + size_t **lods_values, const int batch_size, const int embedding_size, + const int cvm_offset) { + CUDA_KERNEL_LOOP(i, N) { + int key = i / embedding_size; + int offset = i % embedding_size; // embedx offset + int x = key / batch_size; // slot id + int y = key % batch_size; // ins id + + T &val = + (offset < cvm_offset) + ? *(cvm_values[x] + y * cvm_offset + offset) + : *(out_grads_values[x] + y * (embedding_size - 1) + offset - 1); + + auto &start = *(lods_values[x] + y); + auto &end = *(lods_values[x] + y + 1); + for (auto k = start; k < end; ++k) { + *(in_grads_values[x] + k * embedding_size + offset) = val; + } + } +} + +// join only show not has click, expand slot's feasign +template +__global__ void FusedSeqpoolCVMWithConvGradKernelWithShowConcate( + const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values, + size_t **lods_values, const int batch_size, const int embedding_size, + const int cvm_offset, const int embedx_concate_size) { + CUDA_KERNEL_LOOP(i, N) { + int key = i / embedding_size; + int offset = i % embedding_size; // embedx offset + int x = key / batch_size; // slot id + int y = key % batch_size; // ins id + + auto &start = *(lods_values[x] + y); + auto &end = *(lods_values[x] + y + 1); + int concate_index = 0; + for (auto k = start; k < end; ++k) { + T &val = + (offset < cvm_offset) + ? *(cvm_values[x] + y * cvm_offset + offset) + : *(out_grads_values[x] + y * (embedding_size - 1) * embedx_concate_size + + (embedding_size - 1) * concate_index + offset - 1); + *(in_grads_values[x] + k * embedding_size + offset) = val; + concate_index = concate_index == (embedx_concate_size - 1) ? concate_index : concate_index + 1; + } + } +} // update grad template @@ -267,6 +511,33 @@ __global__ void FusedSeqpoolCVMWithConvGradKernelNoCVM( } } } + +// update grad, expand slot's feasign +template +__global__ void FusedSeqpoolCVMWithConvGradKernelNoCVMConcate( + const size_t N, T **out_grads_values, T **in_grads_values, T **cvm_values, + size_t **lods_values, const int batch_size, const int embedding_size, + const int cvm_offset, const int embedx_concate_size) { + CUDA_KERNEL_LOOP(i, N) { + int key = i / embedding_size; + int offset = i % embedding_size; // embedx offset + int x = key / batch_size; // slot id + int y = key % batch_size; // ins id + + auto &start = *(lods_values[x] + y); + auto &end = *(lods_values[x] + y + 1); + int concate_index = 0; + for (auto k = start; k < end; ++k) { + T &val = (offset < cvm_offset) + ? *(cvm_values[x] + y * cvm_offset + offset) + : *(out_grads_values[x] + y * (embedding_size - cvm_offset) * embedx_concate_size + + (embedding_size - cvm_offset) * concate_index + offset - cvm_offset); + *(in_grads_values[x] + k * embedding_size + offset) = val; + concate_index = concate_index == (embedx_concate_size - 1) ? concate_index : concate_index + 1; + } + } +} + template void FusedSeqpoolCVMGradWithConv(const paddle::platform::Place &place, const std::vector &out_grads_data, @@ -275,7 +546,8 @@ void FusedSeqpoolCVMGradWithConv(const paddle::platform::Place &place, const std::vector &lods, const int batch_size, const int slot_num, const int embedding_size, const bool use_cvm, - const int cvm_offset, bool show_filter) { + const int cvm_offset, bool show_filter, + const int embedx_concate_size) { auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -307,23 +579,43 @@ void FusedSeqpoolCVMGradWithConv(const paddle::platform::Place &place, size_t N = static_cast(batch_size * slot_num * embedding_size); if (use_cvm) { if (show_filter) { + if (embedx_concate_size == 1) { FusedSeqpoolCVMWithConvGradKernelWithShow<<>>( N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values, lods_values, batch_size, embedding_size, cvm_offset); - + } else { + FusedSeqpoolCVMWithConvGradKernelWithShowConcate<<>>( + N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values, + lods_values, batch_size, embedding_size, cvm_offset, embedx_concate_size); + } } else { + if (embedx_concate_size == 1) { FusedSeqpoolCVMWithConvGradKernelWithCVM<<>>( N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values, lods_values, batch_size, embedding_size, cvm_offset); + } else { + FusedSeqpoolCVMWithConvGradKernelWithCVMConcate<<>>( + N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values, + lods_values, batch_size, embedding_size, cvm_offset, embedx_concate_size); + } } } else { // update grad - FusedSeqpoolCVMWithConvGradKernelNoCVM<<>>( - N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values, - lods_values, batch_size, embedding_size, cvm_offset); + if (embedx_concate_size == 1) { + FusedSeqpoolCVMWithConvGradKernelNoCVM<<>>( + N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values, + lods_values, batch_size, embedding_size, cvm_offset); + } else { + FusedSeqpoolCVMWithConvGradKernelNoCVMConcate<<>>( + N, gpu_out_grads_values, gpu_in_grads_values, gpu_cvm_values, + lods_values, batch_size, embedding_size, cvm_offset, embedx_concate_size); + } } } @@ -344,9 +636,14 @@ class FusedSeqpoolCVMWithConvCUDAKernel : public framework::OpKernel { auto padding_value = ctx.Attr("pad_value"); auto use_cvm = ctx.Attr("use_cvm"); + bool need_filter = ctx.Attr("need_filter"); + float show_coeff = ctx.Attr("show_coeff"); + float clk_coeff = ctx.Attr("clk_coeff"); + float threshold = ctx.Attr("threshold"); const int cvm_offset = ctx.Attr("cvm_offset"); bool show_filter = ctx.Attr("show_filter"); - + const int embedx_concate_size = ctx.Attr("embedx_concate_size"); + framework::GPULodVector gpu_lods[slot_size]; auto place = ctx.GetPlace(); @@ -368,13 +665,13 @@ class FusedSeqpoolCVMWithConvCUDAKernel : public framework::OpKernel { if (use_cvm) { if (show_filter) { // show will filtered - output->Resize({batch_size, embedding_size - 1}); + output->Resize({batch_size, (embedding_size - 1) * embedx_concate_size}); } else { // show will filtered - output->Resize({batch_size, embedding_size}); + output->Resize({batch_size, embedding_size * embedx_concate_size}); } } else { - output->Resize({batch_size, embedding_size - cvm_offset}); + output->Resize({batch_size, (embedding_size - cvm_offset) * embedx_concate_size}); } output_data[i] = reinterpret_cast(output->mutable_data(ctx.GetPlace())); @@ -382,11 +679,13 @@ class FusedSeqpoolCVMWithConvCUDAKernel : public framework::OpKernel { seqpool_output_data[i] = reinterpret_cast(seqpool_outputs[i].mutable_data( - {batch_size, embedding_size}, ctx.GetPlace())); + {batch_size, embedding_size * embedx_concate_size}, ctx.GetPlace())); } FusedSeqpoolCVMWithConv(ctx.GetPlace(), input_data, output_data, seqpool_output_data, lods_data, batch_size, slot_size, - embedding_size, padding_value, use_cvm, cvm_offset, show_filter); + embedding_size, padding_value, use_cvm, + need_filter, show_coeff, clk_coeff, threshold, + cvm_offset, show_filter, embedx_concate_size); } }; @@ -402,6 +701,7 @@ class FusedSeqpoolCVMWithConvGradCUDAKernel : public framework::OpKernel { auto use_cvm = ctx.Attr("use_cvm"); const int cvm_offset = ctx.Attr("cvm_offset"); bool show_filter = ctx.Attr("show_filter"); + const int embedx_concate_size = ctx.Attr("embedx_concate_size"); const auto slot_size = in_grads.size(); std::vector out_grads_data(slot_size); @@ -436,7 +736,7 @@ class FusedSeqpoolCVMWithConvGradCUDAKernel : public framework::OpKernel { } FusedSeqpoolCVMGradWithConv(ctx.GetPlace(), out_grads_data, in_grads_data, cvm_data, lods_data, batch_size, slot_size, embedding_size, - use_cvm, cvm_offset, show_filter); + use_cvm, cvm_offset, show_filter, embedx_concate_size); } }; diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc index c6bed95e83dc5..25ed83844a079 100644 --- a/paddle/fluid/operators/index_select_op.cc +++ b/paddle/fluid/operators/index_select_op.cc @@ -104,8 +104,11 @@ class IndexSelectGradMaker : public framework::SingleGradOpMaker { } }; +#ifdef PADDLE_ON_INFERENCE DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer, "X"); +#endif + } // namespace operators } // namespace paddle @@ -119,6 +122,12 @@ REGISTER_OPERATOR(index_select, ops::IndexSelectGradMaker, ops::IndexSelectGradMaker, IndexSelectInferShapeFunctor); + +#ifdef PADDLE_ON_INFERENCE REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp, ops::IndexSelectGradNoNeedBufferVarsInferer); +#else +REGISTER_OPERATOR(index_select_grad, + ops::IndexSelectGradOp); +#endif diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index f18053e297e55..50a7a3414dc52 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -13,9 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include -#include - #include #include diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc index 73d1131655aca..defea15e022c8 100644 --- a/paddle/fluid/operators/rank_attention_op.cc +++ b/paddle/fluid/operators/rank_attention_op.cc @@ -326,6 +326,7 @@ class RankAttentionGradOpMaker : public framework::SingleGradOpMaker { op->SetAttrMap(this->Attrs()); } }; + DECLARE_NO_NEED_BUFFER_VARS_INFERER(RankAttentionGradOpNoNeedBufferVarsInference, "X", "RankOffset", diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu index 76999815e8eaf..99a954673da16 100644 --- a/paddle/fluid/operators/rank_attention_op.cu +++ b/paddle/fluid/operators/rank_attention_op.cu @@ -264,9 +264,9 @@ __global__ void kernel_rank_back_propagate(const int para_row, // rank offset 2:1:46:2:44:3:45 CUDA_KERNEL_LOOP(idx, ins_num * ins_col * para_col * max_rank) { int ins_id = idx / para_col / ins_col / max_rank; - int para_col_id = (idx / ins_col / max_rank) % para_col; + int para_col_id = (idx / ins_col / ins_num) % para_col; int ins_col_id = (idx / para_col / max_rank) % ins_col; - int k = (idx / para_col / ins_col) % max_rank; + int k = idx % max_rank; int lower = rank_offset[ins_id * rank_cols] - 1; if (lower < 0) { diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h index a9a44861c11e1..eeb284d167883 100644 --- a/paddle/fluid/operators/save_combine_op.h +++ b/paddle/fluid/operators/save_combine_op.h @@ -37,12 +37,9 @@ template class SaveCombineOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto place = ctx.GetPlace(); auto filename = ctx.Attr("file_path"); auto overwrite = ctx.Attr("overwrite"); - auto save_as_fp16 = ctx.Attr("save_as_fp16"); auto save_to_memory = ctx.Attr("save_to_memory"); - auto output = ctx.Output("Y"); bool is_present = FileExists(filename); if (is_present && !overwrite) { @@ -52,8 +49,31 @@ class SaveCombineOpKernel : public framework::OpKernel { filename, overwrite)); } + if (save_to_memory) { + auto output = ctx.Output("Y"); + PADDLE_ENFORCE_NE(output, + nullptr, + platform::errors::InvalidArgument( + "Cannot find variable Y for save_combine_op")); + std::ostringstream ss; + SaveCombineVars(ctx, reinterpret_cast(&ss)); + *output = ss.str(); + } else { + MkDirRecursively(DirName(filename).c_str()); + std::ofstream fout(filename, std::ios::binary); + PADDLE_ENFORCE_EQ(static_cast(fout), + true, + platform::errors::Unavailable( + "Cannot open %s to save variables.", filename)); + SaveCombineVars(ctx, reinterpret_cast(&fout)); + fout.close(); + } + } - std::ostringstream ss; + protected: + void SaveCombineVars(const framework::ExecutionContext &ctx, std::ostream *os) const { + auto place = ctx.GetPlace(); + auto save_as_fp16 = ctx.Attr("save_as_fp16"); auto inp_var_names = ctx.InputNames("X"); auto &inp_vars = ctx.MultiInputVar("X"); PADDLE_ENFORCE_GT(inp_var_names.size(), @@ -102,9 +122,9 @@ class SaveCombineOpKernel : public framework::OpKernel { out.set_lod(tensor.lod()); framework::TransDataType( in_kernel_type, out_kernel_type, tensor, &out); - framework::SerializeToStream(ss, out, dev_ctx); + framework::SerializeToStream(*os, out, dev_ctx); } else { - framework::SerializeToStream(ss, tensor, dev_ctx); + framework::SerializeToStream(*os, tensor, dev_ctx); } } else { auto &tensor = inp_vars[i]->Get(); @@ -114,25 +134,9 @@ class SaveCombineOpKernel : public framework::OpKernel { framework::ConvertWstrToStr(it->first, &t); data.emplace(t, it->second); } - framework::StringMapToStream(ss, data); + framework::StringMapToStream(*os, data); } } - if (save_to_memory) { - PADDLE_ENFORCE_NE(output, - nullptr, - platform::errors::InvalidArgument( - "Cannot find variable Y for save_combine_op")); - *output = ss.str(); - } else { - MkDirRecursively(DirName(filename).c_str()); - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fout), - true, - platform::errors::Unavailable( - "Cannot open %s to save variables.", filename)); - fout << ss.str(); - fout.close(); - } } }; diff --git a/paddle/fluid/operators/scaled_fc_op.cu b/paddle/fluid/operators/scaled_fc_op.cu index 20bd9dbf07361..bf920093ff794 100644 --- a/paddle/fluid/operators/scaled_fc_op.cu +++ b/paddle/fluid/operators/scaled_fc_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/scaled_fc_op.h" diff --git a/paddle/fluid/operators/scaled_int8fc_op.cu b/paddle/fluid/operators/scaled_int8fc_op.cu index c03bbf61d67fb..347640fadd68f 100644 --- a/paddle/fluid/operators/scaled_int8fc_op.cu +++ b/paddle/fluid/operators/scaled_int8fc_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/scaled_int8fc_op.h" diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h index 7185d2356aae5..4ff874c3e89f5 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h @@ -22,6 +22,7 @@ #include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/backends/gpu/cuda/cuda_helper.h" namespace paddle { namespace platform { @@ -70,11 +71,6 @@ namespace platform { * */ -#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ - int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \ - for (index_type i = __index__; __index__ < (num); \ - __index__ += blockDim.x * gridDim.x, i = __index__) - class CublasHandleHolder { public: CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc index 291dd6c7ce1c7..a49d9013fb6d0 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc @@ -17,9 +17,10 @@ namespace paddle { namespace platform { -void CudaProfilerInit(std::string output_file, - std::string output_mode, - std::string config_file) { +void CudaProfilerInit(const std::string& output_file, + const std::string& output_mode, + const std::string& config_file) { +#if CUDA_VERSION < 11000 PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv", platform::errors::InvalidArgument( "Unsupported cuda profiler output mode, expect `kvp` or " @@ -28,6 +29,7 @@ void CudaProfilerInit(std::string output_file, cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; PADDLE_ENFORCE_GPU_SUCCESS( cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode)); +#endif } void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); } @@ -35,8 +37,16 @@ void CudaProfilerStart() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStart()); } void CudaProfilerStop() { PADDLE_ENFORCE_GPU_SUCCESS(cudaProfilerStop()); } #ifndef _WIN32 -void CudaNvtxRangePush(std::string name) { - dynload::nvtxRangePushA(name.c_str()); +void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) { + nvtxEventAttributes_t eventAttrib; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = static_cast(color); + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.message.ascii = name.c_str(); + + dynload::nvtxRangePushEx(&eventAttrib); } void CudaNvtxRangePop() { dynload::nvtxRangePop(); } diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h index 6c7cf0fd8dd94..555a83a0210f2 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h @@ -23,16 +23,26 @@ limitations under the License. */ namespace paddle { namespace platform { -void CudaProfilerInit(std::string output_file, - std::string output_mode, - std::string config_file); +void CudaProfilerInit(const std::string& output_file, + const std::string& output_mode, + const std::string& config_file); void CudaProfilerStart(); void CudaProfilerStop(); #ifndef _WIN32 -void CudaNvtxRangePush(std::string name); +enum class NvtxRangeColor : uint32_t { + Black = 0x00000000, + Red = 0x00ff0000, + Green = 0x0000ff00, + Blue = 0x000000ff, + White = 0x00ffffff, + Yellow = 0x00ffff00, +}; + +void CudaNvtxRangePush(const std::string& name, + const NvtxRangeColor color = NvtxRangeColor::Green); void CudaNvtxRangePop(); #endif diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc index c6f367228e114..edcd29b88d0c2 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.cc +++ b/paddle/fluid/platform/device/xpu/xpu_info.cc @@ -21,6 +21,25 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/backends/xpu/xpu_info.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" +#include "paddle/fluid/memory/stats.h" + +DECLARE_uint64(xpu_memory_limit_mb); +DECLARE_double(fraction_of_xpu_memory_to_use); +DECLARE_uint64(initial_xpu_memory_in_mb); +DECLARE_uint64(reallocate_xpu_memory_in_mb); + +constexpr static float fraction_reserve_xpu_memory = 0.05f; + +PADDLE_DEFINE_EXPORTED_bool(enable_xpu_memory_usage_log, + false, + "Whether to print the message of xpu memory usage " + "at exit, mainly used for UT and CI."); +PADDLE_DEFINE_EXPORTED_bool(enable_xpu_memory_usage_log_mb, + true, + "Whether to print the message of xpu memory usage " + "MB as a unit of measurement."); namespace paddle { namespace platform { @@ -199,5 +218,333 @@ phi::backends::xpu::XPUVersion get_xpu_version(int dev_id) { return phi::backends::xpu::get_xpu_version(dev_id); } +std::once_flag XPUMLHandler::init_flag_; + +XPUMLHandler::XPUMLHandler() { + std::call_once(XPUMLHandler::init_flag_, &XPUMLHandler::init_ml); + xpumlDeviceGetCount(&device_nums_); + device_handlers_.resize(device_nums_); + mem_infos_.resize(device_nums_); + for (unsigned int i = 0; i < device_nums_; ++i) { + xpumlDeviceGetHandleByIndex(i, &device_handlers_[i]); + } +} + + +/**************************** Memory Management **************************/ +// == Memory monitor == +void XPUMLHandler::init_ml() { + xpumlInit(); +} + +bool XPUMLHandler::getMemoryUsageInfo(int dev_id, unsigned long long *total, + unsigned long long* used, unsigned long long *free) { + if(xpumlDeviceGetMemoryInfo(device_handlers_[dev_id], &mem_infos_[dev_id]) != xpumlReturn_enum::XPUML_SUCCESS) { + return false; + } + *total = mem_infos_[dev_id].totalGlobalMemory; + *free = mem_infos_[dev_id].freeGlobalMemory; + *used = mem_infos_[dev_id].usedGlobalMemory; + return true; +} + +bool XPUMLHandler::getL3UsageInfo(int dev_id, unsigned long long *total, + unsigned long long *used, unsigned long long *free) { + if(xpumlDeviceGetMemoryInfo(device_handlers_[dev_id], &mem_infos_[dev_id]) != xpumlReturn_enum::XPUML_SUCCESS) { + return false; + } + *total = mem_infos_[dev_id].totalL3Memory; + *free = mem_infos_[dev_id].freeL3Memory; + *used = mem_infos_[dev_id].usedL3Memory; + return true; +} + +std::tuple XPUMLHandler::getMemoryUsageTuple(int dev_id) { + if(xpumlDeviceGetMemoryInfo(device_handlers_[dev_id], &mem_infos_[dev_id]) != xpumlReturn_enum::XPUML_SUCCESS) { + return {0, 0, 0}; + } + return {mem_infos_[dev_id].totalGlobalMemory, + mem_infos_[dev_id].usedGlobalMemory, + mem_infos_[dev_id].freeGlobalMemory}; + +} + +std::tuple XPUMLHandler::getL3UsageTuple(int dev_id) { + if(xpumlDeviceGetMemoryInfo(device_handlers_[dev_id], &mem_infos_[dev_id]) != xpumlReturn_enum::XPUML_SUCCESS) { + return {0, 0, 0}; + } + return {mem_infos_[dev_id].totalL3Memory, + mem_infos_[dev_id].usedL3Memory, + mem_infos_[dev_id].freeL3Memory}; +} + + +// == Memory malloc & free == + + + +class RecordedXpuMallocHelper { + private: + explicit RecordedXpuMallocHelper(int dev_id, uint64_t limit_size = 0) + : dev_id_(dev_id), limit_size_(limit_size) { + if (NeedRecord()) { + mtx_.reset(new std::mutex()); + } + + if (FLAGS_enable_xpu_memory_usage_log) { + // A fake UPDATE to trigger the construction of memory stat instances, + // make sure that they are destructed after RecordedXpuMallocHelper. + DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id, 0); + DEVICE_MEMORY_STAT_UPDATE(Allocated, dev_id, 0); + } + } + + DISABLE_COPY_AND_ASSIGN(RecordedXpuMallocHelper); + + public: + ~RecordedXpuMallocHelper() { + if (FLAGS_enable_xpu_memory_usage_log) { + if (FLAGS_enable_xpu_memory_usage_log_mb) { + std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = " + << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / + 1048576.0 + << ", Allocated = " + << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / + 1048576.0 + << std::endl; + } else { + std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = " + << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) + << ", Allocated = " + << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) + << std::endl; + } + } + } + + static RecordedXpuMallocHelper *Instance(int dev_id) { + static std::vector> instances_; + + std::call_once(once_flag_, [] { + int dev_cnt = GetXPUDeviceCount(); + instances_.reserve(dev_cnt); + for (int i = 0; i < dev_cnt; ++i) { + instances_.emplace_back( + new RecordedXpuMallocHelper(i, FLAGS_xpu_memory_limit_mb << 20)); + } + }); + + PADDLE_ENFORCE_GE( + dev_id, + 0, + platform::errors::OutOfRange( + "Device id must be not less than 0, but got %d.", dev_id)); + PADDLE_ENFORCE_LT( + dev_id, + instances_.size(), + platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.", + dev_id, + instances_.size())); + return instances_[dev_id].get(); + } + + XPUError_t Malloc(void **ptr, size_t size, bool malloc_managed_memory = false) { + // CHECK(malloc_managed_memory == false) << "xpu not supported yet"; + if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { + return XPUERR_NOMEM; + } + + XPUDeviceGuard guard(dev_id_); + + int result = xpu_malloc(ptr, size); + VLOG(10) << "[xpu_malloc] size=" << static_cast(size) / (1 << 20) + << " MB, result=" << result; + + if (result == 0) { + if (UNLIKELY(NeedRecord())) { + cur_size_.fetch_add(size); + DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size); + platform::RecordMemEvent(ptr, + XPUPlace(dev_id_), + size, + platform::TracerMemEventType::ReservedAllocate); + } + + return XPU_SUCCESS; + } else { + return XPUERR_NOMEM; + } + } + + void Free(void *ptr, size_t size) { + XPUDeviceGuard guard(dev_id_); + xpu_free(ptr); + if (UNLIKELY(NeedRecord())) { + cur_size_.fetch_sub(size); + DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); + platform::RecordMemEvent(ptr, + XPUPlace(dev_id_), + size, + platform::TracerMemEventType::ReservedFree); + } + } + + bool GetMemInfo(size_t *avail, + size_t *total, + size_t *actual_avail, + size_t *actual_total) { + unsigned long long uint64_total = 0, used = 0, free = 0; + + CHECK(ml_handler.getMemoryUsageInfo(dev_id_, &uint64_total, &used, &free) == true) << "get mem usage info failed"; + *actual_avail = uint64_total - free; + *actual_total = uint64_total; + + if (UNLIKELY(NeedRecord())) { + std::lock_guard guard(*mtx_); + *avail = std::min(*actual_avail, limit_size_ - cur_size_.load()); + *total = std::min(*actual_total, limit_size_); + return *total < *actual_total; + } else { + *avail = *actual_avail; + *total = *actual_total; + return false; + } + } + + inline bool NeedRecord() const { return limit_size_ != 0; } + + uint64_t RecordedSize() const { return cur_size_.load(); } + + uint64_t LimitSize() const { return limit_size_; } + + + private: + const int dev_id_; + const uint64_t limit_size_; + std::atomic cur_size_{0}; + + mutable std::unique_ptr mtx_; + static std::once_flag once_flag_; + + XPUMLHandler ml_handler; +}; + +std::once_flag RecordedXpuMallocHelper::once_flag_; + +XPUError_t RecordedXpuMalloc(void **ptr, size_t size, int dev_id, bool malloc_managed_memory) { + return RecordedXpuMallocHelper::Instance(dev_id)->Malloc(ptr, size, malloc_managed_memory); +} + +void RecordedXpuFree(void *p, size_t size, int dev_id) { + return RecordedXpuMallocHelper::Instance(dev_id)->Free(p, size); +} + +bool RecordedXpuMemGetInfo(size_t *avail, + size_t *total, + size_t *actual_avail, + size_t *actual_total, + int dev_id) { + return RecordedXpuMallocHelper::Instance(dev_id)->GetMemInfo(avail, total, actual_avail, actual_total); +} + +size_t XpuAvailableMemToAlloc() { + XPUMLHandler handler; + unsigned long long total = 0; + unsigned long long used = 0; + unsigned long long free = 0; + bool re = handler.getMemoryUsageInfo(GetXPUCurrentDeviceId(), &total, &used, &free); + CHECK(re == true) << "query mem info failed"; + + size_t reserving = static_cast(fraction_reserve_xpu_memory * free); + // If available size is less than minimum chunk size, no usable memory exists + size_t available_to_alloc = free - reserving; + size_t min_chunk_size = XpuMinChunkSize(); + if (available_to_alloc < min_chunk_size) { + available_to_alloc = 0; + } + return available_to_alloc; +} + +static size_t XpuAllocSize(bool realloc) { + size_t available_to_alloc = XpuAvailableMemToAlloc(); + PADDLE_ENFORCE_GT( + available_to_alloc, + 0, + platform::errors::ResourceExhausted("Not enough available XPU memory.")); + // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be + // allocated by fraction + size_t flag_mb = realloc ? FLAGS_reallocate_xpu_memory_in_mb + : FLAGS_initial_xpu_memory_in_mb; + size_t alloc_bytes = + (flag_mb > 0ul + ? flag_mb << 20 + : available_to_alloc * FLAGS_fraction_of_xpu_memory_to_use); + PADDLE_ENFORCE_GE( + available_to_alloc, + alloc_bytes, + platform::errors::ResourceExhausted("Not enough available GPU memory.")); + VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) + << " MiB, is it Re-alloc: " << realloc; + return alloc_bytes; +} + + +size_t XpuInitAllocSize() { + return XpuAllocSize(false); +} + +size_t XpuReallocSize() { + return XpuAllocSize(true); +} + +size_t XpuMaxAllocSize() { + return std::max(XpuInitAllocSize(), XpuReallocSize()); +} + +size_t XpuMinChunkSize() { + return 1 << 8; +} + + +size_t XpuMaxChunkSize() { + + size_t max_chunk_size = XpuMaxAllocSize(); + VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; + return max_chunk_size; + +} + +// for test +class MallocCnter { +public: + static MallocCnter & getInstance() { + static MallocCnter instance; + return instance; + } + + void inc_malloc_cnt(int dev_id) { + CHECK(dev_id >= 0 && dev_id < 8); + malloc_cnts[dev_id]++; + } + + int get_malloc_cnt(int dev_id) { + CHECK(dev_id >= 0 && dev_id < 8); + return malloc_cnts[dev_id].load(); + } + +private: + MallocCnter() {} + std::atomic malloc_cnts[8]; +}; + +int get_malloc_cnt(int dev_id) { + return MallocCnter::getInstance().get_malloc_cnt(dev_id); +} + +int inc_malloc_cnt(int dev_id) { + MallocCnter::getInstance().inc_malloc_cnt(dev_id); + return 0; +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h index 85445df0bd762..9e25e82677e10 100644 --- a/paddle/fluid/platform/device/xpu/xpu_info.h +++ b/paddle/fluid/platform/device/xpu/xpu_info.h @@ -12,10 +12,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include +#include #include "paddle/fluid/platform/place.h" #include "paddle/phi/backends/xpu/xpu_info.h" #include "xpu/runtime.h" +#include "xpu/xpuml.h" namespace paddle { @@ -106,6 +108,48 @@ using XPUDeviceGuard = phi::backends::xpu::XPUDeviceGuard; phi::backends::xpu::XPUVersion get_xpu_version(int dev_id); +class XPUMLHandler { +public: + XPUMLHandler(); + // total, used, free + bool getMemoryUsageInfo(int dev_id, unsigned long long *total, unsigned long long* used, unsigned long long *free); + bool getL3UsageInfo(int dev_id, unsigned long long *total, unsigned long long *used, unsigned long long *free); + + // (total, used, free) + std::tuple getMemoryUsageTuple(int dev_id); + std::tuple getL3UsageTuple(int dev_id); + +private: + static void init_ml(); + + static std::once_flag init_flag_; + + std::vector device_handlers_; + std::vector mem_infos_; + unsigned int device_nums_; +}; + +XPUError_t RecordedXpuMalloc(void **ptr, size_t size, int dev_id, bool malloc_managed_memory = false); + +void RecordedXpuFree(void *p, size_t size, int dev_id); + +bool RecordedXpuMemGetInfo(size_t *avail, + size_t *total, + size_t *actual_avail, + size_t *actual_total, + int dev_id); + +size_t XpuMinChunkSize(); +size_t XpuMaxChunkSize(); + +size_t XpuInitAllocSize(); +size_t XpuReallocSize(); +size_t XpuMaxAllocSize(); + +// for calculate malloc times +int get_malloc_cnt(int dev_id); +int inc_malloc_cnt(int dev_id); + } // namespace platform } // namespace paddle #endif diff --git a/paddle/fluid/platform/dynload/cublasLt.h b/paddle/fluid/platform/dynload/cublasLt.h index 3a1d28072c591..5bf92876f4fd0 100644 --- a/paddle/fluid/platform/dynload/cublasLt.h +++ b/paddle/fluid/platform/dynload/cublasLt.h @@ -39,7 +39,34 @@ namespace dynload { extern DynLoad__##__name __name // APIs available after CUDA 10.1 -// #if CUDA_VERSION >= 10100 +#if CUDA_VERSION >= 11010 +#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasLtCreate); \ + __macro(cublasLtDestroy); \ + __macro(cublasLtMatmul); \ + __macro(cublasLtMatmulDescCreate); \ + __macro(cublasLtMatmulDescDestroy); \ + __macro(cublasLtMatmulDescSetAttribute); \ + __macro(cublasLtMatmulDescGetAttribute); \ + __macro(cublasLtMatrixLayoutCreate); \ + __macro(cublasLtMatrixLayoutDestroy); \ + __macro(cublasLtMatrixLayoutSetAttribute); \ + __macro(cublasLtMatrixLayoutGetAttribute); \ + __macro(cublasLtMatmulPreferenceCreate); \ + __macro(cublasLtMatmulPreferenceDestroy); \ + __macro(cublasLtMatmulPreferenceSetAttribute); \ + __macro(cublasLtMatmulAlgoGetHeuristic); \ + __macro(cublasLtMatrixTransform); \ + __macro(cublasLtMatrixTransformDescCreate); \ + __macro(cublasLtMatrixTransformDescDestroy); \ + __macro(cublasLtMatrixTransformDescSetAttribute); \ + __macro(cublasLtMatmulAlgoInit); \ + __macro(cublasLtMatmulAlgoConfigSetAttribute); \ + __macro(cublasLtMatmulAlgoGetIds); \ + __macro(cublasLtMatmulAlgoCapGetAttribute); \ + __macro(cublasLtMatmulAlgoCheck); \ + __macro(cublasLtGetCudartVersion); +#else #define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \ __macro(cublasLtCreate); \ __macro(cublasLtDestroy); \ @@ -60,6 +87,7 @@ namespace dynload { __macro(cublasLtMatrixTransformDescCreate); \ __macro(cublasLtMatrixTransformDescDestroy); \ __macro(cublasLtMatrixTransformDescSetAttribute); +#endif CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP) // #endif diff --git a/paddle/fluid/platform/dynload/nvtx.h b/paddle/fluid/platform/dynload/nvtx.h index c3dc9e31df354..e5816e240e6d2 100644 --- a/paddle/fluid/platform/dynload/nvtx.h +++ b/paddle/fluid/platform/dynload/nvtx.h @@ -13,11 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifndef _WIN32 -#include -#include - -#include // NOLINT - #include "paddle/phi/backends/dynload/nvtx.h" namespace paddle { @@ -28,11 +23,12 @@ namespace dynload { using DynLoad__##__name = phi::dynload::DynLoad__##__name; \ extern DynLoad__##__name __name -#define NVTX_ROUTINE_EACH(__macro) \ - __macro(nvtxRangePushA); \ +#define PLATFORM_NVTX_ROUTINE_EACH(__macro) \ + __macro(nvtxRangePushA); \ + __macro(nvtxRangePushEx); \ __macro(nvtxRangePop); -NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP); +PLATFORM_NVTX_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP); #undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVTX_WRAP } // namespace dynload diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index e78abbf80a8d0..d507837ec915f 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -630,6 +630,56 @@ PADDLE_DEFINE_EXPORTED_uint64( #endif +#if defined(PADDLE_WITH_XPU) + +PADDLE_DEFINE_EXPORTED_bool( + use_xpu_buddy_allocator, + true, + "If set true, using buddy allocator to manage mem allocation"); + +constexpr static float fraction_of_xpu_memory_to_use = 0.1f; +PADDLE_DEFINE_EXPORTED_double( + fraction_of_xpu_memory_to_use, + fraction_of_xpu_memory_to_use, + "Allocate a trunk of xpu memory that is this fraction of the " + "total xpu memory size. Future memory usage will be allocated " + "from the trunk. If the trunk doesn't have enough xpu memory, " + "additional trunks of the same size will be requested from xpu " + "until the xpu has no memory left for another trunk."); + +PADDLE_DEFINE_EXPORTED_uint64( + initial_xpu_memory_in_mb, + 0ul, + "Allocate a trunk of xpu memory whose byte size is specified by " + "the flag. Future memory usage will be allocated from the " + "trunk. If the trunk doesn't have enough gpu memory, additional " + "trunks of the xpu memory will be requested from xpu with size " + "specified by FLAGS_reallocate_xpu_memory_in_mb until the xpu has " + "no memory left for the additional trunk. Note: if you set this " + "flag, the memory size set by " + "FLAGS_fraction_of_xpu_memory_to_use will be overrided by this " + "flag. If you don't set this flag, PaddlePaddle will use " + "FLAGS_fraction_of_xpu_memory_to_use to allocate xpu memory"); + +PADDLE_DEFINE_EXPORTED_uint64( + reallocate_xpu_memory_in_mb, + 0ul, + "If this flag is set, Paddle will reallocate the xpu memory with " + "size specified by this flag. Else Paddle will reallocate by " + "FLAGS_fraction_of_xpu_memory_to_use"); + +PADDLE_DEFINE_EXPORTED_uint64( + xpu_memory_limit_mb, + 0UL, + "The maximum gpu memory limit that the process can allocate. " + "If it is equal to 0, there would be no limit and all gpu memory " + "would be available to the process. If it is larger than 0, " + "the process would raise out of memory error if the allocated " + "memory exceeds the limit even though there is available " + "memory on the gpu card. The unit is MB and default value is 0."); + +#endif + /** * Scope related FLAG * Name: local_exe_sub_scope_limit @@ -1027,6 +1077,10 @@ PADDLE_DEFINE_EXPORTED_bool(gpugraph_load_node_list_into_hbm, true, "enable load_node_list_into_hbm, default true"); +PADDLE_DEFINE_EXPORTED_bool(enable_dump_main_program, + false, + "enable dump main program, default false"); + /** * ProcessGroupNCCL related FLAG * Name: nccl_blocking_wait diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index ae5e5696a80f7..f6a89a0090e79 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -132,7 +132,8 @@ set(PYBIND_SRCS op_function5.cc op_function6.cc op_function7.cc - op_function8.cc) + op_function8.cc + ) if(WITH_CUSTOM_DEVICE) set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi) @@ -206,6 +207,10 @@ if(WITH_NCCL OR WITH_RCCL) list(APPEND PYBIND_SRCS nccl_wrapper_py.cc) endif() +if(WITH_XPU) + list(APPEND PYBIND_SRCS xpu_info_py.cc) +endif() + if(WITH_PYTHON) # generate op pybind functions automatically for dygraph. if(WITH_ASCEND_CL) diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index 89a3904d0003f..5503c47197a26 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -49,7 +49,8 @@ void BindConstValue(pybind11::module* m) { .value("Loss", framework::OpRole::kLoss) .value("RPC", framework::OpRole::kRPC) .value("Dist", framework::OpRole::kDist) - .value("LRSched", framework::OpRole::kLRSched); + .value("LRSched", framework::OpRole::kLRSched) + .value("ScaleLr", framework::OpRole::kScaleLr); op_proto_and_checker_maker.def( "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2649165eb1d3d..cdb193b928490 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -147,8 +147,9 @@ limitations under the License. */ #endif #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/platform/device/xpu/xpu_info.h" -#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" +// #include "paddle/fluid/platform/device/xpu/xpu_info.h" +// #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" +#include "paddle/fluid/pybind/xpu_info_py.h" #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -531,6 +532,10 @@ PYBIND11_MODULE(core_avx, m) { PYBIND11_MODULE(core_noavx, m) { #endif +#ifdef PADDLE_WITH_XPU + BindXPUInfo(&m); +#endif + BindImperative(&m); BindEager(&m); BindEagerStringTensor(&m); @@ -2420,7 +2425,10 @@ All parameter, weight, gradient are variables in Paddle. BindNeighborSampleResult(&m); BindGraphGpuWrapper(&m); #endif + #endif } + + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index ba646816126c4..7e226b48a1d08 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -719,6 +719,18 @@ void BindTensor(pybind11::module &m) { // NOLINT } return dst; }) + .def("copy_from", + [](framework::Tensor &self, const framework::Tensor &src) { + // follow fetch_op's inplementation + if (src.IsInitialized() && src.numel() > 0) { + TensorCopySync(src, src.place(), &self); + } else { + // Not copy, if the src tensor is empty. + self.clear(); + self.Resize({0}); + } + self.set_lod(src.lod()); + }) .def("_copy", [](const framework::Tensor &self, const platform::Place &place) { // follow fetch_op's inplementation diff --git a/paddle/fluid/pybind/xpu_info_py.cc b/paddle/fluid/pybind/xpu_info_py.cc new file mode 100644 index 0000000000000..5c19fb8e7baee --- /dev/null +++ b/paddle/fluid/pybind/xpu_info_py.cc @@ -0,0 +1,17 @@ +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/pybind/xpu_info_py.h" +#include "tuple" + +namespace paddle { +namespace pybind { + +void BindXPUInfo(py::module* m) { + py::class_(*m, "XPUMLHandler") + .def(py::init<>()) + .def("getMemoryUsageTuple", &platform::XPUMLHandler::getMemoryUsageTuple) + .def("getL3UsageTuple", &platform::XPUMLHandler::getL3UsageTuple); +} + +} // namespace pybind +} // namespace paddle +#endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/pybind/xpu_info_py.h b/paddle/fluid/pybind/xpu_info_py.h new file mode 100644 index 0000000000000..ffa716b7b9d0f --- /dev/null +++ b/paddle/fluid/pybind/xpu_info_py.h @@ -0,0 +1,18 @@ +#pragma once +#ifdef PADDLE_WITH_XPU +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +void BindXPUInfo(py::module* m); + +} // namespace pybind +} // namespace paddle +#endif // PADDLE_WITH_XPU diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h index 1e2a20ebdf440..8a005cb93b7d4 100644 --- a/paddle/phi/backends/dynload/cublasLt.h +++ b/paddle/phi/backends/dynload/cublasLt.h @@ -54,6 +54,34 @@ extern void *cublasLt_dso_handle; // APIs available after CUDA 10.1 // #if CUDA_VERSION >= 10100 +#if CUDA_VERSION >= 11010 +#define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasLtCreate); \ + __macro(cublasLtDestroy); \ + __macro(cublasLtMatmul); \ + __macro(cublasLtMatmulDescCreate); \ + __macro(cublasLtMatmulDescDestroy); \ + __macro(cublasLtMatmulDescSetAttribute); \ + __macro(cublasLtMatmulDescGetAttribute); \ + __macro(cublasLtMatrixLayoutCreate); \ + __macro(cublasLtMatrixLayoutDestroy); \ + __macro(cublasLtMatrixLayoutSetAttribute); \ + __macro(cublasLtMatrixLayoutGetAttribute); \ + __macro(cublasLtMatmulPreferenceCreate); \ + __macro(cublasLtMatmulPreferenceDestroy); \ + __macro(cublasLtMatmulPreferenceSetAttribute); \ + __macro(cublasLtMatmulAlgoGetHeuristic); \ + __macro(cublasLtMatrixTransform); \ + __macro(cublasLtMatrixTransformDescCreate); \ + __macro(cublasLtMatrixTransformDescDestroy); \ + __macro(cublasLtMatrixTransformDescSetAttribute); \ + __macro(cublasLtMatmulAlgoInit); \ + __macro(cublasLtMatmulAlgoConfigSetAttribute); \ + __macro(cublasLtMatmulAlgoGetIds); \ + __macro(cublasLtMatmulAlgoCapGetAttribute); \ + __macro(cublasLtMatmulAlgoCheck); \ + __macro(cublasLtGetCudartVersion); +#else #define CUBLASLT_BLAS_ROUTINE_EACH(__macro) \ __macro(cublasLtCreate); \ __macro(cublasLtDestroy); \ @@ -74,6 +102,7 @@ extern void *cublasLt_dso_handle; __macro(cublasLtMatrixTransformDescCreate); \ __macro(cublasLtMatrixTransformDescDestroy); \ __macro(cublasLtMatrixTransformDescSetAttribute); +#endif CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP) // #endif diff --git a/paddle/phi/backends/dynload/cuda_driver.cc b/paddle/phi/backends/dynload/cuda_driver.cc index 2bd0a7bfea5c1..d9fd89a0c65a6 100644 --- a/paddle/phi/backends/dynload/cuda_driver.cc +++ b/paddle/phi/backends/dynload/cuda_driver.cc @@ -24,6 +24,7 @@ void* cuda_dso_handle = nullptr; #if CUDA_VERSION >= 10020 CUDA_ROUTINE_EACH_VVM(DEFINE_WRAP); +CUDA_ROUTINE_EACH_CUDA_GRAPH(DEFINE_WRAP); #endif CUDA_ROUTINE_EACH(DEFINE_WRAP); diff --git a/paddle/phi/backends/dynload/cuda_driver.h b/paddle/phi/backends/dynload/cuda_driver.h index f743a33a1866f..ba771afe09023 100644 --- a/paddle/phi/backends/dynload/cuda_driver.h +++ b/paddle/phi/backends/dynload/cuda_driver.h @@ -72,7 +72,13 @@ extern bool HasCUDADriver(); __macro(cuMemRelease); \ __macro(cuMemAddressFree) +#define CUDA_ROUTINE_EACH_CUDA_GRAPH(__macro) \ + __macro(cuGraphNodeGetType); \ + __macro(cuGraphKernelNodeGetParams); \ + __macro(cuGraphExecKernelNodeSetParams) + CUDA_ROUTINE_EACH_VVM(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); +CUDA_ROUTINE_EACH_CUDA_GRAPH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); #endif CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc index 8aa3b623273d7..9bd38a89ab177 100644 --- a/paddle/phi/backends/dynload/cudnn.cc +++ b/paddle/phi/backends/dynload/cudnn.cc @@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP); #endif +#ifdef CUDNN_DNN_ROUTINE_EACH_FRONTEND +CUDNN_DNN_ROUTINE_EACH_FRONTEND(DEFINE_WRAP); +#endif + bool HasCUDNN() { std::call_once(cudnn_dso_flag, []() { cudnn_dso_handle = GetCUDNNDsoHandle(); }); diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h index 7b9004308e95b..3292beb037110 100644 --- a/paddle/phi/backends/dynload/cudnn.h +++ b/paddle/phi/backends/dynload/cudnn.h @@ -194,6 +194,19 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif +#ifdef PADDLE_WITH_CUDNN_FRONTEND +#define CUDNN_DNN_ROUTINE_EACH_FRONTEND(__macro) \ + __macro(cudnnBackendCreateDescriptor); \ + __macro(cudnnBackendDestroyDescriptor); \ + __macro(cudnnBackendExecute); \ + __macro(cudnnBackendFinalize); \ + __macro(cudnnBackendGetAttribute); \ + __macro(cudnnBackendSetAttribute); \ + __macro(cudnnGetStream); \ + __macro(cudnnReorderFilterAndBias); +CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + } // namespace dynload } // namespace phi diff --git a/paddle/phi/backends/dynload/nvtx.h b/paddle/phi/backends/dynload/nvtx.h index a9a166b289e33..e51bbf2154a17 100644 --- a/paddle/phi/backends/dynload/nvtx.h +++ b/paddle/phi/backends/dynload/nvtx.h @@ -42,6 +42,7 @@ extern void *nvtx_dso_handle; #define NVTX_ROUTINE_EACH(__macro) \ __macro(nvtxRangePushA); \ + __macro(nvtxRangePushEx); \ __macro(nvtxRangePop); NVTX_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVTX_WRAP); diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h index c62addfd257ab..2d527dd526a0e 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_helper.h +++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h @@ -14,6 +14,13 @@ #pragma once +#include // NOLINT + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/enforce.h" + namespace phi { namespace backends { namespace gpu { @@ -24,7 +31,7 @@ namespace gpu { * [ Why need this macro? ] * * The original looping in CUDA kernel is: - * + *p * `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ * i += blockDim.x * gridDim.x)` * @@ -62,10 +69,37 @@ namespace gpu { * */ -#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ - int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \ - for (index_type i = __index__; __index__ < (num); \ - __index__ += blockDim.x * gridDim.x, i = __index__) +#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type) \ + int64_t __index__ = \ + static_cast(blockIdx.x) * blockDim.x + threadIdx.x; \ + int64_t __stride__ = static_cast(blockDim.x) * gridDim.x; \ + for (index_type i = __index__; __index__ < (num); \ + __index__ += __stride__, i = __index__) + +template +cudaDataType_t ToCudaDataType() { + if (std::is_same::value) { + return CUDA_R_32F; + } else if (std::is_same::value) { + return CUDA_R_64F; + } else if (std::is_same::value) { + return CUDA_R_16F; +#if CUDA_VERSION >= 11000 + } else if (std::is_same::value) { + return CUDA_R_16BF; +#endif +#if CUDA_VERSION >= 11040 + } else if (std::is_same::value) { + return CUDA_R_8I; + } else if (std::is_same::value) { + return CUDA_R_32I; +#endif + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "DataType %d is unsupported for CUDA.", + paddle::experimental::CppTypeToDataType::Type())); + } +} } // namespace gpu } // namespace backends diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 87d779f9194db..62082beac13a3 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -57,6 +57,17 @@ limitations under the License. */ // TODO(phi): remove fluid header. #include "paddle/fluid/platform/enforce.h" +#ifdef PADDLE_ON_INFERENCE +PADDLE_DEFINE_EXPORTED_bool(enable_cublas_tf32_op_math, + false, + "enable tf32 for cublas."); +#else +PADDLE_DEFINE_EXPORTED_bool(enable_cublas_tf32_op_math, + true, + "enable tf32 for cublas."); +#endif +DECLARE_bool(enable_cublas_tensor_op_math); + namespace phi { namespace internal { @@ -216,6 +227,8 @@ struct GPUContext::Impl { stream_ = new CUDAStream(place_); InitEigenDevice(); InitDnnWorkspace(); + GetDnnHandle(); + GetBlasHandle(); } void PartialInitWithoutAllocator() { @@ -231,6 +244,8 @@ struct GPUContext::Impl { &max_threads_per_block_, &max_grid_dim_size_); stream_ = new CUDAStream(place_); + GetDnnHandle(); + GetBlasHandle(); } void PartialInitWithAllocator() { @@ -238,6 +253,8 @@ struct GPUContext::Impl { stream_owned_ = true; backends::gpu::GPUDeviceGuard guard(place_.device); InitDnnWorkspace(); + GetDnnHandle(); + GetBlasHandle(); } explicit Impl(const GPUPlace& place) : place_(place) {} @@ -369,7 +386,7 @@ struct GPUContext::Impl { } #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 9000 - if (!blas_tensor_core_handle_) { + if (FLAGS_enable_cublas_tensor_op_math && !blas_tensor_core_handle_) { if (!blas_tensor_core_handle_creator_) { phi::InitBlasHandle(&blas_tensor_core_handle_, stream()); } else { @@ -380,7 +397,7 @@ struct GPUContext::Impl { } #endif #if CUDA_VERSION >= 11000 - if (!blas_tf32_tensor_core_handle_) { + if (FLAGS_enable_cublas_tf32_op_math && !blas_tf32_tensor_core_handle_) { if (!blas_tf32_tensor_core_handle_creator_) { phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream()); } else { @@ -561,40 +578,6 @@ struct GPUContext::Impl { } inline void CublasCall(const std::function& callback) { - std::call_once(flag_cublas_, [&]() { - if (!blas_handle_) { - if (!blas_handle_creator_) { - phi::InitBlasHandle(&blas_handle_, stream()); - } else { - blas_handle_ = blas_handle_creator_(); - } - } -#ifdef PADDLE_WITH_CUDA -#if CUDA_VERSION >= 9000 - if (!blas_tensor_core_handle_) { - if (!blas_tensor_core_handle_creator_) { - phi::InitBlasHandle(&blas_tensor_core_handle_, stream()); - } else { - phi::InitBlasHandle(&blas_tensor_core_handle_, stream()); - } - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); - } -#endif -#if CUDA_VERSION >= 11000 - if (!blas_tf32_tensor_core_handle_) { - if (!blas_tf32_tensor_core_handle_creator_) { - phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream()); - } else { - blas_tf32_tensor_core_handle_ = - blas_tf32_tensor_core_handle_creator_(); - } - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH)); - } -#endif -#endif - }); if (blas_tf32_tensor_core_handle_ != nullptr) { std::lock_guard guard(blas_tf32_mtx_); callback(blas_tf32_tensor_core_handle_); @@ -606,40 +589,6 @@ struct GPUContext::Impl { inline void TensorCoreCublasCallIfAvailable( const std::function& callback) { - std::call_once(flag_tensorcore_cublas_, [&]() { - if (!blas_handle_) { - if (!blas_handle_creator_) { - phi::InitBlasHandle(&blas_handle_, stream()); - } else { - blas_handle_ = blas_handle_creator_(); - } - } -#ifdef PADDLE_WITH_CUDA -#if CUDA_VERSION >= 9000 - if (!blas_tensor_core_handle_) { - if (!blas_tensor_core_handle_creator_) { - phi::InitBlasHandle(&blas_tensor_core_handle_, stream()); - } else { - blas_tensor_core_handle_ = blas_tensor_core_handle_creator_(); - } - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); - } -#endif -#if CUDA_VERSION >= 11000 - if (!blas_tf32_tensor_core_handle_) { - if (!blas_tf32_tensor_core_handle_creator_) { - phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream()); - } else { - blas_tf32_tensor_core_handle_ = - blas_tf32_tensor_core_handle_creator_(); - } - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH)); - } -#endif -#endif - }); if (blas_tensor_core_handle_ != nullptr) { std::lock_guard guard(blas_tensor_core_mtx_); callback(blas_tensor_core_handle_); @@ -716,6 +665,14 @@ struct GPUContext::Impl { } } } + // get workspace ptr + void* GetWorkSpacePtr(const size_t& len) { + if (workspace_ptr_ == nullptr || len > workspace_ptr_->size()) { + workspace_ptr_.reset(); + workspace_ptr_ = allocator_->Allocate(len); + } + return workspace_ptr_->ptr(); + } // use one flag for all handles? // they should be accessed consistently @@ -780,6 +737,8 @@ struct GPUContext::Impl { Allocator* allocator_{nullptr}; // external resource. // A internal resouce to initinalize eigen_device. std::unique_ptr eigen_stream_{nullptr}; + // work space + phi::Allocator::AllocationPtr workspace_ptr_{nullptr}; }; GPUContext::GPUContext(GPUContext&&) = default; @@ -1000,4 +959,9 @@ void GPUContext::SetDriverVersion(int val) { impl_->driver_version_ = val; } void GPUContext::SetRuntimeVersion(int val) { impl_->runtime_version_ = val; } +// Get Work Space +void* GPUContext::GetWorkSpacePtr(const size_t& len) const { + return impl_->GetWorkSpacePtr(len); +} + } // namespace phi diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 989bbbcbbf5f8..c76d8549c284c 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -199,6 +199,9 @@ class PADDLE_API GPUContext : public DeviceContext { // clear: whether clear the original CUDAStream or not void SetCUDAStream(CUDAStream*, bool clear = true); + // Get Work Space + void* GetWorkSpacePtr(const size_t& len) const; + protected: // NOTE: External users manage resources. Used in inference scenarios. // The Set interface is for inference only, DeviceContext will mark the diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 552f60783c8b2..fd712baf75480 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -34,18 +34,16 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/enforce.h" -#ifdef __HIPCC__ -// HIP results in error or nan if > 256 -#define PREDEFINED_BLOCK_SIZE 256 -#else // CUDA performs better when thread_per_block is between [64, 512] #define PREDEFINED_BLOCK_SIZE 512 -#endif namespace phi { namespace backends { namespace gpu { +// Limitation of the setting in one dimension of cuda grid. +constexpr int kMultiDimslimit = 65536; + template inline T DivUp(T a, T b) { return (a + b - 1) / b; @@ -53,20 +51,21 @@ inline T DivUp(T a, T b) { // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 // for round integer value into next highest power of 2. -inline int64_t RoundToPowerOfTwo(int64_t n) { +inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val = 1) { n--; n |= (n >> 1); n |= (n >> 2); n |= (n >> 4); n |= (n >> 8); n |= (n >> 16); - int64_t min_val = 32; -#ifdef __HIPCC__ - int64_t max_val = 256; -#else + return std::max(min_val, (n + 1)); +} + +inline int64_t RoundToPowerOfTwo(int64_t n) { + constexpr int64_t min_val = 32; + int64_t num = RoundToNextHighPowOfTwo(n, min_val); int64_t max_val = 1024; -#endif - return std::min(max_val, std::max(min_val, (n + 1))); + return std::min(max_val, num); } #ifdef WITH_NV_JETSON @@ -162,8 +161,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, } inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, - int x_dim, - int y_dim) { + int64_t x_dim, + int64_t y_dim) { PADDLE_ENFORCE_GT( x_dim, 0, @@ -178,7 +177,7 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, y_dim)); const int kThreadsPerBlock = 256; - int block_cols = std::min(x_dim, kThreadsPerBlock); + int block_cols = std::min(x_dim, kThreadsPerBlock); int block_rows = std::max(kThreadsPerBlock / block_cols, 1); int max_physical_threads = context.GetMaxPhysicalThreadCount(); @@ -188,8 +187,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, // Noticed, block size is not align to 32, if needed do it yourself. config.thread_per_block = dim3(block_cols, block_rows, 1); - int grid_x = std::min(DivUp(x_dim, block_cols), max_blocks); - int grid_y = std::min(max_blocks / grid_x, std::max(y_dim / block_rows, 1)); + int grid_x = std::min(DivUp(x_dim, block_cols), max_blocks); + int grid_y = std::min(max_blocks / grid_x, + std::max(y_dim / block_rows, 1)); config.block_per_grid = dim3(grid_x, grid_y, 1); return config; @@ -229,6 +229,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig3D(const phi::GPUContext& context, return config; } +template +void LimitGridDim(const Context& ctx, dim3* grid_dim) { + auto max_grid_dim = + reinterpret_cast(ctx).GetCUDAMaxGridDimSize(); + grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; + grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; + grid_dim->z = grid_dim->z < max_grid_dim[2] ? grid_dim->z : max_grid_dim[2]; +} } // namespace gpu } // namespace backends } // namespace phi diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc index 0257139914384..2a8dbb85e8035 100644 --- a/paddle/phi/backends/gpu/gpu_resources.cc +++ b/paddle/phi/backends/gpu/gpu_resources.cc @@ -58,16 +58,20 @@ void InitGpuProperties(Place place, *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId()); // TODO(wilber): glog may be replaced in the future? - LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " - << static_cast(place.device) - << ", GPU Compute Capability: " - << *compute_capability / 10 << "." - << *compute_capability % 10 - << ", Driver API Version: " << *driver_version / 1000 - << "." << (*driver_version % 100) / 10 - << ", Runtime API Version: " - << *runtime_version / 1000 << "." - << (*runtime_version % 100) / 10; + LOG_FIRST_N(WARNING, 1) + << "Please NOTE: device: " << static_cast(place.device) + << ", GPU Compute Capability: " << *compute_capability / 10 << "." + << *compute_capability % 10 + << ", Driver API Version: " << *driver_version / 1000 << "." + << (*driver_version % 100) / 10 + << ", Runtime API Version: " << *runtime_version / 1000 << "." + << (*runtime_version % 100) / 10 << ", Build Date " +#ifdef PADDLE_BRANCH_NAME + << __DATE__ << " Time " << __TIME__ + << ", Git Version: " PADDLE_BRANCH_NAME ":" PADDLE_COMMIT_HASH; +#else + << __DATE__ << " Time " << __TIME__; +#endif #ifdef PADDLE_WITH_HIP size_t miopen_major, miopen_minor, miopen_patch; PADDLE_ENFORCE_GPU_SUCCESS( diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h new file mode 100644 index 0000000000000..045fdf9daa568 --- /dev/null +++ b/paddle/phi/common/memory_utils.h @@ -0,0 +1,107 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include "paddle/fluid/memory/malloc.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/stream.h" + +namespace phi { + +/* + NOTE(YuanRisheng) Why should we add the following code? + We need this because MemoryUtils::instance() is a singleton object and we + don't recommend using singleton object in kernels. So, we wrap it using a + function and if we delete this singleton object in future, it will be easy to + change code. +*/ + +namespace memory_utils { +class Buffer { + public: + explicit Buffer(const phi::Place& place) : place_(place) {} + + template + T* Alloc(size_t size) { + using AllocT = typename std:: + conditional::value, uint8_t, T>::type; + if (UNLIKELY(size == 0)) return nullptr; + size *= sizeof(AllocT); + if (allocation_ == nullptr || allocation_->size() < size) { + allocation_ = paddle::memory::Alloc(place_, size); + } + return reinterpret_cast(allocation_->ptr()); + } + + template + const T* Get() const { + return reinterpret_cast( + allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr); + } + + template + T* GetMutable() { + return reinterpret_cast( + allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr); + } + + size_t Size() const { return allocation_ ? allocation_->size() : 0; } + + phi::Place GetPlace() const { return place_; } + + private: + Allocator::AllocationPtr allocation_; + phi::Place place_; +}; + +template +struct ThrustAllocator { + typedef char value_type; + ThrustAllocator(phi::Place place, StreamType stream) { + place_ = place; + stream_ = stream; + } + ~ThrustAllocator() {} + char* allocate(std::ptrdiff_t num_bytes) { + auto storage = + paddle::memory::AllocShared(place_, + num_bytes, + phi::Stream(reinterpret_cast(stream_))); + char* ptr = reinterpret_cast(storage->ptr()); + busy_allocation_.emplace(std::make_pair(ptr, storage)); + return ptr; + } + void deallocate(char* ptr, size_t) { + allocation_map_type::iterator iter = busy_allocation_.find(ptr); + // CHECK(iter != busy_allocation_.end()); + busy_allocation_.erase(iter); + } + + private: + typedef std::unordered_map> + allocation_map_type; + allocation_map_type busy_allocation_; + phi::Place place_; + StreamType stream_; +}; + +} // namespace memory_utils + +} // namespace phi diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc index 838f2dd265eb3..ad7a2b134a20c 100644 --- a/paddle/phi/kernels/autotune/cache.cc +++ b/paddle/phi/kernels/autotune/cache.cc @@ -21,21 +21,6 @@ namespace phi { namespace autotune { -// Define the cache key of operator -size_t ConvKey(const std::vector& x_dims, - const std::vector& w_dims, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - phi::DataType dtype) { - return GetKey(x_dims, - w_dims, - strides, - paddings, - dilations, - static_cast(dtype)); -} - size_t TransposeKey(const std::vector& x_dims, const std::vector& perm, phi::DataType dtype) { @@ -73,6 +58,19 @@ void AutoTuneCache::UpdateStatus() { cache_hits += v.second.CacheHits(); cache_misses += v.second.CacheMisses(); } + + for (auto& v : cudnn_auto_tune_map_) { + VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width) + << AlgorithmTypeString(v.first) + << " Cache Size: " << v.second.Size() + << " Hits: " << v.second.CacheHits() + << " Misses: " << v.second.CacheMisses() + << " Hit Rate: " << v.second.CacheHitRate(); + size += v.second.Size(); + cache_hits += v.second.CacheHits(); + cache_misses += v.second.CacheMisses(); + } + total_size_ = size; total_cache_hits_ = cache_hits; total_cache_misses_ = cache_misses; diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h index 1263cf40e567e..54c9508571c69 100644 --- a/paddle/phi/kernels/autotune/cache.h +++ b/paddle/phi/kernels/autotune/cache.h @@ -32,6 +32,7 @@ template inline void HashCombine(std::size_t* seed, const T& v, Rest... rest) { std::hash hasher; *seed ^= hasher(v) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2); + *seed *= 0x00000100000001B3; HashCombine(seed, rest...); } @@ -41,7 +42,7 @@ namespace std { template struct hash> { std::size_t operator()(std::vector const& vec) const noexcept { - std::size_t seed = 0; + std::size_t seed = 0xcbf29ce484222325; for (auto val : vec) { HashCombine(&seed, val); } @@ -53,6 +54,16 @@ struct hash> { namespace phi { namespace autotune { +struct ConvAutoTuneResult { + ConvAutoTuneResult() {} + ConvAutoTuneResult(int64_t a, size_t size, bool search) + : algo(a), workspace_size(size), exhaustive_search(search) {} + + int64_t algo; + size_t workspace_size = 0; + bool exhaustive_search = false; +}; + template size_t GetKey(Args&&... args) { size_t seed = 0; @@ -60,24 +71,147 @@ size_t GetKey(Args&&... args) { return seed; } -// Define the cache key of operator -size_t ConvKey(const std::vector& x_dims, - const std::vector& w_dims, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - phi::DataType dtype); +struct ConvCacheKey { + ConvCacheKey() {} + ConvCacheKey(const std::vector& arg_x_dims, + const std::vector& arg_w_dims, + const std::vector& arg_strides, + const std::vector& arg_paddings, + const std::vector& arg_dilations, + phi::DataType arg_dtype, + int arg_groups, + int64_t arg_data_layout) + : x_dims(arg_x_dims), + w_dims(arg_w_dims), + strides(arg_strides), + paddings(arg_paddings), + dilations(arg_dilations), + dtype(arg_dtype), + groups(arg_groups), + data_layout(arg_data_layout) {} + size_t hash_value() const { + return GetKey(x_dims, + w_dims, + strides, + paddings, + dilations, + static_cast(dtype), + groups, + data_layout); + } + + std::vector x_dims; + std::vector w_dims; + std::vector strides; + std::vector paddings; + std::vector dilations; + phi::DataType dtype; + int groups; + int64_t data_layout; +}; + +struct ConvCacheKeyHash { + size_t operator()(const ConvCacheKey& cache) const { + return cache.hash_value(); + } +}; + +struct ConvCacheKeyEqual { + size_t operator()(const ConvCacheKey& first, + const ConvCacheKey& second) const { + if (first.x_dims != second.x_dims) return false; + if (first.w_dims != second.w_dims) return false; + if (first.strides != second.strides) return false; + if (first.paddings != second.paddings) return false; + if (first.dilations != second.dilations) return false; + if (first.dtype != second.dtype) return false; + if (first.groups != second.groups) return false; + if (first.data_layout != second.data_layout) return false; + + return true; + } +}; + +class CudnnAlgorithmsCacheMap { + public: + CudnnAlgorithmsCacheMap() : cache_mutex_(new std::mutex()) { hash_.clear(); } + + ConvAutoTuneResult Get(const ConvCacheKey& key) { + std::lock_guard lock(*cache_mutex_); + PADDLE_ENFORCE_NE( + hash_.find(key), + hash_.end(), + phi::errors::PreconditionNotMet("The key does not exist.")); + return hash_[key]; + } + + bool Find(const ConvCacheKey& key) { + bool ret = false; + std::lock_guard lock(*cache_mutex_); + if (hash_.find(key) != hash_.end()) { + cache_hits_++; + ret = true; + } else { + cache_misses_++; + } + return ret; + } + + void Clean() { + std::lock_guard lock(*cache_mutex_); + hash_.clear(); + cache_hits_ = 0; + cache_misses_ = 0; + } + + void Set(const ConvCacheKey& key, ConvAutoTuneResult algo) { + std::lock_guard lock(*cache_mutex_); + if (hash_.size() > static_cast(1000000)) { + hash_.clear(); + } + hash_[key] = algo; + } + + int64_t CacheMisses() const { return cache_misses_; } + + int64_t CacheHits() const { return cache_hits_; } + + float CacheHitRate() const { + int64_t num_accesses = cache_hits_ + cache_misses_; + float cache_hit_rate = 0.; + if (num_accesses != 0) { + cache_hit_rate = + static_cast(cache_hits_) / static_cast(num_accesses); + } + return cache_hit_rate; + } + + int64_t Size() const { return hash_.size(); } + + private: + std::unordered_map + hash_; + std::shared_ptr cache_mutex_; + + int64_t cache_hits_{0}; + int64_t cache_misses_{0}; +}; size_t TransposeKey(const std::vector& x_dims, const std::vector& perm, phi::DataType dtype); - -template +template , + typename KeyEqualT = std::equal_to> class AlgorithmsCache { public: - AlgorithmsCache() : cache_mutex_(new std::mutex()) { hash_.clear(); } + AlgorithmsCache() : cache_mutex_(new std::mutex()) {} - AlgorithmT Get(size_t key) { + AlgorithmT Get(const KeyT& key) { std::lock_guard lock(*cache_mutex_); PADDLE_ENFORCE_NE( hash_.find(key), @@ -86,7 +220,7 @@ class AlgorithmsCache { return hash_[key]; } - bool Find(size_t key) { + bool Find(const KeyT& key) { bool ret = false; std::lock_guard lock(*cache_mutex_); if (hash_.find(key) != hash_.end()) { @@ -105,7 +239,7 @@ class AlgorithmsCache { cache_misses_ = 0; } - void Set(size_t key, AlgorithmT algo) { + void Set(const KeyT& key, AlgorithmT algo) { std::lock_guard lock(*cache_mutex_); hash_[key] = algo; } @@ -126,14 +260,43 @@ class AlgorithmsCache { int64_t Size() const { return hash_.size(); } - private: - std::unordered_map hash_; + protected: + std::unordered_map hash_; std::shared_ptr cache_mutex_; int64_t cache_hits_{0}; int64_t cache_misses_{0}; }; +template +class MatmulAlgorithmsCache : public AlgorithmsCache { + public: + MatmulAlgorithmsCache() : AlgorithmsCache() {} + + bool FindSubKey(const KeyT& sub_key) { + std::lock_guard lock(*(this->cache_mutex_)); + bool ret = (sub_hash_.find(sub_key) != sub_hash_.end()) ? true : false; + return ret; + } + + void SetSubKey(const KeyT& sub_key, void* algo) { + std::lock_guard lock(*(this->cache_mutex_)); + sub_hash_[sub_key] = algo; + } + + void* GetSubKey(const KeyT& sub_key) { + std::lock_guard lock(*(this->cache_mutex_)); + PADDLE_ENFORCE_NE( + sub_hash_.find(sub_key), + sub_hash_.end(), + phi::errors::PreconditionNotMet("The key does not exist.")); + return sub_hash_[sub_key]; + } + + private: + std::unordered_map sub_hash_; +}; + enum class AlgorithmType { kConvForward = 1, kConvBackwardData = 2, @@ -143,9 +306,13 @@ enum class AlgorithmType { }; // AlgorithmsConfigKey -> AlgorithmsID -using AlgorithmsCacheMap = AlgorithmsCache; +// (todo. hong) use cudnnConvolutionFwdAlgo_t +using AlgorithmsCacheMap = AlgorithmsCache; // AlgorithmType -> AlgorithmsCache using AlgorithmsTypeMap = std::unordered_map; +using CudnnAlgorithmsTypeMap = + std::unordered_map; +using MatmulAlgorithmsCacheMap = MatmulAlgorithmsCache; class AutoTuneCache { public: @@ -158,24 +325,22 @@ class AutoTuneCache { return auto_tune_map_[static_cast(algo_type)]; } - AlgorithmsCacheMap& GetConvForward() { - return Get(AlgorithmType::kConvForward); - } - - AlgorithmsCacheMap& GetConvBackwardData() { - return Get(AlgorithmType::kConvBackwardData); - } - - AlgorithmsCacheMap& GetConvBackwardFilter() { - return Get(AlgorithmType::kConvBackwardFilter); + CudnnAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) { + return cudnn_auto_tune_map_[static_cast(algo_type)]; } AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); } + MatmulAlgorithmsCacheMap& GetMatmul() { return matmul_auto_tune_map_; } + void Clean() { for (auto& v : auto_tune_map_) { v.second.Clean(); } + + for (auto& v : cudnn_auto_tune_map_) { + v.second.Clean(); + } } void UpdateStatus(); @@ -206,14 +371,26 @@ class AutoTuneCache { void Register(const AlgorithmType& algo_type) { std::lock_guard lock(*autotune_cache_mutex_); - int64_t key = static_cast(algo_type); - if (auto_tune_map_.find(key) == auto_tune_map_.end()) { - AlgorithmsCacheMap cache; - auto_tune_map_[key] = cache; + if (algo_type == AlgorithmType::kConvForward || + algo_type == AlgorithmType::kConvBackwardData || + algo_type == AlgorithmType::kConvBackwardFilter) { + int64_t key = static_cast(algo_type); + if (auto_tune_map_.find(key) == auto_tune_map_.end()) { + CudnnAlgorithmsCacheMap cache; + cudnn_auto_tune_map_[key] = cache; + } + } else { + int64_t key = static_cast(algo_type); + if (auto_tune_map_.find(key) == auto_tune_map_.end()) { + AlgorithmsCacheMap cache; + auto_tune_map_[key] = cache; + } } } AlgorithmsTypeMap auto_tune_map_; + CudnnAlgorithmsTypeMap cudnn_auto_tune_map_; + MatmulAlgorithmsCacheMap matmul_auto_tune_map_; std::shared_ptr autotune_cache_mutex_; int64_t total_cache_hits_{0}; int64_t total_cache_misses_{0}; diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc index 53574c3d0c9ac..18454ad3e1997 100644 --- a/paddle/phi/kernels/autotune/cache_test.cc +++ b/paddle/phi/kernels/autotune/cache_test.cc @@ -25,7 +25,8 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 }; TEST(AlgosCache, AlgosCache) { auto autotune_cache = phi::autotune::AutoTuneCache::Instance(); - auto& cache = autotune_cache.GetConvForward(); + auto& cache = + autotune_cache.GetConv(phi::autotune::AlgorithmType::kConvForward); std::vector x_shape = {4, 224, 224, 3}; std::vector w_shape = {32, 3, 3, 3}; @@ -34,20 +35,24 @@ TEST(AlgosCache, AlgosCache) { std::vector dilations = {1, 1}; phi::DataType dtype = paddle::experimental::CppTypeToDataType::Type(); - auto key = phi::autotune::ConvKey( - x_shape, w_shape, paddings, strides, dilations, dtype); + phi::autotune::ConvCacheKey key( + x_shape, w_shape, paddings, strides, dilations, dtype, 0, 0); EXPECT_EQ(cache.Find(key), false); - cache.Set(key, ConvAlgos::GEMMKernel); + phi::autotune::ConvAutoTuneResult node( + static_cast(ConvAlgos::GEMMKernel), 0, false); + cache.Set(key, node); EXPECT_EQ(cache.Size(), 1); EXPECT_EQ(cache.Find(key), true); auto algo = cache.Get(key); - EXPECT_EQ(algo, ConvAlgos::GEMMKernel); + EXPECT_EQ(algo.algo, ConvAlgos::GEMMKernel); x_shape = {4, 128, 128, 3}; - key = phi::autotune::ConvKey( - x_shape, w_shape, paddings, strides, dilations, dtype); - EXPECT_EQ(cache.Find(key), false); - cache.Set(key, ConvAlgos::CuDNNKernel_1); + phi::autotune::ConvCacheKey key1( + x_shape, w_shape, paddings, strides, dilations, dtype, 0, 1); + EXPECT_EQ(cache.Find(key1), false); + phi::autotune::ConvAutoTuneResult node1( + static_cast(ConvAlgos::CuDNNKernel_1), 0, false); + cache.Set(key1, node1); EXPECT_EQ(cache.Size(), 2); EXPECT_EQ(cache.CacheHits(), 1); EXPECT_EQ(cache.CacheMisses(), 2); diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h index 509d824ca0553..459a701b5115b 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h @@ -14,6 +14,9 @@ #pragma once +#if defined(__NVCC__) +#include +#endif #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/phi/backends/gpu/gpu_context.h" @@ -55,6 +58,17 @@ struct CUBlas { PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cublasSgemv(args...)); } + template + static void GEMM_BATCH(ARGS... args) { +#if CUDA_VERSION >= 8000 + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cublasSgemmBatched(args...)); +#else + PADDLE_THROW(phi::errors::Unimplemented( + "SgemmBatched is not supported on cuda <= 7.5")); +#endif + } + template static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 @@ -181,6 +195,17 @@ struct CUBlas { PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cublasDgemv(args...)); } + template + static void GEMM_BATCH(ARGS... args) { +#if CUDA_VERSION >= 8000 + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cublasDgemmBatched(args...)); +#else + PADDLE_THROW(phi::errors::Unimplemented( + "DgemmBatched is not supported on cuda <= 7.5")); +#endif + } + template static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 @@ -235,40 +260,69 @@ struct CUBlas { }; template <> struct CUBlas { - //int8_t call func: - //CUBlas::GEMM_EX( - // &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_8I, ldb, A, - // CUDA_R_8I, lda, &h_beta, C, CUDA_R_32F, N, CUDA_R_32F); + // int8_t call func: + // CUBlas::GEMM_EX( + // &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_8I, ldb, A, + // CUDA_R_8I, lda, &h_beta, C, CUDA_R_32F, N, CUDA_R_32F); // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode template static void GEMM_EX(phi::GPUContext *dev_ctx, - cublasOperation_t transa, cublasOperation_t transb, int m, - int n, int k, const void *alpha, const void *A, - cudaDataType_t Atype, int lda, const void *B, - cudaDataType_t Btype, int ldb, const void *beta, void *C, - cudaDataType_t Ctype, int ldc, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void *alpha, + const void *A, + cudaDataType_t Atype, + int lda, + const void *B, + cudaDataType_t Btype, + int ldb, + const void *beta, + void *C, + cudaDataType_t Ctype, + int ldc, cudaDataType_t computeType) { #if CUDA_VERSION >= 8000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 bool use_tensor_op_math = dev_ctx->tensor_core_available(); if (use_tensor_op_math) { - //algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - //VLOG(5) << "2. CUBlas int8_t, algo is CUBLAS_GEMM_DFALT_TENSOR_OP."; - algo = CUBLAS_GEMM_DFALT; // only for int8 gemm + // algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + // VLOG(5) << "2. CUBlas int8_t, algo is CUBLAS_GEMM_DFALT_TENSOR_OP."; + algo = CUBLAS_GEMM_DFALT; // only for int8 gemm } VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); - VLOG(5) << "3. use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + VLOG(5) << "3. use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); algo = CUBLAS_GEMM_DFALT; #endif // CUDA_VERSION >= 9000 dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cublasGemmEx( - handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, - beta, C, Ctype, ldc, computeType, algo)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + computeType, + algo)); }); #else PADDLE_THROW(platform::errors::Unimplemented( @@ -311,6 +365,69 @@ struct CUBlas { ldc)); } +#if defined(__NVCC__) + static void GEMM_BATCH(phi::GPUContext *dev_ctx, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float *alpha, + const float16 **A, + cudaDataType_t Atype, + int lda, + const float16 **B, + cudaDataType_t Btype, + int ldb, + const float *beta, + float16 **C, + cudaDataType_t Ctype, + int ldc, + int batchCount, + cudaDataType_t computeType) { +#if CUDA_VERSION >= 8000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; +#if CUDA_VERSION >= 9000 + bool use_tensor_op_math = dev_ctx->tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); +#endif // CUDA_VERSION >= 9000 + thrust::device_vector A_ptr(A, A + batchCount); + thrust::device_vector B_ptr(B, B + batchCount); + thrust::device_vector C_ptr(C, C + batchCount); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cublasGemmBatchedEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A_ptr.data().get(), + Atype, + lda, + B_ptr.data().get(), + Btype, + ldb, + beta, + C_ptr.data().get(), + Ctype, + ldc, + batchCount, + computeType, + algo)); + }); +#else + PADDLE_THROW(phi::errors::Unimplemented( + "cublasGemmBatchedEx is not supported on cuda <= 7.5")); +#endif + } +#endif + static void GEMM_STRIDED_BATCH(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, @@ -961,20 +1078,20 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 8000 } -//int8_t matmul +// int8_t matmul template <> template <> -inline void Blas::GEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - float alpha, - const int8_t *A, - const int8_t *B, - float beta, - float *C, int flag) const { +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + float alpha, + const int8_t *A, + const int8_t *B, + float beta, + float *C, + int flag) const { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -985,7 +1102,8 @@ inline void Blas::GEMM( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE_GE( - context_.GetComputeCapability(), 53, + context_.GetComputeCapability(), + 53, phi::errors::InvalidArgument( "cublas int8_t gemm requires GPU compute capability >= 53," "but received %d", @@ -1001,17 +1119,32 @@ inline void Blas::GEMM( // using tensor cores in volta GPUs. auto &cuda_ctx = const_cast(context_); VLOG(3) << "1. call int8_t GEMM_EX."; - CUBlas::GEMM_EX( - &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_8I, ldb, A, - CUDA_R_8I, lda, &h_beta, C, CUDA_R_32F, N, CUDA_R_32F); + CUBlas::GEMM_EX(&cuda_ctx, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + CUDA_R_8I, + ldb, + A, + CUDA_R_8I, + lda, + &h_beta, + C, + CUDA_R_32F, + N, + CUDA_R_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - //context_.CublasCall([&](cublasHandle_t handle) { - // CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, - // &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, - // N); - //}); + // context_.CublasCall([&](cublasHandle_t handle) { + // CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, + // &h_alpha, h_B, ldb, h_A, lda, &h_beta, + // h_C, N); + // }); #endif // CUDA_VERSION >= 8000 } @@ -1428,6 +1561,75 @@ inline void Blas::GEMM(bool transA, }); } +template <> +template <> +inline void Blas::GEMM(bool transA, + bool transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + int lda, + const phi::dtype::bfloat16 *B, + int ldb, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C, + int ldc) const { +#if CUDA_VERSION >= 11000 + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), + 80, + phi::errors::InvalidArgument( + "cublas bf16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + + cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cublasGemmEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + CUDA_R_16BF, + ldb, + A, + CUDA_R_16BF, + lda, + &h_beta, + C, + CUDA_R_16BF, + ldc, + CUDA_R_32F, + algo)); + }); +#else + // raise error + PADDLE_THROW(phi::errors::Unimplemented( + "cublasGemmEx with bfloat16 is not supported on cuda <= 11")); + +#endif // CUDA_VERSION >= 11000 +} + template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { @@ -1708,6 +1910,97 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, } } +#if defined(__NVCC__) +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + double alpha, + const double **A, + const double **B, + double beta, + double **C, + int batchCount) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + thrust::device_vector A_ptr(A, A + batchCount); + thrust::device_vector B_ptr(B, B + batchCount); + thrust::device_vector C_ptr(C, C + batchCount); + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_BATCH(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B_ptr.data().get(), + ldb, + A_ptr.data().get(), + lda, + &beta, + C_ptr.data().get(), + ldc, + batchCount); + }); +} + +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + float alpha, + const float **A, + const float **B, + float beta, + float **C, + int batchCount) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + thrust::device_vector A_ptr(A, A + batchCount); + thrust::device_vector B_ptr(B, B + batchCount); + thrust::device_vector C_ptr(C, C + batchCount); + + context_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_BATCH(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B_ptr.data().get(), + ldb, + A_ptr.data().get(), + lda, + &beta, + C_ptr.data().get(), + ldc, + batchCount); + }); +} + template <> template <> inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -1721,10 +2014,45 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, phi::dtype::float16 beta, phi::dtype::float16 **C, int batchCount) const { - for (int k = 0; k < batchCount; ++k) { - this->template GEMM( - transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); - } + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), + 53, + phi::errors::InvalidArgument( + "cublas fp16 gemm requires GPU compute capability >= 53," + "but received %d", + context_.GetComputeCapability())); + float f_alpha = static_cast(alpha); + float f_beta = static_cast(beta); + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_BATCH(&cuda_ctx, + cuTransB, + cuTransA, + N, + M, + K, + &f_alpha, + B, + CUDA_R_16F, + ldb, + A, + CUDA_R_16F, + lda, + &f_beta, + C, + CUDA_R_16F, + ldc, + batchCount, + CUDA_R_32F); } template <> @@ -1740,11 +2068,69 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, phi::dtype::bfloat16 beta, phi::dtype::bfloat16 **C, int batchCount) const { - for (int k = 0; k < batchCount; ++k) { - this->template GEMM( - transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); +#if CUDA_VERSION >= 11000 + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), + 80, + phi::errors::InvalidArgument( + "cublas bf16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float f_alpha = static_cast(alpha); + float f_beta = static_cast(beta); + + cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + thrust::device_vector A_ptr(A, A + batchCount); + thrust::device_vector B_ptr(B, B + batchCount); + thrust::device_vector C_ptr(C, C + batchCount); + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cublasGemmBatchedEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &f_alpha, + B_ptr.data().get(), + CUDA_R_16BF, + ldb, + A_ptr.data().get(), + CUDA_R_16BF, + lda, + &f_beta, + C_ptr.data().get(), + CUDA_R_16BF, + ldc, + batchCount, + CUDA_R_32F, + algo)); + }); +#else + // raise error + PADDLE_THROW(phi::errors::Unimplemented( + "cublasGemmBatchedEx with bfloat16 is not supported on cuda <= 11")); + +#endif // CUDA_VERSION >= 11000 } +#endif template <> template diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h index e322fba39a481..60d0b4ff3c0ef 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h @@ -999,6 +999,68 @@ inline void Blas::GEMM(bool transA, }); } +template <> +template <> +inline void Blas::GEMM(bool transA, + bool transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + int lda, + const phi::dtype::bfloat16 *B, + int ldb, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C, + int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + rocblas_operation cuTransA = + transA ? rocblas_operation_none : rocblas_operation_transpose; + rocblas_operation cuTransB = + transB ? rocblas_operation_none : rocblas_operation_transpose; + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), + 80, + phi::errors::InvalidArgument( + "rocblas bf16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + + context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::rocblas_gemm_ex(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + rocblas_datatype_bf16_r, + ldb, + A, + rocblas_datatype_bf16_r, + lda, + &h_beta, + C, + rocblas_datatype_bf16_r, + ldc, + C, + rocblas_datatype_bf16_r, + ldc, + rocblas_datatype_f32_r, + algo, + 0, + 0)); + }); +} + template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { @@ -1128,6 +1190,159 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, }); } +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + float16 alpha, + const float16 *A, + const float16 *B, + float16 beta, + float16 *C, + int batchCount, + int64_t strideA, + int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + const int64_t strideC = M * N; + context_.CublasCall([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::rocblas_hgemm_strided_batched( + handle, + cuTransB, + cuTransA, + N, + M, + K, + reinterpret_cast(&alpha), + reinterpret_cast(B), + ldb, + strideB, + reinterpret_cast(A), + lda, + strideA, + reinterpret_cast(&beta), + reinterpret_cast(C), + ldc, + strideC, + batchCount)); + }); +} + +// note(wangran16): unknown bug. parameters dislocation when calling +// GEMM_STRIDED_BATCH and GEMM_STRIDED_BATCH +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + float alpha, + const float *A, + const float *B, + float beta, + float *C, + int batchCount, + int64_t strideA, + int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + const int64_t strideC = M * N; + context_.CublasCall([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::rocblas_sgemm_strided_batched(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + strideB, + A, + lda, + strideA, + &beta, + C, + ldc, + strideC, + batchCount)); + }); +} + +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + double alpha, + const double *A, + const double *B, + double beta, + double *C, + int batchCount, + int64_t strideA, + int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + const int64_t strideC = M * N; + context_.CublasCall([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::rocblas_dgemm_strided_batched(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + strideB, + A, + lda, + strideA, + &beta, + C, + ldc, + strideC, + batchCount)); + }); +} + template <> template <> inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, diff --git a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h new file mode 100644 index 0000000000000..37229fc0daff1 --- /dev/null +++ b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h @@ -0,0 +1,1149 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 + +#include "glog/logging.h" + +#include // NOLINT +#include "cuda.h" // NOLINT +#include "paddle/phi/backends/dynload/cublasLt.h" +#include "paddle/phi/backends/gpu/cuda/cuda_helper.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/autotune/gpu_timer.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" + +DECLARE_int64(cublaslt_exhaustive_search_times); +#endif + +namespace phi { +namespace funcs { + +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) + +// Set this enum according to +// https://docs.nvidia.com/cuda/cublas/index.html#cublasltepilogue-t +// While kMatmul, kMatmulGrad, kMatmulGradWithoutBias share the same +// enum value, but if all elements for MatmulPlanner->GetKey() is same, +// no matter forward or backward, they could share the same descriptor +// cache, in that the descriptor is for description of matmul operation. +enum MatmulFusedType { + kMatmul = 0, + kMatmulGrad = 1, + kMatmulGradWithoutBias = 2, + kMatmulBias = 3, + kMatmulRelu = 4, + kMatmulGelu = 5, + kMatmulBiasRelu = 6, + kMatmulBiasGelu = 7, + kMatmulBiasReluWithReservedData = 8, + kMatmulBiasGeluWithReservedData = 9, + kMatmulReluGrad = 10, + kMatmulGeluGrad = 11, + kMatmulBiasGradToA = 12, + kMatmulBiasGradToB = 13, +}; + +static cublasLtEpilogue_t ConvertFusedType(MatmulFusedType fused_type) { + static std::map fused_type_map = { + {MatmulFusedType::kMatmul, CUBLASLT_EPILOGUE_DEFAULT}, + {MatmulFusedType::kMatmulGrad, CUBLASLT_EPILOGUE_DEFAULT}, + {MatmulFusedType::kMatmulGradWithoutBias, CUBLASLT_EPILOGUE_DEFAULT}, + {MatmulFusedType::kMatmulBias, CUBLASLT_EPILOGUE_BIAS}, + {MatmulFusedType::kMatmulRelu, CUBLASLT_EPILOGUE_RELU}, + {MatmulFusedType::kMatmulGelu, CUBLASLT_EPILOGUE_GELU}, + {MatmulFusedType::kMatmulBiasRelu, CUBLASLT_EPILOGUE_RELU_BIAS}, + {MatmulFusedType::kMatmulBiasGelu, CUBLASLT_EPILOGUE_GELU_BIAS}, + {MatmulFusedType::kMatmulBiasReluWithReservedData, + CUBLASLT_EPILOGUE_RELU_AUX_BIAS}, + {MatmulFusedType::kMatmulBiasGeluWithReservedData, + CUBLASLT_EPILOGUE_GELU_AUX_BIAS}, +#if CUDA_VERSION >= 11060 + {MatmulFusedType::kMatmulReluGrad, CUBLASLT_EPILOGUE_DRELU}, + {MatmulFusedType::kMatmulGeluGrad, CUBLASLT_EPILOGUE_DGELU}, + {MatmulFusedType::kMatmulBiasGradToA, CUBLASLT_EPILOGUE_BGRADA}, + {MatmulFusedType::kMatmulBiasGradToB, CUBLASLT_EPILOGUE_BGRADB} +#endif + }; + + return fused_type_map[fused_type]; +} + +enum FusedGEMMGradInType { kDX = 0, kDY = 1, kDZ = 2 }; + +template +struct FusedGEMMGradTrait; + +template <> +struct FusedGEMMGradTrait { + static constexpr auto kXGradA = FusedGEMMGradInType::kDZ; + static constexpr auto kXGradB = FusedGEMMGradInType::kDY; + static constexpr auto kXGradATrans = false; + static constexpr auto kXGradBTrans = true; + + static constexpr auto kYGradA = FusedGEMMGradInType::kDX; + static constexpr auto kYGradB = FusedGEMMGradInType::kDZ; + static constexpr auto kYGradATrans = true; + static constexpr auto kYGradBTrans = false; +}; + +template <> +struct FusedGEMMGradTrait { + static constexpr auto kXGradA = FusedGEMMGradInType::kDY; + static constexpr auto kXGradB = FusedGEMMGradInType::kDZ; + static constexpr auto kXGradATrans = false; + static constexpr auto kXGradBTrans = true; + + static constexpr auto kYGradA = FusedGEMMGradInType::kDX; + static constexpr auto kYGradB = FusedGEMMGradInType::kDZ; + static constexpr auto kYGradATrans = false; + static constexpr auto kYGradBTrans = false; +}; + +template <> +struct FusedGEMMGradTrait { + static constexpr auto kXGradA = FusedGEMMGradInType::kDZ; + static constexpr auto kXGradB = FusedGEMMGradInType::kDY; + static constexpr auto kXGradATrans = false; + static constexpr auto kXGradBTrans = false; + + static constexpr auto kYGradA = FusedGEMMGradInType::kDZ; + static constexpr auto kYGradB = FusedGEMMGradInType::kDX; + static constexpr auto kYGradATrans = true; + static constexpr auto kYGradBTrans = false; +}; + +template <> +struct FusedGEMMGradTrait { + static constexpr auto kXGradA = FusedGEMMGradInType::kDY; + static constexpr auto kXGradB = FusedGEMMGradInType::kDZ; + static constexpr auto kXGradATrans = true; + static constexpr auto kXGradBTrans = true; + + static constexpr auto kYGradA = FusedGEMMGradInType::kDZ; + static constexpr auto kYGradB = FusedGEMMGradInType::kDX; + static constexpr auto kYGradATrans = true; + static constexpr auto kYGradBTrans = true; +}; + +// To tell any matmul or fused matmul operation from each other. +struct MatmulPlanner { + public: + const void* bias{nullptr}; + void* aux_data{nullptr}; + + MatmulPlanner() {} + MatmulPlanner(const std::vector& x_dims, + const std::vector& y_dims, + const bool trans_x, + const bool trans_y, + phi::DataType dtype, + MatmulFusedType fused_type, + const void* bias_data = nullptr, + void* reserve_data = nullptr, // Commonly for ReLu bit-mask. + bool use_addto = false, + bool no_exchange = true) + : bias(bias_data), aux_data(reserve_data), fused_type_(fused_type) { + use_addto_ = use_addto; + key_ = phi::autotune::GetKey(x_dims, + y_dims, + static_cast(trans_x), + static_cast(trans_y), + static_cast(dtype), + static_cast(fused_type_), + static_cast(use_addto_), + static_cast(no_exchange)); + } + + bool UseAddTo() const { return use_addto_; } + size_t GetKey() const { return key_; } + MatmulFusedType GetFusedType() const { return fused_type_; } + + size_t GenSubKey() const { return key_; } + + private: + MatmulFusedType fused_type_; + bool use_addto_; + size_t key_; +}; + +template +cublasComputeType_t GetCudaComputeType() { + if (std::is_same::value) { + return CUBLAS_COMPUTE_64F; + } else if (std::is_same::value) { + return CUBLAS_COMPUTE_32I; + } else { + return CUBLAS_COMPUTE_32F; + } +} + +struct MatmulDescriptor { + public: + cublasLtMatmulDesc_t op_desc{nullptr}; + cublasLtMatrixLayout_t x_desc{nullptr}; + cublasLtMatrixLayout_t y_desc{nullptr}; + cublasLtMatrixLayout_t out_desc{nullptr}; + cublasLtMatmulAlgo_t* algo{nullptr}; + bool is_cached{false}; + + MatmulDescriptor() {} + MatmulDescriptor(const MatmulDescriptor& obj) { + algo = obj.algo; + x_desc = obj.x_desc; + y_desc = obj.y_desc; + op_desc = obj.op_desc; + out_desc = obj.out_desc; + is_cached = obj.is_cached; + } + + MatmulDescriptor& operator=(const MatmulDescriptor& obj) { + algo = obj.algo; + x_desc = obj.x_desc; + y_desc = obj.y_desc; + op_desc = obj.op_desc; + out_desc = obj.out_desc; + is_cached = obj.is_cached; + + return *this; + } + + ~MatmulDescriptor() PADDLE_MAY_THROW { + if (!is_cached) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescDestroy(op_desc)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(y_desc)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(x_desc)); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatrixLayoutDestroy(out_desc)); + delete algo; + + op_desc = nullptr; + x_desc = nullptr; + y_desc = nullptr; + out_desc = nullptr; + algo = nullptr; + } + } + + // x_desc, y_desc, op_desc are allocated in heap memory. + template + void Create(const int64_t M, + const int64_t N, + const int64_t K, + const bool trans_x, + const bool trans_y, + phi::funcs::MatmulPlanner* planner, + const int batch_size = 1, + const int64_t stride_x = 0, + const int64_t stride_y = 0, + const int64_t stride_out = 0, + bool grad_for_dx = true) { + using MT = typename phi::dtype::MPTypeTrait::Type; + cudaDataType_t mat_type = phi::backends::gpu::ToCudaDataType(); + cudaDataType_t out_mat_type = phi::backends::gpu::ToCudaDataType(); + cudaDataType_t scale_type = phi::backends::gpu::ToCudaDataType(); + cublasComputeType_t compute_type = GetCudaComputeType(); + + if (std::is_same::value) { + out_mat_type = phi::backends::gpu::ToCudaDataType(); + scale_type = phi::backends::gpu::ToCudaDataType(); + } + + // Create operation descriptor; see cublasLtMatmulDescAttributes_t for + // details about defaults; just need to set the transforms for A and B + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulDescCreate(&op_desc, compute_type, scale_type)); + SetFusedEpilogueOpDescriptor(planner, trans_x, trans_y, N); + + // Create matrix descriptors + CreateMatrixLayout(&x_desc, mat_type, M, K, trans_x); + CreateMatrixLayout(&y_desc, mat_type, K, N, trans_y); + CreateMatrixLayout(&out_desc, out_mat_type, M, N, false); + + // Config batch size and stride. + if (batch_size > 1) { + SetBatchAndStride(x_desc, batch_size, stride_x); + SetBatchAndStride(y_desc, batch_size, stride_y); + SetBatchAndStride(out_desc, batch_size, stride_out); + } + } + + cublasLtMatmulAlgo_t* SetAlgo() { + // while entering this function, the desc shall be cached. + is_cached = true; + algo = new cublasLtMatmulAlgo_t; + return algo; + } + + template + void SetFusedEpiloguePtr(phi::funcs::MatmulPlanner* planner) { + if (planner->bias != nullptr) { + const T* bias_data = static_cast(planner->bias); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute( + op_desc, + CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &bias_data, + sizeof(bias_data))); + } + if (planner->aux_data != nullptr) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute( + op_desc, + CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &(planner->aux_data), + sizeof(planner->aux_data))); + } + } + + std::string GetDescResultString(std::string prefix, + bool has_algo = true) const { + std::ostringstream out; + out << prefix << " \n"; +#define GET_DESC_DATA_STRING(src) \ + do { \ + out << " " << #src << " = ["; \ + int num = sizeof((*src)) / sizeof(src->data[0]); \ + for (int i = 0; i < num; ++i) { \ + if (i == 0) { \ + out << src->data[i]; \ + } else { \ + out << ", " << src->data[i]; \ + } \ + } \ + out << "]\n"; \ + } while (0); + + if (has_algo) { + GET_DESC_DATA_STRING(algo); + } + GET_DESC_DATA_STRING(x_desc); + GET_DESC_DATA_STRING(y_desc); + GET_DESC_DATA_STRING(out_desc); + GET_DESC_DATA_STRING(op_desc); +#undef GET_DESC_DATA_STRING + return out.str(); + } + + void ExchangeXYDesc(bool no_exchange) {} + + protected: + void SetFusedEpilogueOpDescriptor(phi::funcs::MatmulPlanner* planner, + const bool trans_x, + const bool trans_y, + int64_t lead_dim) { + cublasOperation_t cublas_trans_x = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t cublas_trans_y = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N; + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulDescSetAttribute(op_desc, + CUBLASLT_MATMUL_DESC_TRANSB, + &cublas_trans_x, + sizeof(cublas_trans_x))); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulDescSetAttribute(op_desc, + CUBLASLT_MATMUL_DESC_TRANSA, + &cublas_trans_y, + sizeof(cublas_trans_y))); + MatmulFusedType fused_type = planner->GetFusedType(); + if (fused_type != MatmulFusedType::kMatmul) { + cublasLtEpilogue_t cublaslt_fused_type = ConvertFusedType(fused_type); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulDescSetAttribute(op_desc, + CUBLASLT_MATMUL_DESC_EPILOGUE, + &cublaslt_fused_type, + sizeof(fused_type))); + } + if (planner->aux_data) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute( + op_desc, + CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, + &lead_dim, + sizeof(lead_dim))); + } + } + + void CreateMatrixLayout(cublasLtMatrixLayout_t* desc, + cudaDataType type, + uint64_t rows, + uint64_t cols, + bool trans) { + if (trans) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatrixLayoutCreate(desc, type, rows, cols, rows)); + } else { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatrixLayoutCreate(desc, type, cols, rows, cols)); + } + } + + void SetBatchAndStride(cublasLtMatrixLayout_t desc, + int batch_size, + int64_t stride) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutSetAttribute( + desc, + CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, + &batch_size, + sizeof(batch_size))); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutSetAttribute( + desc, + CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + &stride, + sizeof(stride))); + } +}; + +struct MatmulGradDescriptor : MatmulDescriptor { + public: + MatmulGradDescriptor() {} + + template + void Create(const int64_t M, + const int64_t N, + const int64_t K, + const bool trans_x, + const bool trans_y, + phi::funcs::MatmulPlanner* planner, + const int batch_size = 1, + int64_t stride_x = 0, + int64_t stride_y = 0, + int64_t stride_out = 0, + bool grad_for_dx = true) { + using MT = typename phi::dtype::MPTypeTrait::Type; + cudaDataType_t mat_type = phi::backends::gpu::ToCudaDataType(); + cudaDataType_t scale_type = phi::backends::gpu::ToCudaDataType(); + cublasComputeType_t compute_type = GetCudaComputeType(); + + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulDescCreate(&op_desc, compute_type, scale_type)); + this->SetFusedEpilogueOpDescriptor( + planner, trans_x, trans_y, TransX ? M : K); + + // Create operation desciriptor; see cublasLtMatmulDescAttributes_t for + // details about defaults; just need to set the transforms for A and B + this->CreateMatrixLayout(&x_desc, mat_type, N, M, true); + if (grad_for_dx) { + this->CreateMatrixLayout(&y_desc, mat_type, K, N, TransY); + this->CreateMatrixLayout( + &out_desc, phi::backends::gpu::ToCudaDataType(), M, K, TransX); + } else { + this->CreateMatrixLayout(&y_desc, mat_type, M, K, TransX); + this->CreateMatrixLayout( + &out_desc, phi::backends::gpu::ToCudaDataType(), K, N, TransY); + } + } + + void ExchangeXYDesc(bool no_exchange) { + if (no_exchange) { + return; + } + auto* temp = y_desc; + y_desc = x_desc; + x_desc = temp; + } +}; + +template +struct CublasLtBase { + public: + using MT = typename phi::dtype::MPTypeTrait::Type; + static phi::Allocator::AllocationPtr GetWorkspace(const phi::GPUContext& ctx, + size_t workspace_size) { + return paddle::memory::Alloc( + ctx.GetPlace(), + workspace_size, + phi::Stream(reinterpret_cast(ctx.stream()))); + } + + static void RunImpl(const phi::GPUContext& ctx, + MatmulDescT* desc, + const size_t sub_key, + const T* x_ptr, + const T* y_ptr, + OutT* out_ptr, + phi::funcs::MatmulPlanner* planner) { + MT alpha = static_cast(1); + MT beta = planner->UseAddTo() ? static_cast(1) : static_cast(0); + cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle(); + + // NOTE(limingshu): As workspace_size varies from different DL framework, + // I wonder is there any smarter idea for workspace setting, currently I + // just followed the settings from the NVIDIA colleague`s setting. + size_t workspace_size = static_cast(4) * 1024 * 1024; + // phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, + // workspace_size); + void* workspace_ptr = ctx.GetWorkSpacePtr(workspace_size); + + if (planner != nullptr) { + if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() && + (!desc->is_cached)) { + SearchBestAlgo(ctx, + cublaslt_handle, + desc, + static_cast(&alpha), + static_cast(&beta), + y_ptr, + x_ptr, + out_ptr, + workspace_ptr, + workspace_size); + MatmulDescT* best_desc = new MatmulDescT(*desc); + VLOG(6) << best_desc->GetDescResultString( + "[Searched CublasltDescriptor] "); + + auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); + cache.SetSubKey(sub_key, reinterpret_cast(best_desc)); + } + } + + VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] "); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmul(cublaslt_handle, + desc->op_desc, + static_cast(&alpha), + y_ptr, + desc->y_desc, + x_ptr, + desc->x_desc, + static_cast(&beta), + out_ptr, + desc->out_desc, + out_ptr, + desc->out_desc, + desc->algo, + workspace_ptr, + workspace_size, + ctx.stream())); + } + + static void SearchBestAlgo(const phi::GPUContext& ctx, + const cublasLtHandle_t& lt_handle, + MatmulDescT* desc, + const void* alpha, + const void* beta, + const void* y_data, + const void* x_data, + void* out_data, + void* workspace_ptr, + size_t workspace_size) { + cublasLtMatmulPreference_t preference; + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulPreferenceCreate(&preference)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute( + preference, + CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &workspace_size, + sizeof(workspace_size))); + + int returned_results = 0; + constexpr int requested_algo_count = 10; + std::vector heuristic_results( + requested_algo_count); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle, + desc->op_desc, + desc->y_desc, + desc->x_desc, + desc->out_desc, + desc->out_desc, + preference, + requested_algo_count, + heuristic_results.data(), + &returned_results)); + PADDLE_ENFORCE_GT(returned_results, + 0, + phi::errors::Unavailable("No GEMM algorithm avaliable.")); + int best_algo_idx = -1; + if (returned_results == 1 || FLAGS_cublaslt_exhaustive_search_times <= 0) { + best_algo_idx = 0; + } else { + float min_time_cost = std::numeric_limits::max(); + for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) { + float cur_time_cost = + RunAndMeasureAlgo(ctx, + lt_handle, + desc, + alpha, + beta, + y_data, + x_data, + out_data, + workspace_ptr, + workspace_size, + &(heuristic_results[algo_idx].algo)); + VLOG(6) << "[MatmulWithCublaslt] algo[" << algo_idx + << "] time: " << cur_time_cost << " s"; + + if ((best_algo_idx == 0 && (1.05 * cur_time_cost < min_time_cost)) || + (cur_time_cost < min_time_cost)) { + best_algo_idx = algo_idx; + min_time_cost = cur_time_cost; + } + } + } + VLOG(6) << "[MatmulWithCublaslt] best_algo_idx: " << best_algo_idx; + + cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo(); + *best_algo = heuristic_results[best_algo_idx].algo; + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulPreferenceDestroy(preference)); + } + + static float RunAndMeasureAlgo(const phi::GPUContext& ctx, + const cublasLtHandle_t& lt_handle, + MatmulDescT* desc, + const void* alpha, + const void* beta, + const void* y_data, + const void* x_data, + void* out_data, + void* workspace_ptr, + size_t workspace_size, + cublasLtMatmulAlgo_t* algo) { + int repeats = FLAGS_cublaslt_exhaustive_search_times; + if (repeats <= 0) { + return std::numeric_limits::max(); + } + + phi::GpuTimer timer; + float time_cost = 0.f; + const auto& stream = ctx.stream(); + + for (int i = 0; i < repeats; ++i) { + timer.Start(stream); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(lt_handle, + desc->op_desc, + alpha, + y_data, + desc->y_desc, + x_data, + desc->x_desc, + beta, + out_data, + desc->out_desc, + out_data, + desc->out_desc, + algo, + workspace_ptr, + workspace_size, + stream)); + timer.Stop(stream); + ctx.Wait(); + auto time = timer.ElapsedTime(); + if (i > 0) { + // Exclude the warmup runtime. + time_cost += time; + } + } + return (time_cost / (repeats - 1)); + } +}; + +template <> +struct CublasLtBase { + public: + static phi::Allocator::AllocationPtr GetWorkspace(const phi::GPUContext& ctx, + size_t workspace_size) { + return paddle::memory::Alloc( + ctx.GetPlace(), + workspace_size, + phi::Stream(reinterpret_cast(ctx.stream()))); + } + + static void RunImpl(const phi::GPUContext& ctx, + MatmulDescriptor* desc, + const size_t sub_key, + const int8_t* x_ptr, + const int8_t* y_ptr, + int32_t* out_ptr, + phi::funcs::MatmulPlanner* planner) { + int32_t alpha = 1; + int32_t beta = + planner->UseAddTo() ? static_cast(1) : static_cast(0); + cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle(); + + size_t workspace_size = static_cast(4) * 1024 * 1024; + // phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, + // workspace_size); + void* workspace_ptr = ctx.GetWorkSpacePtr(workspace_size); + + if (planner != nullptr) { + if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() && + (!desc->is_cached)) { + SearchBestAlgo(ctx, + cublaslt_handle, + desc, + static_cast(&alpha), + static_cast(&beta), + y_ptr, + x_ptr, + out_ptr, + workspace_ptr, + workspace_size); + MatmulDescriptor* best_desc = new MatmulDescriptor(*desc); + VLOG(6) << best_desc->GetDescResultString( + "[Searched CublasltDescriptor] "); + + auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); + cache.SetSubKey(sub_key, reinterpret_cast(best_desc)); + } + } + + VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] "); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmul(cublaslt_handle, + desc->op_desc, + static_cast(&alpha), + y_ptr, + desc->y_desc, + x_ptr, + desc->x_desc, + static_cast(&beta), + out_ptr, + desc->out_desc, + out_ptr, + desc->out_desc, + desc->algo, + workspace_ptr, + workspace_size, + ctx.stream())); + } + + static void SearchBestAlgo(const phi::GPUContext& ctx, + const cublasLtHandle_t& lt_handle, + MatmulDescriptor* desc, + const void* alpha, + const void* beta, + const void* y_data, + const void* x_data, + void* out_data, + void* workspace_ptr, + size_t workspace_size) { + cublasLtMatmulPreference_t preference; + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulPreferenceCreate(&preference)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute( + preference, + CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &workspace_size, + sizeof(workspace_size))); + + int returned_results = 0; + constexpr int requested_algo_count = 10; + std::vector heuristic_results( + requested_algo_count); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle, + desc->op_desc, + desc->y_desc, + desc->x_desc, + desc->out_desc, + desc->out_desc, + preference, + requested_algo_count, + heuristic_results.data(), + &returned_results)); + PADDLE_ENFORCE_GT(returned_results, + 0, + phi::errors::Unavailable("No GEMM algorithm avaliable.")); + int best_algo_idx = -1; + if (returned_results == 1 || FLAGS_cublaslt_exhaustive_search_times <= 0) { + best_algo_idx = 0; + } else { + float min_time_cost = std::numeric_limits::max(); + for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) { + float cur_time_cost = + RunAndMeasureAlgo(ctx, + lt_handle, + desc, + alpha, + beta, + y_data, + x_data, + out_data, + workspace_ptr, + workspace_size, + &(heuristic_results[algo_idx].algo)); + VLOG(6) << "[MatmulWithCublaslt] algo[" << algo_idx + << "] time: " << cur_time_cost << " s"; + + if ((best_algo_idx == 0 && (1.05 * cur_time_cost < min_time_cost)) || + (cur_time_cost < min_time_cost)) { + best_algo_idx = algo_idx; + min_time_cost = cur_time_cost; + } + } + } + VLOG(6) << "[MatmulWithCublaslt] best_algo_idx: " << best_algo_idx; + + cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo(); + *best_algo = heuristic_results[best_algo_idx].algo; + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cublasLtMatmulPreferenceDestroy(preference)); + } + + static float RunAndMeasureAlgo(const phi::GPUContext& ctx, + const cublasLtHandle_t& lt_handle, + MatmulDescriptor* desc, + const void* alpha, + const void* beta, + const void* y_data, + const void* x_data, + void* out_data, + void* workspace_ptr, + size_t workspace_size, + cublasLtMatmulAlgo_t* algo) { + int repeats = FLAGS_cublaslt_exhaustive_search_times; + if (repeats <= 0) { + return std::numeric_limits::max(); + } + + phi::GpuTimer timer; + float time_cost = 0.f; + const auto& stream = ctx.stream(); + + for (int i = 0; i < repeats; ++i) { + timer.Start(stream); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(lt_handle, + desc->op_desc, + alpha, + y_data, + desc->y_desc, + x_data, + desc->x_desc, + beta, + out_data, + desc->out_desc, + out_data, + desc->out_desc, + algo, + workspace_ptr, + workspace_size, + stream)); + timer.Stop(stream); + ctx.Wait(); + auto time = timer.ElapsedTime(); + if (i > 0) { + // Exclude the warmup runtime. + time_cost += time; + } + } + return (time_cost / (repeats - 1)); + } +}; + +// To judge if desc is cached or not. +template +struct DescriptorSetter { + public: + DescT desc; + size_t sub_key{std::numeric_limits::min()}; + + DescriptorSetter(phi::funcs::MatmulPlanner* planner, + const int64_t M, + const int64_t N, + const int64_t K, + const bool trans_x, + const bool trans_y, + const int batch_size = 1, + int64_t stride_x = 0, + int64_t stride_y = 0, + int64_t stride_out = 0, + const bool no_exchange = true, + bool grad_for_dx = true) { + if (std::is_same::value) { + if (!trans_x && !trans_y) { + PADDLE_ENFORCE_EQ( + (N % 4 == 0 || N == 1), + true, + phi::errors::InvalidArgument( + "The dimension size N used in int8 matmul must be 1 or a " + "multiple of 4 does not " + "match the size (%d) currently contained in the container.", + N)); + PADDLE_ENFORCE_EQ( + (K % 4 == 0), + true, + phi::errors::InvalidArgument( + "The dimension size K used in int8 matmul must be a multiple " + "of 4 does not " + "match the size (%d) currently contained in the container.", + K)); + } else if (!trans_x && trans_y) { + PADDLE_ENFORCE_EQ( + (K % 4 == 0), + true, + phi::errors::InvalidArgument( + "The dimension size K used in int8 matmul must be a multiple " + "of 4 does not " + "match the size (%d) currently contained in the container.", + K)); + } else if (trans_x && !trans_y) { + PADDLE_ENFORCE_EQ( + (M % 4 == 0 || M == 1), + true, + phi::errors::InvalidArgument( + "The dimension size M used in int8 matmul must be 1 or a " + "multiple of 4 does not " + "match the size (%d) currently contained in the container.", + M)); + PADDLE_ENFORCE_EQ( + (N % 4 == 0 || N == 1), + true, + phi::errors::InvalidArgument( + "The dimension size N used in int8 matmul must be 1 or a " + "multiple of 4 does not " + "match the size (%d) currently contained in the container.", + N)); + } else { + PADDLE_ENFORCE_EQ( + (M % 4 == 0 || M == 1), + true, + phi::errors::InvalidArgument( + "The dimension size M used in int8 matmul must be 1 or a " + "multiple of 4 does not " + "match the size (%d) currently contained in the container.", + M)); + PADDLE_ENFORCE_EQ( + (K % 4 == 0), + true, + phi::errors::InvalidArgument( + "The dimension size K used in int8 matmul must be a multiple " + "of 4 does not " + "match the size (%d) currently contained in the container.", + K)); + } + } + + if (planner != nullptr) { + sub_key = planner->GenSubKey(); + } + + auto& mamtul_cache = phi::autotune::AutoTuneCache::Instance().GetMatmul(); + if (mamtul_cache.FindSubKey(sub_key)) { + desc = *(reinterpret_cast(mamtul_cache.GetSubKey(sub_key))); + desc.template SetFusedEpiloguePtr(planner); + VLOG(7) << desc.GetDescResultString("[Heap CublasltDescriptor] "); + } else { + desc.template Create(M, + N, + K, + trans_x, + trans_y, + planner, + batch_size, + stride_x, + stride_y, + stride_out, + grad_for_dx); + desc.ExchangeXYDesc(no_exchange); + if (planner != nullptr) { + desc.template SetFusedEpiloguePtr(planner); + } + VLOG(7) << desc.GetDescResultString("[Stack CublasltDescriptor] ", false); + } + } +}; + +// For matmul with kernels autotune +template +struct MatmulWithCublasLt : public CublasLtBase { + public: + static void Run(const phi::GPUContext& ctx, + const T* x_data, + const T* y_data, + OutT* out_data, + const int64_t M, + const int64_t N, + const int64_t K, + const bool trans_x, + const bool trans_y, + phi::funcs::MatmulPlanner* planner = nullptr) { + auto setter = DescriptorSetter( + planner, M, N, K, trans_x, trans_y); + CublasLtBase::RunImpl( + ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner); + } + + static void RunWithBatch(const phi::GPUContext& ctx, + const T* x_data, + const T* y_data, + OutT* out_data, + const int64_t M, + const int64_t N, + const int64_t K, + bool trans_x, + bool trans_y, + int batch_size, + int64_t stride_x, + int64_t stride_y, + int64_t stride_out, + phi::funcs::MatmulPlanner* planner = nullptr) { + auto setter = DescriptorSetter(planner, + M, + N, + K, + trans_x, + trans_y, + batch_size, + stride_x, + stride_y, + stride_out); + CublasLtBase::RunImpl( + ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner); + } + + static void RunWithBatch(const phi::GPUContext& ctx, + const T** x_data, + const T** y_data, + OutT** out_data, + const int64_t M, + const int64_t N, + const int64_t K, + bool trans_x, + bool trans_y, + int batch_size, + phi::funcs::MatmulPlanner* planner = nullptr) { + for (int i = 0; i < batch_size; ++i) { + Run(ctx, + x_data[i], + y_data[i], + out_data[i], + M, + N, + K, + trans_x, + trans_y, + planner); + } + } +}; + +// As for just Linear fused ephilogue below: out = matmul(x, y) + bias. +template +struct LinearWithCublasLt : public CublasLtBase { + static void Run(const phi::GPUContext& ctx, + const phi::DenseTensor* x, + const phi::DenseTensor* y, + phi::DenseTensor* out, + const void* bias_data, + void* reserve_data, + const int64_t M, + const int64_t N, + const int64_t K, + const bool trans_x, + const bool trans_y, + const MatmulFusedType fused_type) { + auto planner = phi::funcs::MatmulPlanner( + vectorize(x->dims()), + vectorize(y->dims()), + trans_x, + trans_y, + paddle::experimental::CppTypeToDataType::Type(), + fused_type, + bias_data, + reserve_data); + auto setter = DescriptorSetter( + &planner, M, N, K, trans_x, trans_y); + CublasLtBase::RunImpl(ctx, + &setter.desc, + setter.sub_key, + x->data(), + y->data(), + out->data(), + &planner); + } +}; + +template +struct LinearGradWithCublasLt : public CublasLtBase { + static void Run( + const phi::GPUContext& ctx, + const phi::DenseTensor* x, + const phi::DenseTensor* y, + phi::DenseTensor* out, + const void* bias_data, + void* reserve_data, + const int64_t M, + const int64_t N, + const int64_t K, + const MatmulFusedType fused_type, + const bool trans_x, + const bool trans_y, + const bool use_addto, + const bool no_exchange, // exchange x_desc and y_desc for grad. + bool grad_for_dx = true) { + auto planner = phi::funcs::MatmulPlanner( + vectorize(x->dims()), + vectorize(y->dims()), + trans_x, + trans_y, + paddle::experimental::CppTypeToDataType::Type(), + fused_type, + bias_data, + reserve_data, + use_addto, + no_exchange); + auto setter = + DescriptorSetter( + &planner, + M, + N, + K, + trans_x, + trans_y, + /*batch_size=*/1, + /*stride_x=*/0, + /*stride_y=*/0, + /*stride_out=*/0, + /*exchange_x_y_desc=*/no_exchange, + /*grad_for_dx=*/grad_for_dx); + + // To setting data type for different kinda out_data. + if (grad_for_dx) { + CublasLtBase::RunImpl( + ctx, + &setter.desc, + setter.sub_key, + no_exchange ? x->data() : y->data(), + no_exchange ? y->data() : x->data(), + out->data(), + &planner); + } else { + CublasLtBase::RunImpl( + ctx, + &setter.desc, + setter.sub_key, + no_exchange ? x->data() : y->data(), + no_exchange ? y->data() : x->data(), + out->data(), + &planner); + } + } +}; +#else +// A void structure just for successfully compile. +struct MatmulPlanner {}; +#endif // (PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 0458f0d83ed1a..1b1814ec0ae2b 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -101,7 +101,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x, gpu_type); }); if (batch_size > 1) { -#if CUDA_VERSION >= 11070 +#if CUDA_VERSION >= 11080 dev_ctx.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseCsrSetStridedBatch( *descriptor, batch_size, M + 1, batch_nnz); @@ -109,7 +109,7 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x, #else PADDLE_THROW(phi::errors::Unimplemented( "Batch Sparse matmul use 'cusparseCsrSetStridedBatch', which is " - "supported from CUDA 11.7")); + "supported from CUDA 11.8")); #endif } } @@ -155,7 +155,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x, }); if (batch_size > 1) { -#if CUDA_VERSION >= 11070 +#if CUDA_VERSION >= 11080 dev_ctx.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseCooSetStridedBatch( *descriptor, batch_size, batch_nnz); @@ -163,7 +163,7 @@ inline void CreateCooDescriptor(const phi::SparseCooTensor& x, #else PADDLE_THROW(phi::errors::Unimplemented( "Batch Sparse matmul use 'cusparseCooSetStridedBatch', which is " - "supported from CUDA 11.7")); + "supported from CUDA 11.8")); #endif } } @@ -241,7 +241,7 @@ class CuSparseDnMatDescriptor { PADDLE_ENFORCE_EQ(x.numel(), batch_size * M * N); if (batch_size > 1) { -#if CUDA_VERSION >= 11070 +#if CUDA_VERSION >= 11080 dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseDnMatSetStridedBatch( descriptor_, batch_size, M * N); @@ -249,7 +249,7 @@ class CuSparseDnMatDescriptor { #else PADDLE_THROW(phi::errors::Unimplemented( "Batch Sparse matmul use 'cusparseDnMatSetStridedBatch', which is " - "supported from CUDA 11.7")); + "supported from CUDA 11.8")); #endif } VLOG(6) << "Create cusparseDnMatDescr_t " << &descriptor_; @@ -379,7 +379,11 @@ void SparseBlas::SPMV(bool transa, &beta, out_descriptor.descriptor(), gpu_type, +#if CUDA_VERSION >= 11040 + CUSPARSE_SPMV_ALG_DEFAULT, +#else CUSPARSE_MV_ALG_DEFAULT, +#endif &buffer_size); }); @@ -395,7 +399,11 @@ void SparseBlas::SPMV(bool transa, &beta, out_descriptor.descriptor(), gpu_type, +#if CUDA_VERSION >= 11040 + CUSPARSE_SPMV_ALG_DEFAULT, +#else CUSPARSE_MV_ALG_DEFAULT, +#endif tmp_buffer_ptr); }); } diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu index 7ecf352ffe996..700ce21caf2ba 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu @@ -14,9 +14,7 @@ #include "paddle/phi/kernels/graph_send_recv_kernel.h" -#include -#include - +#include "paddle/phi/kernels/funcs/math_function.h" #include #include @@ -59,17 +57,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, cudaMemset(p_output, 0, memset_bytes); #endif } else if (pool_type == "MAX") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, - p_output_ptr, - p_output_ptr + memset_size, - std::numeric_limits::min()); + phi::funcs::set_constant(ctx, out, std::numeric_limits::min()); } else if (pool_type == "MIN") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, - p_output_ptr, - p_output_ptr + memset_size, - std::numeric_limits::max()); + phi::funcs::set_constant(ctx, out, std::numeric_limits::max()); } if (index_size == 0) return; diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu index 2753937eb7142..b6c13360cd404 100644 --- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu @@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/matmul_grad_kernel.h" - #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h" +#include "paddle/phi/kernels/matmul_grad_kernel.h" PD_REGISTER_KERNEL(matmul_grad, GPU, diff --git a/paddle/phi/kernels/gpu/matmul_kernel.cu b/paddle/phi/kernels/gpu/matmul_kernel.cu index e5de7966c2ec4..32d70ae0763f0 100644 --- a/paddle/phi/kernels/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/gpu/matmul_kernel.cu @@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/matmul_kernel.h" - #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/matmul_kernel_impl.h" +#include "paddle/phi/kernels/matmul_kernel.h" PD_REGISTER_KERNEL(matmul, GPU, diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu index 3d44c9af03c07..c52555c38e5a3 100644 --- a/paddle/phi/kernels/gpu/unique_kernel.cu +++ b/paddle/phi/kernels/gpu/unique_kernel.cu @@ -21,12 +21,13 @@ #include #include #include - +#include #include #include -#include "paddle/fluid/framework/tensor_util.h" // TensorToVector() +#include "cub/cub.cuh" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/unique_functor.h" @@ -194,22 +195,29 @@ static void UniqueFlattendCUDATensor(const Context& context, indices->Resize(phi::make_ddim({num_input})); auto* indices_data = context.template Alloc(indices); - thrust::sequence(thrust::device, indices_data, indices_data + num_input); +#ifdef PADDLE_WITH_CUDA + phi::memory_utils::ThrustAllocator allocator(context.GetPlace(), + context.stream()); + const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream()); +#else + const auto& exec_policy = thrust::hip::par.on(context.stream()); +#endif + + thrust::sequence(exec_policy, indices_data, indices_data + num_input); thrust::sort_by_key( - thrust::device, in_data_hat, in_data_hat + num_input, indices_data); + exec_policy, in_data_hat, in_data_hat + num_input, indices_data); // 1. Calculate op result: 'out' DenseTensor range; range.Resize(phi::make_ddim({num_input + 1})); auto* range_data_ptr = context.template Alloc(&range); - thrust::sequence( - thrust::device, range_data_ptr, range_data_ptr + num_input + 1); + thrust::sequence(exec_policy, range_data_ptr, range_data_ptr + num_input + 1); phi::Copy(context, in_hat, context.GetPlace(), false, out); int num_out; auto out_data = context.template Alloc(out); num_out = thrust::unique_by_key( - thrust::device, out_data, out_data + num_input, range_data_ptr, equal) + exec_policy, out_data, out_data + num_input, range_data_ptr, equal) .first - out_data; out->Resize(phi::make_ddim({num_out})); @@ -221,18 +229,32 @@ static void UniqueFlattendCUDATensor(const Context& context, DenseTensor inv_loc; inv_loc.Resize(phi::make_ddim({num_input})); auto inv_loc_data_ptr = context.template Alloc(&inv_loc); - thrust::adjacent_difference(thrust::device, + thrust::adjacent_difference(exec_policy, in_data_hat, in_data_hat + num_input, inv_loc_data_ptr, not_equal); - thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); - inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault - thrust::inclusive_scan(thrust::device, - inv_loc_data_ptr, - inv_loc_data_ptr + num_input, - inv_loc_data_ptr); - thrust::scatter(thrust::device, +#ifdef PADDLE_WITH_HIP + hipMemset(inv_loc_data_ptr, 0, sizeof(IndexT)); +#else + cudaMemsetAsync(inv_loc_data_ptr, 0, sizeof(IndexT), context.stream()); +#endif + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(NULL, + temp_storage_bytes, + inv_loc_data_ptr, + inv_loc_data_ptr, + num_input, + context.stream()); + auto d_temp_storage = + paddle::memory::Alloc(context.GetPlace(), temp_storage_bytes); + cub::DeviceScan::InclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + inv_loc_data_ptr, + inv_loc_data_ptr, + num_input, + context.stream()); + thrust::scatter(exec_policy, inv_loc_data_ptr, inv_loc_data_ptr + num_input, indices_data, @@ -244,11 +266,11 @@ static void UniqueFlattendCUDATensor(const Context& context, DenseTensor tmp_indices; tmp_indices.Resize(phi::make_ddim({num_input})); auto* tmp_indices_data_ptr = context.template Alloc(&tmp_indices); - thrust::copy(thrust::device, + thrust::copy(exec_policy, in_data_hat, in_data_hat + num_input, tmp_indices_data_ptr); - thrust::unique_by_key(thrust::device, + thrust::unique_by_key(exec_policy, tmp_indices_data_ptr, tmp_indices_data_ptr + num_input, indices_data, @@ -261,10 +283,10 @@ static void UniqueFlattendCUDATensor(const Context& context, counts->Resize(phi::make_ddim({num_out})); auto count_data = context.template Alloc(counts); // init 'count_data' as 0 - thrust::fill(thrust::device, count_data, count_data + num_out, 0); + thrust::fill(exec_policy, count_data, count_data + num_out, 0); thrust::device_ptr range_data_ptr_dev(range_data_ptr); range_data_ptr_dev[num_out] = num_input; - thrust::adjacent_difference(thrust::device, + thrust::adjacent_difference(exec_policy, range_data_ptr + 1, range_data_ptr + num_out + 1, count_data); @@ -290,24 +312,29 @@ static void ComputeUniqueDims(const Context& context, equal_T equal, not_equal_T not_equal, int64_t row) { +#ifdef PADDLE_WITH_CUDA + phi::memory_utils::ThrustAllocator allocator(context.GetPlace(), + context.stream()); + const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream()); +#else + const auto& exec_policy = thrust::hip::par.on(context.stream()); +#endif // 1. inverse indices: 'inverse' inverse->Resize(phi::make_ddim({row})); auto* inverse_data = context.template Alloc(inverse); DenseTensor inv_loc; inv_loc.Resize(phi::make_ddim({row})); auto inv_loc_data_ptr = context.template Alloc(&inv_loc); - thrust::adjacent_difference(thrust::device, + thrust::adjacent_difference(exec_policy, sorted_indices_data, sorted_indices_data + row, inv_loc_data_ptr, not_equal); thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); inv_loc_data_dev[0] = 0; - thrust::inclusive_scan(thrust::device, - inv_loc_data_ptr, - inv_loc_data_ptr + row, - inv_loc_data_ptr); - thrust::scatter(thrust::device, + thrust::inclusive_scan( + exec_policy, inv_loc_data_ptr, inv_loc_data_ptr + row, inv_loc_data_ptr); + thrust::scatter(exec_policy, inv_loc_data_ptr, inv_loc_data_ptr + row, sorted_indices_data, @@ -317,9 +344,9 @@ static void ComputeUniqueDims(const Context& context, DenseTensor range; range.Resize(phi::make_ddim({row + 1})); auto range_data_ptr = context.template Alloc(&range); - thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1); + thrust::sequence(exec_policy, range_data_ptr, range_data_ptr + row + 1); int num_out; - num_out = thrust::unique_by_key(thrust::device, + num_out = thrust::unique_by_key(exec_policy, sorted_indices_data, sorted_indices_data + row, range_data_ptr, @@ -333,9 +360,9 @@ static void ComputeUniqueDims(const Context& context, // 3. counts: 'counts' counts->Resize(phi::make_ddim({num_out})); auto* count_data = context.template Alloc(counts); - thrust::fill(thrust::device, count_data, count_data + row, 0); + thrust::fill(exec_policy, count_data, count_data + row, 0); thrust::adjacent_difference( - thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data); + exec_policy, range_data_ptr + 1, range_data_ptr + row + 1, count_data); } // Calculate unique when 'axis' is set @@ -384,9 +411,15 @@ static void UniqueDimsCUDATensor(const Context& context, // 2. Calculate 'indices', 'inverse', 'counts' // Init index and sort - thrust::sequence( - thrust::device, sorted_indices_data, sorted_indices_data + row); - thrust::sort(thrust::device, +#ifdef PADDLE_WITH_CUDA + phi::memory_utils::ThrustAllocator allocator(context.GetPlace(), + context.stream()); + const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream()); +#else + const auto& exec_policy = thrust::hip::par.on(context.stream()); +#endif + thrust::sequence(exec_policy, sorted_indices_data, sorted_indices_data + row); + thrust::sort(exec_policy, sorted_indices_data, sorted_indices_data + row, LessThan(col, in_trans_data)); diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu index ef70907b59a61..e61f58450b34f 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu @@ -254,6 +254,8 @@ void ConvCudnnGradGradKernel( auto dtype = paddle::platform::CudnnDataType::type; auto handle = ctx.cudnn_handle(); + auto layout = paddle::platform::GetCudnnTensorFormat( + paddle::platform::DataLayout::kNCHW); paddle::operators::ConvArgs args1{&transformed_ddX, W, @@ -261,28 +263,36 @@ void ConvCudnnGradGradKernel( strides, padding_common, dilations, - dtype}; + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; paddle::operators::ConvArgs args2{&transformed_X, ddW, &transformed_ddO_channel, strides, padding_common, dilations, - dtype}; + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; paddle::operators::ConvArgs args3{&transformed_ddX, dW, &transformed_dO_channel, strides, padding_common, dilations, - dtype}; + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; paddle::operators::ConvArgs args4{&transformed_dX, ddW, &transformed_dO_channel, strides, padding_common, dilations, - dtype}; + dtype, + groups, + paddle::platform::DataLayout::kNCHW}; #ifdef PADDLE_WITH_HIP paddle::operators::SearchResult fwd_result1; @@ -298,9 +308,6 @@ void ConvCudnnGradGradKernel( filter_result; #endif - auto layout = paddle::platform::GetCudnnTensorFormat( - paddle::platform::DataLayout::kNCHW); - // ddo = conv(ddI, W) + conv(I, ddW) size_t workspace_size = 0; @@ -329,7 +336,7 @@ void ConvCudnnGradGradKernel( #else using search1 = paddle::operators::SearchAlgorithm; - fwd_result1 = search1::Find(args1, exhaustive_search, false, ctx); + fwd_result1 = search1::Find(ctx, args1, exhaustive_search, false); workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); #endif } @@ -357,7 +364,7 @@ void ConvCudnnGradGradKernel( #else using search2 = paddle::operators::SearchAlgorithm; - fwd_result2 = search2::Find(args2, exhaustive_search, false, ctx); + fwd_result2 = search2::Find(ctx, args2, exhaustive_search, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); #endif @@ -387,7 +394,7 @@ void ConvCudnnGradGradKernel( using search3 = paddle::operators::SearchAlgorithm; filter_result = - search3::Find(args3, exhaustive_search, deterministic, ctx); + search3::Find(ctx, args3, exhaustive_search, deterministic); workspace_size = std::max( workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); #endif @@ -417,7 +424,7 @@ void ConvCudnnGradGradKernel( using search4 = paddle::operators::SearchAlgorithm; data_result = - search4::Find(args4, exhaustive_search, deterministic, ctx); + search4::Find(ctx, args4, exhaustive_search, deterministic); workspace_size = std::max( workspace_size, search4::GetWorkspaceSize(args4, data_result.algo)); #endif diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 4e9c37879c002..2d61ec6e62c9c 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -251,27 +251,33 @@ void ConvCudnnGradKernel(const Context& ctx, T* input_grad_data = nullptr; T* transformed_input_grad_data = nullptr; + paddle::platform::DataLayout layout = + compute_format == paddle::platform::DataLayout::kNHWC + ? paddle::platform::DataLayout::kNHWC + : paddle::platform::DataLayout::kNCHW; + paddle::operators::ConvArgs args1{&transformed_input_grad, &transformed_filter_channel, &transformed_output_grad_channel, strides, padding_common, dilations, - dtype}; + dtype, + groups, + layout}; paddle::operators::ConvArgs args2{&transformed_input, &transformed_filter_grad_channel, &transformed_output_grad_channel, strides, padding_common, dilations, - dtype}; + dtype, + groups, + layout}; auto handle = ctx.cudnn_handle(); // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout - paddle::platform::DataLayout layout = - compute_format == paddle::platform::DataLayout::kNHWC - ? paddle::platform::DataLayout::kNHWC - : paddle::platform::DataLayout::kNCHW; + if (transformed_input.dims().size() == 5) { layout = compute_format == paddle::platform::DataLayout::kNHWC ? paddle::platform::DataLayout::kNDHWC @@ -367,9 +373,8 @@ void ConvCudnnGradKernel(const Context& ctx, #else using search1 = paddle::operators::SearchAlgorithm; - bwd_result = search1::Find(args1, exhaustive_search, deterministic, ctx); - workspace_size_d = std::max( - workspace_size_d, search1::GetWorkspaceSize(args1, bwd_result.algo)); + bwd_result = search1::Find(ctx, args1, exhaustive_search, deterministic); + workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size); #endif } @@ -397,11 +402,10 @@ void ConvCudnnGradKernel(const Context& ctx, using search2 = paddle::operators::SearchAlgorithm; filter_result = - search2::Find(args2, exhaustive_search, deterministic, ctx); + search2::Find(ctx, args2, exhaustive_search, deterministic); VLOG(3) << "filter algo: " << filter_result.algo << ", time " << filter_result.time; - workspace_size_w = std::max( - workspace_size_w, search2::GetWorkspaceSize(args2, filter_result.algo)); + workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size); #endif } diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index bd95a32bc724f..80544025ff738 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -56,8 +56,7 @@ void ConvCudnnKernel(const Context& ctx, bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, + PADDLE_ENFORCE_EQ(exhaustive_search && deterministic, false, phi::errors::InvalidArgument( "Cann't set exhaustive_search True and " @@ -213,7 +212,9 @@ void ConvCudnnKernel(const Context& ctx, strides, padding_common, dilations, - dtype}; + dtype, + groups, + compute_format}; auto handle = ctx.cudnn_handle(); auto workspace_handle = ctx.cudnn_workspace_handle(); @@ -313,8 +314,8 @@ void ConvCudnnKernel(const Context& ctx, paddle::operators::SearchResult fwd_result; using search = paddle::operators::SearchAlgorithm; - fwd_result = search::Find(args, exhaustive_search, deterministic, ctx); - workspace_size = search::GetWorkspaceSize(args, fwd_result.algo); + fwd_result = search::Find(ctx, args, exhaustive_search, deterministic); + workspace_size = fwd_result.workspace_size; #endif #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu index 0ce16f66becfa..36a3caf97eb94 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -179,14 +179,18 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, strides, padding_common, dilations_, - dtype}; + dtype, + groups, + layout}; paddle::operators::ConvArgs args2{&transformed_dout, &filter, &x_transpose, strides, padding_common, dilations_, - dtype}; + dtype, + groups, + layout}; #ifdef PADDLE_WITH_HIP paddle::operators::SearchResult fwd_result; @@ -226,7 +230,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, #else using search1 = paddle::operators::SearchAlgorithm; - fwd_result = search1::Find(args1, false, deterministic, ctx); + fwd_result = search1::Find(ctx, args1, false, deterministic, false); workspace_size = std::max( workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo)); #endif @@ -253,7 +257,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, #else using search2 = paddle::operators::SearchAlgorithm; - filter_result = search2::Find(args2, false, deterministic, ctx); + filter_result = search2::Find(ctx, args2, false, deterministic, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo)); #endif @@ -625,6 +629,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( auto dtype = paddle::platform::CudnnDataType::type; auto handle = ctx.cudnn_handle(); + auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); paddle::operators::ConvArgs args1{&transformed_ddout_channel, &filter, @@ -632,14 +637,18 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( strides, padding_common, dilations_, - dtype}; + dtype, + groups, + GPUDNNDataLayout::kNCHW}; paddle::operators::ConvArgs args2{&transformed_ddout_channel, &ddfilter, &transformed_x, strides, padding_common, dilations_, - dtype}; + dtype, + groups, + GPUDNNDataLayout::kNCHW}; paddle::operators::ConvArgs args3{&transformed_dout, dfilter, @@ -647,14 +656,18 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( strides, padding_common, dilations_, - dtype}; + dtype, + groups, + GPUDNNDataLayout::kNCHW}; paddle::operators::ConvArgs args4{&transformed_dout, &ddfilter, &transformed_dx_channel, strides, padding_common, dilations_, - dtype}; + dtype, + groups, + GPUDNNDataLayout::kNCHW}; #ifdef PADDLE_WITH_HIP paddle::operators::SearchResult bwd_result1; paddle::operators::SearchResult bwd_result2; @@ -669,8 +682,6 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( paddle::operators::SearchResult fwd_result; #endif - auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); - // ddo = conv(ddI, filter) + conv(I, ddfilter) size_t workspace_size = 0; @@ -699,7 +710,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( #else using search1 = paddle::operators::SearchAlgorithm; - bwd_result1 = search1::Find(args1, false, deterministic, ctx); + bwd_result1 = search1::Find(ctx, args1, false, deterministic, false); workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); #endif @@ -723,7 +734,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( #else using search2 = paddle::operators::SearchAlgorithm; - bwd_result2 = search2::Find(args2, false, deterministic, ctx); + bwd_result2 = search2::Find(ctx, args2, false, deterministic, false); workspace_size = std::max( workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo)); #endif @@ -750,7 +761,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( #else using search3 = paddle::operators::SearchAlgorithm; - filter_result = search3::Find(args3, false, deterministic, ctx); + filter_result = search3::Find(ctx, args3, false, deterministic, false); workspace_size = std::max( workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); #endif @@ -778,7 +789,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( #else using search4 = paddle::operators::SearchAlgorithm; - fwd_result = search4::Find(args4, false, deterministic, ctx); + fwd_result = search4::Find(ctx, args4, false, deterministic, false); workspace_size = std::max( workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo)); #endif diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu index 58ead4c3287f8..5aa7bd60a0aa8 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu @@ -205,7 +205,9 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, strides, padding_common, dilations_, - dtype}; + dtype, + groups, + data_layout}; args.handle = handle; args.idesc.set(transformed_out, iwo_groups); args.wdesc.set(filter, layout_tensor, iwo_groups); @@ -228,7 +230,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, paddle::operators::SearchResult bwd_result; using search = paddle::operators::SearchAlgorithm; - bwd_result = search::Find(args, false, deterministic, ctx); + bwd_result = search::Find(ctx, args, false, deterministic, false); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); #endif diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index be32f85fe99a4..6c75ab86d7c4c 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -100,7 +100,7 @@ void MatMul(const Context& dev_ctx, const DenseTensor& b, bool trans_b, DenseTensor* out, - bool flag = false) { + bool flag) { dev_ctx.template Alloc(out); auto blas = phi::funcs::GetBlas(dev_ctx); auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a); @@ -120,6 +120,32 @@ void MatMul(const Context& dev_ctx, dev_ctx.template Alloc(out), static_cast(flag)); } +template +void MatMul(const Context& dev_ctx, + const DenseTensor& a, + bool trans_a, + const DenseTensor& b, + bool trans_b, + DenseTensor* out) { + dev_ctx.template Alloc(out); + auto blas = phi::funcs::GetBlas(dev_ctx); + auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a); + auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b); + if (a.dims().size() == 3 && b.dims().size() <= 2) { + // the transpose_X must be false, if is true, the transpose cost much time + if (!trans_a) { + mat_dim_a.height_ *= mat_dim_a.batch_size_; + mat_dim_a.batch_size_ = 0; + } + } + blas.MatMul(a.data(), + mat_dim_a, + b.data(), + mat_dim_b, + static_cast(1), + dev_ctx.template Alloc(out), + static_cast(false)); +} /** * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h index 99257ce4a6adf..6e2e8e3634c6e 100644 --- a/paddle/phi/kernels/impl/matmul_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h @@ -478,12 +478,24 @@ void MatMulFunction(const Context& dev_ctx, DenseTensor* Out, bool trans_x, bool trans_y, - bool flag = false) { + bool flag) { const std::vector x_dims = vectorize(X.dims()); const std::vector y_dims = vectorize(Y.dims()); MatMulFunction( dev_ctx, X, Y, x_dims, y_dims, Out, trans_x, trans_y, flag); } +template +void MatMulFunction(const Context& dev_ctx, + const DenseTensor& X, + const DenseTensor& Y, + DenseTensor* Out, + bool trans_x, + bool trans_y) { + const std::vector x_dims = vectorize(X.dims()); + const std::vector y_dims = vectorize(Y.dims()); + MatMulFunction( + dev_ctx, X, Y, x_dims, y_dims, Out, trans_x, trans_y, false); +} template void MatmulKernel(const Context& dev_ctx, diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu index a8e88f351ccbc..389737037a38e 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu @@ -13,7 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/sparse/coalesce_kernel.h" - +#include +#include #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py index 1eae4be579aa7..9c35964587cd9 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py +++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py @@ -27,6 +27,7 @@ from .lamb_optimizer import LambOptimizer from .fp16_allreduce_optimizer import FP16AllReduceOptimizer from .sharding_optimizer import ShardingOptimizer +from .sharding_optimizer import ThreadShardingOptimizer from .dygraph_optimizer import HybridParallelOptimizer from .dygraph_optimizer import HeterParallelOptimizer from .dygraph_optimizer import HybridParallelGradScaler diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py index 7002dfa2be514..33c4d01e4daea 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py @@ -51,6 +51,7 @@ def has_var(self, var_name): self._var_device_id(var_name) == self.worker_idx def _split_params(self, params_grads, worker_idx, worker_num): + """ param2device = {} total_param_mem = 0.0 param2mem = [] @@ -62,12 +63,29 @@ def _split_params(self, params_grads, worker_idx, worker_num): device_idx = 0 mem_accu = 0.0 for param_name, mem in param2mem: - if mem_accu > total_param_mem * 1.0 * (device_idx + 1) / worker_num: + if mem_accu > total_param_mem * (device_idx + 1) / worker_num: device_idx += 1 device2params[device_idx].append(param_name) param2device[param_name] = device_idx mem_accu += mem return param2device, device2params + """ + param2device = {} + device2params = {x: [] for x in range(worker_num)} + + sizes = [0] * worker_num + for param in [x[0] for x in params_grads]: + numel = get_var_size(param) + device_idx = sizes.index(min(sizes)) + device2params[device_idx].append(param.name) + param2device[param.name] = device_idx + sizes[device_idx] += numel + + for x in range(worker_num): + print("device id: %s, num: %s, mem: %s, names: %s" % ( + x, len(device2params[x]), sizes[x], device2params[x])) + + return param2device, device2params def _var_device_id(self, var_name): if var_name in self.global_param2device: diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index 39f71be0cde76..605e94e94d9d6 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -754,7 +754,7 @@ def get_first_optimize_op_idx(block): return first_opt_op_idx -def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root): +def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root,use_calc_stream=False): """ _add_broadcast_ops """ @@ -767,6 +767,7 @@ def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root): attrs={ 'ring_id': ring_id, 'root': root_device, + 'use_calc_stream': use_calc_stream, OP_ROLE_KEY: op_role }) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index fcecc3a9a671e..7916218cbbc11 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -71,6 +71,9 @@ def __init__(self, optimizer): self._reduced_grads_to_param = {} self._shard = Shard() self._verbose = False + self._thread_mode = False + self._use_calc_stream = False + # use sharding as outer parallelism (e.g. inner:Megatron & outer sharding) self.mp_degree = 1 @@ -576,10 +579,12 @@ def _apply_optimize_offload_pass(self, params_grads): def _dump_program_for_debug(self): main_block = self._main_program.global_block() startup_block = self._startup_program.global_block() - with open("start_sharding_%d" % self.role_maker._worker_index(), + startup_id = str(id(startup_block.program)) + with open(("start_sharding_%d_%s" % (self.role_maker._worker_index(), startup_id)), 'w') as f: f.writelines(str(startup_block.program)) - with open("main_sharding_%d" % self.role_maker._worker_index(), + main_id = str(id(main_block.program)) + with open(("main_sharding_%d_%s" % (self.role_maker._worker_index(), main_id)), 'w') as f: f.writelines(str(main_block.program)) @@ -819,7 +824,7 @@ def collect_segment(self, segment, op_idx, block): def _split_program(self, block): for op_idx, op in reversed(list(enumerate(block.ops))): - if int(op.attr('op_role')) != int(OpRole.Optimize): + if int(op.attr('op_role')) != int(OpRole.Optimize) and int(op.attr('op_role'))!= int(OpRole.ScaleLr): last_backward_op_idx = op_idx + 1 break @@ -829,6 +834,7 @@ def _split_program(self, block): for op_idx in reversed(range(last_backward_op_idx)): op = block.ops[op_idx] assert (int(op.attr('op_role')) != int(OpRole.Optimize)) + assert (int(op.attr('op_role')) != int(OpRole.ScaleLr)) if self._sharding_segment_strategy == "segment_broadcast_MB": if segment._param_mem >= self._broadcast_MB: segment = self.collect_segment(segment, op_idx, block) @@ -874,7 +880,8 @@ def _split_program(self, block): else: broadcast_var_name = unique_name.generate(input_name + "@BroadCast") - segment._fill_constant_vars.append(broadcast_var_name) + if not self._thread_mode: + segment._fill_constant_vars.append(broadcast_var_name) # (JZ-LIANG) should use Param base name ? broadcast_var_base_name = input_name @@ -1094,24 +1101,26 @@ def _add_broadcast_allreduce(self, block): if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1: if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len( shard_allredue_vars) >= 1: - insert_sync_comm_ops(block, self._segments[-1]._end_idx, - self.dp_ring_id, shard_allredue_vars) + if not self._use_calc_stream: + insert_sync_comm_ops(block, self._segments[-1]._end_idx, + self.dp_ring_id, shard_allredue_vars) insert_allreduce_ops( block, self._segments[-1]._end_idx, self.dp_ring_id, shard_allredue_vars, - user_defined_strategy=self.user_defined_strategy) + user_defined_strategy=self.user_defined_strategy, + use_calc_stream=self._use_calc_stream) # gradient merge elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1: self.create_persistable_gradients_and_insert_merge_ops( block, self._startup_program.global_block(), self._segments[-1]._end_idx, shard_allredue_vars, self._shard) - - insert_sync_comm_ops(block, self._segments[-1]._end_idx, - self.sharding_ring_id, - self._segments[-1]._allreduce_vars) + if not self._use_calc_stream: + insert_sync_comm_ops(block, self._segments[-1]._end_idx, + self.sharding_ring_id, + self._segments[-1]._allreduce_vars) # allreduce --> reduce insert_reduce_ops(block, self._segments[-1]._end_idx, @@ -1119,7 +1128,8 @@ def _add_broadcast_allreduce(self, block): self._segments[-1]._allreduce_vars, self._shard, op_role=OpRole.Backward, - use_calc_stream=False) + use_calc_stream=self._use_calc_stream, + ) for idx, segment in reversed(list(enumerate(self._segments))): allreduce_vars = self._segments[ @@ -1162,11 +1172,12 @@ def _add_broadcast_allreduce(self, block): if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1: if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len( shard_allredue_vars) >= 1: - insert_sync_comm_ops(block, segment._end_idx, - self.dp_ring_id, shard_allredue_vars) + if not self._use_calc_stream: + insert_sync_comm_ops(block, segment._end_idx, + self.dp_ring_id, shard_allredue_vars) broad_cast_vars = [x[0] for x in broadcast_vars] - if len(broad_cast_vars) > 0: + if not self._use_calc_stream and len(broad_cast_vars) > 0: insert_sync_comm_ops(block, segment._end_idx, self.sharding_ring_id, broad_cast_vars) @@ -1174,14 +1185,14 @@ def _add_broadcast_allreduce(self, block): comm_dep_vars = allreduce_vars + [ x[0] for x in broadcast_vars ] - if len(comm_dep_vars) > 0: + if not self._use_calc_stream and len(comm_dep_vars) > 0: insert_sync_comm_ops(block, segment._end_idx, self.sharding_ring_id, comm_dep_vars) # gradient merge elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1: broad_cast_vars = [x[0] for x in broadcast_vars] - if len(broad_cast_vars) > 0: + if not self._use_calc_stream and len(broad_cast_vars) > 0: insert_sync_comm_ops(block, segment._end_idx, self.sharding_ring_id, broad_cast_vars) @@ -1189,7 +1200,7 @@ def _add_broadcast_allreduce(self, block): k for k, v in cast_ops.items() ] + self._segments[idx]._allreduce_vars - if len(calc_dep_vars) > 0: + if not self._use_calc_stream and len(calc_dep_vars) > 0: insert_sync_calc_op(block, segment._end_idx, [calc_dep_vars[-1]]) @@ -1208,7 +1219,7 @@ def _add_broadcast_allreduce(self, block): segment._start_idx, shard_allredue_vars, self._shard) insert_broadcast_ops(block, segment._start_idx, - self.sharding_ring_id, broadcast_vars) + self.sharding_ring_id, broadcast_vars, self._use_calc_stream) # step6: add all_reduce ops # dp @@ -1220,13 +1231,17 @@ def _add_broadcast_allreduce(self, block): segment._start_idx, self.dp_ring_id, shard_allredue_vars, - user_defined_strategy=self.user_defined_strategy) - insert_sync_comm_ops(block, segment._start_idx, - self.sharding_ring_id, allreduce_vars) + user_defined_strategy=self.user_defined_strategy, + use_calc_stream=self._use_calc_stream, + ) + if not self._use_calc_stream: + insert_sync_comm_ops(block, segment._start_idx, + self.sharding_ring_id, allreduce_vars) # gradient merge elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1: - insert_sync_comm_ops(block, segment._start_idx, - self.sharding_ring_id, allreduce_vars) + if not self._use_calc_stream: + insert_sync_comm_ops(block, segment._start_idx, + self.sharding_ring_id, allreduce_vars) # sharding # allreduce --> reduce # TODO temp change @@ -1237,17 +1252,19 @@ def _add_broadcast_allreduce(self, block): allreduce_vars, self._shard, op_role=OpRole.Backward, - use_calc_stream=False) + use_calc_stream=self._use_calc_stream) block._sync_with_cpp() if self._segments[0]._broadcast_vars: broadcast_vars = [x[0] for x in self._segments[0]._broadcast_vars] - insert_sync_comm_ops(block, self._segments[0]._start_idx, - self.sharding_ring_id, broadcast_vars) + if not self._use_calc_stream: + insert_sync_comm_ops(block, self._segments[0]._start_idx, + self.sharding_ring_id, broadcast_vars) insert_broadcast_ops(block, self._segments[0]._start_idx, self.sharding_ring_id, - self._segments[0]._broadcast_vars) + self._segments[0]._broadcast_vars, + self._use_calc_stream) fill_constant_vars = [] for x in self._segments[:2]: @@ -1260,7 +1277,7 @@ def _add_broadcast_allreduce(self, block): cast_ops[k] = v calc_deps_vars = fill_constant_vars + [k for k, v in cast_ops.items()] - if fill_constant_vars or cast_ops: + if not self._use_calc_stream and fill_constant_vars or cast_ops: insert_sync_calc_op(block, self._segments[0]._start_idx, [calc_deps_vars[-1]]) @@ -1308,7 +1325,10 @@ def _build_groups(self): self.global_word_size = self.role_maker._worker_num() self.global_rank = self.role_maker._worker_index() self.global_endpoints = self.role_maker._get_trainer_endpoints() - self.current_endpoint = self.global_endpoints[self.global_rank] + if self._thread_mode: + self.current_endpoint = self.global_endpoints[self.role_maker._role_id()] + else: + self.current_endpoint = self.global_endpoints[self.global_rank] self._collective_helper = CollectiveHelper(self.role_maker, nrings=self._nrings_sharding) assert self.global_word_size % self.mp_degree == 0, \ @@ -1844,3 +1864,190 @@ def _sharding_gradient_merge(self): 'sub_block': cond_block, 'is_scalar_condition': True, }) +class ThreadShardingOptimizer(ShardingOptimizer): + """Sharding Optimizer.""" + def __init__(self, optimizer): + super().__init__(optimizer) + self.inner_opt = optimizer + self.meta_optimizers_white_list = [ + "ParameterServerOptimizer", + "RecomputeOptimizer", + "AMPOptimizer", + "LarsOptimizer", + "LambOptimizer", + "ASPOptimizer", + # "ModelParallelOptimizer", + # "PipelineOptimizer", + ] + self._thread_mode = True + self._use_calc_stream = False + op_maker = core.op_proto_and_checker_maker + self.op_role_key = op_maker.kOpRoleAttrName() + + def _prune_main_program(self, block, shard, rings): + """ + rename BroadCast param + """ + var_names = set([]) + for idx, op in enumerate(block.ops): + for input_name in op.desc.input_arg_names(): + pos = input_name.find("@BroadCast") + if pos <= 0: + continue + new_name = input_name[0 : pos] + op.desc._rename_input( + input_name, new_name + ) + var_names.add(input_name) + for output_name in op.desc.output_arg_names(): + pos = output_name.find("@BroadCast") + if pos <= 0: + continue + new_name = output_name[0 : pos] + op.desc._rename_output( + output_name, new_name + ) + var_names.add(output_name) + + for var_name in var_names: + block._remove_var(var_name, sync=False) + + print("remove broadcast param count=", len(var_names)) + block._sync_with_cpp() + + def _prune_startup_program(self, block, shard): + """ + not need process + """ + block._sync_with_cpp() + + def _insert_loss_grad_scale_op(self): + """ + paddlebox grad not need scale + """ + main_block = self._main_program.global_block() + # # step6: loss div dp_degree + # global_dp_degree = self.sharding_degree * self.dp_degree + # assert int(global_dp_degree) == global_dp_degree + # if global_dp_degree > 1: + # insert_scale_loss_grad_ops(main_block, scale=global_dp_degree) + main_block._sync_with_cpp() + + def minimize_impl( + self, loss, startup_program=None, parameter_list=None, no_grad_set=None + ): + """ + reset start program and main program + """ + sharding_configs = self.user_defined_strategy.sharding_configs + if "use_calc_stream" in sharding_configs: + self._use_calc_stream = sharding_configs["use_calc_stream"] + optimize_ops, params_grads = super().minimize_impl( + loss, startup_program, parameter_list, no_grad_set) + # main_block = self._main_program.global_block() + # startup_block = self._startup_program.global_block() + loss.block.program = self._main_program + from paddle import fluid + fluid.framework.switch_startup_program(self._startup_program) + return optimize_ops, params_grads + + def _init_comm(self): + # sync var + self.role_id = self.role_maker._role_id() + self.node_nums = self.role_maker._node_num() + startup_block = self._startup_program.global_block() + if self.node_nums > 1: + node_nums = len(self.global_endpoints) + assert ( + self.node_nums == node_nums + ), "end points not equal node nums" + self.current_endpoint = self.global_endpoints[self.role_id] + + # mp ring + if self.mp_degree > 1: + self._init_communicator( + self._startup_program, + self.current_endpoint, + self.mp_group_endpoints, + self.role_id, + self.mp_ring_id, + ) + + # sharding ring + if self.sharding_degree > 1: + self._init_communicator( + self._startup_program, + self.current_endpoint, + self.sharding_group_endpoints, + self.role_id, + self.sharding_ring_id, + ) + + # pure dp ring + if self.dp_degree > 1: + self._init_communicator( + self._startup_program, + self.current_endpoint, + self.dp_group_endpoints, + self.role_id, + self.dp_ring_id, + ) + + startup_block._sync_with_cpp() + + def _wait(self): + if self.node_nums <= 1: + return + endpoints = self.global_endpoints[:] + current_endpoint = endpoints[self.role_id] + if self.global_rank == 0: + from paddle.fluid.transpiler.details import wait_server_ready + endpoints.remove(current_endpoint) + wait_server_ready(endpoints) + + def _init_communicator( + self, + program, + current_endpoint, + endpoints, + role_id, + ring_id + ): + block = program.global_block() + # init mulit node nccl + if self.node_nums > 1: + other_endpoints = endpoints[:] + other_endpoints.remove(current_endpoint) + + nccl_id_var = block.create_var( + name=unique_name.generate('nccl_id'), + persistable=True, + type=core.VarDesc.VarType.RAW, + ) + block.append_op( + type='c_gen_nccl_id', + inputs={}, + outputs={'Out': nccl_id_var}, + attrs={ + 'rank': role_id, + 'endpoint': current_endpoint, + 'other_endpoints': other_endpoints, + self.op_role_key: OpRole.Forward, + }, + ) + block.append_op( + type='c_comm_init_multitrainer', + inputs={'X': nccl_id_var}, + outputs={}, + attrs={ + 'ntrainers': self.node_nums, + 'trainer_id': role_id, + 'ring_id': ring_id, + self.op_role_key: OpRole.Forward, + }, + ) + else: + block.append_op( + type='c_comm_init_all', + attrs={'ring_id': ring_id} + ) diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index 1ff8a579de4cf..a0b4e6474fd0b 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -71,6 +71,7 @@ 'fused_seqpool_concat', 'fused_concat', 'rank_attention2', + 'fused_seq_tensor', ] @@ -1601,7 +1602,7 @@ def rank_attention2(input, return output -def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None, batchcount=0): +def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None, batchcount=0, transpose_weight=False): """ **Batch FC layer** This Op can calculate BatchFC. This is similar to matmul op, @@ -1666,7 +1667,10 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None, batc "W": w, "Bias": b }, - attrs={'batchcount': batchcount}, + attrs={ + 'batchcount': batchcount, + 'transpose_weight': transpose_weight + }, outputs={"Out": pre_act}) return helper.append_activation(pre_act) @@ -1760,6 +1764,7 @@ def fused_seqpool_cvm(input, embed_thres_size=0, embedx_concate_size=1, embedx_concate_filter=False, + fill_zero=True, fix_ctr_to_click=False): """ **Notes: The Op only receives List of LoDTensor as input, only support SUM pooling now. @@ -1820,6 +1825,7 @@ def fused_seqpool_cvm(input, "embed_thres_size": embed_thres_size, "embedx_concate_size": embedx_concate_size, "embedx_concate_filter": embedx_concate_filter, + "fill_zero": fill_zero, "fix_ctr_to_click": fix_ctr_to_click }) @@ -2827,3 +2833,66 @@ def fused_concat(input, start_index=0, length=-1, axis=1): "length": length}) return out +def fused_seq_tensor(input, + batch_count, + max_length, + slot_num, + ad_slot_num, + fea_emb_dim, + ad_slot_offset): + """ + **fused seq tensor** + Notice: It currently only supports GPU device. + + Args: + input: [input, ad_input], input tensor list with data type float32. + batch_count: parrellel num. + max_length: max_length. + slot_num: slot_num, sum of ad_slot_num and side info slot. + ad_slot_num: ad slot num. + fea_emb_dim: embding dim. + ad_slot_offset: ad slot offset. + + Returns: + Variable: + din_out, mask_out, side_info_out, ad_slot_session_out + """ + + helper = LayerHelper("fused_seq_tensor", **locals()) + + check_type(input, "input", list, 'fused_seq_tensor') + + dtype = helper.input_dtype() + check_dtype(dtype, 'input', ['float32', 'float64'], 'fused_seq_tensor') + + check_type(batch_count, 'batch_count', (int, Variable), 'fused_seq_tensor') + check_type(max_length, 'max_length', (int, Variable), 'fused_seq_tensor') + check_type(slot_num, 'slot_num', (int, Variable), 'fused_seq_tensor') + check_type(fea_emb_dim, 'fea_emb_dim', (int, Variable), 'fused_seq_tensor') + + din_out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + mask_out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + side_info_out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + ad_slot_session_out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + + helper.append_op( + type="fused_seq_tensor", + inputs={"Input": input[0], + "ADInput": input[1] + }, + attrs={ + 'batch_count': batch_count, + 'max_length': max_length, + 'slot_num': slot_num, + 'fea_emb_dim': fea_emb_dim, + 'ad_slot_num': ad_slot_num, + 'ad_slot_offset': ad_slot_offset + }, + outputs={ + "DINOut": din_out, + "MaskOut": mask_out, + "SideInfoOut": side_info_out, + "ADSlotSessionOut": ad_slot_session_out + }) + + return din_out, mask_out, side_info_out, ad_slot_session_out diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 2acf005487bef..63eb3914e4bbd 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -401,7 +401,7 @@ def _is_enable_standalone_executor(): from ..distributed.fleet import fleet # use standalone_executor by default if not distributed if fleet._role_maker is None and framework._enable_standalone_executor_ is None: - framework._enable_standalone_executor_ = 1 + framework._enable_standalone_executor_ = 0 if framework._enable_standalone_executor_ in [1, '1', True, 'True', 'true']: flag = True diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 122c70f466722..7354d49a975e6 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -290,7 +290,8 @@ def save_vars(executor, main_program=None, vars=None, predicate=None, - filename=None): + filename=None, + filter_func=None): """ :api_attr: Static Graph @@ -374,7 +375,8 @@ def name_has_fc(var): main_program=main_program, dirname=dirname, vars=list(filter(predicate, main_program.list_vars())), - filename=filename) + filename=filename, + filter_func=filter_func) else: params_var_name = "saved_params" # give warning when there is no var in model @@ -389,6 +391,8 @@ def name_has_fc(var): save_var_map = {} for each_var in vars: + if filter_func is not None and filter_func(each_var.name): + continue # NOTE: don't save the variable which type is RAW if each_var.type == core.VarDesc.VarType.RAW: continue @@ -668,7 +672,11 @@ def is_valid(var): @dygraph_not_support -def save_persistables(executor, dirname, main_program=None, filename=None): +def save_persistables(executor, + dirname, + main_program=None, + filename=None, + filter_func=None): """ :api_attr: Static Graph @@ -737,7 +745,8 @@ def save_persistables(executor, dirname, main_program=None, filename=None): main_program=main_program, vars=None, predicate=is_persistable, - filename=filename) + filename=filename, + filter_func=filter_func) def load_vars(executor, @@ -1245,7 +1254,8 @@ def save_inference_model(dirname, params_filename=None, export_for_deployment=True, program_only=False, - clip_extra=False): + clip_extra=False, + filter_func=None): """ :api_attr: Static Graph @@ -1454,7 +1464,7 @@ def save_inference_model(dirname, if params_filename is not None: params_filename = os.path.basename(params_filename) - save_persistables(executor, save_dirname, main_program, params_filename) + save_persistables(executor, save_dirname, main_program, params_filename, filter_func) return target_var_name_list diff --git a/python/setup.py.in b/python/setup.py.in index 6ef620f5f0784..1b423bcca695f 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -557,10 +557,14 @@ if '${WITH_XPU}' == 'ON': shutil.copy('${XPU_RT_LIB}', libs_path) shutil.copy('${XPU_API_PLUGIN}', libs_path) shutil.copy('${XPU_RT_ALIAS_LIB}', libs_path) + shutil.copy('${XPU_ML_LIB}', libs_path) + shutil.copy('${XPU_ML_ALIAS_LIB}', libs_path) package_data['paddle.libs']+=['${XPU_API_LIB_NAME}', '${XPU_RT_LIB_NAME}', '${XPU_RT_ALIAS_LIB_NAME}', - '${XPU_API_PLUGIN_NAME}'] + '${XPU_API_PLUGIN_NAME}', + '${XPU_ML_LIB_NAME}', + '${XPU_ML_ALIAS_LIB_NAME}'] if '${WITH_XPU_BKCL}' == 'ON': shutil.copy('${XPU_BKCL_LIB}', libs_path)