Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
joey12300 committed Mar 24, 2022
2 parents 6302677 + 8df9176 commit bef8cbe
Show file tree
Hide file tree
Showing 374 changed files with 16,375 additions and 9,205 deletions.
6 changes: 3 additions & 3 deletions cmake/external/ascend.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ endif()
if (WITH_ASCEND_CL)
macro(find_ascend_toolkit_version ascend_toolkit_version_info)
file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
string(REGEX MATCH "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
string(REGEX REPLACE "[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
if(NOT ASCEND_TOOLKIT_VERSION)
set(ASCEND_TOOLKIT_VERSION "???")
Expand Down
6 changes: 6 additions & 0 deletions cmake/inference_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,12 @@ copy(inference_lib_dist
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
Expand Down
118 changes: 58 additions & 60 deletions cmake/phi.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ function(kernel_library TARGET)
set(xpu_srcs)
set(gpudnn_srcs)
set(kps_srcs)
set(selected_rows_srcs)
# parse and save the deps kerenl targets
set(all_srcs)
set(kernel_deps)
Expand All @@ -111,6 +110,12 @@ function(kernel_library TARGET)

cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})

# used for cc_library selected_rows dir target
set(target_suffix "")
if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows_kernel")
set(target_suffix "_sr")
endif()

list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
# one kernel only match one impl file in each backend
Expand All @@ -121,9 +126,6 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP)
list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
list(APPEND selected_rows_srcs ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
endif()
if (WITH_GPU OR WITH_ROCM)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
Expand Down Expand Up @@ -169,23 +171,41 @@ function(kernel_library TARGET)
list(APPEND all_srcs ${xpu_srcs})
list(APPEND all_srcs ${gpudnn_srcs})
list(APPEND all_srcs ${kps_srcs})

set(all_include_kernels)
set(all_kernel_name)

foreach(src ${all_srcs})
file(READ ${src} target_content)
# "kernels/xxx"(DenseTensor Kernel) can only include each other, but can't include "SUB_DIR/xxx" (such as selected_rows Kernel)
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
if ("${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
else()
list(APPEND all_include_kernels ${include_kernels})

# "SUB_DIR/xxx" can include "kernels/xx" and "SUB_DIR/xxx"
if (NOT "${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_dense_kernels ${target_content})
list(APPEND all_include_kernels ${include_kernels})
endif()
foreach(include_kernel ${include_kernels})

foreach(include_kernel ${all_include_kernels})
if ("${kernel_library_SUB_DIR}" STREQUAL "")
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
else()
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
# NOTE(dev): we should firstly match kernel_library_SUB_DIR.
if (${include_kernel} MATCHES "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/")
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
# for selected_rows directory, add ${target_suffix}.
string(REGEX REPLACE ".h\"" "${target_suffix}" kernel_name ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
else()
string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND all_kernel_name ${kernel_name})
endif()
endif()
string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
list(APPEND kernel_deps ${kernel_name})
list(APPEND kernel_deps ${all_kernel_name})
endforeach()

if (NOT "${kernel_library_SUB_DIR}" STREQUAL "")
Expand All @@ -199,100 +219,81 @@ function(kernel_library TARGET)

endforeach()
list(REMOVE_DUPLICATES kernel_deps)
list(REMOVE_ITEM kernel_deps ${TARGET})
list(REMOVE_ITEM kernel_deps ${TARGET}${target_suffix})

list(LENGTH common_srcs common_srcs_len)
list(LENGTH cpu_srcs cpu_srcs_len)
list(LENGTH gpu_srcs gpu_srcs_len)
list(LENGTH xpu_srcs xpu_srcs_len)
list(LENGTH gpudnn_srcs gpudnn_srcs_len)
list(LENGTH kps_srcs kps_srcs_len)
list(LENGTH selected_rows_srcs selected_rows_srcs_len)

# kernel source file level
# level 1: base device kernel
# - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs
# level 2: device-independent kernel
# - common_srcs
# level 3: Kernel implemented by reusing device-independent kernel
# - selected_rows_srcs
set(base_device_kernels)
set(device_independent_kernel)
set(high_level_kernels)

# 1. Base device kernel compile
if (${cpu_srcs_len} GREATER 0)
cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_cpu)
cc_library(${TARGET}_cpu${target_suffix} SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_cpu${target_suffix})
endif()
if (${gpu_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET}_gpu${target_suffix} SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET}_gpu${target_suffix} SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
list(APPEND base_device_kernels ${TARGET}_gpu)
list(APPEND base_device_kernels ${TARGET}_gpu${target_suffix})
endif()
if (${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_xpu)
cc_library(${TARGET}_xpu${target_suffix} SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_xpu${target_suffix})
endif()
if (${gpudnn_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
elseif (WITH_ROCM)
hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET}_gpudnn${target_suffix} SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
list(APPEND base_device_kernels ${TARGET}_gpudnn)
list(APPEND base_device_kernels ${TARGET}_gpudnn${target_suffix})
endif()
if (${kps_srcs_len} GREATER 0)
# only when WITH_XPU_KP, the kps_srcs_len can be > 0
xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_kps)
xpu_library(${TARGET}_kps${target_suffix} SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
list(APPEND base_device_kernels ${TARGET}_kps${target_suffix})
endif()

# 2. Device-independent kernel compile
if (${common_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
nv_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
elseif (WITH_ROCM)
hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
hip_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
elseif (WITH_XPU_KP)
xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
xpu_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
else()
cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
cc_library(${TARGET}_common${target_suffix} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels})
endif()
list(APPEND device_independent_kernel ${TARGET}_common)
list(APPEND device_independent_kernel ${TARGET}_common${target_suffix})
endif()

# 3. Reusing kernel compile
if (${selected_rows_srcs_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_ROCM)
hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_XPU_KP)
xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
else()
cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
endif()
list(APPEND high_level_kernels ${TARGET}_sr)
endif()

# 4. Unify target compile
# 3. Unify target compile
list(LENGTH base_device_kernels base_device_kernels_len)
list(LENGTH device_independent_kernel device_independent_kernel_len)
list(LENGTH high_level_kernels high_level_kernels_len)
if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR
${high_level_kernels_len} GREATER 0)
if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0)
if (WITH_GPU)
nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
nv_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_ROCM)
hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
hip_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
elseif (WITH_XPU_KP)
xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
xpu_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
else()
cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels})
cc_library(${TARGET}${target_suffix} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel})
endif()
else()
set(target_build_flag 0)
Expand All @@ -301,10 +302,10 @@ function(kernel_library TARGET)
if (${target_build_flag} EQUAL 1)
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
${gpudnn_srcs_len} GREATER 0)
# append target into PHI_KERNELS property
get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
set(phi_kernels ${phi_kernels} ${TARGET})
set(phi_kernels ${phi_kernels} ${TARGET}${target_suffix})
set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels})
endif()

Expand All @@ -329,9 +330,6 @@ function(kernel_library TARGET)
if (${kps_srcs_len} GREATER 0)
kernel_declare(${kps_srcs})
endif()
if (${selected_rows_srcs_len} GREATER 0)
kernel_declare(${selected_rows_srcs})
endif()
endif()
endfunction()

Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/distributed/ps.proto
Original file line number Diff line number Diff line change
Expand Up @@ -219,13 +219,13 @@ message GraphParameter {
optional string gpups_graph_sample_class = 3
[ default = "CompleteGraphSampler" ];
optional string gpups_graph_sample_args = 4 [ default = "" ];
optional bool use_cache = 5 [ default = true ];
optional float cache_ratio = 6 [ default = 0.3 ];
optional bool use_cache = 5 [ default = false ];
optional int32 cache_size_limit = 6 [ default = 100000 ];
optional int32 cache_ttl = 7 [ default = 5 ];
optional GraphFeature graph_feature = 8;
optional string table_name = 9 [ default = "" ];
optional string table_type = 10 [ default = "" ];
optional int32 gpups_mode_shard_num = 11 [ default = 127 ];
optional int32 shard_num = 11 [ default = 127 ];
optional int32 gpu_num = 12 [ default = 1 ];
}

Expand Down
33 changes: 15 additions & 18 deletions paddle/fluid/distributed/ps/service/brpc_ps_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ int32_t BrpcPsClient::initialize() {
std::thread(std::bind(&BrpcPsClient::push_dense_task_consume, this));
// for debug
// _print_thread =
// std::thread(std::bind(&BrpcPsClient::print_queue_size_thread, this));
// std::thread(std::bind(&BrpcPsClient::print_queue_size_thread, this));

return 0;
}
Expand Down Expand Up @@ -1315,11 +1315,11 @@ std::future<int32_t> BrpcPsClient::push_sparse(size_t table_id,
CostTimer parse_timer("pserver_client_push_sparse_parse");
int push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
while (push_sparse_async_num > FLAGS_pserver_max_async_call_num) {
// LOG(INFO) << "push_sparse Waiting for async_call_num comsume, task_num:"
// << push_sparse_async_num << ", max_task_limit:" <<
// FLAGS_pserver_max_async_call_num;
// LOG(INFO) << "push_sparse Waiting for async_call_num comsume,
// task_num:"
// << push_sparse_async_num
// << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
usleep(5000); // 5ms
// push_sparse_async_num = _push_sparse_task_queue_map[table_id]->size();
push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
}
auto put_timer = std::make_shared<CostTimer>("client_push_sparse_put");
Expand Down Expand Up @@ -1381,8 +1381,7 @@ void BrpcPsClient::push_sparse_task_consume() {
::ThreadPool async_push_sparse_shard_threads(
FLAGS_pserver_sparse_merge_thread);
while (_running) {
platform::Timer timeline;
timeline.Start();
auto async_start_time_ms = butil::gettimeofday_ms();
// 所有sparseTable的pushTask 进行处理
for (auto &push_sparse_task_itr : _push_sparse_task_queue_map) {
auto table_id = push_sparse_task_itr.first;
Expand Down Expand Up @@ -1497,9 +1496,8 @@ void BrpcPsClient::push_sparse_task_consume() {
std::vector<std::future<int>>().swap(merge_status);
}
}
timeline.Pause();
auto wait_ms =
FLAGS_pserver_async_push_sparse_interval_ms - (timeline.ElapsedMS());
auto wait_ms = FLAGS_pserver_async_push_sparse_interval_ms -
(butil::gettimeofday_ms() - async_start_time_ms);
if (wait_ms > 0) {
usleep(wait_ms * 1000);
}
Expand Down Expand Up @@ -1661,9 +1659,10 @@ std::future<int32_t> BrpcPsClient::push_dense(const Region *regions,
std::make_shared<CostTimer>("pserver_client_push_dense_parse");
int push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
while (push_dense_async_num > FLAGS_pserver_max_async_call_num) {
LOG(INFO) << "push_dense Waiting for async_call_num comsume, task_num:"
<< push_dense_async_num
<< ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
// LOG(INFO) << "push_dense Waiting for async_call_num comsume,
// task_num:"
// << push_dense_async_num
// << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
usleep(5000); // 5ms
push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
}
Expand Down Expand Up @@ -1701,8 +1700,7 @@ void BrpcPsClient::push_dense_task_consume() {
static bool scale_gradient = FLAGS_pserver_scale_gradient_by_merge;
::ThreadPool async_merge_dense_threads(10);
while (_running) {
platform::Timer timeline;
timeline.Start();
auto async_start_time_ms = butil::gettimeofday_ms();
for (auto &task_queue_itr : _push_dense_task_queue_map) {
auto &task_queue = task_queue_itr.second;
auto queue_size = task_queue->Size();
Expand Down Expand Up @@ -1791,9 +1789,8 @@ void BrpcPsClient::push_dense_task_consume() {
push_dense_raw_gradient(task_ptr, total_send_data, total_send_data_size,
closure);
}
timeline.Pause();
auto wait_ms =
FLAGS_pserver_async_push_dense_interval_ms - (timeline.ElapsedMS());
auto wait_ms = FLAGS_pserver_async_push_dense_interval_ms -
(butil::gettimeofday_ms() - async_start_time_ms);
if (wait_ms > 0) {
usleep(wait_ms * 1000);
}
Expand Down
Loading

0 comments on commit bef8cbe

Please sign in to comment.