Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support h800 #76

Merged
merged 9 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 39 additions & 17 deletions cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,34 @@ if(WITH_NV_JETSON)
add_definitions(-DWITH_NV_JETSON)
set(paddle_known_gpu_archs "53 62 72")
set(paddle_known_gpu_archs10 "53 62 72")
set(paddle_known_gpu_archs11 "53 62 72 87")
set(paddle_known_gpu_archs12 "53 62 72 87 90")
elseif(NEW_RELEASE_ALL)
message("Using New Release Strategy - All Arches Packge")
add_definitions(-DNEW_RELEASE_ALL)
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90")
elseif(NEW_RELEASE_PYPI)
message("Using New Release Strategy - Cubin Packge")
add_definitions(-DNEW_RELEASE_PYPI)
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
set(paddle_known_gpu_archs10 "")
set(paddle_known_gpu_archs11 "60 61 70 75 80")
set(paddle_known_gpu_archs11 "61 70 75 80")
set(paddle_known_gpu_archs12 "61 70 75 80 90")
elseif(NEW_RELEASE_JIT)
message("Using New Release Strategy - JIT Packge")
add_definitions(-DNEW_RELEASE_JIT)
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
set(paddle_known_gpu_archs10 "35 50 60 70 75")
set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
set(paddle_known_gpu_archs10 "50 60 70 75")
set(paddle_known_gpu_archs11 "50 60 70 75 80")
set(paddle_known_gpu_archs12 "50 60 70 75 80 90")
else()
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
set(paddle_known_gpu_archs "70 80")
set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
set(paddle_known_gpu_archs12 "70 80")
endif()

######################################################################################
Expand Down Expand Up @@ -98,12 +104,12 @@ endfunction()
function(select_nvcc_arch_flags out_variable)
# List of arch names
set(archs_names
"Kepler"
"Maxwell"
"Pascal"
"Volta"
"Turing"
"Ampere"
"Hopper"
"All"
"Manual")
set(archs_name_default "Auto")
Expand Down Expand Up @@ -142,9 +148,7 @@ function(select_nvcc_arch_flags out_variable)
unset(CUDA_ARCH_PTX CACHE)
endif()

if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
set(cuda_arch_bin "30 35")
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
if(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
if(WITH_NV_JETSON)
set(cuda_arch_bin "53")
else()
Expand All @@ -165,11 +169,17 @@ function(select_nvcc_arch_flags out_variable)
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
set(cuda_arch_bin "80")
elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+
set(cuda_arch_bin "80 86")
if(WITH_NV_JETSON)
set(cuda_arch_bin "87")
else()
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
set(cuda_arch_bin "80")
else()
set(cuda_arch_bin "80 86")
endif()
endif()
elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper")
set(cuda_arch_bin "90")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
Expand All @@ -186,6 +196,13 @@ function(select_nvcc_arch_flags out_variable)
set(cuda_arch_bin ${CUDA_ARCH_BIN})
endif()

# cuda11.4
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.6)
set(cuda_arch_bin "70 80")
else()
set(cuda_arch_bin "70 80 90")
endif()

if(NEW_RELEASE_JIT)
set(cuda_arch_ptx "${cuda_arch_ptx}${cuda_arch_bin}")
set(cuda_arch_bin "")
Expand Down Expand Up @@ -249,6 +266,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
set(paddle_known_gpu_archs "${paddle_known_gpu_archs12} 90")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
endif()

if(NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
Expand Down
9 changes: 7 additions & 2 deletions cmake/external/gloo.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,13 @@ set(GLOO_LIBRARY_DIR
"${GLOO_INSTALL_DIR}/lib"
CACHE PATH "gloo library directory." FORCE)
# As we add extra features for gloo, we use the non-official repo
set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git)
set(GLOO_TAG v0.0.2)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0)
set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git)
set(GLOO_TAG v0.0.2)
else()
set(GLOO_REPOSITORY ${GIT_URL}/ziyoujiyi/gloo.git)
set(GLOO_TAG v0.0.3)
endif()
set(GLOO_LIBRARIES
"${GLOO_INSTALL_DIR}/lib/libgloo.a"
CACHE FILEPATH "gloo library." FORCE)
Expand Down
6 changes: 5 additions & 1 deletion cmake/external/warpctc.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,11 @@ set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
# in case of low internet speed
#set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git)
set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git)
set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0)
set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
else()
set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184)
endif()

set(WARPCTC_INCLUDE_DIR
"${WARPCTC_INSTALL_DIR}/include"
Expand Down
24 changes: 24 additions & 0 deletions cmake/version.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,27 @@ math(EXPR PADDLE_VERSION_INTEGER "${PADDLE_MAJOR_VER} * 1000000
add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
add_definitions(-DPADDLE_VERSION_INTEGER=${PADDLE_VERSION_INTEGER})
message(STATUS "Paddle version is ${PADDLE_VERSION}")

#add git version
set(COMMIT_HASH "")
set(BRANCH_NAME "")
find_package(Git QUIET)
if(GIT_FOUND)
execute_process(
COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%H
OUTPUT_VARIABLE COMMIT_HASH
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
execute_process(
COMMAND ${GIT_EXECUTABLE} symbolic-ref --short -q HEAD
OUTPUT_VARIABLE BRANCH_NAME
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
endif()
message(STATUS "Git version is ${BRANCH_NAME}:${COMMIT_HASH}")
add_definitions(-DPADDLE_BRANCH_NAME="${BRANCH_NAME}")
add_definitions(-DPADDLE_COMMIT_HASH="${COMMIT_HASH}")
10 changes: 6 additions & 4 deletions paddle/fluid/framework/boxps_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -968,10 +968,6 @@ void BoxPSWorker::CreateThreadScopeForNorm(const ProgramDesc& program) {
auto dim = root_tensor->dims();
param_sync_.share(gpu_tensor, len).Resize(dim);
skip_vars_.push_back(name);
// add copy back to root scope
if (device_id_ == 0) {
need_copy_vars_.push_back(name);
}
}
}
// data norm copy and learning rate
Expand All @@ -985,6 +981,11 @@ void BoxPSWorker::CreateThreadScopeForNorm(const ProgramDesc& program) {
place_,
static_cast<Tensor*>(gpu_tensor));
++copy_persist_num;
// add copy back to root scope
if (device_id_ == 0) {
need_copy_vars_.push_back(name);
skip_vars_.push_back(name);
}
}
} else {
auto* ptr = thread_scope_->Var(name);
Expand Down Expand Up @@ -1104,6 +1105,7 @@ void BoxPSWorker::CreateThreadScopeForSharding(const ProgramDesc& program) {
// device 0 need sync datanorm and learning rate to root scope
if (device_id_ == 0) {
need_copy_vars_.push_back(name);
skip_vars_.push_back(name);
}
}
} else {
Expand Down
20 changes: 15 additions & 5 deletions paddle/fluid/framework/fleet/box_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ DECLARE_int32(padbox_dataset_shuffle_thread_num);

namespace paddle {
namespace framework {
extern int make_day_id(const int &y, const int &m, const int &d);
extern int make_day_id(const int& y, const int& m, const int& d);
#ifdef PADDLE_WITH_BOX_PS
#define MAX_GPU_NUM 16

Expand Down Expand Up @@ -322,6 +322,11 @@ class MetricMsg {
platform::errors::NotFound("Error: var %s is not found in scope.",
varname.c_str()));
auto& gpu_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
gpu_tensor.IsInitialized(),
true,
platform::errors::InvalidArgument(
"Error: monitor var `%s` uninitialized Tensor.", varname.c_str()));
*data = gpu_tensor.data<T>();
*len = gpu_tensor.numel();
}
Expand All @@ -335,6 +340,11 @@ class MetricMsg {
platform::errors::NotFound("Error: var %s is not found in scope.",
varname.c_str()));
auto& gpu_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ(
gpu_tensor.IsInitialized(),
true,
platform::errors::InvalidArgument(
"Error: monitor var `%s` uninitialized Tensor.", varname.c_str()));
auto* gpu_data = gpu_tensor.data<T>();
auto len = gpu_tensor.numel();
data->resize(len);
Expand Down Expand Up @@ -424,7 +434,7 @@ class BoxWrapper {
}
int GetMpiSize() { return boxps::MPICluster::Ins().size(); }
int GetMpiRank() { return boxps::MPICluster::Ins().rank(); }
int GetNCCLRankId(const int &device_id) {
int GetNCCLRankId(const int& device_id) {
return (GetMpiRank() * gpu_num_ + device_id);
}
int GetGpuNum() { return gpu_num_; }
Expand Down Expand Up @@ -832,7 +842,7 @@ class BoxWrapper {
for (auto& name : var_names) {
auto it = std::find(skip_gc_vars_.begin(), skip_gc_vars_.end(), name);
if (it != skip_gc_vars_.end()) {
return;
continue;
}
skip_gc_vars_.push_back(name);
}
Expand Down Expand Up @@ -1026,8 +1036,8 @@ class BoxHelper {

void SetDate(int year, int month, int day) {
day_id_ = make_day_id(year, month, day);
VLOG(0) << "BoxHelpler set year=" << year << ", month="
<< month << ", day=" << day << ", day id=" << day_id_;
VLOG(0) << "BoxHelpler set year=" << year << ", month=" << month
<< ", day=" << day << ", day id=" << day_id_;
}
void BeginPass() {
#ifdef PADDLE_WITH_BOX_PS
Expand Down
25 changes: 16 additions & 9 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ DECLARE_bool(check_nan_inf);
DECLARE_bool(enable_unused_var_check);
DECLARE_bool(run_kp_kernel);
DECLARE_bool(enable_host_event_recorder_hook);
PADDLE_DEFINE_EXPORTED_bool(enable_check_input_var,
false,
"enable check input var");

namespace paddle {
namespace framework {
Expand Down Expand Up @@ -1773,7 +1776,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
os << "\n";
printf("%s", os.str().c_str());
}
PADDLE_ENFORCE(false, "ERROR: check INF and NAN: %s",
PADDLE_ENFORCE(false,
"ERROR: check INF and NAN: %s",
DebugStringEx(&exec_scope).c_str());
}
#else
Expand Down Expand Up @@ -1938,7 +1942,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
} else if (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) {
} else if (!paddle::platform::is_xpu_support_op(type_,
expected_kernel_key)) {
VLOG(3) << "fluid XPU not support kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
Expand Down Expand Up @@ -2419,13 +2424,15 @@ void OperatorWithKernel::ParseInputDataType(
}
}
if (t != nullptr) {
// PADDLE_ENFORCE_EQ(
// t->IsInitialized(),
// true,
// platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
// "contains uninitialized Tensor.",
// Type(),
// name));
if (FLAGS_enable_check_input_var) {
PADDLE_ENFORCE_EQ(
t->IsInitialized(),
true,
platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
"contains uninitialized Tensor.",
Type(),
name));
}
*data_type = paddle::framework::TransToProtoVarType(t->dtype());
}
}
Expand Down
56 changes: 56 additions & 0 deletions paddle/fluid/operators/batch_fc_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,61 @@ class BatchFCOp : public framework::OperatorWithKernel {
auto w_dims = ctx->GetInputDim("W");

int batchcount = ctx->Attrs().Get<int>("batchcount");
int transpose_weight = ctx->Attrs().Get<bool>("transpose_weight");

if (transpose_weight) {
// Input_dim: [batch_count, ?, in_dim]
// W_dim: [in_dim, batch_count * out_dim]
// Bias_dim: [1, batch_count * out_dim]
// Out_dim: [batch_count, ?, out_dim]
PADDLE_ENFORCE_GT(
batchcount,
0,
platform::errors::PreconditionNotMet(
"with transpose weight, batchcount should > 0"));
PADDLE_ENFORCE_EQ(
w_dims.size(),
2,
platform::errors::InvalidArgument(
"W of BatchFCOp should have 2D."));

int out_dim = w_dims[1] / batchcount;
PADDLE_ENFORCE_EQ(
input_dims.size(),
3,
platform::errors::InvalidArgument(
"Input of BatchFCOp should have 3D."));
PADDLE_ENFORCE_EQ(
input_dims[2],
w_dims[0],
platform::errors::InvalidArgument(
"Input.dim[2] and w_dims[0] of BatchFCOp should be same."));
PADDLE_ENFORCE_EQ(
input_dims[0],
batchcount,
platform::errors::InvalidArgument(
"Input.dim[0] and batchcount of BatchFCOp should be same."));
PADDLE_ENFORCE_EQ(
input_dims[2],
w_dims[0],
platform::errors::InvalidArgument(
"Input.dim[2] and W.dim[1] of BatchFCOp should be same."));

auto bias_dims = ctx->GetInputDim("Bias");
PADDLE_ENFORCE_EQ(
bias_dims.size(),
2,
platform::errors::InvalidArgument("Bias of BatchFCOp should have 2D."));
PADDLE_ENFORCE_EQ(
bias_dims[1],
w_dims[1],
platform::errors::InvalidArgument(
"Bias.dim[1] should be same as input.dim[2]."));

ctx->SetOutputDim("Out", {input_dims[0], input_dims[1], out_dim});
ctx->ShareLoD("Input", /*->*/ "Out");
return;
}
if (batchcount > 0) {
int feature_dim = input_dims[1] / batchcount;
PADDLE_ENFORCE_EQ(feature_dim, w_dims[0],
Expand Down Expand Up @@ -139,6 +194,7 @@ class BatchFCOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Bias", "(Tensor) Input tensor of batch_fc_op operator.");
AddOutput("Out", "Output tensor of batch_fc_op operator.");
AddAttr<int>("batchcount", "(int64_t) the batchcount").SetDefault(0);
AddAttr<bool>("transpose_weight", "(bool) the transpose_weight").SetDefault(false);
AddComment(R"DOC(
BatchFC Operator.
Notice: It currently supports GPU device.
Expand Down
Loading