From 383be08e6a12e507cfdaca383a584156f4b2436d Mon Sep 17 00:00:00 2001 From: xiayanming Date: Tue, 26 Mar 2024 20:21:38 +0800 Subject: [PATCH 1/2] xpu support check_nan_inf --- .../framework/details/nan_inf_utils_detail.cc | 63 +++++++++++++++---- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 731fd7db493f2f..e18e040e0e922a 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -23,6 +23,10 @@ #endif #include "paddle/fluid/framework/convert_utils.h" +#ifdef PADDLE_WITH_XPU +#include "xpu/refactor/math.h" +#endif + namespace paddle { namespace framework { namespace details { @@ -719,23 +723,54 @@ void CheckVarHasNanOrInfRet(const std::string& op_type, #ifdef PADDLE_WITH_XPU if (framework::TransToProtoVarType(tensor->dtype()) != proto::VarType::FP32) { + LOG(WARNING) << "skip check_nan_inf, tensor type:" << tensor->dtype() << " not float32!"; return; } - float* cpu_data = new float[tensor->numel()]; + // float* cpu_data = new float[tensor->numel()]; + // memory::Copy(platform::CPUPlace(), + // static_cast(cpu_data), + // tensor->place(), + // static_cast(tensor->data()), + // tensor->numel() * sizeof(float)); + // // bool flag = false; + // for (int64_t i = 0; i < tensor->numel(); i++) { + // if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { + // get_cpu_nan_inf_num() ++; + // break; + // } + // } + // delete[] cpu_data; + + using XPUType = typename XPUTypeTrait::Type; + platform::XPUDeviceContext* dev_ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(tensor->place())); + const XPUType* x = reinterpret_cast(tensor->data()); + + Tensor y_tensor; + bool* y_ptr = y_tensor.mutable_data({1}, place); + int r = xpu::check_nan_or_inf(dev_ctx->x_context(), + x, + y_ptr, + tensor->numel()); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External( + "The check_nan_or_inf XPU OP return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + dev_ctx->Wait(); + + bool check_res = false; + bool* res_ptr = &check_res; memory::Copy(platform::CPUPlace(), - static_cast(cpu_data), - tensor->place(), - static_cast(tensor->data()), - tensor->numel() * sizeof(float)); - // bool flag = false; - for (int64_t i = 0; i < tensor->numel(); i++) { - if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { - get_cpu_nan_inf_num() ++; - break; - } + static_cast(res_ptr), + y_tensor.place(), + static_cast(y_tensor.data()), + y_tensor.numel() * sizeof(bool)); + VLOG(3) << "CheckVarHasNanOrInfRet check_res = " << check_res; + if (check_res) { + get_cpu_nan_inf_num() ++; } - delete[] cpu_data; + return; #endif } #if defined(PADDLE_WITH_CUDA) @@ -743,6 +778,7 @@ void CheckVarHasNanOrInfRet(const std::string& op_type, CudaTensorCheckNanInf(*tensor, dnum); #endif } + bool CheckBatchNanOrInfRet(const platform::Place& place) { if (!platform::is_gpu_place(place)) { return (get_cpu_nan_inf_num() > 0); @@ -829,9 +865,10 @@ void DumpTensorToFile(const std::string& path, const std::string& prefix, out.write(s.c_str(), s.length()); out.close(); } + void DumpAllScope(const Scope& exec_scope, const platform::Place& place) { int device_id = 0; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)) && !defined(_WIN32) device_id = place.GetDeviceId(); #endif VLOG(0) << "begin dump scope all tensor data, device id=" << device_id; From 9041f85df268503932405c5a853d30cbdb3b83a0 Mon Sep 17 00:00:00 2001 From: xiayanming Date: Tue, 26 Mar 2024 20:21:38 +0800 Subject: [PATCH 2/2] xpu support check_nan_inf --- paddle/fluid/framework/boxps_worker.cc | 2 +- .../framework/details/nan_inf_utils_detail.cc | 63 +++++++++++++++---- paddle/fluid/framework/operator.cc | 2 +- 3 files changed, 52 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc index 17fffb84077a26..cb097970593877 100644 --- a/paddle/fluid/framework/boxps_worker.cc +++ b/paddle/fluid/framework/boxps_worker.cc @@ -776,7 +776,7 @@ void BoxPSWorker::TrainFiles() { SyncParam(); } } -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_XPU_KP) if (FLAGS_check_nan_inf) { // check nan result if (framework::details::CheckBatchNanOrInfRet(place_)) { diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 731fd7db493f2f..e18e040e0e922a 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -23,6 +23,10 @@ #endif #include "paddle/fluid/framework/convert_utils.h" +#ifdef PADDLE_WITH_XPU +#include "xpu/refactor/math.h" +#endif + namespace paddle { namespace framework { namespace details { @@ -719,23 +723,54 @@ void CheckVarHasNanOrInfRet(const std::string& op_type, #ifdef PADDLE_WITH_XPU if (framework::TransToProtoVarType(tensor->dtype()) != proto::VarType::FP32) { + LOG(WARNING) << "skip check_nan_inf, tensor type:" << tensor->dtype() << " not float32!"; return; } - float* cpu_data = new float[tensor->numel()]; + // float* cpu_data = new float[tensor->numel()]; + // memory::Copy(platform::CPUPlace(), + // static_cast(cpu_data), + // tensor->place(), + // static_cast(tensor->data()), + // tensor->numel() * sizeof(float)); + // // bool flag = false; + // for (int64_t i = 0; i < tensor->numel(); i++) { + // if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { + // get_cpu_nan_inf_num() ++; + // break; + // } + // } + // delete[] cpu_data; + + using XPUType = typename XPUTypeTrait::Type; + platform::XPUDeviceContext* dev_ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(tensor->place())); + const XPUType* x = reinterpret_cast(tensor->data()); + + Tensor y_tensor; + bool* y_ptr = y_tensor.mutable_data({1}, place); + int r = xpu::check_nan_or_inf(dev_ctx->x_context(), + x, + y_ptr, + tensor->numel()); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External( + "The check_nan_or_inf XPU OP return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + dev_ctx->Wait(); + + bool check_res = false; + bool* res_ptr = &check_res; memory::Copy(platform::CPUPlace(), - static_cast(cpu_data), - tensor->place(), - static_cast(tensor->data()), - tensor->numel() * sizeof(float)); - // bool flag = false; - for (int64_t i = 0; i < tensor->numel(); i++) { - if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { - get_cpu_nan_inf_num() ++; - break; - } + static_cast(res_ptr), + y_tensor.place(), + static_cast(y_tensor.data()), + y_tensor.numel() * sizeof(bool)); + VLOG(3) << "CheckVarHasNanOrInfRet check_res = " << check_res; + if (check_res) { + get_cpu_nan_inf_num() ++; } - delete[] cpu_data; + return; #endif } #if defined(PADDLE_WITH_CUDA) @@ -743,6 +778,7 @@ void CheckVarHasNanOrInfRet(const std::string& op_type, CudaTensorCheckNanInf(*tensor, dnum); #endif } + bool CheckBatchNanOrInfRet(const platform::Place& place) { if (!platform::is_gpu_place(place)) { return (get_cpu_nan_inf_num() > 0); @@ -829,9 +865,10 @@ void DumpTensorToFile(const std::string& path, const std::string& prefix, out.write(s.c_str(), s.length()); out.close(); } + void DumpAllScope(const Scope& exec_scope, const platform::Place& place) { int device_id = 0; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)) && !defined(_WIN32) device_id = place.GetDeviceId(); #endif VLOG(0) << "begin dump scope all tensor data, device id=" << device_id; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6db5a002de1832..bb134df5e775e2 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1760,7 +1760,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } if (FLAGS_check_nan_inf) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_XPU_KP) if (framework::details::CheckOpHasNanOrInfRet(*this, exec_scope, place)) { framework::details::DumpAllScope(exec_scope, place); // dump current op data