From 383be08e6a12e507cfdaca383a584156f4b2436d Mon Sep 17 00:00:00 2001
From: xiayanming <xiayanming@baidu.com>
Date: Tue, 26 Mar 2024 20:21:38 +0800
Subject: [PATCH 1/2] xpu support check_nan_inf

---
 .../framework/details/nan_inf_utils_detail.cc | 63 +++++++++++++++----
 1 file changed, 50 insertions(+), 13 deletions(-)
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 731fd7db493f2f..e18e040e0e922a 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -23,6 +23,10 @@
 #endif
 #include "paddle/fluid/framework/convert_utils.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "xpu/refactor/math.h"
+#endif
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -719,23 +723,54 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,
 #ifdef PADDLE_WITH_XPU
     if (framework::TransToProtoVarType(tensor->dtype()) !=
         proto::VarType::FP32) {
+      LOG(WARNING) << "skip check_nan_inf, tensor type:" << tensor->dtype() << " not float32!";
       return;
     }
 
-    float* cpu_data = new float[tensor->numel()];
+    // float* cpu_data = new float[tensor->numel()];
+    // memory::Copy(platform::CPUPlace(),
+    //              static_cast<void*>(cpu_data),
+    //              tensor->place(),
+    //              static_cast<const void*>(tensor->data<float>()),
+    //              tensor->numel() * sizeof(float));
+    // // bool flag = false;
+    // for (int64_t i = 0; i < tensor->numel(); i++) {
+    //   if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
+    //     get_cpu_nan_inf_num() ++;
+    //     break;
+    //   }
+    // }
+    // delete[] cpu_data;
+
+    using XPUType = typename XPUTypeTrait<float>::Type;
+    platform::XPUDeviceContext* dev_ctx = dynamic_cast<platform::XPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(tensor->place()));
+    const XPUType* x = reinterpret_cast<const XPUType*>(tensor->data<float>());
+
+    Tensor y_tensor;
+    bool* y_ptr = y_tensor.mutable_data<bool>({1}, place);
+    int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(), 
+                              x, 
+                              y_ptr,
+                              tensor->numel());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+            platform::errors::External(
+               "The check_nan_or_inf XPU OP return wrong value[%d %s]",
+               r, XPUAPIErrorMsg[r]));
+    dev_ctx->Wait();
+
+    bool check_res = false;
+    bool* res_ptr = &check_res;
     memory::Copy(platform::CPUPlace(),
-                 static_cast<void*>(cpu_data),
-                 tensor->place(),
-                 static_cast<const void*>(tensor->data<float>()),
-                 tensor->numel() * sizeof(float));
-    // bool flag = false;
-    for (int64_t i = 0; i < tensor->numel(); i++) {
-      if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
-        get_cpu_nan_inf_num() ++;
-        break;
-      }
+                 static_cast<void*>(res_ptr),
+                 y_tensor.place(),
+                 static_cast<const void*>(y_tensor.data<bool>()),
+                 y_tensor.numel() * sizeof(bool));
+    VLOG(3) << "CheckVarHasNanOrInfRet check_res = " << check_res;
+    if (check_res) {
+      get_cpu_nan_inf_num() ++;
     }
-    delete[] cpu_data;
+    return;
 #endif
   }
 #if defined(PADDLE_WITH_CUDA)
@@ -743,6 +778,7 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,
   CudaTensorCheckNanInf(*tensor, dnum);
 #endif
 }
+
 bool CheckBatchNanOrInfRet(const platform::Place& place) {
   if (!platform::is_gpu_place(place)) {
     return (get_cpu_nan_inf_num() > 0);
@@ -829,9 +865,10 @@ void DumpTensorToFile(const std::string& path, const std::string& prefix,
   out.write(s.c_str(), s.length());
   out.close();
 }
+
 void DumpAllScope(const Scope& exec_scope, const platform::Place& place) {
   int device_id = 0;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)) && !defined(_WIN32)
   device_id = place.GetDeviceId();
 #endif
   VLOG(0) << "begin dump scope all tensor data, device id=" << device_id;

From 9041f85df268503932405c5a853d30cbdb3b83a0 Mon Sep 17 00:00:00 2001
From: xiayanming <xiayanming@baidu.com>
Date: Tue, 26 Mar 2024 20:21:38 +0800
Subject: [PATCH 2/2] xpu support check_nan_inf

---
 paddle/fluid/framework/boxps_worker.cc        |  2 +-
 .../framework/details/nan_inf_utils_detail.cc | 63 +++++++++++++++----
 paddle/fluid/framework/operator.cc            |  2 +-
 3 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc
index 17fffb84077a26..cb097970593877 100644
--- a/paddle/fluid/framework/boxps_worker.cc
+++ b/paddle/fluid/framework/boxps_worker.cc
@@ -776,7 +776,7 @@ void BoxPSWorker::TrainFiles() {
         SyncParam();
       }
     }
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_XPU_KP)
     if (FLAGS_check_nan_inf) {
       // check nan result
       if (framework::details::CheckBatchNanOrInfRet(place_)) {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 731fd7db493f2f..e18e040e0e922a 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -23,6 +23,10 @@
 #endif
 #include "paddle/fluid/framework/convert_utils.h"
 
+#ifdef PADDLE_WITH_XPU
+#include "xpu/refactor/math.h"
+#endif
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -719,23 +723,54 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,
 #ifdef PADDLE_WITH_XPU
     if (framework::TransToProtoVarType(tensor->dtype()) !=
         proto::VarType::FP32) {
+      LOG(WARNING) << "skip check_nan_inf, tensor type:" << tensor->dtype() << " not float32!";
       return;
     }
 
-    float* cpu_data = new float[tensor->numel()];
+    // float* cpu_data = new float[tensor->numel()];
+    // memory::Copy(platform::CPUPlace(),
+    //              static_cast<void*>(cpu_data),
+    //              tensor->place(),
+    //              static_cast<const void*>(tensor->data<float>()),
+    //              tensor->numel() * sizeof(float));
+    // // bool flag = false;
+    // for (int64_t i = 0; i < tensor->numel(); i++) {
+    //   if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
+    //     get_cpu_nan_inf_num() ++;
+    //     break;
+    //   }
+    // }
+    // delete[] cpu_data;
+
+    using XPUType = typename XPUTypeTrait<float>::Type;
+    platform::XPUDeviceContext* dev_ctx = dynamic_cast<platform::XPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(tensor->place()));
+    const XPUType* x = reinterpret_cast<const XPUType*>(tensor->data<float>());
+
+    Tensor y_tensor;
+    bool* y_ptr = y_tensor.mutable_data<bool>({1}, place);
+    int r = xpu::check_nan_or_inf<XPUType>(dev_ctx->x_context(), 
+                              x, 
+                              y_ptr,
+                              tensor->numel());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+            platform::errors::External(
+               "The check_nan_or_inf XPU OP return wrong value[%d %s]",
+               r, XPUAPIErrorMsg[r]));
+    dev_ctx->Wait();
+
+    bool check_res = false;
+    bool* res_ptr = &check_res;
     memory::Copy(platform::CPUPlace(),
-                 static_cast<void*>(cpu_data),
-                 tensor->place(),
-                 static_cast<const void*>(tensor->data<float>()),
-                 tensor->numel() * sizeof(float));
-    // bool flag = false;
-    for (int64_t i = 0; i < tensor->numel(); i++) {
-      if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
-        get_cpu_nan_inf_num() ++;
-        break;
-      }
+                 static_cast<void*>(res_ptr),
+                 y_tensor.place(),
+                 static_cast<const void*>(y_tensor.data<bool>()),
+                 y_tensor.numel() * sizeof(bool));
+    VLOG(3) << "CheckVarHasNanOrInfRet check_res = " << check_res;
+    if (check_res) {
+      get_cpu_nan_inf_num() ++;
     }
-    delete[] cpu_data;
+    return;
 #endif
   }
 #if defined(PADDLE_WITH_CUDA)
@@ -743,6 +778,7 @@ void CheckVarHasNanOrInfRet(const std::string& op_type,
   CudaTensorCheckNanInf(*tensor, dnum);
 #endif
 }
+
 bool CheckBatchNanOrInfRet(const platform::Place& place) {
   if (!platform::is_gpu_place(place)) {
     return (get_cpu_nan_inf_num() > 0);
@@ -829,9 +865,10 @@ void DumpTensorToFile(const std::string& path, const std::string& prefix,
   out.write(s.c_str(), s.length());
   out.close();
 }
+
 void DumpAllScope(const Scope& exec_scope, const platform::Place& place) {
   int device_id = 0;
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)) && !defined(_WIN32)
   device_id = place.GetDeviceId();
 #endif
   VLOG(0) << "begin dump scope all tensor data, device id=" << device_id;
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6db5a002de1832..bb134df5e775e2 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1760,7 +1760,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 
   if (FLAGS_check_nan_inf) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_XPU_KP)
     if (framework::details::CheckOpHasNanOrInfRet(*this, exec_scope, place)) {
       framework::details::DumpAllScope(exec_scope, place);
       // dump current op data