PaddlePaddle · jiweibo · Jan 17, 2022 · Jan 10, 2022 · Jan 10, 2022 · Jan 10, 2022
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -221,8 +221,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(place)) {
       if (framework::IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new framework::UnsafeFastGPUGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
+        gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
+                                                              max_memory_size));
       }
     }
 #endif

diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -106,13 +106,12 @@ void SerializeLodTensor(framework::Variable* var,
     iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
   } else {
 #ifdef PADDLE_WITH_CUDA
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(
-        platform::CPUPlace(), temp_ptr,
-        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
         tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
@@ -148,13 +147,12 @@ void SerializeSelectedRows(framework::Variable* var,
     iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
   } else {
 #ifdef PADDLE_WITH_CUDA
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(
-        platform::CPUPlace(), temp_ptr,
-        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
         tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
     iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
@@ -204,7 +202,7 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
 }
 
 void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
-                          butil::IOBufBytesIterator& io_buffer_itr,
+                          butil::IOBufBytesIterator& io_buffer_itr,  // NOLINT
                           const platform::DeviceContext& ctx) {
   const auto place = ctx.GetPlace();
   framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
@@ -229,30 +227,30 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
 
   // IO Buffer
   if (platform::is_cpu_place(place)) {
-    unsigned long data_len;
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    unsigned long data_len;                                 // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);  // NOLINT
     io_buffer_itr.copy_and_forward(tensor_data, data_len);
   } else if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_WITH_CUDA
-    unsigned long data_len;
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
-    io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);
+    unsigned long data_len;  // NOLINT
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);             // NOLINT
+    io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);         // NOLINT
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
-                 platform::CPUPlace(), (void*)temp_ptr,
-                 tensor->numel() * framework::SizeOfType(tensor->type()),
-                 stream);
+    memory::Copy(
+        place, tensor_data, platform::CPUPlace(), (void*)temp_ptr,  // NOLINT
+        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     delete[] temp_ptr;
 #endif
   }
 }
 
-void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
-                             butil::IOBufBytesIterator& io_buffer_itr,
-                             const platform::DeviceContext& ctx) {
+void DeserializeSelectedRows(
+    framework::Variable* var, const VarMsg& msg,
+    butil::IOBufBytesIterator& io_buffer_itr,  // NOLINT
+    const platform::DeviceContext& ctx) {
   const auto place = ctx.GetPlace();
   auto* slr = var->GetMutable<framework::SelectedRows>();
   framework::Tensor* tensor = slr->mutable_value();
@@ -269,20 +267,19 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
       tensor->mutable_data(place, VarMessageToVarType(msg.data_type()));
   // IO Buffer
   if (platform::is_cpu_place(place)) {
-    unsigned long data_len;
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    unsigned long data_len;                                 // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);  // NOLINT
     io_buffer_itr.copy_and_forward(tensor_data, data_len);
   } else if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_WITH_CUDA
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
-    unsigned long data_len;
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
+    unsigned long data_len;                                            // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);             // NOLINT
     io_buffer_itr.copy_and_forward(temp_ptr, data_len);
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
-                 platform::CPUPlace(), temp_ptr,
+    memory::Copy(place, tensor_data, platform::CPUPlace(), temp_ptr,
                  tensor->numel() * framework::SizeOfType(tensor->type()),
                  stream);
     delete[] temp_ptr;

diff --git a/paddle/fluid/distributed/service/heter_client.cc b/paddle/fluid/distributed/service/heter_client.cc
@@ -44,8 +44,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(
-        platform::CPUPlace(), temp_ptr,
-        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
         tensor->numel() * framework::SizeOfType(tensor->type()), stream);
     float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
     micro_id = static_cast<int>(temp_ptr_float[0]);

diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -43,7 +43,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
   TensorAddFunctor(int64_t numel, const T* x, T* y)
       : numel_(numel), x_(x), y_(y) {}
 
-  void operator()(const paddle::platform::CPUPlace& place) {
+  void operator()(const paddle::platform::CPUPlace& place) const {
     paddle::platform::CPUDeviceContext* ctx =
         dynamic_cast<paddle::platform::CPUDeviceContext*>(
             paddle::platform::DeviceContextPool::Instance().Get(place));
@@ -56,7 +56,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 // TODO(jiabin): Support xpu here from gradient_accumulator.cc
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void operator()(const paddle::platform::CUDAPlace& place) {
+  void operator()(const paddle::platform::CUDAPlace& place) const {
     paddle::platform::CUDADeviceContext* ctx =
         dynamic_cast<paddle::platform::CUDADeviceContext*>(
             paddle::platform::DeviceContextPool::Instance().Get(place));
@@ -66,7 +66,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
     blas.AXPY(numel_, 1., x_, y_);
   }
 #else
-  void operator()(const paddle::platform::CUDAPlace& place) {
+  void operator()(const paddle::platform::CUDAPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -76,22 +76,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
 
   // TODO(jiabin): Support Npu here from gradient_accumulator.cc
   // there is NO blas in CUDAPinnedPlace
-  void operator()(const paddle::platform::CUDAPinnedPlace& place) {
+  void operator()(const paddle::platform::CUDAPinnedPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
         place));
   }
 
 #ifdef PADDLE_WITH_ASCEND_CL
-  void operator()(const paddle::platform::NPUPlace& place) {
+  void operator()(const paddle::platform::NPUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
         place));
   }
 #else
-  void operator()(const paddle::platform::NPUPlace& place) {
+  void operator()(const paddle::platform::NPUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -100,14 +100,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif
 
 #ifdef PADDLE_WITH_XPU
-  void operator()(const paddle::platform::XPUPlace& place) {
+  void operator()(const paddle::platform::XPUPlace& place) const {
     paddle::platform::XPUDeviceContext* ctx =
         dynamic_cast<paddle::platform::XPUDeviceContext*>(
             paddle::platform::DeviceContextPool::Instance().Get(place));
     xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_));
   }
 #else
-  void operator()(const paddle::platform::XPUPlace& place) {
+  void operator()(const paddle::platform::XPUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -116,14 +116,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif
 
 #ifdef PADDLE_WITH_MLU
-  void operator()(const paddle::platform::MLUPlace& place) {
+  void operator()(const paddle::platform::MLUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
         place));
   }
 #else
-  void operator()(const paddle::platform::MLUPlace& place) {
+  void operator()(const paddle::platform::MLUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -132,22 +132,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif
 
 #ifdef PADDLE_WITH_IPU
-  void operator()(const paddle::platform::IPUPlace& place) {
+  void operator()(const paddle::platform::IPUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
         place));
   }
 #else
-  void operator()(const paddle::platform::IPUPlace& place) {
+  void operator()(const paddle::platform::IPUPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
         place));
   }
 #endif
 
-  void operator()(const paddle::platform::NPUPinnedPlace& place) {
+  void operator()(const paddle::platform::NPUPinnedPlace& place) const {
     PADDLE_THROW(paddle::platform::errors::PermissionDenied(
         "Gradient accumulation on place (%s) "
         "is not supported in imperative mode",
@@ -157,7 +157,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
  private:
   int64_t numel_;
   const T* x_;
-  T* y_;
+  mutable T* y_;
 };
 
 template <typename DeviceContext, typename T>
@@ -218,7 +218,7 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
   if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
     TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(),     \
                                     dst_tensor->mutable_data<cpp_type>());   \
-    boost::apply_visitor(func, place);                                       \
+    paddle::platform::VisitPlace(place, func);                               \
     return;                                                                  \
   }
 
@@ -294,7 +294,7 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
     TensorAddFunctor<cpp_type> func(                                         \
         numel, src_tensor.data<cpp_type>(),                                  \
         dst_tensor->mutable_data<cpp_type>(place));                          \
-    boost::apply_visitor(func, place);                                       \
+    paddle::platform::VisitPlace(place, func);                               \
     return;                                                                  \
   }
 

diff --git a/paddle/fluid/eager/legacy/op_runner.cc b/paddle/fluid/eager/legacy/op_runner.cc
@@ -150,24 +150,21 @@ void RunOp(const std::string& type, const NameTensorMap& ins,
     VLOG(6) << "Get Device id";
     if (paddle::platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      paddle::platform::SetDeviceId(
-          BOOST_GET_CONST(paddle::platform::CUDAPlace, place).device);
+      paddle::platform::SetDeviceId(place.device);
 #else
       PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with GPU if use CUDAPlace."));
 #endif
     } else if (paddle::platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU
-      paddle::platform::SetXPUDeviceId(
-          BOOST_GET_CONST(paddle::platform::XPUPlace, place).device);
+      paddle::platform::SetXPUDeviceId(place.device);
 #else
       PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with XPU if use XPUPlace."));
 #endif
     } else if (paddle::platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      paddle::platform::SetNPUDeviceId(
-          BOOST_GET_CONST(paddle::platform::NPUPlace, place).device);
+      paddle::platform::SetNPUDeviceId(place.device);
 #else
       PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
           "PaddlePaddle should compile with NPU if use NPUPlace."));

diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc
@@ -116,7 +116,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
   auto& kernels = kernels_iter->second;
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_XPU
-  if (is_xpu_place(expected_kernel_key.place_) &&
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() ||
        !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
        paddle::platform::is_in_xpu_black_list(op.Type()))) {
@@ -129,7 +129,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   if (kernel_iter == kernels.end() &&
-      is_npu_place(expected_kernel_key.place_)) {
+      paddle::platform::is_npu_place(expected_kernel_key.place_)) {
     VLOG(3) << "missing NPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";

diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
@@ -22,7 +22,7 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
           << " dst_place: " << dst_place;
 
   PADDLE_ENFORCE_NE(
-      in.place().which(), dst_place.which(),
+      in.place().GetType(), dst_place.GetType(),
       platform::errors::Unavailable("Currently, model parallelism is only "
                                     "supported between CPU and CUDA."));
 

diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -15,6 +15,7 @@
 
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -181,7 +182,7 @@ void AllReduceOpHandle::AllReduceFunc(
     const framework::proto::VarType::Type &dtype, int64_t numel,
     const std::vector<platform::Place> &places,
     const std::vector<std::string> &out_var_names) {
-  if (is_gpu_place(places[0])) {
+  if (platform::is_gpu_place(places[0])) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
                             platform::errors::InvalidArgument(
@@ -200,7 +201,7 @@ void AllReduceOpHandle::AllReduceFunc(
     PADDLE_THROW(
         platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
-  } else if (is_xpu_place(places[0])) {
+  } else if (platform::is_xpu_place(places[0])) {
 #if defined(PADDLE_WITH_XPU_BKCL)
     PADDLE_ENFORCE_NOT_NULL(bkcl_ctxs_,
                             platform::errors::InvalidArgument(
@@ -286,7 +287,7 @@ void AllReduceOpHandle::NCCLAllReduceFunc(
 void AllReduceOpHandle::SyncNCCLAllReduce() {
   if (FLAGS_sync_nccl_allreduce) {
     for (auto &p : places_) {
-      int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
+      int dev_id = p.device;
       auto *nccl_ctxs =
           nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
       auto &nccl_ctx = nccl_ctxs->at(dev_id);

diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
@@ -46,7 +46,7 @@ BindThreadedSSAGraphExecutor::BindThreadedSSAGraphExecutor(
   }
   int index = 0;
   for (uint32_t i = 0; i < places.size(); i++) {
-    int id = BOOST_GET_CONST(platform::XPUPlace, places_[i]).device;
+    int id = places_[i].device;
     if (place_to_index_.find(id) == place_to_index_.end()) {
       place_to_index_[id] = index;
       index++;
@@ -145,8 +145,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
       RunMultiDeviceOpAsync(cur_op, op_deps.get(), ready_ops);
       continue;
     } else {
-      cur_place =
-          BOOST_GET_CONST(platform::XPUPlace, dev_ctxes_.begin()->first);
+      cur_place = dev_ctxes_.begin()->first;
       int cur_index = place_to_index_[cur_place.device];
       RunOpAsyncMainStream(cur_op, op_deps.get(), ready_ops, cur_index);
     }

diff --git a/paddle/fluid/framework/details/bkcl_op_handle.h b/paddle/fluid/framework/details/bkcl_op_handle.h
@@ -85,7 +85,7 @@ class BKCLOpHandleBase : public OpHandleBase {
         platform::errors::InvalidArgument(
             "The argument run_order_ must be >= 0, but got %d.", run_order_));
     auto flat_bkcl_ctxs = bkcl_ctxs_->GetFlatCtx(run_order_);
-    int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+    int dev_id = place.device;
     auto& bkcl_ctx = flat_bkcl_ctxs->at(dev_id);
     auto comm = bkcl_ctx.comm_;