Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…

… final_state_trace_op
PaddlePaddle · Feb 8, 2022 · eb8dd27 · eb8dd27
2 parents 3482e29 + 4291036
commit eb8dd27
Show file tree

Hide file tree

Showing 50 changed files with 1,470 additions and 95 deletions.
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -64,6 +64,11 @@ static bool IgnoreGradAttribute(const std::string& op_type,
     }
   }
 
+  // Only allow SumOp
+  if (op_type != "sum") {
+    return true;
+  }
+
   return false;
 }
 
@@ -1693,7 +1698,7 @@ static std::string GenerateSingleOpBase(
   VLOG(6) << "Generated Outs Map";
 
   // [Generation] Get Attrs Map
-  const char* ATTRS_TEMPLATE = "  auto %s = this->attr_map_;\n";
+  const char* ATTRS_TEMPLATE = "  auto& %s = this->attr_map_;\n";
   std::string grad_attrs_str =
       paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name);
   for (const auto& iter : grad_attrs) {

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
@@ -61,27 +61,27 @@ static T* DynLoad(void* handle, std::string name) {
   return func;
 }
 
-inline bool IsGradVar(const std::string& var_name) {
+inline static bool IsGradVar(const std::string& var_name) {
   std::string suffix = kGradVarSuffix;
   return var_name.rfind(suffix) != std::string::npos;
 }
 
-inline bool IsDuplicableVar(const std::string& var_name) {
+inline static bool IsDuplicableVar(const std::string& var_name) {
   std::string suffix = kTensorVectorSuffix;
   return var_name.rfind(suffix) != std::string::npos;
 }
 
-inline std::string NoGrad(const std::string& var_name) {
+inline static std::string NoGrad(const std::string& var_name) {
   std::string suffix = kGradVarSuffix;
   return var_name.substr(0, var_name.size() - kGradVarSuffixSize);
 }
 
-inline bool IsMemberOf(const std::vector<std::string>& vec,
-                       const std::string& name) {
+inline static bool IsMemberOf(const std::vector<std::string>& vec,
+                              const std::string& name) {
   return std::find(vec.cbegin(), vec.cend(), name) != vec.cend();
 }
 
-std::vector<std::string> ParseAttrStr(const std::string& attr) {
+static std::vector<std::string> ParseAttrStr(const std::string& attr) {
   auto split_pos = attr.find_first_of(":");
   PADDLE_ENFORCE_NE(split_pos, std::string::npos,
                     platform::errors::InvalidArgument(
@@ -602,44 +602,57 @@ class CustomGradOpMaker<imperative::OpBase>
 
 //////////// Operator and Kernel Register //////////////
 
-void RegisterOperatorKernelWithPlace(const std::string& name,
-                                     const paddle::KernelFunc& kernel_func,
-                                     const proto::VarType::Type type,
-                                     const PlaceType& place,
-                                     const std::vector<std::string>& inputs,
-                                     const std::vector<std::string>& outputs,
-                                     const std::vector<std::string>& attrs) {
+static void RegisterOperatorKernelWithPlace(
+    const std::string& name,
+    const OperatorWithKernel::OpKernelFunc& op_kernel_func,
+    const proto::VarType::Type type, const PlaceType& place) {
   OpKernelType key(type, experimental::ConvertExtPlaceToInnerPlace(place));
   VLOG(3) << "Custom Operator: op kernel key: " << key;
-  OperatorWithKernel::AllOpKernels()[name][key] =
-      [kernel_func, inputs, outputs,
-       attrs](const framework::ExecutionContext& ctx) {
-        VLOG(3) << "Custom Operator: run custom kernel func in lambda.";
-        RunKernelFunc(ctx, kernel_func, inputs, outputs, attrs);
-      };
+  OperatorWithKernel::AllOpKernels()[name][key] = op_kernel_func;
 }
 
-void RegisterOperatorKernel(const std::string& name,
-                            const paddle::KernelFunc& kernel_func,
-                            const std::vector<std::string>& inputs,
-                            const std::vector<std::string>& outputs,
-                            const std::vector<std::string>& attrs) {
+static void RegisterOperatorKernel(const std::string& name,
+                                   const paddle::KernelFunc& kernel_func,
+                                   const std::vector<std::string>& inputs,
+                                   const std::vector<std::string>& outputs,
+                                   const std::vector<std::string>& attrs,
+                                   void* dso_handle) {
   VLOG(3) << "Custom Operator: op name in kernel: " << name;
   // NOTE [ Dummy Op Kernel Key ]
   // TODO(chenweihang): Because execute engine need get device context based
   // op_kernel_key.place_, so we should register kernel for each
   // device. But this is not entirely correct, if user only give a cpu kernel,
   // but call api in gpu device, it will cause error.
-  RegisterOperatorKernelWithPlace(name, kernel_func, proto::VarType::RAW,
-                                  PlaceType::kCPU, inputs, outputs, attrs);
+  OperatorWithKernel::OpKernelFunc op_kernel_func;
+  if (kernel_func) {
+    VLOG(3) << "Register custom operator " << name << " with kernel func";
+    op_kernel_func = [kernel_func, inputs, outputs,
+                      attrs](const framework::ExecutionContext& ctx) {
+      VLOG(3) << "Custom Operator: run custom kernel func in lambda.";
+      RunKernelFunc(ctx, kernel_func, inputs, outputs, attrs);
+    };
+  } else {
+    VLOG(3) << "Register custom operator " << name
+            << " with raw op kernel func";
+    PADDLE_ENFORCE_NOT_NULL(
+        dso_handle,
+        platform::errors::InvalidArgument(
+            "The dso handle must be provided if kernel_func is nullptr."));
+    using OpKernelFuncPtr = void(const framework::ExecutionContext&);
+    auto symbol_name = "PD_" + name + "_raw_op_kernel_func";
+    auto* func = detail::DynLoad<OpKernelFuncPtr>(dso_handle, symbol_name);
+    op_kernel_func = func;
+  }
+  RegisterOperatorKernelWithPlace(name, op_kernel_func, proto::VarType::RAW,
+                                  PlaceType::kCPU);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  RegisterOperatorKernelWithPlace(name, kernel_func, proto::VarType::RAW,
-                                  PlaceType::kGPU, inputs, outputs, attrs);
+  RegisterOperatorKernelWithPlace(name, op_kernel_func, proto::VarType::RAW,
+                                  PlaceType::kGPU);
 #endif
 }
 
-void RegisterOperatorWithMetaInfo(
-    const std::vector<OpMetaInfo>& op_meta_infos) {
+void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
+                                  void* dso_handle) {
   /* Op register */
   OpInfo info;
 
@@ -792,7 +805,8 @@ void RegisterOperatorWithMetaInfo(
   }
 
   // Kernel func
-  RegisterOperatorKernel(op_name, kernel_fn, op_inputs, op_outputs, op_attrs);
+  RegisterOperatorKernel(op_name, kernel_fn, op_inputs, op_outputs, op_attrs,
+                         dso_handle);
 
   // If grad op or double grad op exists
   std::string cur_op_name = op_name;
@@ -900,7 +914,7 @@ void RegisterOperatorWithMetaInfo(
 
     // Kernel func
     RegisterOperatorKernel(grad_op_name, grad_kernel_fn, grad_op_inputs,
-                           grad_op_outputs, grad_op_attrs);
+                           grad_op_outputs, grad_op_attrs, dso_handle);
 
     // update current info
     OpInfoMap::Instance().Insert(cur_op_name, info);
@@ -912,14 +926,14 @@ void RegisterOperatorWithMetaInfo(
 }
 
 void RegisterOperatorWithMetaInfoMap(
-    const paddle::OpMetaInfoMap& op_meta_info_map) {
+    const paddle::OpMetaInfoMap& op_meta_info_map, void* dso_handle) {
   auto& meta_info_map = op_meta_info_map.GetMap();
   VLOG(3) << "Custom Operator: size of op meta info map - "
           << meta_info_map.size();
   // pair: {op_type, OpMetaInfo}
   for (auto& pair : meta_info_map) {
     VLOG(3) << "Custom Operator: pair first -> op name: " << pair.first;
-    RegisterOperatorWithMetaInfo(pair.second);
+    RegisterOperatorWithMetaInfo(pair.second, dso_handle);
   }
 }
 
@@ -934,7 +948,7 @@ void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
       detail::DynLoad<get_op_meta_info_map_t>(handle, "PD_GetOpMetaInfoMap");
   auto& op_meta_info_map = get_op_meta_info_map();
 
-  RegisterOperatorWithMetaInfoMap(op_meta_info_map);
+  RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle);
 }
 
 }  // namespace framework

diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h
@@ -26,10 +26,11 @@ void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
 
 // Register custom op api: register op directly
 void RegisterOperatorWithMetaInfoMap(
-    const paddle::OpMetaInfoMap& op_meta_info_map);
+    const paddle::OpMetaInfoMap& op_meta_info_map, void* dso_handle = nullptr);
 
 // Interface for selective register custom op.
-void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos);
+void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
+                                  void* dso_handle = nullptr);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/custom_raw_op_kernel_func.h b/paddle/fluid/framework/custom_raw_op_kernel_func.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/pten/api/ext/op_meta_info.h"
+
+// NOTE(zengjinle): this macro is only for internal usage. Commonly, users
+// should not use this macro.
+#define __PD_DEFINE_RAW_OP_KERNEL_FUNC(op_name, ctx)                      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __reg_raw_op_kernel_func__##op_name,                                \
+      "__PD_DEFINE_RAW_KERNEL_FUNC must be called in global namespace."); \
+  extern "C" void PD_##op_name##_raw_op_kernel_func(                      \
+      const ::paddle::framework::ExecutionContext& ctx)
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
@@ -20,18 +20,29 @@ if (WITH_GPU)
     nv_test(stream_safe_cuda_alloc_test
             SRCS stream_safe_cuda_alloc_test.cu
             DEPS malloc cuda_graph_with_memory_pool)
+    nv_test(cuda_managed_memory_test
+            SRCS cuda_managed_memory_test.cu
+            DEPS malloc gpu_info place)
 
     if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
         set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
-                             ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;
-                                          FLAGS_allocator_strategy=auto_growth")
-    endif()  
+                             ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth")
+    endif()
 endif()
 
 if (WITH_ROCM)
     hip_test(malloc_test
             SRCS malloc_test.cu
             DEPS device_context malloc)
+    hip_test(cuda_managed_memory_test
+            SRCS cuda_managed_memory_test.cu
+            DEPS malloc gpu_info place)
+endif()
+
+if(WITH_TESTING AND TEST cuda_managed_memory_test)
+set_tests_properties(cuda_managed_memory_test PROPERTIES
+                     ENVIRONMENT "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth"
+                     TIMEOUT 50)
 endif()
 
 if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")       

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -15,6 +15,7 @@ endif()
 
 if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+  nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
   nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
@@ -27,6 +28,7 @@ endif()
 
 if (WITH_ROCM)
   hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+  hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
   hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
   hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
@@ -42,7 +44,7 @@ endif()
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
 if (WITH_GPU OR WITH_ROCM)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator cuda_managed_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
     if(CUDA_VERSION GREATER_EQUAL 10.2)
       list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
     endif()