Skip to content

Commit

Permalink
Merge branch 'develop' of /~https://github.com/PaddlePaddle/Paddle into…
Browse files Browse the repository at this point in the history
… final_state_trace_op
  • Loading branch information
jim19930609 committed Feb 8, 2022
2 parents 3482e29 + 4291036 commit eb8dd27
Show file tree
Hide file tree
Showing 50 changed files with 1,470 additions and 95 deletions.
7 changes: 6 additions & 1 deletion paddle/fluid/eager/auto_code_generator/eager_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ static bool IgnoreGradAttribute(const std::string& op_type,
}
}

// Only allow SumOp
if (op_type != "sum") {
return true;
}

return false;
}

Expand Down Expand Up @@ -1693,7 +1698,7 @@ static std::string GenerateSingleOpBase(
VLOG(6) << "Generated Outs Map";

// [Generation] Get Attrs Map
const char* ATTRS_TEMPLATE = " auto %s = this->attr_map_;\n";
const char* ATTRS_TEMPLATE = " auto& %s = this->attr_map_;\n";
std::string grad_attrs_str =
paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name);
for (const auto& iter : grad_attrs) {
Expand Down
84 changes: 49 additions & 35 deletions paddle/fluid/framework/custom_operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,27 +61,27 @@ static T* DynLoad(void* handle, std::string name) {
return func;
}

inline bool IsGradVar(const std::string& var_name) {
inline static bool IsGradVar(const std::string& var_name) {
std::string suffix = kGradVarSuffix;
return var_name.rfind(suffix) != std::string::npos;
}

inline bool IsDuplicableVar(const std::string& var_name) {
inline static bool IsDuplicableVar(const std::string& var_name) {
std::string suffix = kTensorVectorSuffix;
return var_name.rfind(suffix) != std::string::npos;
}

inline std::string NoGrad(const std::string& var_name) {
inline static std::string NoGrad(const std::string& var_name) {
std::string suffix = kGradVarSuffix;
return var_name.substr(0, var_name.size() - kGradVarSuffixSize);
}

inline bool IsMemberOf(const std::vector<std::string>& vec,
const std::string& name) {
inline static bool IsMemberOf(const std::vector<std::string>& vec,
const std::string& name) {
return std::find(vec.cbegin(), vec.cend(), name) != vec.cend();
}

std::vector<std::string> ParseAttrStr(const std::string& attr) {
static std::vector<std::string> ParseAttrStr(const std::string& attr) {
auto split_pos = attr.find_first_of(":");
PADDLE_ENFORCE_NE(split_pos, std::string::npos,
platform::errors::InvalidArgument(
Expand Down Expand Up @@ -602,44 +602,57 @@ class CustomGradOpMaker<imperative::OpBase>

//////////// Operator and Kernel Register //////////////

void RegisterOperatorKernelWithPlace(const std::string& name,
const paddle::KernelFunc& kernel_func,
const proto::VarType::Type type,
const PlaceType& place,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
const std::vector<std::string>& attrs) {
static void RegisterOperatorKernelWithPlace(
const std::string& name,
const OperatorWithKernel::OpKernelFunc& op_kernel_func,
const proto::VarType::Type type, const PlaceType& place) {
OpKernelType key(type, experimental::ConvertExtPlaceToInnerPlace(place));
VLOG(3) << "Custom Operator: op kernel key: " << key;
OperatorWithKernel::AllOpKernels()[name][key] =
[kernel_func, inputs, outputs,
attrs](const framework::ExecutionContext& ctx) {
VLOG(3) << "Custom Operator: run custom kernel func in lambda.";
RunKernelFunc(ctx, kernel_func, inputs, outputs, attrs);
};
OperatorWithKernel::AllOpKernels()[name][key] = op_kernel_func;
}

void RegisterOperatorKernel(const std::string& name,
const paddle::KernelFunc& kernel_func,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
const std::vector<std::string>& attrs) {
static void RegisterOperatorKernel(const std::string& name,
const paddle::KernelFunc& kernel_func,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
const std::vector<std::string>& attrs,
void* dso_handle) {
VLOG(3) << "Custom Operator: op name in kernel: " << name;
// NOTE [ Dummy Op Kernel Key ]
// TODO(chenweihang): Because execute engine need get device context based
// op_kernel_key.place_, so we should register kernel for each
// device. But this is not entirely correct, if user only give a cpu kernel,
// but call api in gpu device, it will cause error.
RegisterOperatorKernelWithPlace(name, kernel_func, proto::VarType::RAW,
PlaceType::kCPU, inputs, outputs, attrs);
OperatorWithKernel::OpKernelFunc op_kernel_func;
if (kernel_func) {
VLOG(3) << "Register custom operator " << name << " with kernel func";
op_kernel_func = [kernel_func, inputs, outputs,
attrs](const framework::ExecutionContext& ctx) {
VLOG(3) << "Custom Operator: run custom kernel func in lambda.";
RunKernelFunc(ctx, kernel_func, inputs, outputs, attrs);
};
} else {
VLOG(3) << "Register custom operator " << name
<< " with raw op kernel func";
PADDLE_ENFORCE_NOT_NULL(
dso_handle,
platform::errors::InvalidArgument(
"The dso handle must be provided if kernel_func is nullptr."));
using OpKernelFuncPtr = void(const framework::ExecutionContext&);
auto symbol_name = "PD_" + name + "_raw_op_kernel_func";
auto* func = detail::DynLoad<OpKernelFuncPtr>(dso_handle, symbol_name);
op_kernel_func = func;
}
RegisterOperatorKernelWithPlace(name, op_kernel_func, proto::VarType::RAW,
PlaceType::kCPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
RegisterOperatorKernelWithPlace(name, kernel_func, proto::VarType::RAW,
PlaceType::kGPU, inputs, outputs, attrs);
RegisterOperatorKernelWithPlace(name, op_kernel_func, proto::VarType::RAW,
PlaceType::kGPU);
#endif
}

void RegisterOperatorWithMetaInfo(
const std::vector<OpMetaInfo>& op_meta_infos) {
void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
void* dso_handle) {
/* Op register */
OpInfo info;

Expand Down Expand Up @@ -792,7 +805,8 @@ void RegisterOperatorWithMetaInfo(
}

// Kernel func
RegisterOperatorKernel(op_name, kernel_fn, op_inputs, op_outputs, op_attrs);
RegisterOperatorKernel(op_name, kernel_fn, op_inputs, op_outputs, op_attrs,
dso_handle);

// If grad op or double grad op exists
std::string cur_op_name = op_name;
Expand Down Expand Up @@ -900,7 +914,7 @@ void RegisterOperatorWithMetaInfo(

// Kernel func
RegisterOperatorKernel(grad_op_name, grad_kernel_fn, grad_op_inputs,
grad_op_outputs, grad_op_attrs);
grad_op_outputs, grad_op_attrs, dso_handle);

// update current info
OpInfoMap::Instance().Insert(cur_op_name, info);
Expand All @@ -912,14 +926,14 @@ void RegisterOperatorWithMetaInfo(
}

void RegisterOperatorWithMetaInfoMap(
const paddle::OpMetaInfoMap& op_meta_info_map) {
const paddle::OpMetaInfoMap& op_meta_info_map, void* dso_handle) {
auto& meta_info_map = op_meta_info_map.GetMap();
VLOG(3) << "Custom Operator: size of op meta info map - "
<< meta_info_map.size();
// pair: {op_type, OpMetaInfo}
for (auto& pair : meta_info_map) {
VLOG(3) << "Custom Operator: pair first -> op name: " << pair.first;
RegisterOperatorWithMetaInfo(pair.second);
RegisterOperatorWithMetaInfo(pair.second, dso_handle);
}
}

Expand All @@ -934,7 +948,7 @@ void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
detail::DynLoad<get_op_meta_info_map_t>(handle, "PD_GetOpMetaInfoMap");
auto& op_meta_info_map = get_op_meta_info_map();

RegisterOperatorWithMetaInfoMap(op_meta_info_map);
RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle);
}

} // namespace framework
Expand Down
5 changes: 3 additions & 2 deletions paddle/fluid/framework/custom_operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);

// Register custom op api: register op directly
void RegisterOperatorWithMetaInfoMap(
const paddle::OpMetaInfoMap& op_meta_info_map);
const paddle::OpMetaInfoMap& op_meta_info_map, void* dso_handle = nullptr);

// Interface for selective register custom op.
void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos);
void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
void* dso_handle = nullptr);

} // namespace framework
} // namespace paddle
27 changes: 27 additions & 0 deletions paddle/fluid/framework/custom_raw_op_kernel_func.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "paddle/fluid/framework/operator.h"
#include "paddle/pten/api/ext/op_meta_info.h"

// NOTE(zengjinle): this macro is only for internal usage. Commonly, users
// should not use this macro.
#define __PD_DEFINE_RAW_OP_KERNEL_FUNC(op_name, ctx) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_raw_op_kernel_func__##op_name, \
"__PD_DEFINE_RAW_KERNEL_FUNC must be called in global namespace."); \
extern "C" void PD_##op_name##_raw_op_kernel_func( \
const ::paddle::framework::ExecutionContext& ctx)
17 changes: 14 additions & 3 deletions paddle/fluid/memory/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,29 @@ if (WITH_GPU)
nv_test(stream_safe_cuda_alloc_test
SRCS stream_safe_cuda_alloc_test.cu
DEPS malloc cuda_graph_with_memory_pool)
nv_test(cuda_managed_memory_test
SRCS cuda_managed_memory_test.cu
DEPS malloc gpu_info place)

if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES
ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;
FLAGS_allocator_strategy=auto_growth")
endif()
ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth")
endif()
endif()

if (WITH_ROCM)
hip_test(malloc_test
SRCS malloc_test.cu
DEPS device_context malloc)
hip_test(cuda_managed_memory_test
SRCS cuda_managed_memory_test.cu
DEPS malloc gpu_info place)
endif()

if(WITH_TESTING AND TEST cuda_managed_memory_test)
set_tests_properties(cuda_managed_memory_test PROPERTIES
ENVIRONMENT "FLAGS_use_cuda_managed_memory=true;FLAGS_allocator_strategy=auto_growth"
TIMEOUT 50)
endif()

if(WITH_GPU AND WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ endif()

if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
Expand All @@ -27,6 +28,7 @@ endif()

if (WITH_ROCM)
hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
Expand All @@ -42,7 +44,7 @@ endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)

if (WITH_GPU OR WITH_ROCM)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
set(AllocatorFacadeDeps gpu_info cuda_allocator cuda_managed_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
if(CUDA_VERSION GREATER_EQUAL 10.2)
list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
endif()
Expand Down
Loading

0 comments on commit eb8dd27

Please sign in to comment.