Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[new-exec] fit mkldnn #41058

Merged
merged 4 commits into from
Mar 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions paddle/fluid/framework/details/nan_inf_utils_detail.cc
Original file line number Diff line number Diff line change
Expand Up @@ -422,8 +422,11 @@ void CheckVarHasNanOrInf(const std::string& op_type,
bool IsSkipOp(const framework::OperatorBase& op) {
if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true;

int op_role = op.template Attr<int>(
framework::OpProtoAndCheckerMaker::OpRoleAttrName());
int op_role = 0;
if (op.HasAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())) {
op_role = op.template Attr<int>(
framework::OpProtoAndCheckerMaker::OpRoleAttrName());
}

// kForward=0, can't filter
if (op_role == static_cast<int>(framework::OpRole::kForward)) {
Expand Down
163 changes: 133 additions & 30 deletions paddle/fluid/framework/new_executor/data_transfer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
const std::string& var_name,
std::string* new_var_name,
std::vector<OpFuncNode>* op_func_nodes,
bool use_local_scope) {
bool use_local_scope, bool is_fetch_v2) {
bool is_transferred = false;
auto* src_var_name = &var_name;

Expand All @@ -35,8 +35,11 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
if (need_layout_transform(kernel_type_for_var, expected_kernel_key)) {
auto op = TransferLayout(
*src_var_name, new_var_name, kernel_type_for_var.data_layout_,
expected_kernel_key.data_layout_, var_scope_, local_scope);
RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
expected_kernel_key.data_layout_, var_scope_, local_scope, is_fetch_v2);
if (op) {
RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name,
op_func_nodes);
}
// update src_var_name
src_var_name = new_var_name;
is_transferred = true;
Expand All @@ -46,7 +49,10 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
auto op = TransferDtype(
*src_var_name, new_var_name, kernel_type_for_var.data_type_,
expected_kernel_key.data_type_, var_scope_, local_scope);
RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
if (op) {
RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name,
op_func_nodes);
}
// update src_var_name
src_var_name = new_var_name;
is_transferred = true;
Expand All @@ -55,9 +61,13 @@ bool DataTranferHelper::apply(const OpKernelType& kernel_type_for_var,
if (need_device_transform(kernel_type_for_var, expected_kernel_key)) {
auto src_place = kernel_type_for_var.place_;
auto dst_place = expected_kernel_key.place_;

auto op = TransferDevice(*src_var_name, new_var_name, src_place, dst_place,
var_scope_, local_scope);
RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name, op_func_nodes);
if (op) {
RunAndConstructOpFuncNode(op, *src_var_name, *new_var_name,
op_func_nodes);
}
is_transferred = true;
}
return is_transferred;
Expand Down Expand Up @@ -128,17 +138,44 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
new_op_func_nodes->emplace_back(std::move(new_op_func_node));
}

std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
std::string* new_var_name,
DataLayout in_layout,
DataLayout out_layout,
VariableScope* var_scope,
framework::Scope* local_scope) {
// Var is initialized && var contains tensor && tensor is initialized
bool IsTensorOfVarInitialized(Variable* var) {
if (var->IsInitialized()) {
if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>()) {
return GetLoDTensorOrSelectedRowsValueFromVar(*var)->IsInitialized();
} else if (var->IsType<LoDTensorArray>()) {
return static_cast<const Tensor*>(&(var->Get<LoDTensorArray>()[0]))
->IsInitialized();
}
}
return false;
}

std::shared_ptr<OperatorBase> TransferLayout(
const std::string& var_name, std::string* new_var_name,
DataLayout in_layout, DataLayout out_layout, VariableScope* var_scope,
framework::Scope* local_scope, bool is_fetch_v2) {
#ifdef PADDLE_WITH_MKLDNN
// NOTE(zhiqiu): hot fix, follow the same logic in DataCopy() in fetch_op.cc
if (in_layout == framework::DataLayout::kMKLDNN &&
var_name == framework::GradVarName("Filter") && is_fetch_v2) {
out_layout = framework::DataLayout::kNCHW;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why out_layout is set to kNCHW ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really understand the tricky, just follow DataCopy() in fetch_op.cc

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. Thanks I understand now.

}
#endif

// 1. Generate new_var_name and Initialize it
*new_var_name =
var_name + "_layout_" + std::to_string(var_scope->VarSize() + 1);
auto* ptr = local_scope->Var(*new_var_name);
*new_var_name = var_name + "_layout_" +
std::to_string(static_cast<int>(in_layout)) + "_" +
std::to_string(static_cast<int>(out_layout));

if (var_scope->HasVar(*new_var_name) &&
IsTensorOfVarInitialized(var_scope->Var(*new_var_name))) {
// already has same var
VLOG(4) << "Use cached variable: " << *new_var_name;
return nullptr;
}

auto* ptr = local_scope->Var(*new_var_name);
auto var_type = var_scope->Var(var_name)->Type();
InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
VLOG(3) << "Create Variable " << *new_var_name
Expand Down Expand Up @@ -171,10 +208,17 @@ std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
VariableScope* var_scope,
framework::Scope* local_scope) {
// 1. Generate new_var_name and Initialize it
*new_var_name =
var_name + "_dtype_" + std::to_string(var_scope->VarSize() + 1);
auto* ptr = local_scope->Var(*new_var_name);
*new_var_name = var_name + "_dtype_" +
std::to_string(static_cast<int>(in_dtype)) + "_" +
std::to_string(static_cast<int>(out_dtype));
if (var_scope->HasVar(*new_var_name) &&
IsTensorOfVarInitialized(var_scope->Var(*new_var_name))) {
// already has same var
VLOG(4) << "Use cached variable: " << *new_var_name;
return nullptr;
}

auto* ptr = local_scope->Var(*new_var_name);
auto var_type = var_scope->Var(var_name)->Type();
InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));

Expand Down Expand Up @@ -211,10 +255,17 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
VariableScope* var_scope,
framework::Scope* local_scope) {
// 1. Generate new_var_name and Initialize it
*new_var_name =
var_name + "_device_" + std::to_string(var_scope->VarSize() + 1);
auto* ptr = local_scope->Var(*new_var_name);
*new_var_name = var_name + "_device_" + src_place.DebugString() + "_" +
dst_place.DebugString();

if (var_scope->HasVar(*new_var_name) &&
IsTensorOfVarInitialized(var_scope->Var(*new_var_name))) {
// already has same var
VLOG(4) << "Use cached variable: " << *new_var_name;
return nullptr;
}

auto* ptr = local_scope->Var(*new_var_name);
auto var_type = var_scope->Var(var_name)->Type();
InitializeVariable(ptr, static_cast<proto::VarType::Type>(var_type));
VLOG(3) << "Create Variable " << *new_var_name
Expand Down Expand Up @@ -258,12 +309,28 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
// record the no need transform variable index.
std::unordered_set<int> no_data_transform_index;

const std::unordered_set<std::string>* no_buffer_ins = nullptr;
auto& no_buffer_inferer = op_base->Info().NoNeedBufferVarsInferer();
if (no_buffer_inferer) {
no_buffer_ins = &(no_buffer_inferer(op_base->Inputs(), op_base->Outputs(),
op_base->Attrs()));
if (no_buffer_ins->empty()) {
no_buffer_ins = nullptr;
}
}

DataTranferHelper data_transfer_helper(place, var_scope);
for (auto& var_name_item : *ins_map_temp) {
bool should_skip_input =
no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0;

for (size_t i = 0; i < var_name_item.second.size(); ++i) {
auto var = var_name_item.second[i];
auto var_name = new_ins[var_name_item.first].at(i);
const Tensor* tensor_in;
std::string new_var_name;
bool is_transferred = false;

if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>()) {
tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
} else if (var->IsType<LoDTensorArray>()) {
Expand All @@ -272,18 +339,54 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
} else {
continue;
}
// special case
if (!tensor_in->IsInitialized()) {
continue;
if (should_skip_input == true) {
#ifdef PADDLE_WITH_MKLDNN
// Var without buffer may be needed
// for some situation like InferShape().
// In this situation We cannot skip Var analysis, as
// MKL-DNN shape of Var may differ from kNHWC Var
// In such situation corressponding resized Var
// has to be created and registered
if ((tensor_in->layout() == DataLayout::kMKLDNN) &&
(var->IsType<LoDTensor>() == true) &&
(expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) &&
(paddle::platform::MKLDNNDeviceContext::tls()
.get_cur_paddle_data_layout() == DataLayout::kNHWC)) {
VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
"but kNHWC layout"
<< var_name_item.first << " in Operator "
<< op_base->Type();
Scope* local_scope = use_local_scope
? var_scope->GetMutableLocalScope()
: var_scope->GetMutableScope();
auto op = TransferLayout(
var_name, &new_var_name, tensor_in->layout(), DataLayout::kNHWC,
var_scope, local_scope, op_base->Type() == "fetch_v2");
if (op) {
data_transfer_helper.RunAndConstructOpFuncNode(
op, var_name, new_var_name, new_op_func_nodes);
}
is_transferred = true;
} else {
VLOG(7) << "Skip scanning input " << var_name_item.first
<< " in Operator " << op_base->Type();
}
#endif
} else {
continue;
}
} else {
auto kernel_type_for_var =
static_cast<const framework::OperatorWithKernel*>(op_base)
->GetKernelTypeForVar(var_name_item.first, *tensor_in,
expected_kernel_key);
// apply data transform
is_transferred = data_transfer_helper.apply(
kernel_type_for_var, expected_kernel_key, var_name, &new_var_name,
new_op_func_nodes, use_local_scope, op_base->Type() == "fetch_v2");
}
auto kernel_type_for_var =
static_cast<const framework::OperatorWithKernel*>(op_base)
->GetKernelTypeForVar(var_name_item.first, *tensor_in,
expected_kernel_key);
// apply data transform
std::string new_var_name;
bool is_transferred = data_transfer_helper.apply(
kernel_type_for_var, expected_kernel_key, var_name, &new_var_name,
new_op_func_nodes, use_local_scope);

if (is_transferred) {
// update RuntimeContext.inputs and original op_func_node inputs
Expand Down
13 changes: 6 additions & 7 deletions paddle/fluid/framework/new_executor/data_transfer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ class DataTranferHelper {
bool apply(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_key,
const std::string& var_name, std::string* new_var_name,
std::vector<OpFuncNode>* new_op_func_nodes, bool use_local_scope);
std::vector<OpFuncNode>* new_op_func_nodes, bool use_local_scope,
bool is_fetch_v2);

void RunAndConstructShareNode(const std::string& src_var_name,
const std::string& dst_var_name,
Expand Down Expand Up @@ -94,12 +95,10 @@ inline bool need_layout_transform(const OpKernelType& kernel_type_for_var,
expected_kernel_key.data_layout_);
}

std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
std::string* new_var_name,
DataLayout in_layout,
DataLayout out_layout,
VariableScope* var_scope,
framework::Scope* local_scope);
std::shared_ptr<OperatorBase> TransferLayout(
const std::string& var_name, std::string* new_var_name,
DataLayout in_layout, DataLayout out_layout, VariableScope* var_scope,
framework::Scope* local_scope, bool is_fetch_v2);

std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
std::string* new_var_name,
Expand Down
18 changes: 18 additions & 0 deletions paddle/fluid/framework/new_executor/interpretercore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif

PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
"Use inplace in new executor");
Expand Down Expand Up @@ -55,6 +58,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
block_(block),
global_scope_(global_scope),
stream_analyzer_(place) {
VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
is_build_ = false;
async_work_queue_.reset(new interpreter::AsyncWorkQueue(
kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_));
Expand Down Expand Up @@ -92,6 +96,14 @@ InterpreterCore::~InterpreterCore() {
gc_.reset(nullptr);

async_work_queue_.reset(nullptr);
VLOG(4) << "~InterpreterCore(): " << this;
VLOG(4) << " on" << place_;

#ifdef PADDLE_WITH_MKLDNN
// Clear mkl-dnn cache,
// this is needed to have mkl-dnn unit tests working
platform::ClearMKLDNNCache(place_, this);
#endif
}

void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
Expand All @@ -101,6 +113,9 @@ void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
paddle::framework::FetchList InterpreterCore::Run(
const std::vector<std::string>& feed_names,
const std::vector<framework::LoDTensor>& feed_tensors) {
#ifdef PADDLE_WITH_MKLDNN
platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
bool is_build = is_build_;
global_scope_->SetLocalScope(local_scope_);
Prepare(feed_names, feed_tensors, is_build);
Expand All @@ -120,6 +135,9 @@ paddle::framework::FetchList InterpreterCore::Run(

paddle::framework::FetchList InterpreterCore::Run(
const std::vector<std::string>& feed_names) {
#ifdef PADDLE_WITH_MKLDNN
platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
if (!is_build_) {
if (create_local_scope_ &&
global_scope_->GetMutableLocalScope() !=
Expand Down
8 changes: 8 additions & 0 deletions paddle/fluid/framework/new_executor/interpretercore_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include "paddle/phi/core/kernel_factory.h"

#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif

PADDLE_DEFINE_EXPORTED_bool(
new_executor_sequential_run, false,
"Enable sequential execution for standalone executor, used for debug");
Expand Down Expand Up @@ -313,6 +317,10 @@ void build_op_func_list(const platform::Place& place,
operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
main_program, block.ID(), ops_unique);

#ifdef PADDLE_WITH_MKLDNN
platform::RegisterModelLayout(ops_unique, place);
#endif

// its elements will be moved to vec_func_list
std::vector<std::shared_ptr<OperatorBase>> ops;
for (auto& op_unique : ops_unique) {
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/framework/new_executor/standalone_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
auto iter = interpretercores_.find(oss.str());

if (iter == interpretercores_.end()) {
VLOG(3) << "create interpreter_core for " << oss.str();
VLOG(3) << "create interpreter_core for " << oss.str() << " on place "
<< place_;
VLOG(3) << "add fetch op: " << add_fetch_op;
std::shared_ptr<InterpreterCore> core = nullptr;
if (add_fetch_op) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/new_executor/standalone_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class StandaloneExecutor : public ExecutorBase {
const std::vector<std::string>& feed_names,
const std::vector<std::string>& fetch_names, bool add_fetch_op);

const platform::Place& place_;
platform::Place place_;
const ProgramDesc& startup_prog_;
const ProgramDesc& main_prog_;
VariableScope global_scope_;
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/operators/controlflow/fetch_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ static void DataCopy(const framework::LoDTensor &src_item,
framework::Tensor out;
// Convert to desired Paddle layout, apart from grads of filter
// as params are not a subject to paddle's data_format
VLOG(4) << "innerTransDataLayoutFromMKLDNN";
framework::innerTransDataLayoutFromMKLDNN(
src_item.layout(), fetch_var_name == framework::GradVarName("Filter")
? framework::DataLayout::kNCHW
Expand Down
Loading