Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run_program_op add scope cache & reuse #45813

Merged
merged 5 commits into from
Sep 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 119 additions & 54 deletions paddle/fluid/eager/to_static/run_program_op_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,34 @@ static void BuildScopeByBlock(
}
}

static void GcScope(paddle::framework::Scope *scope) {
std::deque<std::shared_ptr<paddle::memory::Allocation>> *garbages =
new std::deque<std::shared_ptr<paddle::memory::Allocation>>();

for (auto &var : scope->LocalVars()) {
if (var != nullptr) {
if (var->IsType<paddle::framework::LoDTensor>()) {
garbages->emplace_back(var->GetMutable<paddle::framework::LoDTensor>()
->MoveMemoryHolder());
}
if (var->IsType<phi::SelectedRows>()) {
garbages->emplace_back(var->GetMutable<phi::SelectedRows>()
->mutable_value()
->MoveMemoryHolder());
}
if (var->IsType<paddle::framework::LoDTensorArray>()) {
auto *lod_tensor_arr =
var->GetMutable<paddle::framework::LoDTensorArray>();
for (auto &t : *lod_tensor_arr) {
garbages->emplace_back(t.MoveMemoryHolder());
}
lod_tensor_arr->clear();
}
}
}
delete garbages; // free mem
}

} // namespace details

inline void RunProgramAPI(
Expand Down Expand Up @@ -274,23 +302,15 @@ inline void RunProgramAPI(
1,
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should only hold one scope."));
// Step 2. prepare executor and init persistable variables
// NOTE(Aurelius84): While training some models, forward can be called many
// times and then apply backpropagation all at once, such as Reinforcement
// Learning. Tensor data in multi-step training should be saved into single
// scope separately. Otherwise, the gradients can be miscalculated because
// always using the Tensor data of the last step in forward.
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
VLOG(2) << "The number of sub scopes before forward: "
<< out_scope_vec->front()->kids().size();
paddle::framework::Scope &scope = global_inner_scope->NewScope();

bool use_interpretorcore =
PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore"));

if (use_interpretorcore) {
VLOG(0) << "RunProgramOp use interpretercore to execute program.";

paddle::framework::Scope *global_inner_scope = out_scope_vec->front();

auto input_names = details::GetTensorsName(x);
auto output_names = details::GetTensorsName(out);
auto dout_names = details::GetTensorsName(dout);
Expand All @@ -308,12 +328,16 @@ inline void RunProgramAPI(
if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/false)) {
VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
// Step 1. share input_vars & parameters into scope
details::ShareTensorsIntoScope(x, &scope);
details::ShareTensorsIntoScope(params, &scope);
details::ShareTensorsIntoScope(x, global_inner_scope);
details::ShareTensorsIntoScope(params, global_inner_scope);
// Step 2. create new interpretercore
auto interpreter_core =
paddle::framework::CreateInterpreterCoreInfoToCache(
*forward_program, place, /*is_grad=*/false, program_id, &scope);
*forward_program,
place,
/*is_grad=*/false,
program_id,
global_inner_scope);
// Step 3. get all eager gc vars
std::set<std::string> skip_eager_delete_vars =
paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
Expand All @@ -331,45 +355,70 @@ inline void RunProgramAPI(
interpreter_core->Run({});
}
// Step 5. Get Output
details::ShareTensorsFromScopeWithPartialBlock(
out, *forward_global_block, *backward_global_block, &scope);
details::ShareTensorsFromScopeWithPartialBlock(
dout, *forward_global_block, *backward_global_block, &scope);
details::ShareTensorsFromScopeWithPartialBlock(out,
*forward_global_block,
*backward_global_block,
global_inner_scope);
details::ShareTensorsFromScopeWithPartialBlock(dout,
*forward_global_block,
*backward_global_block,
global_inner_scope);
} else {
VLOG(2) << "Get interpretercore cahce by program:" << program_id;
// Step 1. get cache interpretercore
auto &cached_value =
interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false);
auto &interpreter_core = cached_value.core_;
// Step 2. update scope for cache interpretercore
details::ShareTensorsIntoScope(x, &scope);
details::ShareTensorsIntoScope(params, &scope);
details::BuildScopeByBlock(
*interpreter_core.get(), *forward_global_block, &scope);
interpreter_core->reset_scope(&scope);
details::ShareTensorsIntoScope(x, global_inner_scope);
details::ShareTensorsIntoScope(params, global_inner_scope);
if (interpreter_core->GetVariableScope()->GetMutableScope() !=
global_inner_scope) {
details::BuildScopeByBlock(
*interpreter_core.get(), *forward_global_block, global_inner_scope);
interpreter_core->reset_scope(global_inner_scope);
}
// Step 3. interpretercore run
if (forward_global_block->OpSize() > 0) {
interpreter_core->Run({});
}
// Step 4. Get Output
details::ShareTensorsFromScopeWithPartialBlock(
out, *forward_global_block, *backward_global_block, &scope);
details::ShareTensorsFromScopeWithPartialBlock(
dout, *forward_global_block, *backward_global_block, &scope);
details::ShareTensorsFromScopeWithPartialBlock(out,
*forward_global_block,
*backward_global_block,
global_inner_scope);
details::ShareTensorsFromScopeWithPartialBlock(dout,
*forward_global_block,
*backward_global_block,
global_inner_scope);
}
VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());

if (is_test) {
VLOG(1) << "is test, after forward, drop kids";
out_scope_vec->front()->DropKids();
VLOG(4) << "is test, set this scope can reused";
global_inner_scope->SetCanReuesd(true);
details::GcScope(global_inner_scope);
} else {
VLOG(4) << "not test, set this scope can not reused";
global_inner_scope->SetCanReuesd(false);
}
VLOG(2) << "The number of sub scopes after forward: "
<< out_scope_vec->front()->kids().size();
#ifdef PADDLE_WITH_MKLDNN
if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
#endif
} else {
VLOG(2) << "RunProgramOp execute with parallel_executor.";

// Step 2. prepare executor and init persistable variables
// NOTE(Aurelius84): While training some models, forward can be called many
// times and then apply backpropagation all at once, such as Reinforcement
// Learning. Tensor data in multi-step training should be saved into single
// scope separately. Otherwise, the gradients can be miscalculated because
// always using the Tensor data of the last step in forward.
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
VLOG(2) << "The number of sub scopes before forward: "
<< out_scope_vec->front()->kids().size();
paddle::framework::Scope &scope = global_inner_scope->NewScope();

// share input_vars & parameters into scope
details::ShareTensorsIntoScope(x, &scope);
details::ShareTensorsIntoScope(params, &scope);
Expand Down Expand Up @@ -454,21 +503,14 @@ inline void RunProgramGradAPI(
1,
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should only hold one scope."));
paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto sub_scope_num = global_inner_scope->kids().size();
VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
PADDLE_ENFORCE_GT(sub_scope_num,
0,
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should hold at "
"least one sub scope."));

auto &scope = *(global_inner_scope->kids().front());
auto place = egr::Controller::Instance().GetExpectedPlace();

if (use_interpretorcore) {
VLOG(0) << "RunProgramGradOp use interpretercore to execute program.";

paddle::framework::Scope *global_inner_scope = out_scope_vec->front();

auto *forward_global_block = PADDLE_GET_CONST(
paddle::framework::BlockDesc *, attrs.at("forward_global_block"));
auto *backward_global_block = PADDLE_GET_CONST(
Expand All @@ -490,10 +532,14 @@ inline void RunProgramGradAPI(
paddle::framework::InterpreterCoreInfoCache::Instance();
if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/true)) {
VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
details::ShareTensorsIntoScope(out_grad, &scope);
details::ShareTensorsIntoScope(out_grad, global_inner_scope);
auto interpreter_core =
paddle::framework::CreateInterpreterCoreInfoToCache(
*backward_program, place, /*is_grad=*/true, program_id, &scope);
*backward_program,
place,
/*is_grad=*/true,
program_id,
global_inner_scope);

// get all eager gc vars
std::set<std::string> skip_eager_delete_vars;
Expand All @@ -518,10 +564,14 @@ inline void RunProgramGradAPI(
interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/true);
auto &interpreter_core = cached_value.core_;
// update scope
details::ShareTensorsIntoScope(out_grad, &scope);
details::BuildScopeByBlock(
*interpreter_core.get(), *backward_global_block, &scope);
interpreter_core->reset_scope(&scope);
details::ShareTensorsIntoScope(out_grad, global_inner_scope);
if (interpreter_core->GetVariableScope()->GetMutableScope() !=
global_inner_scope) {
details::BuildScopeByBlock(*interpreter_core.get(),
*backward_global_block,
global_inner_scope);
interpreter_core->reset_scope(global_inner_scope);
}

if (backward_global_block->OpSize() > 0) {
// Debug info: scope info when run end
Expand All @@ -531,16 +581,31 @@ inline void RunProgramGradAPI(
}
}
// Step 4. get outputs
details::ShareTensorsFromScopeWithPartialBlock(
x_grad, *forward_global_block, *backward_global_block, &scope);
details::ShareTensorsFromScopeWithPartialBlock(
params_grad, *forward_global_block, *backward_global_block, &scope);

// Step5. drop current scope
global_inner_scope->DeleteScope(&scope);
VLOG(2) << "The number of sub scopes after backward: "
<< global_inner_scope->kids().size();
details::ShareTensorsFromScopeWithPartialBlock(x_grad,
*forward_global_block,
*backward_global_block,
global_inner_scope);
details::ShareTensorsFromScopeWithPartialBlock(params_grad,
*forward_global_block,
*backward_global_block,
global_inner_scope);
VLOG(4) << "after backward gc all vars";
global_inner_scope->SetCanReuesd(true);
details::GcScope(global_inner_scope);
} else {
VLOG(2) << "RunProgramGradOp use pe to execute program.";

paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
auto sub_scope_num = global_inner_scope->kids().size();
VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
PADDLE_ENFORCE_GT(sub_scope_num,
0,
paddle::platform::errors::InvalidArgument(
"The OutScope of RunProgramGradOp should hold at "
"least one sub scope."));

auto &scope = *(global_inner_scope->kids().front());

auto *global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *,
attrs.at("global_block"));
auto orig_end_op_index =
Expand Down
8 changes: 8 additions & 0 deletions paddle/fluid/framework/scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ class Scope : public ScopeBase {
// Rename variable to a new name and return the new name
std::string Rename(const std::string& origin_name) const;

// only for dygraph_to_static
bool CanReuesd() const { return can_reused_; }

void SetCanReuesd(bool can_reused) { can_reused_ = can_reused; }

protected:
struct KeyHasher {
std::size_t operator()(const std::string& key) const {
Expand Down Expand Up @@ -169,6 +174,9 @@ class Scope : public ScopeBase {
mutable std::list<Scope*> kids_;
const Scope* parent_{nullptr};

// only for dygraph_to_static
bool can_reused_{false};

DISABLE_COPY_AND_ASSIGN(Scope);

#ifndef PADDLE_ON_INFERENCE
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1014,7 +1014,8 @@ All parameter, weight, gradient are variables in Paddle.
R"DOC(
Delete all sub-scopes of the current scope.
)DOC")
.def("_kids", &Scope::kids);
.def("_kids", &Scope::kids)
.def_property("_can_reuesd", &Scope::CanReuesd, &Scope::SetCanReuesd);

m.def(
"Scope",
Expand Down
42 changes: 35 additions & 7 deletions python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,25 @@ def __init__(self,
custom_white_list=custom_white_list,
custom_black_list=custom_black_list)

# program_id -> list(scope)
self._scope_cache = {}

def _get_scope(self, program_id=None, use_scope_cache=False):
if use_scope_cache:
if program_id not in self._scope_cache:
scope = core.Scope()
self._scope_cache[program_id] = [scope]
return scope
else:
for scope in self._scope_cache[program_id]:
if scope._can_reuesd:
return scope
scope = core.Scope()
self._scope_cache[program_id].append(scope)
return scope
else:
return core.Scope()

@LazyInitialized
def __fake_vars(self):
return _create_fake_var()
Expand Down Expand Up @@ -555,11 +574,19 @@ def __call__(self, inputs):
('forward_global_block', self.forward_program.desc.block(0),
'backward_global_block', self.backward_program.desc.block(0)))

_legacy_C_ops.run_program(self._valid_vars(in_vars),
self._valid_vars(self._params),
self._valid_vars(out_vars),
self._create_scope_vec(), self._double_grads,
self._cuda_graph_vec, *attrs)
_legacy_C_ops.run_program(
self._valid_vars(in_vars), self._valid_vars(self._params),
self._valid_vars(out_vars),
self._create_scope_vec(program_id=self.program_id,
use_scope_cache=True),
self._double_grads, self._cuda_graph_vec, *attrs)
else:
_legacy_C_ops.run_program(self._valid_vars(in_vars),
self._valid_vars(self._params),
self._valid_vars(out_vars),
self._create_scope_vec(),
self._double_grads, self._cuda_graph_vec,
*attrs)
restored_nest_out = self._restore_out(out_vars)
return self._remove_no_value(restored_nest_out)

Expand Down Expand Up @@ -735,10 +762,11 @@ def create_out(var_id):

return input_vars, out_vars

def _create_scope_vec(self):
def _create_scope_vec(self, program_id=None, use_scope_cache=False):
# Hold forward variables
tmp_scope_vec = None
inner_scope = core.Scope()
inner_scope = self._get_scope(program_id=program_id,
use_scope_cache=use_scope_cache)
if not framework._in_eager_mode_:
tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
"program_out_scope",
Expand Down