Skip to content

Commit

Permalink
Merge pull request #9377 from typhoonzero/prepare_pserver_executor
Browse files Browse the repository at this point in the history
prepare pserver executor
  • Loading branch information
typhoonzero authored Apr 4, 2018
2 parents 66e0aed + 1f6e044 commit fb8c1cf
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 15 deletions.
15 changes: 15 additions & 0 deletions paddle/fluid/framework/executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,21 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
return std::unique_ptr<ExecutorPrepareContext>(ctx);
}

std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids) {
std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
for (auto& bid : block_ids) {
auto* ctx = new ExecutorPrepareContext(program, bid);
PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
auto& block = program.Block(bid);
for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
}
result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
}
return result;
}

void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) {
auto& block = ctx->prog_.Block(ctx->block_id_);
Expand Down
3 changes: 3 additions & 0 deletions paddle/fluid/framework/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ class Executor {
static std::unique_ptr<ExecutorPrepareContext> Prepare(
const ProgramDesc& program, int block_id);

static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids);

void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true,
bool create_vars = true);
Expand Down
42 changes: 27 additions & 15 deletions paddle/fluid/operators/listen_and_serv_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,23 @@ static void CreateTensorFromMessageType(framework::Variable *var,
}
}

static void ParallelExecuteBlocks(const std::vector<size_t> &parallel_blkids,
framework::Executor *executor,
framework::ProgramDesc *program,
framework::Scope *scope) {
static void ParallelExecuteBlocks(
const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
&prepared,
framework::ProgramDesc *program, framework::Scope *scope) {
std::vector<std::future<void>> fs;
for (size_t idx : parallel_blkids) {
fs.push_back(framework::Async([&executor, &program, &scope, idx]() {
int run_block = idx; // thread local
try {
executor->Run(*program, scope, run_block, false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
}));
fs.push_back(
framework::Async([&executor, &prepared, &program, &scope, idx]() {
int run_block = idx; // thread local
try {
executor->RunPreparedContext(prepared[run_block].get(), scope,
false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
}));
}
for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
}
Expand Down Expand Up @@ -101,6 +104,13 @@ class ListenAndServOp : public framework::OperatorBase {
"server program should have at least 2 blocks");

framework::Executor executor(dev_place);
std::vector<int> block_list;
for (size_t blkid = 1; blkid < num_blocks; ++blkid)
block_list.push_back(blkid);
auto prepared = executor.Prepare(*program, block_list);
prepared.insert(
prepared.begin(),
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));

// TODO(qiao) set proper fields for table lookup and update
rpc_service_->SetExecutor(&executor);
Expand Down Expand Up @@ -160,14 +170,15 @@ class ListenAndServOp : public framework::OperatorBase {
for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
if (program->Block(blkid).Parent() != last_parent_blkid) {
for (size_t idx : parallel_blkids) VLOG(3) << idx;
ParallelExecuteBlocks(parallel_blkids, &executor, program,
ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
&recv_scope);
parallel_blkids.clear();
last_parent_blkid = program->Block(blkid).Parent();
}
parallel_blkids.push_back(blkid);
}
ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);
ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
&recv_scope);

VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts
<< "(ms)";
Expand All @@ -181,7 +192,8 @@ class ListenAndServOp : public framework::OperatorBase {
var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
}
rpc_service_->SetCond(1);
// FIXME(typhoonzero): use another condition to sync wait clients get.
// NOTE: does not consider barrier request retry in here, we may use
// global barrier id to resolve this.
rpc_service_->WaitClientGet(fan_in);
sparse_vars.clear();
} // while(true)
Expand Down

0 comments on commit fb8c1cf

Please sign in to comment.