Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Bulked op segments to allow Variable nodes #14200

Merged
merged 15 commits into from
Mar 7, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion docs/faq/env_var.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,13 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
- If set to `1`, during training MXNet executes the computation graph as several subgraphs in bulk mode.
* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN
- Values: Int ```(default=15)```
- The maximum number of nodes in the subgraph executed in bulk during training(not inference). Setting this to a larger number may reduce the degree of parallelism for multi-GPU training.
- The maximum number of nodes in the subgraph executed in bulk during training (not inference). Setting this to a larger number may reduce the degree of parallelism for multi-GPU training.
eric-haibin-lin marked this conversation as resolved.
Show resolved Hide resolved
* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD
- Values: Int ```(default=<value of MXNET_EXEC_BULK_MAX_NODE_TRAIN>)```
- The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the forward pass.
* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD
- Values: Int ```(default=<value of MXNET_EXEC_BULK_MAX_NODE_TRAIN>)```
- The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass.

## Control the Data Communication

Expand Down
3 changes: 2 additions & 1 deletion include/mxnet/imperative.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ class Imperative {
/*! \brief make constructor protected. */
Imperative() {
if (dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_TRAIN", 1)) {
backward_bulk_size_ = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15);
backward_bulk_size_ = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD",
dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15));
}
}
/*! \brief find the input/output ndarrays that are needed for backward */
Expand Down
75 changes: 35 additions & 40 deletions src/executor/graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1211,63 +1211,58 @@ void GraphExecutor::InitOpSegs() {


void GraphExecutor::BulkTrainingOpSegs(size_t total_num_nodes) {
// The maximum number of node in a segment executed in bulk
size_t num_nodes_threshold = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15);
// The maximum number of nodes in a segment executed in bulk (excluding variables).
size_t segment_num_nodes_threshold =
dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15);
// The maximum number of nodes in a segment executed in bulk (excluding variables) in fwd pass.
size_t segment_num_nodes_threshold_fwd =
dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD", segment_num_nodes_threshold);
// The maximum number of nodes in a segment executed in bulk (excluding variables) in bwd pass.
size_t segment_num_nodes_threshold_bwd =
dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD", segment_num_nodes_threshold);

// create forward segments for training
size_t topo_start = 0;
size_t segment_node_count = 0;
for (size_t nid = 0; nid < num_forward_nodes_; nid++) {
eric-haibin-lin marked this conversation as resolved.
Show resolved Hide resolved
auto &node = graph_.indexed_graph()[nid].source;
auto &op_node = op_nodes_[nid];
// check if the segment relies on external input, or exceeds maxinum number of node,
// or requires async ops
if (node->is_variable() || nid - topo_start > num_nodes_threshold ||
op_node.exec->exec_type() != ExecType::kSync) {
// create a new segment for the previous nodes if the current one cannot be bulked
cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
// Variables, such as learned weights, are ignored in the segment_node_count
bool ignore_node = node->is_variable();
if (!ignore_node)
segment_node_count++;
bool can_bulk = ignore_node || op_node.exec->exec_type() == ExecType::kSync;
// check if we need to create the segment based on properties of this node
if (!can_bulk || nid == num_forward_nodes_ - 1 ||
segment_node_count >= segment_num_nodes_threshold_fwd) {
// Create a new segment for the previous nodes- include also this node if it's bulkable
cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, can_bulk ? nid + 1 : nid);
topo_start = nid + 1;
segment_node_count = 0;
}
}
// the last segment
if (topo_start != num_forward_nodes_) {
cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, num_forward_nodes_);
}

// create backward segments for training
// get all gradient variables
std::unordered_set<engine::VarHandle> grad_vars;
for (auto &kv : grad_store_) {
grad_vars.insert(kv.second.var());
}
auto &idx = graph_.indexed_graph();
topo_start = num_forward_nodes_;
segment_node_count = 0;
for (size_t nid = num_forward_nodes_; nid < total_num_nodes; nid++) {
auto &node = graph_.indexed_graph()[nid].source;
auto &op_node = op_nodes_[nid];
if (op_node.skip_exec_node || op_node.exec == nullptr) {
continue;
}
if (idx[nid].source->is_variable() || nid - topo_start > num_nodes_threshold ||
op_node.exec->exec_type() != ExecType::kSync) {
cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
// Variables, such as learned weights, are ignored in the segment_node_count and
// nodes that are not executed for various reasons.
bool ignore_node = node->is_variable() || op_node.skip_exec_node || op_node.exec == nullptr;
if (!ignore_node)
segment_node_count++;
bool can_bulk = ignore_node || op_node.exec->exec_type() == ExecType::kSync;
// check if we need to create the segment based on properties of this node
if (!can_bulk || nid == total_num_nodes - 1 ||
segment_node_count >= segment_num_nodes_threshold_bwd) {
// Create a new segment for the previous nodes- include also this node if it's bulkable
cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, can_bulk ? nid + 1 : nid);
topo_start = nid + 1;
} else {
// If it produces output gradient, don't include it in the segment
bool output_gradient = false;
for (auto &out_arr : op_node.exec->out_array) {
if (grad_vars.find(out_arr.var()) != grad_vars.end()) {
output_gradient = true;
}
}
if (output_gradient) {
cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
topo_start = nid + 1;
}
segment_node_count = 0;
}
}
// last segment for backward
if (topo_start < total_num_nodes) {
cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, total_num_nodes);
}
}

void GraphExecutor::BulkInferenceOpSegs() {
Expand Down
6 changes: 4 additions & 2 deletions src/imperative/cached_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,12 @@ struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
.set_default(2)
.describe("Maximum number of operators that can be inlined.");
DMLC_DECLARE_FIELD(forward_bulk_size)
.set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that env_vars like MXNET_EXEC_BULK_EXEC_TRAIN/MXNET_EXEC_BULK_EXEC_INFER=0 are not respected by the cached_op. Would you have time to kindly fix it for cached op?
/~https://github.com/apache/incubator-mxnet/blob/54fd288c7a4bf59d37f793c26ef9a98ed40b0c40/src/imperative/cached_op.cc#L593-L596

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please review the latest commits, which address this valid concern about consistency. I also consolidated all op-bulking env var references to a central place and added timing-based tests for the perf impact of bulking. I'm happy with the PR now (assuming it passes CI). Anyone else you want to pull into the review @eric-haibin-lin?

.set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD",
dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15)))
.describe("Segment size of bulk execution during forward pass.");
DMLC_DECLARE_FIELD(backward_bulk_size)
.set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
.set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD",
dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15)))
.describe("Segment size of bulk execution during backward pass.");
DMLC_DECLARE_FIELD(data_indices)
.set_default(nnvm::Tuple<uint32_t>())
Expand Down