apache · eric-haibin-lin · Mar 7, 2019 · Sep 29, 2018 · Feb 21, 2019 · Feb 24, 2019
@@ -115,7 +115,13 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - If set to `1`, during training MXNet executes the computation graph as several subgraphs in bulk mode.
 * MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN
   - Values: Int ```(default=15)```
-  - The maximum number of nodes in the subgraph executed in bulk during training(not inference). Setting this to a larger number may reduce the degree of parallelism for multi-GPU training.
+  - The maximum number of nodes in the subgraph executed in bulk during training (not inference). Setting this to a larger number may reduce the degree of parallelism for multi-GPU training.
+* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD
+  - Values: Int ```(default=<value of MXNET_EXEC_BULK_MAX_NODE_TRAIN>)```
+  - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the forward pass.
+* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD
+  - Values: Int ```(default=<value of MXNET_EXEC_BULK_MAX_NODE_TRAIN>)```
+  - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass.
 
 ## Control the Data Communication
 

@@ -135,7 +135,8 @@ class Imperative {
   /*! \brief make constructor protected. */
   Imperative() {
     if (dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_TRAIN", 1)) {
-      backward_bulk_size_ =  dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15);
+      backward_bulk_size_ = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD",
+                                         dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15));
     }
   }
   /*! \brief find the input/output ndarrays that are needed for backward */

diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
@@ -1211,63 +1211,58 @@ void GraphExecutor::InitOpSegs() {
 
 
 void GraphExecutor::BulkTrainingOpSegs(size_t total_num_nodes) {
-  // The maximum number of node in a segment executed in bulk
-  size_t num_nodes_threshold = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15);
+  // The maximum number of nodes in a segment executed in bulk (excluding variables).
+  size_t segment_num_nodes_threshold =
+      dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15);
+  // The maximum number of nodes in a segment executed in bulk (excluding variables) in fwd pass.
+  size_t segment_num_nodes_threshold_fwd =
+      dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD", segment_num_nodes_threshold);
+  // The maximum number of nodes in a segment executed in bulk (excluding variables) in bwd pass.
+  size_t segment_num_nodes_threshold_bwd =
+      dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD", segment_num_nodes_threshold);
 
   // create forward segments for training
   size_t topo_start = 0;
+  size_t segment_node_count = 0;
   for (size_t nid = 0; nid < num_forward_nodes_; nid++) {
     auto &node = graph_.indexed_graph()[nid].source;
     auto &op_node = op_nodes_[nid];
-    // check if the segment relies on external input, or exceeds maxinum number of node,
-    // or requires async ops
-    if (node->is_variable() || nid - topo_start > num_nodes_threshold ||
-        op_node.exec->exec_type() != ExecType::kSync) {
-      // create a new segment for the previous nodes if the current one cannot be bulked
-      cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
+    // Variables, such as learned weights, are ignored in the segment_node_count
+    bool ignore_node = node->is_variable();
+    if (!ignore_node)
+      segment_node_count++;
+    bool can_bulk = ignore_node || op_node.exec->exec_type() == ExecType::kSync;
+    // check if we need to create the segment based on properties of this node
+    if (!can_bulk || nid == num_forward_nodes_ - 1 ||
+        segment_node_count >= segment_num_nodes_threshold_fwd) {
+      // Create a new segment for the previous nodes- include also this node if it's bulkable
+      cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, can_bulk ? nid + 1 : nid);
       topo_start = nid + 1;
+      segment_node_count = 0;
     }
   }
-  // the last segment
-  if (topo_start != num_forward_nodes_) {
-    cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, num_forward_nodes_);
-  }
 
   // create backward segments for training
-  // get all gradient variables
-  std::unordered_set<engine::VarHandle> grad_vars;
-  for (auto &kv : grad_store_) {
-    grad_vars.insert(kv.second.var());
-  }
-  auto &idx = graph_.indexed_graph();
   topo_start = num_forward_nodes_;
+  segment_node_count = 0;
   for (size_t nid = num_forward_nodes_; nid < total_num_nodes; nid++) {
+    auto &node = graph_.indexed_graph()[nid].source;
     auto &op_node = op_nodes_[nid];
-    if (op_node.skip_exec_node || op_node.exec == nullptr) {
-      continue;
-    }
-    if (idx[nid].source->is_variable() || nid - topo_start > num_nodes_threshold ||
-        op_node.exec->exec_type() != ExecType::kSync) {
-      cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
+    // Variables, such as learned weights, are ignored in the segment_node_count and
+    // nodes that are not executed for various reasons.
+    bool ignore_node = node->is_variable() || op_node.skip_exec_node || op_node.exec == nullptr;
+    if (!ignore_node)
+      segment_node_count++;
+    bool can_bulk = ignore_node || op_node.exec->exec_type() == ExecType::kSync;
+    // check if we need to create the segment based on properties of this node
+    if (!can_bulk || nid == total_num_nodes - 1 ||
+        segment_node_count >= segment_num_nodes_threshold_bwd) {
+      // Create a new segment for the previous nodes- include also this node if it's bulkable
+      cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, can_bulk ? nid + 1 : nid);
       topo_start = nid + 1;
-    } else {
-      // If it produces output gradient, don't include it in the segment
-      bool output_gradient = false;
-      for (auto &out_arr : op_node.exec->out_array) {
-        if (grad_vars.find(out_arr.var()) != grad_vars.end()) {
-          output_gradient = true;
-        }
-      }
-      if (output_gradient) {
-        cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
-        topo_start = nid + 1;
-      }
+      segment_node_count = 0;
     }
   }
-  // last segment for backward
-  if (topo_start < total_num_nodes) {
-    cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, total_num_nodes);
-  }
 }
 
 void GraphExecutor::BulkInferenceOpSegs() {

diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
@@ -53,10 +53,12 @@ struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
     .set_default(2)
     .describe("Maximum number of operators that can be inlined.");
     DMLC_DECLARE_FIELD(forward_bulk_size)
-    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
+    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_FWD",
+                              dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15)))
     .describe("Segment size of bulk execution during forward pass.");
     DMLC_DECLARE_FIELD(backward_bulk_size)
-    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15))
+    .set_default(dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD",
+                              dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15)))
     .describe("Segment size of bulk execution during backward pass.");
     DMLC_DECLARE_FIELD(data_indices)
     .set_default(nnvm::Tuple<uint32_t>())