From 85aaf570c261986eebb076c7e160254c75f89ebb Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Sat, 25 Feb 2017 11:48:36 -0800
Subject: [PATCH] LSTM Memory Allocator Fix #5035 (#105)

* Imbalance version of shared pool during plan memory

* Bug fix for no shared_pool case

* Auto search and updated shared mem pool

* Cleanup unused code

* Cleanup logging code

* Add unit test for shared storage

* Remove shared pool in PlanMemory. Fix lint warnings

* Fix lint warnings

* Use reference instead of ptrs
---
 src/pass/plan_memory.cc    | 146 ++++++++++++++++++++++++-------------
 tests/python/test_graph.py |   1 -
 2 files changed, 94 insertions(+), 53 deletions(-)
diff --git a/src/pass/plan_memory.cc b/src/pass/plan_memory.cc
index 1666da4bb693..e9029c2268ba 100644
--- a/src/pass/plan_memory.cc
+++ b/src/pass/plan_memory.cc
@@ -19,10 +19,12 @@ class GraphAllocator {
  public:
   // storage id equals integer.
   using StorageID = int;
+
   // bad storage id
   static const StorageID kBadStorageID = -1;
   // external storage id
   static const StorageID kExternalStorageID = -2;
+
   // request a free storage
   StorageID Request(int dev_id, int dtype, TShape shape, uint32_t node_id) {
     if (shape.ndim() == 0) return kBadStorageID;
@@ -54,7 +56,7 @@ class GraphAllocator {
           node_color_[e->released_by_node] != node_color_[node_id]) continue;
       // Use exect matching strategy
       e->max_bytes = std::max(size, e->max_bytes);
-      // find a exact match, erase from map and return
+      // erase from map and return
       free_.erase(it);
       return e->id;
     }
@@ -69,6 +71,7 @@ class GraphAllocator {
     e->released_by_node = node_id;
     free_.insert({e->max_bytes, e});
   }
+
   // totoal number of bytes allocated
   size_t TotalAllocBytes() const {
     size_t total = 0;
@@ -79,14 +82,13 @@ class GraphAllocator {
   }
 
   // constructor
-  explicit GraphAllocator(const IndexedGraph* idx) : idx_(idx) {
-    this->Init(dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16),
-               dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1));
+  explicit GraphAllocator(const IndexedGraph* idx, const size_t match_range) : idx_(idx) {
+    this->Init(match_range, dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1));
   }
 
  private:
   // initialize the graph allocator
-  void Init(size_t match_range, uint32_t num_match_color) {
+  void Init(const size_t match_range, const uint32_t num_match_color) {
     match_range_ = match_range;
     num_match_color_ = num_match_color;
     if (num_match_color_ > 1) {
@@ -136,43 +138,17 @@ class GraphAllocator {
   const IndexedGraph* idx_;
 };
 
-// function to plan memory
-Graph PlanMemory(Graph ret) {
-  // setup ref counter
-  const IndexedGraph& idx = ret.indexed_graph();
+/*
+ * Internal method to perform the memory allocation for a graph
+ * */
+size_t AllocMemory(const Graph& ret, const IndexedGraph& idx, StorageVector* storage_ptr,
+                   std::vector<int>* storage_inplace_index_ptr, std::vector<uint32_t> ref_count,
+                   GraphAllocator* allocator) {
+  // Get reference
+  auto &storage = *storage_ptr;
+  auto &storage_inplace_index = *storage_inplace_index_ptr;
 
-  static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
-  // reference counter of each node
-  std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
-  // step 1: initialize reference count
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) continue;
-    for (const auto& e : inode.inputs) {
-      ++ref_count[idx.entry_id(e)];
-    }
-    // no dataflow dependency is needed for those are ignored.
-    // revoke the dependency counter.
-    if (fignore_inputs.count(inode.source->op()) != 0) {
-      auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
-      for (uint32_t i : ignore_inputs) {
-        --ref_count[idx.entry_id(inode.inputs[i])];
-      }
-    }
-  }
-  for (const auto& e : idx.outputs()) {
-    ++ref_count[idx.entry_id(e)];
-  }
-  // step 2: allocate memory.
-  StorageVector storage;
-
-  if (ret.attrs.count("storage") != 0) {
-    storage = ret.MoveCopyAttr<StorageVector>("storage");
-  } else {
-    storage.resize(idx.num_node_entries(), -1);
-  }
-
-  std::vector<int> storage_inplace_index(idx.num_node_entries(), -1);
+  // Get attributes from the graph
   const ShapeVector& shape_vec = ret.GetAttr<ShapeVector>("shape");
   const DTypeVector& dtype_vec = ret.GetAttr<DTypeVector>("dtype");
   const DeviceVector* device_vec = nullptr;
@@ -181,9 +157,6 @@ Graph PlanMemory(Graph ret) {
   if (ret.attrs.count("device") != 0) {
     device_vec = &(ret.GetAttr<DeviceVector>("device"));
   }
-  // the allocator.
-  GraphAllocator allocator(&idx);
-  // number of entries that are not statically allocated.
   size_t num_not_allocated = 0;
 
   for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
@@ -210,15 +183,24 @@ Graph PlanMemory(Graph ret) {
     }
     // normal allocation
     const int dev_id = (device_vec != nullptr) ? device_vec->at(nid) : 0;
-    // allocate output
+    // sort output nodes based on size before allocating output
+    std::multimap<size_t, uint32_t> eids;
     for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
       uint32_t eid = idx.entry_id(nid, index);
       if (storage[eid] == GraphAllocator::kBadStorageID) {
-        storage[eid] = allocator.Request(dev_id, dtype_vec[eid], shape_vec[eid], nid);
+        auto &eshape = shape_vec[eid];
+        size_t esize = 0;
+        if (eshape.ndim() != 0) esize = eshape.Size();
+        eids.insert(std::make_pair(esize, eid));
       }
     }
+    for (auto rit = eids.rbegin(); rit != eids.rend(); ++rit) {
+        uint32_t eid = rit->second;
+        storage[eid] = allocator->Request(dev_id, dtype_vec[eid], shape_vec[eid], nid);
+    }
 
     // check if certain inputs is ignored.
+    static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
     std::vector<uint32_t> ignore_inputs;
     if (fignore_inputs.count(inode.source->op()) != 0) {
       ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
@@ -235,7 +217,7 @@ Graph PlanMemory(Graph ret) {
       // if we decrease it to zero, means we are ready to relase
       --ref_count[eid];
       if (ref_count[eid] == 0 && storage[eid] != GraphAllocator::kBadStorageID) {
-        allocator.Release(storage[eid], nid);
+        allocator->Release(storage[eid], nid);
       }
     }
     // check if there are outputs that can be freeded immediately
@@ -243,7 +225,7 @@ Graph PlanMemory(Graph ret) {
     for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
       uint32_t eid = idx.entry_id(nid, index);
       if (ref_count[eid] == 0 && storage[eid] != GraphAllocator::kBadStorageID) {
-        allocator.Release(storage[eid], nid);
+        allocator->Release(storage[eid], nid);
         // use -2 to indicate that the node was never touched.
         storage_inplace_index[eid] = -2;
       }
@@ -252,10 +234,70 @@ Graph PlanMemory(Graph ret) {
       }
     }
   }
-  ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage));
-  ret.attrs["storage_inplace_index"] = std::make_shared<any>(std::move(storage_inplace_index));
-  ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(allocator.TotalAllocBytes());
-  ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(num_not_allocated);
+  return num_not_allocated;
+}
+
+
+// function to plan memory
+Graph PlanMemory(Graph ret) {
+  // setup ref counter
+  const IndexedGraph& idx = ret.indexed_graph();
+  static auto& fignore_inputs = Op::GetAttr<FIgnoreInputs>("FIgnoreInputs");
+  // reference counter of each node
+  std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
+  // step 1: initialize reference count
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    for (const auto& e : inode.inputs) {
+      ++ref_count[idx.entry_id(e)];
+    }
+    // no dataflow dependency is needed for those are ignored.
+    // revoke the dependency counter.
+    if (fignore_inputs.count(inode.source->op()) != 0) {
+      auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs);
+      for (uint32_t i : ignore_inputs) {
+        --ref_count[idx.entry_id(inode.inputs[i])];
+      }
+    }
+  }
+  for (const auto& e : idx.outputs()) {
+    ++ref_count[idx.entry_id(e)];
+  }
+  // step 2: allocate memory.
+  StorageVector storage;
+  if (ret.attrs.count("storage") != 0) {
+    storage = ret.MoveCopyAttr<StorageVector>("storage");
+  } else {
+    storage.resize(idx.num_node_entries(), -1);
+  }
+
+  // Search the best NNVM_EXEC_MATCH_RANGE parameter. This is turned off by default
+  size_t min_allocated_bytes = -1;
+  size_t max_match_range = dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16);
+  size_t min_match_range =
+         dmlc::GetEnv("NNVM_AUTO_SEARCH_MATCH_RANGE", false) ? 1 : max_match_range;
+  for (size_t match_range = min_match_range; match_range <= max_match_range; match_range *= 2) {
+    // Make a copy of related fields
+    StorageVector storage_vec(storage);
+    std::vector<int> storage_inplace_index(idx.num_node_entries(), -1);
+
+    // the allocator
+    GraphAllocator allocator(&idx, match_range);
+
+    // number of entries that are not statically allocated.
+    size_t storage_num_not_allocated =
+      AllocMemory(ret, idx, &storage_vec, &storage_inplace_index, ref_count, &allocator);
+    size_t storage_allocated_bytes = allocator.TotalAllocBytes();
+    // Choose the plan which leads to minimal memory usage
+    if (min_allocated_bytes > storage_allocated_bytes) {
+      ret.attrs["storage_id"] = std::make_shared<any>(std::move(storage_vec));
+      ret.attrs["storage_inplace_index"] = std::make_shared<any>(std::move(storage_inplace_index));
+      ret.attrs["storage_allocated_bytes"] = std::make_shared<any>(storage_allocated_bytes);
+      ret.attrs["storage_num_not_allocated"] = std::make_shared<any>(storage_num_not_allocated);
+      min_allocated_bytes = storage_allocated_bytes;
+    }
+  }
   return ret;
 }
 
diff --git a/tests/python/test_graph.py b/tests/python/test_graph.py
index 702f37bd38e0..86595aec3219 100644
--- a/tests/python/test_graph.py
+++ b/tests/python/test_graph.py
@@ -88,7 +88,6 @@ def test_infer_shape_known_partial():
     assert g.json_attr('shape')[jnode_row_ptr[nindex["reshape1"]]] == [2, 4]
     assert g.json_attr('shape')[jnode_row_ptr[nindex["add1"]]] == [4, 2]
 
-
 def test_infer_type():
     x = sym.Variable('x', dtype=0)
     y = sym.add(x, x, name='add1')