From 85aaf570c261986eebb076c7e160254c75f89ebb Mon Sep 17 00:00:00 2001 From: Haibin Lin Date: Sat, 25 Feb 2017 11:48:36 -0800 Subject: [PATCH] LSTM Memory Allocator Fix #5035 (#105) * Imbalance version of shared pool during plan memory * Bug fix for no shared_pool case * Auto search and updated shared mem pool * Cleanup unused code * Cleanup logging code * Add unit test for shared storage * Remove shared pool in PlanMemory. Fix lint warnings * Fix lint warnings * Use reference instead of ptrs --- src/pass/plan_memory.cc | 146 ++++++++++++++++++++++++------------- tests/python/test_graph.py | 1 - 2 files changed, 94 insertions(+), 53 deletions(-) diff --git a/src/pass/plan_memory.cc b/src/pass/plan_memory.cc index 1666da4bb693..e9029c2268ba 100644 --- a/src/pass/plan_memory.cc +++ b/src/pass/plan_memory.cc @@ -19,10 +19,12 @@ class GraphAllocator { public: // storage id equals integer. using StorageID = int; + // bad storage id static const StorageID kBadStorageID = -1; // external storage id static const StorageID kExternalStorageID = -2; + // request a free storage StorageID Request(int dev_id, int dtype, TShape shape, uint32_t node_id) { if (shape.ndim() == 0) return kBadStorageID; @@ -54,7 +56,7 @@ class GraphAllocator { node_color_[e->released_by_node] != node_color_[node_id]) continue; // Use exect matching strategy e->max_bytes = std::max(size, e->max_bytes); - // find a exact match, erase from map and return + // erase from map and return free_.erase(it); return e->id; } @@ -69,6 +71,7 @@ class GraphAllocator { e->released_by_node = node_id; free_.insert({e->max_bytes, e}); } + // totoal number of bytes allocated size_t TotalAllocBytes() const { size_t total = 0; @@ -79,14 +82,13 @@ class GraphAllocator { } // constructor - explicit GraphAllocator(const IndexedGraph* idx) : idx_(idx) { - this->Init(dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16), - dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1)); + explicit GraphAllocator(const IndexedGraph* idx, const size_t match_range) : idx_(idx) { + this->Init(match_range, dmlc::GetEnv("NNVM_EXEC_NUM_TEMP", 1)); } private: // initialize the graph allocator - void Init(size_t match_range, uint32_t num_match_color) { + void Init(const size_t match_range, const uint32_t num_match_color) { match_range_ = match_range; num_match_color_ = num_match_color; if (num_match_color_ > 1) { @@ -136,43 +138,17 @@ class GraphAllocator { const IndexedGraph* idx_; }; -// function to plan memory -Graph PlanMemory(Graph ret) { - // setup ref counter - const IndexedGraph& idx = ret.indexed_graph(); +/* + * Internal method to perform the memory allocation for a graph + * */ +size_t AllocMemory(const Graph& ret, const IndexedGraph& idx, StorageVector* storage_ptr, + std::vector* storage_inplace_index_ptr, std::vector ref_count, + GraphAllocator* allocator) { + // Get reference + auto &storage = *storage_ptr; + auto &storage_inplace_index = *storage_inplace_index_ptr; - static auto& fignore_inputs = Op::GetAttr("FIgnoreInputs"); - // reference counter of each node - std::vector ref_count(idx.num_node_entries(), 0); - // step 1: initialize reference count - for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { - const auto& inode = idx[nid]; - if (inode.source->is_variable()) continue; - for (const auto& e : inode.inputs) { - ++ref_count[idx.entry_id(e)]; - } - // no dataflow dependency is needed for those are ignored. - // revoke the dependency counter. - if (fignore_inputs.count(inode.source->op()) != 0) { - auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs); - for (uint32_t i : ignore_inputs) { - --ref_count[idx.entry_id(inode.inputs[i])]; - } - } - } - for (const auto& e : idx.outputs()) { - ++ref_count[idx.entry_id(e)]; - } - // step 2: allocate memory. - StorageVector storage; - - if (ret.attrs.count("storage") != 0) { - storage = ret.MoveCopyAttr("storage"); - } else { - storage.resize(idx.num_node_entries(), -1); - } - - std::vector storage_inplace_index(idx.num_node_entries(), -1); + // Get attributes from the graph const ShapeVector& shape_vec = ret.GetAttr("shape"); const DTypeVector& dtype_vec = ret.GetAttr("dtype"); const DeviceVector* device_vec = nullptr; @@ -181,9 +157,6 @@ Graph PlanMemory(Graph ret) { if (ret.attrs.count("device") != 0) { device_vec = &(ret.GetAttr("device")); } - // the allocator. - GraphAllocator allocator(&idx); - // number of entries that are not statically allocated. size_t num_not_allocated = 0; for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { @@ -210,15 +183,24 @@ Graph PlanMemory(Graph ret) { } // normal allocation const int dev_id = (device_vec != nullptr) ? device_vec->at(nid) : 0; - // allocate output + // sort output nodes based on size before allocating output + std::multimap eids; for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) { uint32_t eid = idx.entry_id(nid, index); if (storage[eid] == GraphAllocator::kBadStorageID) { - storage[eid] = allocator.Request(dev_id, dtype_vec[eid], shape_vec[eid], nid); + auto &eshape = shape_vec[eid]; + size_t esize = 0; + if (eshape.ndim() != 0) esize = eshape.Size(); + eids.insert(std::make_pair(esize, eid)); } } + for (auto rit = eids.rbegin(); rit != eids.rend(); ++rit) { + uint32_t eid = rit->second; + storage[eid] = allocator->Request(dev_id, dtype_vec[eid], shape_vec[eid], nid); + } // check if certain inputs is ignored. + static auto& fignore_inputs = Op::GetAttr("FIgnoreInputs"); std::vector ignore_inputs; if (fignore_inputs.count(inode.source->op()) != 0) { ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs); @@ -235,7 +217,7 @@ Graph PlanMemory(Graph ret) { // if we decrease it to zero, means we are ready to relase --ref_count[eid]; if (ref_count[eid] == 0 && storage[eid] != GraphAllocator::kBadStorageID) { - allocator.Release(storage[eid], nid); + allocator->Release(storage[eid], nid); } } // check if there are outputs that can be freeded immediately @@ -243,7 +225,7 @@ Graph PlanMemory(Graph ret) { for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) { uint32_t eid = idx.entry_id(nid, index); if (ref_count[eid] == 0 && storage[eid] != GraphAllocator::kBadStorageID) { - allocator.Release(storage[eid], nid); + allocator->Release(storage[eid], nid); // use -2 to indicate that the node was never touched. storage_inplace_index[eid] = -2; } @@ -252,10 +234,70 @@ Graph PlanMemory(Graph ret) { } } } - ret.attrs["storage_id"] = std::make_shared(std::move(storage)); - ret.attrs["storage_inplace_index"] = std::make_shared(std::move(storage_inplace_index)); - ret.attrs["storage_allocated_bytes"] = std::make_shared(allocator.TotalAllocBytes()); - ret.attrs["storage_num_not_allocated"] = std::make_shared(num_not_allocated); + return num_not_allocated; +} + + +// function to plan memory +Graph PlanMemory(Graph ret) { + // setup ref counter + const IndexedGraph& idx = ret.indexed_graph(); + static auto& fignore_inputs = Op::GetAttr("FIgnoreInputs"); + // reference counter of each node + std::vector ref_count(idx.num_node_entries(), 0); + // step 1: initialize reference count + for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { + const auto& inode = idx[nid]; + if (inode.source->is_variable()) continue; + for (const auto& e : inode.inputs) { + ++ref_count[idx.entry_id(e)]; + } + // no dataflow dependency is needed for those are ignored. + // revoke the dependency counter. + if (fignore_inputs.count(inode.source->op()) != 0) { + auto ignore_inputs = fignore_inputs[inode.source->op()](inode.source->attrs); + for (uint32_t i : ignore_inputs) { + --ref_count[idx.entry_id(inode.inputs[i])]; + } + } + } + for (const auto& e : idx.outputs()) { + ++ref_count[idx.entry_id(e)]; + } + // step 2: allocate memory. + StorageVector storage; + if (ret.attrs.count("storage") != 0) { + storage = ret.MoveCopyAttr("storage"); + } else { + storage.resize(idx.num_node_entries(), -1); + } + + // Search the best NNVM_EXEC_MATCH_RANGE parameter. This is turned off by default + size_t min_allocated_bytes = -1; + size_t max_match_range = dmlc::GetEnv("NNVM_EXEC_MATCH_RANGE", 16); + size_t min_match_range = + dmlc::GetEnv("NNVM_AUTO_SEARCH_MATCH_RANGE", false) ? 1 : max_match_range; + for (size_t match_range = min_match_range; match_range <= max_match_range; match_range *= 2) { + // Make a copy of related fields + StorageVector storage_vec(storage); + std::vector storage_inplace_index(idx.num_node_entries(), -1); + + // the allocator + GraphAllocator allocator(&idx, match_range); + + // number of entries that are not statically allocated. + size_t storage_num_not_allocated = + AllocMemory(ret, idx, &storage_vec, &storage_inplace_index, ref_count, &allocator); + size_t storage_allocated_bytes = allocator.TotalAllocBytes(); + // Choose the plan which leads to minimal memory usage + if (min_allocated_bytes > storage_allocated_bytes) { + ret.attrs["storage_id"] = std::make_shared(std::move(storage_vec)); + ret.attrs["storage_inplace_index"] = std::make_shared(std::move(storage_inplace_index)); + ret.attrs["storage_allocated_bytes"] = std::make_shared(storage_allocated_bytes); + ret.attrs["storage_num_not_allocated"] = std::make_shared(storage_num_not_allocated); + min_allocated_bytes = storage_allocated_bytes; + } + } return ret; } diff --git a/tests/python/test_graph.py b/tests/python/test_graph.py index 702f37bd38e0..86595aec3219 100644 --- a/tests/python/test_graph.py +++ b/tests/python/test_graph.py @@ -88,7 +88,6 @@ def test_infer_shape_known_partial(): assert g.json_attr('shape')[jnode_row_ptr[nindex["reshape1"]]] == [2, 4] assert g.json_attr('shape')[jnode_row_ptr[nindex["add1"]]] == [4, 2] - def test_infer_type(): x = sym.Variable('x', dtype=0) y = sym.add(x, x, name='add1')