From 4dc5dfb41778baadd122fa46f9aa242fee89e30d Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Wed, 27 Feb 2019 00:47:20 +0800 Subject: [PATCH 1/9] fix heap overflow --- .../predict-cpp/image-classification-predict.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/example/image-classification/predict-cpp/image-classification-predict.cc b/example/image-classification/predict-cpp/image-classification-predict.cc index 2a605b8b2674..3c72589399ba 100644 --- a/example/image-classification/predict-cpp/image-classification-predict.cc +++ b/example/image-classification/predict-cpp/image-classification-predict.cc @@ -76,7 +76,9 @@ class BufferFile { ifs.seekg(0, std::ios::beg); std::cout << file_path.c_str() << " ... " << length_ << " bytes\n"; - buffer_.reset(new char[length_]); + // Buffer as null terminated to be converted to string + buffer_.reset(new char[length_ + 1]); + buffer_[length_] = 0; ifs.read(buffer_.get(), length_); ifs.close(); } From 3efe41cebd6d997c957e01baf234ad9f732fb9a2 Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Wed, 27 Feb 2019 00:47:55 +0800 Subject: [PATCH 2/9] fix memory leak of optimizer and executer --- cpp-package/example/alexnet.cpp | 1 + cpp-package/example/charRNN.cpp | 10 ++++++++++ cpp-package/example/googlenet.cpp | 1 + cpp-package/example/inception_bn.cpp | 1 + cpp-package/example/lenet.cpp | 1 + cpp-package/example/lenet_with_mxdataiter.cpp | 1 + cpp-package/example/mlp_cpu.cpp | 1 + cpp-package/example/mlp_csv.cpp | 1 + cpp-package/example/mlp_gpu.cpp | 1 + cpp-package/example/resnet.cpp | 1 + cpp-package/example/test_optimizer.cpp | 1 + cpp-package/example/test_score.cpp | 1 + 12 files changed, 21 insertions(+) diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp index 1c0f7130d974..e2083a0dfa0a 100644 --- a/cpp-package/example/alexnet.cpp +++ b/cpp-package/example/alexnet.cpp @@ -319,6 +319,7 @@ int main(int argc, char const *argv[]) { } /*don't foget to release the executor*/ delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp index 54b8eea7af39..8951580067a8 100644 --- a/cpp-package/example/charRNN.cpp +++ b/cpp-package/example/charRNN.cpp @@ -500,6 +500,9 @@ void train(const std::string file, int batch_size, int max_epoch, int start_epoc std::string filepath = prefix + "-" + std::to_string(epoch) + ".params"; SaveCheckpoint(filepath, RNN, exe); } + + delete exe; + delete opt; } /*The original example, rnn_cell_demo.py, uses default Xavier as initalizer, which relies on @@ -577,6 +580,9 @@ void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch std::string filepath = prefix + "-" + std::to_string(epoch) + ".params"; SaveCheckpoint(filepath, RNN, exe); } + + delete exe; + delete opt; } void predict(std::wstring* ptext, int sequence_length, const std::string param_file, @@ -639,6 +645,8 @@ void predict(std::wstring* ptext, int sequence_length, const std::string param_f next = charIndices[n]; ptext->push_back(next); } + + delete exe; } void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std::string param_file, @@ -693,6 +701,8 @@ void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std next = charIndices[n]; ptext->push_back(next); } + + delete exe; } int main(int argc, char** argv) { diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp index c8078afd0af4..26ba51027db6 100644 --- a/cpp-package/example/googlenet.cpp +++ b/cpp-package/example/googlenet.cpp @@ -190,6 +190,7 @@ int main(int argc, char const *argv[]) { } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp index 456e0d913475..2073ebe47fbc 100644 --- a/cpp-package/example/inception_bn.cpp +++ b/cpp-package/example/inception_bn.cpp @@ -232,6 +232,7 @@ int main(int argc, char const *argv[]) { LG << "Validation Accuracy: " << val_acc.Get(); } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp index 83c659c7082b..42594548130a 100644 --- a/cpp-package/example/lenet.cpp +++ b/cpp-package/example/lenet.cpp @@ -173,6 +173,7 @@ class Lenet { << ", accuracy: " << ValAccuracy(batch_size * 10, lenet); } delete exe; + delete opt; } private: diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp index 39550a3e9e65..33110fee3a88 100644 --- a/cpp-package/example/lenet_with_mxdataiter.cpp +++ b/cpp-package/example/lenet_with_mxdataiter.cpp @@ -177,6 +177,7 @@ int main(int argc, char const *argv[]) { } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp index 93eaf0538269..5d46d40e421f 100644 --- a/cpp-package/example/mlp_cpu.cpp +++ b/cpp-package/example/mlp_cpu.cpp @@ -139,6 +139,7 @@ int main(int argc, char** argv) { } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/mlp_csv.cpp b/cpp-package/example/mlp_csv.cpp index 43a14c84e6d8..f12b7c17133d 100644 --- a/cpp-package/example/mlp_csv.cpp +++ b/cpp-package/example/mlp_csv.cpp @@ -267,6 +267,7 @@ int main(int argc, char** argv) { } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp index 0befde8ae4bc..f6060209a51e 100644 --- a/cpp-package/example/mlp_gpu.cpp +++ b/cpp-package/example/mlp_gpu.cpp @@ -155,6 +155,7 @@ int main(int argc, char** argv) { } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp index 7c9dd4daa55a..7200bd42d2de 100644 --- a/cpp-package/example/resnet.cpp +++ b/cpp-package/example/resnet.cpp @@ -242,6 +242,7 @@ int main(int argc, char const *argv[]) { LG << "Validation Accuracy: " << val_acc.Get(); } delete exec; + delete opt; MXNotifyShutdown(); return 0; } diff --git a/cpp-package/example/test_optimizer.cpp b/cpp-package/example/test_optimizer.cpp index 42547ea6c7c9..70190eff5dc6 100644 --- a/cpp-package/example/test_optimizer.cpp +++ b/cpp-package/example/test_optimizer.cpp @@ -30,6 +30,7 @@ int main(int argc, char** argv) { opt = OptimizerRegistry::Find("adam"); int ret = (opt == 0) ? 1 : 0; + delete opt; MXNotifyShutdown(); return ret; } diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp index 7e5096abb816..687683f487f8 100644 --- a/cpp-package/example/test_score.cpp +++ b/cpp-package/example/test_score.cpp @@ -156,6 +156,7 @@ int main(int argc, char** argv) { } delete exec; + delete opt; MXNotifyShutdown(); return score >= MIN_SCORE ? 0 : 1; } From 303196e41d3224c8c922466b5fdc97ce2145313e Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Wed, 27 Feb 2019 00:48:52 +0800 Subject: [PATCH 3/9] uncomment memory pool free --- src/common/object_pool.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/common/object_pool.h b/src/common/object_pool.h index 576ff9aea1cc..362553313b85 100644 --- a/src/common/object_pool.h +++ b/src/common/object_pool.h @@ -132,10 +132,9 @@ struct ObjectPoolAllocatable { template ObjectPool::~ObjectPool() { - // TODO(hotpxl): mind destruction order - // for (auto i : allocated_) { - // free(i); - // } + for (auto i : allocated_) { + free(i); + } } template From 8a110f3785b2fa4402f452b2d36ae81581aac5ea Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Wed, 27 Feb 2019 00:49:20 +0800 Subject: [PATCH 4/9] run cleanup in engine shutdown phase --- src/engine/threaded_engine.cc | 2 +- src/engine/threaded_engine.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc index 6a6004011db1..b5897a1ca9cd 100644 --- a/src/engine/threaded_engine.cc +++ b/src/engine/threaded_engine.cc @@ -281,7 +281,7 @@ void ThreadedEngine::DeleteOperator(OprHandle op) { this->PushAsync([threaded_opr](RunContext, CallbackOnComplete on_complete) { ThreadedOpr::Delete(threaded_opr); on_complete(); - }, Context::CPU(), {}, deps, FnProperty::kAsync, 0, + }, Context::CPU(), {}, deps, FnProperty::kDeleteVar, 0, "DeleteOperator"); } diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h index 4a2d4196dcd3..ab06ca1b9b47 100644 --- a/src/engine/threaded_engine.h +++ b/src/engine/threaded_engine.h @@ -352,7 +352,8 @@ class ThreadedEngine : public Engine { LOG(INFO) << "ExecuteOprBlock " << opr_block << "shutdown_phase=" << shutdown_phase_; } - if (!shutdown_phase_) { + // still run cleanup in shutdown_phase + if (!shutdown_phase_ || threaded_opr->prop == FnProperty::kDeleteVar) { try { OnStart(threaded_opr); if (debug_info) { From 9d201332552e359c3a876dd6189b22f122d462d7 Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Wed, 27 Feb 2019 00:50:42 +0800 Subject: [PATCH 5/9] make asan tests blocking --- ci/docker/runtime_functions.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 1b4caa073570..de1b7795ce69 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -1033,8 +1033,6 @@ integrationtest_ubuntu_cpu_asan() { set -ex export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.5 - # We do not want to fail the build on ASAN errors until memory leaks have been addressed. - export ASAN_OPTIONS=exitcode=0 cd /work/mxnet/build/cpp-package/example/ /work/mxnet/cpp-package/example/get_data.sh ./mlp_cpu From 8f2c5cf52d786ea4b6d9a2d052694aa64f61eb24 Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Wed, 27 Feb 2019 01:19:50 +0800 Subject: [PATCH 6/9] fix abort in mxnet shutdown, use forked submodules temporally for tests --- .gitmodules | 4 ++-- 3rdparty/dmlc-core | 2 +- 3rdparty/mshadow | 2 +- include/mxnet/ndarray.h | 20 +++++++++++++------- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/.gitmodules b/.gitmodules index 836d824a6f5a..b9667ca58c9c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,9 @@ [submodule "3rdparty/mshadow"] path = 3rdparty/mshadow - url = /~https://github.com/dmlc/mshadow.git + url = /~https://github.com/arcadiaphy/mshadow.git [submodule "3rdparty/dmlc-core"] path = 3rdparty/dmlc-core - url = /~https://github.com/dmlc/dmlc-core.git + url = /~https://github.com/arcadiaphy/dmlc-core.git [submodule "3rdparty/ps-lite"] path = 3rdparty/ps-lite url = /~https://github.com/dmlc/ps-lite diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 0a0e8addf92e..5834aafec131 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 0a0e8addf92e1287fd7a25c6314016b8c0138dee +Subproject commit 5834aafec1319d2d327279d7e8e4b29f4e1ecd55 diff --git a/3rdparty/mshadow b/3rdparty/mshadow index 3dc80815d965..f289d9518da4 160000 --- a/3rdparty/mshadow +++ b/3rdparty/mshadow @@ -1 +1 @@ -Subproject commit 3dc80815d965b56b9a975dc27229361955bf66fe +Subproject commit f289d9518da4d81b230def1b06cfec36be6e7d61 diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 5de42e19a657..260a72f8bb81 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -848,14 +848,17 @@ class NDArray { // The shape of aux data. The default value for the shape depends on the type of storage. // If aux_shapes[i].Size() is zero, aux data i is empty. std::vector aux_shapes; + /*! \brief Reference to the storage to ensure proper destruct order */ + std::shared_ptr storage_ref_; /*! \brief default cosntructor */ - Chunk() : static_data(true), delay_alloc(false) {} + Chunk() : static_data(true), delay_alloc(false), + storage_ref_(Storage::_GetSharedRef()) {} /*! \brief construct a new chunk */ Chunk(TShape shape, Context ctx_, bool delay_alloc_, int dtype) - : static_data(false), delay_alloc(true), ctx(ctx_) { - auto size = shape.Size(); + : static_data(false), delay_alloc(true), ctx(ctx_), + storage_ref_(Storage::_GetSharedRef()) { auto size = shape.Size(); storage_shape = shape; var = Engine::Get()->NewVariable(); shandle.size = size * mshadow::mshadow_sizeof(dtype); @@ -864,7 +867,8 @@ class NDArray { } Chunk(const TBlob &data, int dev_id) - : static_data(true), delay_alloc(false) { + : static_data(true), delay_alloc(false), + storage_ref_(Storage::_GetSharedRef()) { CHECK(storage_type == kDefaultStorage); var = Engine::Get()->NewVariable(); if (data.dev_mask() == cpu::kDevMask) { @@ -881,7 +885,8 @@ class NDArray { } Chunk(int shared_pid, int shared_id, const TShape& shape, int dtype) - : static_data(false), delay_alloc(false) { + : static_data(false), delay_alloc(false), + storage_ref_(Storage::_GetSharedRef()) { var = Engine::Get()->NewVariable(); ctx = Context::CPUShared(0); shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype); @@ -897,7 +902,7 @@ class NDArray { const std::vector &aux_shapes_) : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_), aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_), - aux_shapes(aux_shapes_) { + aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()) { shandle.ctx = ctx; var = Engine::Get()->NewVariable(); // aux_handles always reflect the correct number of aux data @@ -914,7 +919,8 @@ class NDArray { Chunk(const NDArrayStorageType storage_type_, const TBlob &data, const std::vector &aux_data, int dev_id) - : static_data(true), delay_alloc(false), storage_type(storage_type_) { + : static_data(true), delay_alloc(false), storage_type(storage_type_), + storage_ref_(Storage::_GetSharedRef()) { using namespace mshadow; CHECK_NE(storage_type, kDefaultStorage); // init var From 0fe383bb083ec2033b2e7a28550863ad42b7372f Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Thu, 28 Feb 2019 10:48:59 +0800 Subject: [PATCH 7/9] trigger CI From 442221ac1644b38a17dea4bdeb81d2703c032123 Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Fri, 1 Mar 2019 16:03:34 +0800 Subject: [PATCH 8/9] change submodule mshadow --- .gitmodules | 2 +- 3rdparty/mshadow | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index b9667ca58c9c..9e88c866ff83 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "3rdparty/mshadow"] path = 3rdparty/mshadow - url = /~https://github.com/arcadiaphy/mshadow.git + url = /~https://github.com/dmlc/mshadow.git [submodule "3rdparty/dmlc-core"] path = 3rdparty/dmlc-core url = /~https://github.com/arcadiaphy/dmlc-core.git diff --git a/3rdparty/mshadow b/3rdparty/mshadow index f289d9518da4..95ebe0f109ae 160000 --- a/3rdparty/mshadow +++ b/3rdparty/mshadow @@ -1 +1 @@ -Subproject commit f289d9518da4d81b230def1b06cfec36be6e7d61 +Subproject commit 95ebe0f109ae021d0d66e2a627ccfc55c3253b55 From fa5a703cd9ac9c4c0c1b177909d0e7ec33e5c8eb Mon Sep 17 00:00:00 2001 From: Wang Jiajun Date: Sat, 2 Mar 2019 11:40:59 +0800 Subject: [PATCH 9/9] change submodule dmlc-core --- .gitmodules | 2 +- 3rdparty/dmlc-core | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index 9e88c866ff83..836d824a6f5a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -3,7 +3,7 @@ url = /~https://github.com/dmlc/mshadow.git [submodule "3rdparty/dmlc-core"] path = 3rdparty/dmlc-core - url = /~https://github.com/arcadiaphy/dmlc-core.git + url = /~https://github.com/dmlc/dmlc-core.git [submodule "3rdparty/ps-lite"] path = 3rdparty/ps-lite url = /~https://github.com/dmlc/ps-lite diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 5834aafec131..55f3c7bc1d87 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 5834aafec1319d2d327279d7e8e4b29f4e1ecd55 +Subproject commit 55f3c7bc1d875fbc7d34fc26651bb8c6818c8355