From d50999a75cad8a1e676d64572643bef37c58063f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 28 Mar 2019 05:50:35 -0700 Subject: [PATCH 1/4] Do not touch GPU 0 during ReleaseAll --- src/storage/pooled_storage_manager.h | 12 ++++++++++-- src/storage/storage.cc | 4 ++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h index 4c8ae4eb12dd..7dee32720e6f 100644 --- a/src/storage/pooled_storage_manager.h +++ b/src/storage/pooled_storage_manager.h @@ -54,10 +54,11 @@ class GPUPooledStorageManager final : public StorageManager { /*! * \brief Default constructor. */ - GPUPooledStorageManager() { + GPUPooledStorageManager(int dev_id) { reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5); page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096); large_alloc_round_size_ = dmlc::GetEnv("MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE", 2 * 1024 * 1024); + dev_id_ = dev_id; if (large_alloc_round_size_ <= 0) { LOG(FATAL) << "MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE cannot be set to a value <= 0, found: " << large_alloc_round_size_; @@ -123,6 +124,8 @@ class GPUPooledStorageManager final : public StorageManager { int reserve_; // number of devices const size_t NDEV = 32; + // device id + int dev_id_; // memory pool std::unordered_map> memory_pool_; DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager); @@ -177,6 +180,7 @@ void GPUPooledStorageManager::ReleaseAll() { Storage::Handle handle; handle.dptr = j; handle.size = i.first; + handle.ctx = Context::GPU(dev_id_); DirectFreeNoLock(handle); } } @@ -202,10 +206,11 @@ class GPUPooledRoundedStorageManager final : public StorageManager { /*! * \brief Default constructor. */ - GPUPooledRoundedStorageManager() { + GPUPooledRoundedStorageManager(int dev_id) { reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5); page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096); cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24); + dev_id_ = dev_id; if (page_size_ < 32) { LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than 32. " \ << "Got: " << page_size_ << "."; @@ -290,6 +295,8 @@ class GPUPooledRoundedStorageManager final : public StorageManager { size_t cut_off_; // percentage of reserved memory int reserve_; + // device id + int dev_id_; // memory pool std::vector> memory_pool_; DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager); @@ -345,6 +352,7 @@ void GPUPooledRoundedStorageManager::ReleaseAll() { Storage::Handle handle; handle.size = size; handle.dptr = j; + handle.ctx = Context::GPU(dev_id_); DirectFreeNoLock(handle); } memory_pool_[i].clear(); diff --git a/src/storage/storage.cc b/src/storage/storage.cc index 7484e699d388..ec83252ccafe 100644 --- a/src/storage/storage.cc +++ b/src/storage/storage.cc @@ -104,13 +104,13 @@ void StorageImpl::Alloc(Storage::Handle* handle) { std::string strategy = type; if (strategy == "Round") { - ptr = new storage::GPUPooledRoundedStorageManager(); + ptr = new storage::GPUPooledRoundedStorageManager(handle->ctx.real_dev_id()); LOG(INFO) << "Using GPUPooledRoundedStorageManager."; } else { if (strategy != "Naive") { LOG(FATAL) << "Unknown memory pool strategy specified: " << strategy << "."; } - ptr = new storage::GPUPooledStorageManager(); + ptr = new storage::GPUPooledStorageManager(handle->ctx.real_dev_id()); } #else LOG(FATAL) << "Compile with USE_CUDA=1 to enable GPU usage"; From db59968093d2e84083219678e3179563e3367a1a Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 28 Mar 2019 08:04:03 -0700 Subject: [PATCH 2/4] Fixing lint and fixes from review --- src/storage/pooled_storage_manager.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h index 7dee32720e6f..a2bdb142fa0f 100644 --- a/src/storage/pooled_storage_manager.h +++ b/src/storage/pooled_storage_manager.h @@ -53,12 +53,14 @@ class GPUPooledStorageManager final : public StorageManager { public: /*! * \brief Default constructor. + * + * \param initial_context context used by this Storage Manager */ - GPUPooledStorageManager(int dev_id) { + explicit GPUPooledStorageManager(Context initial_context) { reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5); page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096); large_alloc_round_size_ = dmlc::GetEnv("MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE", 2 * 1024 * 1024); - dev_id_ = dev_id; + initial_context_ = initial_context; if (large_alloc_round_size_ <= 0) { LOG(FATAL) << "MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE cannot be set to a value <= 0, found: " << large_alloc_round_size_; @@ -124,8 +126,8 @@ class GPUPooledStorageManager final : public StorageManager { int reserve_; // number of devices const size_t NDEV = 32; - // device id - int dev_id_; + // context used by this Storage Manager + Context initial_context_; // memory pool std::unordered_map> memory_pool_; DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager); @@ -180,7 +182,7 @@ void GPUPooledStorageManager::ReleaseAll() { Storage::Handle handle; handle.dptr = j; handle.size = i.first; - handle.ctx = Context::GPU(dev_id_); + handle.ctx = initial_context_; DirectFreeNoLock(handle); } } @@ -205,12 +207,14 @@ class GPUPooledRoundedStorageManager final : public StorageManager { public: /*! * \brief Default constructor. + * + * \param initial_context context used by this Storage Manager */ - GPUPooledRoundedStorageManager(int dev_id) { + explicit GPUPooledRoundedStorageManager(Context initial_context) { reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5); page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096); cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24); - dev_id_ = dev_id; + initial_context_ = initial_context; if (page_size_ < 32) { LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than 32. " \ << "Got: " << page_size_ << "."; @@ -295,8 +299,8 @@ class GPUPooledRoundedStorageManager final : public StorageManager { size_t cut_off_; // percentage of reserved memory int reserve_; - // device id - int dev_id_; + // context used by this Storage Manager + Context initial_context_; // memory pool std::vector> memory_pool_; DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager); @@ -352,7 +356,7 @@ void GPUPooledRoundedStorageManager::ReleaseAll() { Storage::Handle handle; handle.size = size; handle.dptr = j; - handle.ctx = Context::GPU(dev_id_); + handle.ctx = initial_context_; DirectFreeNoLock(handle); } memory_pool_[i].clear(); From 0de0dde508da8407f99f39408892c3956315c471 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 28 Mar 2019 08:11:10 -0700 Subject: [PATCH 3/4] Fix --- src/storage/storage.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/storage/storage.cc b/src/storage/storage.cc index ec83252ccafe..4f15351a594a 100644 --- a/src/storage/storage.cc +++ b/src/storage/storage.cc @@ -104,13 +104,13 @@ void StorageImpl::Alloc(Storage::Handle* handle) { std::string strategy = type; if (strategy == "Round") { - ptr = new storage::GPUPooledRoundedStorageManager(handle->ctx.real_dev_id()); + ptr = new storage::GPUPooledRoundedStorageManager(handle->ctx); LOG(INFO) << "Using GPUPooledRoundedStorageManager."; } else { if (strategy != "Naive") { LOG(FATAL) << "Unknown memory pool strategy specified: " << strategy << "."; } - ptr = new storage::GPUPooledStorageManager(handle->ctx.real_dev_id()); + ptr = new storage::GPUPooledStorageManager(handle->ctx); } #else LOG(FATAL) << "Compile with USE_CUDA=1 to enable GPU usage"; From d85e7f2ee938c5a454affca8d4d24960fa19f1bb Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 29 Mar 2019 01:55:25 -0700 Subject: [PATCH 4/4] Fixes from review --- src/storage/pooled_storage_manager.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h index a2bdb142fa0f..7726bc6f9273 100644 --- a/src/storage/pooled_storage_manager.h +++ b/src/storage/pooled_storage_manager.h @@ -56,11 +56,11 @@ class GPUPooledStorageManager final : public StorageManager { * * \param initial_context context used by this Storage Manager */ - explicit GPUPooledStorageManager(Context initial_context) { + explicit GPUPooledStorageManager(Context initial_context) : + initial_context_(initial_context) { reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5); page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096); large_alloc_round_size_ = dmlc::GetEnv("MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE", 2 * 1024 * 1024); - initial_context_ = initial_context; if (large_alloc_round_size_ <= 0) { LOG(FATAL) << "MXNET_GPU_MEM_LARGE_ALLOC_ROUND_SIZE cannot be set to a value <= 0, found: " << large_alloc_round_size_; @@ -127,7 +127,7 @@ class GPUPooledStorageManager final : public StorageManager { // number of devices const size_t NDEV = 32; // context used by this Storage Manager - Context initial_context_; + const Context initial_context_; // memory pool std::unordered_map> memory_pool_; DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager); @@ -210,11 +210,11 @@ class GPUPooledRoundedStorageManager final : public StorageManager { * * \param initial_context context used by this Storage Manager */ - explicit GPUPooledRoundedStorageManager(Context initial_context) { + explicit GPUPooledRoundedStorageManager(Context initial_context) : + initial_context_(initial_context) { reserve_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_RESERVE", 5); page_size_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_PAGE_SIZE", 4096); cut_off_ = dmlc::GetEnv("MXNET_GPU_MEM_POOL_ROUND_LINEAR_CUTOFF", 24); - initial_context_ = initial_context; if (page_size_ < 32) { LOG(FATAL) << "MXNET_GPU_MEM_POOL_PAGE_SIZE cannot be set to a value smaller than 32. " \ << "Got: " << page_size_ << "."; @@ -300,7 +300,7 @@ class GPUPooledRoundedStorageManager final : public StorageManager { // percentage of reserved memory int reserve_; // context used by this Storage Manager - Context initial_context_; + const Context initial_context_; // memory pool std::vector> memory_pool_; DISALLOW_COPY_AND_ASSIGN(GPUPooledRoundedStorageManager);