From 5f32f32e8c7ce50f20eae1438bf970eba6830578 Mon Sep 17 00:00:00 2001 From: Dick Carter Date: Sat, 23 Feb 2019 18:56:30 -0800 Subject: [PATCH] Dual stream cudnn Convolution backward() with MXNET_GPU_WORKER_NSTREAMS=2. (#14006) * Dual stream conv backward(). Enable with MXNET_GPU_WORKER_NSTREAMS=2. * Fix for MSVC compiler. * Fix cpplint. * Add MXNET_GPU_WORKER_NSTREAMS env var documentation. * Improve test function and commenting. * Add description of proper aux stream use using events. * RAII rework to simplify usage within operators. * Fix cpplint. * Expand testing to cover all engines. * Fix NaiveEngine shutdown segfault on CentOS7. --- docs/faq/env_var.md | 6 + include/mxnet/base.h | 133 ++++++++++++++++++ include/mxnet/op_attr_types.h | 9 ++ src/engine/naive_engine.cc | 25 +++- src/engine/stream_manager.h | 25 +++- src/engine/threaded_engine_perdevice.cc | 12 +- src/imperative/imperative_utils.h | 6 +- src/ndarray/ndarray.cc | 4 +- src/operator/nn/cudnn/cudnn_convolution-inl.h | 72 +++++++--- tests/python/gpu/test_operator_gpu.py | 50 +++++++ 10 files changed, 307 insertions(+), 35 deletions(-) diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md index c35d4e5723a5..f49cb1997691 100644 --- a/docs/faq/env_var.md +++ b/docs/faq/env_var.md @@ -191,6 +191,12 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca ## Other Environment Variables +* MXNET_GPU_WORKER_NSTREAMS + - Values: 1, or 2 ```(default=1)``` + - Determines the number of GPU streams available to operators for their functions. + - Setting this to 2 may yield a modest performance increase, since ops like the cuDNN convolution op can then calculate their data- and weight-gradients in parallel. + - Setting this to 2 may also increase a model's demand for GPU global memory. + * MXNET_CUDNN_AUTOTUNE_DEFAULT - Values: 0, 1, or 2 ```(default=1)``` - The default value of cudnn auto tuning for convolution layers. diff --git a/include/mxnet/base.h b/include/mxnet/base.h index 26c1a1bd2b29..2ea6ebbdf3d7 100644 --- a/include/mxnet/base.h +++ b/include/mxnet/base.h @@ -193,6 +193,11 @@ struct Context { * \return The number of GPUs that are available. */ inline static int32_t GetGPUCount(); + /*! + * Get the number of streams that a GPU Worker has available to operations. + * \return The number of streams that are available. + */ + inline static int32_t GetGPUStreamsPerWorker(); /*! * \brief get the free and total available memory on a GPU * \param dev the GPU number to query @@ -221,6 +226,112 @@ struct Context { inline static Context FromString(const std::string& str); }; +#if MXNET_USE_CUDA +/*! \brief Holds an auxiliary mshadow gpu stream that can be synced with a primary stream. */ +class GPUAuxStream { + public: + /*! + * \brief constructor. + * \param primary_stream gpu stream that is synced with the created auxiliary stream. + */ + explicit GPUAuxStream(mshadow::Stream *primary_stream) : + primary_stream_(primary_stream), + aux_stream_(primary_stream), + gpu_stream_sync_event_(nullptr) { + if (Context::GetGPUStreamsPerWorker() >= 2) { + // Create auxiliary stream on the same device with the same properties as the primary stream + bool primary_has_blas_handle = + primary_stream->blas_handle_ownership_ == mshadow::Stream::OwnHandle; + bool primary_has_dnn_handle = + primary_stream->dnn_handle_ownership_ == mshadow::Stream::OwnHandle; + aux_stream_ = mshadow::NewStream(primary_has_blas_handle, + primary_has_dnn_handle, + primary_stream->dev_id); + MSHADOW_CUDA_CALL(cudaEventCreateWithFlags(&gpu_stream_sync_event_, cudaEventDisableTiming)); + } + } + /*! \brief destructor */ + ~GPUAuxStream() { + // If the aux_stream_ == primary_stream_, then we created no new streams to destroy. + if (aux_stream_ != primary_stream_) { + MSHADOW_CATCH_ERROR(mshadow::DeleteStream(aux_stream_)); + MSHADOW_CATCH_ERROR(cudaEventDestroy(gpu_stream_sync_event_)); + } + } + /*! + * \brief Makes future aux stream work wait on the completion of existing primary stream work. + */ + void PreAuxStreamUseSync() { + // If the aux_stream_ == primary_stream_, then no synchronization is necessary. + if (aux_stream_ != primary_stream_) + StreamSync(primary_stream_, aux_stream_, gpu_stream_sync_event_); + } + /*! + * \brief Makes future primary stream work wait on the completion of existing aux stream work. + */ + void PostAuxStreamUseSync() { + // If the aux_stream_ == primary_stream_, then no synchronization is necessary. + if (aux_stream_ != primary_stream_) + StreamSync(aux_stream_, primary_stream_, gpu_stream_sync_event_); + } + /*! \brief Getter for created auxiliary stream. */ + mshadow::Stream *GetStream() { return aux_stream_; } + /*! + * \brief Make future work enqueued to `s2` wait on completion of current work enqueued to `s1`. + * \param s1 stream with work that must be completed before future s2 work can begin. + * \param s2 stream whose future work is made to wait on the completion of existing s1 work. + * \param event used to pass s1 state to s2. + */ + static void StreamSync(mshadow::Stream *s1, mshadow::Stream *s2, cudaEvent_t event) { + MSHADOW_CUDA_CALL(cudaEventRecord(event, s1->stream_)); + MSHADOW_CUDA_CALL(cudaStreamWaitEvent(s2->stream_, event, 0)); + } + + private: + mshadow::Stream *primary_stream_; + mshadow::Stream *aux_stream_; + cudaEvent_t gpu_stream_sync_event_; +}; + +/*! + * \brief Provides automatic coordination of an auxilary stream with a primary one. + * This object, upon construction, prepares an aux stream for use by syncing it with enqueued + * primary-stream work. Object destruction will sync again so future primary-stream work + * will wait on enqueued aux-stream work. If MXNET_GPU_WORKER_NSTREAMS == 1, then this defaults + * simply: the primary stream will equal the aux stream and the syncs will be executed as nops. + * See ./src/operator/cudnn/cudnn_convolution-inl.h for a usage example. + */ +class SyncedGPUAuxStream { + public: + /*! + * \brief constructor. + * \param gpu_aux_stream auxilary gpu stream that is managed by this RAII object. + */ + explicit SyncedGPUAuxStream(GPUAuxStream *gpu_aux_stream) : gpu_aux_stream_(gpu_aux_stream) { + gpu_aux_stream_->PreAuxStreamUseSync(); + } + /*! \brief destructor */ + ~SyncedGPUAuxStream() { + gpu_aux_stream_->PostAuxStreamUseSync(); + } + /*! \brief copy constructor deleted to prevent unexpected synchronizations. */ + SyncedGPUAuxStream(const SyncedGPUAuxStream&) = delete; + /*! \brief copy assignment operator deleted to prevent unexpected synchronizations. */ + void operator=(const SyncedGPUAuxStream&) = delete; + /*! \brief move constructor permitted as alternative to copying. */ + SyncedGPUAuxStream(SyncedGPUAuxStream&&) = default; + /*! \brief move assignment operator permitted as alternative to copy assignment. */ + SyncedGPUAuxStream& operator=(SyncedGPUAuxStream&&) = default; + /*! \brief Getter for underlying mshadow::Stream. */ + inline mshadow::Stream* GetStream() const { + return gpu_aux_stream_->GetStream(); + } + + private: + GPUAuxStream *gpu_aux_stream_; +}; +#endif // MXNET_USE_CUDA + /*! * \brief execution time context. * The information needed in runtime for actual execution. @@ -232,6 +343,10 @@ struct RunContext { * \brief the stream of the device, can be NULL or Stream* in GPU mode */ void *stream; + /*! + * \brief the auxiliary stream of the device, can be NULL or Stream* in GPU mode + */ + void *aux_stream; /*! * \brief indicator of whether this execution is run in bulk mode */ @@ -245,6 +360,15 @@ struct RunContext { inline mshadow::Stream* get_stream() const { return static_cast*>(stream); } +#if MXNET_USE_CUDA + /*! + * \brief get an RAII object that transparently handles the syncing of the auxiliary stream. + * \return the aux stream auto-syncing object + */ + inline SyncedGPUAuxStream get_gpu_aux_stream() const { + return SyncedGPUAuxStream(static_cast(aux_stream)); + } +#endif /*! \brief get the base Context from RunContext */ inline const Context& get_ctx() const { return ctx; @@ -309,6 +433,15 @@ inline int32_t Context::GetGPUCount() { #endif } +inline int32_t Context::GetGPUStreamsPerWorker() { + // The default number of streams available if the user has not set MXNET_GPU_WORKER_NSTREAMS. + const int32_t default_num_streams = 1; + // The get_aux_stream() interface can supply one additional stream beyond the standard one. + static int32_t num_streams = + dmlc::GetEnv("MXNET_GPU_WORKER_NSTREAMS", default_num_streams) >= 2 ? 2 : 1; + return num_streams; +} + inline void Context::GetGPUMemoryInformation(int dev, uint64_t *free_mem, uint64_t *total_mem) { #if MXNET_USE_CUDA diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h index aba59ce5f0ea..22bba301221d 100644 --- a/include/mxnet/op_attr_types.h +++ b/include/mxnet/op_attr_types.h @@ -83,6 +83,15 @@ struct OpContext { inline mshadow::Stream* get_stream() const { return run_ctx.get_stream(); } +#if MXNET_USE_CUDA + /*! + * \brief get auxilary gpu stream auto-syncing object from Context + * \return the aux stream auto-syncing object + */ + inline SyncedGPUAuxStream get_gpu_aux_stream() const { + return run_ctx.get_gpu_aux_stream(); + } +#endif }; /*! \brief the execution type of the operator */ diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc index 05b72d2a6fde..db4491981bdd 100644 --- a/src/engine/naive_engine.cc +++ b/src/engine/naive_engine.cc @@ -62,6 +62,8 @@ class NaiveEngine final : public Engine { }; NaiveEngine() { + objpool_opr_ref_ = common::ObjectPool::_GetSharedRef(); + objpool_var_ref_ = common::ObjectPool::_GetSharedRef(); } // virtual destructor virtual ~NaiveEngine() { @@ -74,6 +76,12 @@ class NaiveEngine final : public Engine { streams_[i] = nullptr; } } + for (size_t i = 0; i < aux_streams_.size(); ++i) { + if (aux_streams_[i] != nullptr) { + delete aux_streams_[i]; + aux_streams_[i] = nullptr; + } + } #endif } @@ -169,16 +177,18 @@ class NaiveEngine final : public Engine { MSHADOW_CATCH_ERROR(mshadow::SetDevice(exec_ctx.dev_id)); if (streams_.size() <= dev_id) { streams_.resize(dev_id + 1, nullptr); + aux_streams_.resize(dev_id + 1, nullptr); } if (streams_[dev_id] == nullptr) { streams_[dev_id] = mshadow::NewStream(true, MXNET_USE_CUDNN != 0, dev_id); + aux_streams_[dev_id] = new GPUAuxStream(streams_[dev_id]); } - exec_fun(RunContext{exec_ctx, streams_[dev_id]}, callback); + exec_fun(RunContext{exec_ctx, streams_[dev_id], aux_streams_[dev_id], false}, callback); #else LOG(FATAL) << "GPU is not enabled"; #endif } else { - exec_fun(RunContext{exec_ctx, &cpu_stream_}, callback); + exec_fun(RunContext{exec_ctx, &cpu_stream_, nullptr, false}, callback); } CHECK(this->req_completed_) << "NaiveEngine only support synchronize Push so far"; @@ -220,6 +230,17 @@ class NaiveEngine final : public Engine { mshadow::Stream cpu_stream_; // GPU streams std::vector*> streams_; +#if MXNET_USE_CUDA + // GPU auxiliary streams + std::vector aux_streams_; +#endif +/*! + * \brief Holding a shared_ptr to the object pool to prevent it from being destructed too early + * See also #309 (/~https://github.com/dmlc/mxnet/issues/309) and similar fix in threaded_engine.h. + * Without this, segfaults seen on CentOS7 in test_operator_gpu.py:test_convolution_multiple_streams + */ + std::shared_ptr > objpool_opr_ref_; + std::shared_ptr > objpool_var_ref_; }; // class NaiveEngine Engine *CreateNaiveEngine() { diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h index 8d44d9c15531..42d03e55a275 100644 --- a/src/engine/stream_manager.h +++ b/src/engine/stream_manager.h @@ -55,6 +55,8 @@ class StreamManager { #if MXNET_USE_CUDA std::array*, kStreams>, kNumGpus> gpu_streams_; + std::array, kNumGpus> + gpu_aux_streams_; std::array*, kNumGpus> gpu_io_streams_; std::array gpu_cnt_; #endif // MXNET_USE_CUDA @@ -67,7 +69,7 @@ RunContext StreamManager::GetRunContext( RunContext ret; switch (ctx.dev_mask()) { case cpu::kDevMask: - ret = RunContext{ctx, nullptr, false}; + ret = RunContext{ctx, nullptr, nullptr, false}; break; case gpu::kDevMask: { #if MXNET_USE_CUDA @@ -77,8 +79,13 @@ RunContext StreamManager::GetRunContext( auto&& counter = gpu_cnt_.at(ctx.dev_id); if (counter == -1) { mxnet::common::cuda::DeviceStore device_store(ctx.dev_id); - for (auto&& i : gpu_streams_.at(ctx.dev_id)) { - i = mshadow::NewStream(true, MXNET_USE_CUDNN != 0, ctx.dev_id); + for (auto&& primary_stream : gpu_streams_.at(ctx.dev_id)) { + primary_stream = mshadow::NewStream(true, MXNET_USE_CUDNN != 0, ctx.dev_id); + } + int idx = 0; + for (auto&& aux_stream : gpu_aux_streams_.at(ctx.dev_id)) { + auto primary_stream = gpu_streams_.at(ctx.dev_id).at(idx++); + aux_stream = new GPUAuxStream(primary_stream); } counter = 0; } @@ -87,6 +94,7 @@ RunContext StreamManager::GetRunContext( } ret = RunContext{ctx, gpu_streams_.at(ctx.dev_id).at(use_counter), + gpu_aux_streams_.at(ctx.dev_id).at(use_counter), false}; break; #else @@ -105,7 +113,7 @@ RunContext StreamManager::GetIORunContext( RunContext ret; switch (ctx.dev_mask()) { case cpu::kDevMask: - ret = RunContext{ctx, nullptr, false}; + ret = RunContext{ctx, nullptr, nullptr, false}; break; case gpu::kDevMask: { #if MXNET_USE_CUDA @@ -116,7 +124,7 @@ RunContext StreamManager::GetIORunContext( gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream(false, false, ctx.dev_id); } } - ret = RunContext{ctx, gpu_io_streams_.at(ctx.dev_id), false}; + ret = RunContext{ctx, gpu_io_streams_.at(ctx.dev_id), nullptr, false}; break; #else LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; @@ -145,9 +153,12 @@ void StreamManager::Finalize() { #if MXNET_USE_CUDA for (std::size_t i = 0; i < kNumGpus; ++i) { if (gpu_cnt_.at(i) != -1) { - for (auto&& j : gpu_streams_.at(i)) { + for (auto&& primary_stream : gpu_streams_.at(i)) { // Catch exception for CUDA driver shutdown - MSHADOW_CATCH_ERROR(mshadow::DeleteStream(j)); + MSHADOW_CATCH_ERROR(mshadow::DeleteStream(primary_stream)); + } + for (auto&& aux_stream : gpu_aux_streams_.at(i)) { + delete aux_stream; } gpu_cnt_.at(i) = -1; } diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc index b6537dabb638..bcb101e9e1bb 100644 --- a/src/engine/threaded_engine_perdevice.cc +++ b/src/engine/threaded_engine_perdevice.cc @@ -99,7 +99,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine { MSHADOW_CATCH_ERROR(mshadow::SetDevice(ctx.dev_id)); #endif } - this->ExecuteOprBlock(RunContext{ctx, nullptr}, opr_block); + this->ExecuteOprBlock(RunContext{ctx, nullptr, nullptr, false}, opr_block); } else { if (ctx.dev_mask() == Context::kCPU) { // CPU execution. @@ -244,7 +244,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine { this->is_worker_ = true; #if MXNET_USE_CUDA CHECK(block != nullptr); - mshadow::Stream *stream; + mshadow::Stream *stream = nullptr; + GPUAuxStream *aux_stream = nullptr; do { ThreadPool::SetReadyOnDestroy setReady(ready_event); // allocate stream @@ -253,11 +254,12 @@ class ThreadedEnginePerDevice : public ThreadedEngine { stream = mshadow::NewStream(false, false, ctx.dev_id); } else { stream = mshadow::NewStream(true, MXNET_USE_CUDNN != 0, ctx.dev_id); + aux_stream = new GPUAuxStream(stream); } } while (false); // execute task OprBlock* opr_block; - RunContext run_ctx{ctx, stream}; + RunContext run_ctx{ctx, stream, aux_stream, false}; auto* task_queue = &(block->task_queue); // Don't eat up omp threads for GPU jobs. They're probably best used elsewhere, @@ -269,6 +271,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine { } // Catch exception for CUDA driver shutdown MSHADOW_CATCH_ERROR(mshadow::DeleteStream(stream)); + if (aux_stream != nullptr) + delete aux_stream; #else ready_event->signal(); #endif @@ -283,7 +287,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine { const std::shared_ptr& ready_event) { this->is_worker_ = true; auto* task_queue = &(block->task_queue); - RunContext run_ctx{ctx, nullptr}; + RunContext run_ctx{ctx, nullptr, nullptr, false}; // execute task OprBlock* opr_block; diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 7113cb2063c1..a8db4810f7c1 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -448,7 +448,7 @@ inline void PushFComputeEx(const FComputeEx& fn, }; if (exec_type == ExecType::kCrossDeviceCopy) { - run(RunContext{ctx, nullptr}); + run(RunContext{ctx, nullptr, nullptr, false}); } else { CHECK(exec_type == ExecType::kSync); Engine::Get()->PushSync(run, ctx, read_vars, write_vars, FnProperty::kNormal, @@ -498,7 +498,7 @@ inline void PushOperator(const OpStatePtr& state, // For operators with subgraphs, we need to invoke them in the main thread // instead of the threaded engine. if (exec_type == ExecType::kSubgraphExec) { - RunContext rctx{ctx, nullptr}; + RunContext rctx{ctx, nullptr, nullptr, false}; run(rctx, engine::CallbackOnComplete()); } else if (exec_type == ExecType::kSync) { Engine::Get()->PushSync( @@ -546,7 +546,7 @@ inline void PushOperator(const OpStatePtr& state, }; if (exec_type == ExecType::kSubgraphExec) { - RunContext rctx{ctx, nullptr}; + RunContext rctx{ctx, nullptr, nullptr, false}; run(rctx, engine::CallbackOnComplete()); } else if (exec_type == ExecType::kSync) { Engine::Get()->PushSync( diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 0f0fed24d4e6..648f9584618c 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -1826,7 +1826,7 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const { if (this->ctx().dev_mask() == cpu::kDevMask) { this->WaitToWrite(); - RunContext rctx{this->ctx(), nullptr}; + RunContext rctx{this->ctx(), nullptr, nullptr, false}; TBlob dst = this->data(); ndarray::Copy(src, &dst, Context::CPU(), Context::CPU(), rctx); } else { @@ -1957,7 +1957,7 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const { if (this->ctx().dev_mask() == cpu::kDevMask) { this->WaitToRead(); - RunContext rctx{this->ctx(), nullptr}; + RunContext rctx{this->ctx(), nullptr, nullptr, false}; NDArray src = *this; #if MXNET_USE_MKLDNN == 1 if (src.IsMKLDNNData()) diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index 3bd6c5a3826b..f68d2e3e8ead 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -53,6 +53,7 @@ class CuDNNConvolutionOp { CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); + parallelize_backward_kernels_ = Context::GetGPUStreamsPerWorker() >= 2; } void Init(const ConvolutionParam& param, @@ -110,6 +111,7 @@ class CuDNNConvolutionOp { // future cuDNN releases. SelectAlgo(rctx, in_shape, out_shape, cudnn_forward_compute_type, cudnn_backward_compute_type); + GetTempSize(rctx); } ~CuDNNConvolutionOp() { @@ -131,7 +133,6 @@ class CuDNNConvolutionOp { CHECK_EQ(in_data.size(), expected); CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); - GetTempSize(ctx); Tensor workspace = AllocateTempWorkspace(ctx, forward_workspace_byte_); size_t workspace_size = TensorSizeBytes(workspace); @@ -224,6 +225,8 @@ class CuDNNConvolutionOp { CHECK_EQ(in_data.size(), expected); CHECK_EQ(in_grad.size(), expected); Stream *s = ctx.get_stream(); + // RAII object to handle syncing of the underlying auxiliary stream with the primary stream + SyncedGPUAuxStream s_dgrad = ctx.get_gpu_aux_stream(); // I/O's should have 2 more dims than the kernel dim DType *grad_ptr = GetNdPtr(out_grad[conv::kOut], param_.kernel.ndim() + 2, s); @@ -232,9 +235,27 @@ class CuDNNConvolutionOp { DType *data_ptr = GetNdPtr(in_data[conv::kData], param_.kernel.ndim() + 2, s); DType *gdata_ptr = GetNdPtr(in_grad[conv::kData], param_.kernel.ndim() + 2, s); - GetTempSize(ctx); - Tensor workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_); + size_t backward_workspace_byte = + parallelize_backward_kernels_ ? back_workspace_byte_dgrad_ + back_workspace_byte_wgrad_ + : std::max(back_workspace_byte_dgrad_, + back_workspace_byte_wgrad_); + Tensor workspace = AllocateTempWorkspace(ctx, backward_workspace_byte); size_t workspace_size = TensorSizeBytes(workspace); + DType *workspace_dptr_wgrad = workspace.dptr_; + DType *workspace_dptr_dgrad = workspace.dptr_; + if (parallelize_backward_kernels_) { + CHECK_LE(back_workspace_byte_dgrad_ + back_workspace_byte_wgrad_, workspace_size); + // Large allocations at some point will be given their own page. Pass this alignment on to + // the larger of the two separate dgrad/wgrad workspaces. This probably doesn't matter, but + // corresponds more closely to the workspace alignments used during cudnnFind. + if (back_workspace_byte_dgrad_ > back_workspace_byte_wgrad_) + workspace_dptr_wgrad = workspace.dptr_ + back_workspace_byte_dgrad_ / sizeof(DType); + else + workspace_dptr_dgrad = workspace.dptr_ + back_workspace_byte_wgrad_ / sizeof(DType); + } else { + CHECK_LE(back_workspace_byte_dgrad_, workspace_size); + CHECK_LE(back_workspace_byte_wgrad_, workspace_size); + } #if CUDNN_MAJOR >= 7 typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; @@ -259,14 +280,14 @@ class CuDNNConvolutionOp { grad_ptr, back_conv_desc_w_, back_algo_w_.AlgoNumber(), - workspace.dptr_, - workspace_size, + workspace_dptr_wgrad, + back_workspace_byte_wgrad_, req[conv::kWeight] == kAddTo? &beta_add : &beta, filter_desc_, gwmat_ptr)); } if (req[conv::kData] != kNullOp) { - CUDNN_CALL(cudnnConvolutionBackwardData(s->dnn_handle_, + CUDNN_CALL(cudnnConvolutionBackwardData(s_dgrad.GetStream()->dnn_handle_, &alpha, filter_desc_, wmat_ptr, @@ -274,8 +295,8 @@ class CuDNNConvolutionOp { grad_ptr, back_conv_desc_, back_algo_.AlgoNumber(), - workspace.dptr_, - workspace_size, + workspace_dptr_dgrad, + back_workspace_byte_dgrad_, req[conv::kData] == kAddTo? &beta_add : &beta, in_desc_, gdata_ptr)); @@ -912,24 +933,30 @@ class CuDNNConvolutionOp { } - void GetTempSize(const OpContext& ctx) { - mshadow::Stream *s = ctx.get_stream(); - size_t back_size = 0, back_size_w = 0; + void GetTempSize(const RunContext& rctx) { + mshadow::Stream *s = rctx.get_stream(); CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_, filter_desc_, out_desc_, back_conv_desc_, in_desc_, back_algo_.AlgoNumber(), - &back_size)); + &back_workspace_byte_dgrad_)); CUDNN_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_, in_desc_, out_desc_, back_conv_desc_w_, filter_desc_, back_algo_w_.AlgoNumber(), - &back_size_w)); - backward_workspace_byte_ = std::max(back_size, back_size_w); + &back_workspace_byte_wgrad_)); + // cudaMalloc returns addresses that are aligned for large accesses (e.g. to 512 bytes). + // Since we only make one allocation and divide it into two parts when we parallelize + // the dgrad and wgrad kernels, we round the sizes up to this alignment size so the + // dptrs respect this alignment, even if the separate areas are stacked. + const size_t dptr_alignment = 512; + back_workspace_byte_dgrad_ = RoundToMultiple(back_workspace_byte_dgrad_, dptr_alignment); + back_workspace_byte_wgrad_ = RoundToMultiple(back_workspace_byte_wgrad_, dptr_alignment); + CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_, in_desc_, filter_desc_, @@ -983,11 +1010,18 @@ class CuDNNConvolutionOp { CastTShapeToIntPtr(param_.pad, ¶m_pad_); } + // Round a value 'x' up to the next multiple of 'multiple' + size_t RoundToMultiple(size_t x, size_t multiple) { + size_t retVal = ((x + multiple - 1) / multiple) * multiple; + return retVal; + } + // Allocates a 1D Tensor of words with size in bytes >= `size_bytes`. // Always allocates at least one word. mshadow::Tensor AllocateTempWorkspace(const OpContext &ctx, size_t size_bytes) { mshadow::Stream *s = ctx.get_stream(); - size_t size_words = size_bytes / sizeof(DType) + 1; + size_t size_words = + std::max(1, RoundToMultiple(size_bytes, sizeof(DType)) / sizeof(DType)); return ctx.requested[conv::kTempSpace].get_space_typed( mshadow::Shape1(size_words), s); } @@ -1035,8 +1069,10 @@ class CuDNNConvolutionOp { // Temp workspace size in bytes needed for Forward() operation. size_t forward_workspace_byte_; - // Temp workspace size in bytes needed for Backward() operation. - size_t backward_workspace_byte_; + // Temp workspace size in bytes needed for Backward() dgrad (data gradient) operation. + size_t back_workspace_byte_dgrad_; + // Temp workspace size in bytes needed for Backward() wgrad (weight gradient) operation. + size_t back_workspace_byte_wgrad_; size_t data_offset_; size_t out_offset_; size_t weight_offset_; @@ -1052,6 +1088,8 @@ class CuDNNConvolutionOp { cudnnConvolutionDescriptor_t back_conv_desc_; // Convolution descriptor for back-prop operations to the weights cudnnConvolutionDescriptor_t back_conv_desc_w_; + // Should dgrad and wgrad be launched into separate streams + bool parallelize_backward_kernels_; // Algorithm for the forward inference operation CuDNNAlgo forward_algo_; // Algorithm for the back-prop operation to the data diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 010cf504fe70..4dbf82edd3f5 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -520,6 +520,56 @@ def test_convolution_options(): sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(1,1,1), pad=(0,0,0), cudnn_off=True, name='conv') check_consistency_NxM([sym, sym_no_cudnn], ctx_list) + +# Helper function to run tests in a subprocess to avoid save/restore of os.environ. +# Also avoids issues of cached environment variable lookups in the backend. +def _test_in_separate_process(func, env, *args): + try: + mpctx = mp.get_context('spawn') + except: + print('SKIP: python%s.%s lacks the required process fork-exec support ... ' % + sys.version_info[0:2], file=sys.stderr, end='') + else: + seed = np.random.randint(0,1024*1024*1024) + for (key, value) in env.items(): + os.environ[key] = str(value) + # Prepend seed as first arg + p = mpctx.Process(target=func, args=(seed,)+args) + p.start() + p.join() + assert p.exitcode == 0, "Non-zero exit code %d from %s()." % (p.exitcode, func.__name__) + +def _conv_with_num_streams(seed): + with random_seed(seed): + # Try to expose timing-dependent improper workspace sharing by parallel dgrad and wgrad + num_trials = 20 + for _ in range(num_trials): + size = np.random.randint(32, 128) + # The cudnn conv operator runs dgrad and wgrad in separate streams if enabled, with possible + # kernel overlap. The non-cudnn conv op doesn't do this so is used as the 'golden copy'. + ctx = {'ctx': mx.gpu(0), 'conv_data': (2, 2, size, size), + 'type_dict': {'conv_data': np.float32}} + # Adding 'flip' here isolates the model from the input node (which can't use inplace store) + flipped = mx.sym.flip(axis=0, name='conv') + sym = mx.sym.Convolution(data=flipped, num_filter=3, kernel=(3,3), pad=(1,1), name='conv') + flipped_no_cudnn = mx.sym.flip(axis=0, name='conv') + sym_no_cudnn = mx.sym.Convolution(data=flipped_no_cudnn, num_filter=3, kernel=(3,3), pad=(1,1), + cudnn_off=True, name='conv') + try: + # tol can be pretty high- we're looking for a large diff due to garbaged workspace + check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=1e-2) + except: + print('Failing conv size = {}'.format(size)) + raise + +@with_seed() +def test_convolution_multiple_streams(): + for num_streams in [1, 2]: + for engine in ['NaiveEngine', 'ThreadedEngine', 'ThreadedEnginePerDevice']: + _test_in_separate_process(_conv_with_num_streams, + {'MXNET_GPU_WORKER_NSTREAMS' : num_streams, 'MXNET_ENGINE_TYPE' : engine}) + + # This test is designed to expose an issue with cudnn v7.1.4 algo find() when invoked with large c. # Algos returned by find() can fail to run with grad_req='add' (wgrad kernel beta parameter == 1.0f). @with_seed()