Fix a race condition in converting data layouts in MKLDNN. (#9862)

* Fix a race condition in converting data layouts. * Avoid calling data() in elemwise sum. * Fix a compilation error. * Address comments. * avoid data layout conversion inside ndarray. * Fix a compilation error. * address comments. * Reorder weight arrays in convolution async. * Fix async data reordering in NDArray. * Fix race condition in deconv. * Update ndarray.cc * Check more in NDArray. * Fix a bug in MKLDNNDataReorder. * Fix a bug in NDArray. * Simplify weight reorder in (de-)conv.
apache · Mar 1, 2018 · f9c2689 · f9c2689
1 parent e29b2f5
commit f9c2689
Show file tree

Hide file tree

Showing 9 changed files with 188 additions and 85 deletions.
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
@@ -622,12 +622,29 @@ class NDArray {
   /*
    * Reorder the memory to the specified layout.
    */
-  void MKLDNNDataReorder(const mkldnn::memory::primitive_desc &desc);
+  void MKLDNNDataReorder(const mkldnn::memory::primitive_desc &desc) {
+    CHECK_EQ(storage_type(), kDefaultStorage);
+    ptr_->MKLDNNDataReorder(desc);
+  }
   void Reorder2Default() {
     CHECK_EQ(storage_type(), kDefaultStorage);
     ptr_->Reorder2Default();
   }
 
+  /*
+   * These are the async version of the methods above.
+   * It changes the layout of this NDArray, but it happens after all accesses to
+   * the array are complete.
+   */
+  void Reorder2DefaultAsync();
+  void MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc);
+
+  /*
+   * This creates a new NDArray with the reordered data.
+   * It doesn't affect the data of the original NDArray.
+   */
+  NDArray Reorder2Default() const;
+
   void InvalidateMKLDNNData() {
     // Removing mkl_mem_ means the NDArray will store data in the default format.
     ptr_->mkl_mem_ = nullptr;
@@ -880,9 +897,11 @@ class NDArray {
     // Have MKL memory reference to the data in the default storage
     // or create memory for MKLDNN.
     void SetMKLMem(const TShape &shape, int dtype);
-    // In the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and
+    // If the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and
     // save the result in shandle.
     void Reorder2Default();
+    // Reroder data to a specified layout.
+    void MKLDNNDataReorder(const mkldnn::memory::primitive_desc &desc);
     bool IsMKLDNN() const;
     bool IsDefault() const;
 #endif

diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
@@ -375,7 +375,45 @@ void NDArray::Chunk::Reorder2Default() {
   CheckAndAlloc(def_pd.get_size());
   // TODO(zhengda) We need to avoid memory copy here.
   memcpy(shandle.dptr, def_mem->get_data_handle(), def_pd.get_size());
-  mkl_mem_.reset(new mkldnn::memory(def_pd, shandle.dptr));
+  mkl_mem_ = nullptr;
+}
+
+void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::primitive_desc &pd) {
+  // If the memory already uses the specified layout, don't do anything.
+  if (mkl_mem_ != nullptr && mkl_mem_->get_primitive_desc() == pd)
+    return;
+  auto _pd = pd;
+  auto _desc = _pd.desc();
+  auto def_format = GetDefaultFormat(_desc);
+  // If the memory is default, don't do anything.
+  if (def_format == _desc.data.format && IsDefault())
+    return;
+  // If the specified layout is default, we should use Reorder2Default.
+  if (def_format == _desc.data.format) {
+    Reorder2Default();
+    return;
+  }
+
+  std::shared_ptr<mkldnn::memory> new_mem(new mkldnn::memory(pd));
+  std::shared_ptr<mkldnn::memory> old_mem;
+  if (IsDefault()) {
+    auto def_pd = GetPrimitiveDesc(pd, def_format);
+    old_mem.reset(new mkldnn::memory(def_pd, shandle.dptr));
+  } else {
+    old_mem = this->mkl_mem_;
+  }
+  CHECK(old_mem->get_primitive_desc().desc().data.ndims == _desc.data.ndims);
+
+  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
+  std::vector<mkldnn::primitive> net;
+  net.push_back(mkldnn::reorder(*old_mem, *new_mem));
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+
+  CHECK(shandle.size >= pd.get_size());
+  CheckAndAlloc(pd.get_size());
+  // TODO(zhengda) We need to avoid memory copy here.
+  memcpy(shandle.dptr, new_mem->get_data_handle(), pd.get_size());
+  mkl_mem_.reset(new mkldnn::memory(pd, shandle.dptr));
 }
 
 void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) {
@@ -495,12 +533,56 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
   }
 }
 
+NDArray NDArray::Reorder2Default() const {
+  CHECK(storage_type() == kDefaultStorage);
+
+  if (ptr_->mkl_mem_ == nullptr)
+    return *this;
+  auto format = GetDefaultFormat(ptr_->mkl_mem_->get_primitive_desc().desc());
+  if (format == ptr_->mkl_mem_->get_primitive_desc().desc().data.format)
+    return *this;
+
+  NDArray ret(shape(), ctx(), false, dtype());
+  auto def_pd = GetPrimitiveDesc(ptr_->mkl_mem_->get_primitive_desc(), format);
+  CHECK(ret.ptr_->shandle.size >= def_pd.get_size());
+  mkldnn::memory def_mem(def_pd, ret.ptr_->shandle.dptr);
+  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
+  std::vector<mkldnn::primitive> net;
+  net.push_back(mkldnn::reorder(*ptr_->mkl_mem_, def_mem));
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+  return ret;
+}
+
+void NDArray::Reorder2DefaultAsync() {
+  std::vector<Engine::VarHandle> const_vars;
+  std::vector<Engine::VarHandle> mutable_vars(1, this->var());
+  NDArray tmp = *this;
+  Engine::Get()->PushAsync(
+    [tmp](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+      tmp.ptr_->Reorder2Default();
+      on_complete();
+    }, ctx(), const_vars, mutable_vars,
+    FnProperty::kNormal, 0, PROFILER_MESSAGE("Reorder2Default"));
+}
+
+void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc) {
+  std::vector<Engine::VarHandle> const_vars;
+  std::vector<Engine::VarHandle> mutable_vars(1, this->var());
+  NDArray tmp = *this;
+  Engine::Get()->PushAsync(
+    [tmp, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+      tmp.ptr_->MKLDNNDataReorder(desc);
+      on_complete();
+    }, ctx(), const_vars, mutable_vars,
+    FnProperty::kNormal, 0, PROFILER_MESSAGE("Reorder"));
+}
+
 const mkldnn::memory *NDArray::GetMKLDNNData() const {
   CHECK(storage_type() == kDefaultStorage);
-  // If this array uses MKLDNN layout and it's a view, we have to change its
-  // layout to the default layout.
-  if (IsMKLDNNData() && IsView())
-    ptr_->Reorder2Default();
+  // If this array uses MKLDNN layout, we have to make sure it's not a view.
+  // Otherwise, we'll have to change the layout inside the array.
+  if (IsMKLDNNData())
+    CHECK(!IsView());
   ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_);
   // If shandle has data, the data in shandle and mkl_mem_ should match.
   if (ptr_->shandle.dptr)
@@ -534,45 +616,6 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const {
   }
 }
 
-void NDArray::MKLDNNDataReorder(const mkldnn::memory::primitive_desc &pd) {
-  CHECK_EQ(storage_type(), kDefaultStorage);
-  // If the memory already uses the specified layout, don't do anything.
-  if (ptr_->mkl_mem_ != nullptr && ptr_->mkl_mem_->get_primitive_desc() == pd)
-    return;
-  auto _pd = pd;
-  auto _desc = _pd.desc();
-  auto def_format = GetDefaultFormat(_desc);
-  // If the memory is default, don't do anything.
-  if (def_format == _desc.data.format && ptr_->IsDefault())
-    return;
-  // If the specified layout is default, we should use Reorder2Default.
-  if (def_format == _desc.data.format) {
-    ptr_->Reorder2Default();
-    return;
-  }
-
-  std::shared_ptr<mkldnn::memory> new_mem(new mkldnn::memory(pd));
-  ptr_->SetMKLMem(shape_, dtype_);
-  auto old_mem = ptr_->mkl_mem_;
-  // It's possible that the specified layout has a different number of dimensions.
-  if (old_mem->get_primitive_desc().desc().data.ndims != _desc.data.ndims) {
-    // For now, we only support reorder from the default layout.
-    CHECK(ptr_->IsDefault());
-    auto def_pd = GetPrimitiveDesc(pd, def_format);
-    old_mem.reset(new mkldnn::memory(def_pd, old_mem->get_data_handle()));
-  }
-  // This may be called in MKLDNN operators. We can't use MKLDNNStream here.
-  std::vector<mkldnn::primitive> net;
-  net.push_back(mkldnn::reorder(*old_mem, *new_mem));
-  mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
-
-  CHECK(ptr_->shandle.size >= pd.get_size());
-  ptr_->CheckAndAlloc(pd.get_size());
-  // TODO(zhengda) We need to avoid memory copy here.
-  memcpy(ptr_->shandle.dptr, new_mem->get_data_handle(), pd.get_size());
-  ptr_->mkl_mem_.reset(new mkldnn::memory(pd, ptr_->shandle.dptr));
-}
-
 void NDArray::CopyFrom(const mkldnn::memory &mem) {
   CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized";
   if (ptr_->mkl_mem_.get() == &mem)
@@ -581,10 +624,10 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
   CHECK(mem.get_primitive_desc().get_size() == shape().Size() * GetTypeSize(dtype_))
       << "The size of NDArray doesn't match the requested MKLDNN memory desc";
   MKLDNNStream *stream = MKLDNNStream::Get();
-  // If this array uses MKLDNN layout and it's a view, we have to change its
-  // layout to the default layout.
-  if (IsMKLDNNData() && IsView())
-    ptr_->Reorder2Default();
+  // If this array uses MKLDNN layout, we have to make sure it's not a view.
+  // Otherwise, we'll have to change the layout inside the array.
+  if (IsMKLDNNData())
+    CHECK(!IsView());
   ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_,
                   dtype_);
   stream->RegisterMem(ptr_->mkl_mem_);
@@ -1017,6 +1060,7 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
     // with Copy().
     NDArray tmp_from = from;
     if (tmp_from.IsMKLDNNData()) {
+      // TODO(zhengda) tmp_from should be cached.
       tmp_from = NDArray(from.shape(), from.ctx(), false, from.dtype());
       auto tmp_mem = from.GetMKLDNNData();
       tmp_from.CopyFrom(*tmp_mem);
@@ -1025,7 +1069,7 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
     CHECK(tmp_from.IsDefaultData());
     CHECK(to.IsDefaultData());
     TBlob tmp = to.data();
-    ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
+    ndarray::Copy<from_xpu, to_xpu>(tmp_from.data(), &tmp,
                                     from.ctx(), to.ctx(), ctx);
   }
 #endif
@@ -1849,7 +1893,12 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   if (this->ctx().dev_mask() == cpu::kDevMask) {
     this->WaitToRead();
     RunContext rctx{this->ctx(), nullptr};
-    ndarray::Copy<cpu, cpu>(this->data(), &dst,
+    NDArray src = *this;
+#if MXNET_USE_MKLDNN == 1
+    if (src.IsMKLDNNData())
+      src = this->Reorder2Default();
+#endif
+    ndarray::Copy<cpu, cpu>(src.data(), &dst,
                             Context::CPU(), Context::CPU(), rctx);
   } else {
 #if MXNET_USE_CUDA

diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -270,9 +270,26 @@ void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
                      const std::vector<OpReqType> &req,
                      const std::vector<NDArray> &outputs) {
   std::vector<TBlob> in_blobs(inputs.size());
+  std::vector<NDArray> in_bufs;
   for (size_t i = 0; i < in_blobs.size(); i++) {
+    // If the input data isn't stored in the default format, we shouldn't
+    // call data() directly, which will change the layout of the NDArray.
+    // Instead, we should save the converted data in another NDArray.
+    // TODO(zhengda) we should use temp space to save the converted data.
+    if (inputs[i].IsDefaultData()) {
       in_blobs[i] = inputs[i].data();
+    } else {
+      if (in_bufs.empty())
+        in_bufs.reserve(inputs.size());
+      in_bufs.emplace_back(inputs[i].shape(), inputs[i].ctx(),
+                           false, inputs[i].dtype());
+      const mkldnn::memory *mem = inputs[i].GetMKLDNNData();
+      in_bufs.back().CopyFrom(*mem);
+      in_blobs[i] = in_bufs.back().data();
+    }
   }
+  MKLDNNStream::Get()->Submit();
+
   std::vector<TBlob> out_blobs(outputs.size());
   for (size_t i = 0; i < out_blobs.size(); i++) {
     if (req[i] == kWriteTo)

diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -262,25 +262,32 @@ void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx
                                const std::vector<NDArray> &out_data) {
   TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
   const ConvolutionParam& param = nnvm::get<ConvolutionParam>(attrs.parsed);
-  MKLDNNConvForward &fwd = GetConvFwd(attrs,
-      ctx.is_train, in_data[conv::kData], in_data[conv::kWeight],
+  NDArray weight = in_data[conv::kWeight];
+  MKLDNNConvForward &fwd = GetConvFwd(attrs, ctx.is_train, in_data[conv::kData], weight,
       param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
 
   auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc());
   const mkldnn::memory *weight_mem;
   if (ctx.is_train) {
     // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
     // to the default format for now.
-    if (in_data[conv::kWeight].IsMKLDNNData())
-      const_cast<NDArray &>(in_data[conv::kWeight]).Reorder2Default();
-    weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(),
-                            param.num_group);
+    if (weight.IsMKLDNNData())
+      // This asks the engine to change the layout of the weight array after
+      // it's used.
+      weight.Reorder2DefaultAsync();
+    weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), param.num_group);
   } else {
     // For inference, we want to reorder the weight array so we don't need to
     // reorder data every time.
-    const_cast<NDArray &>(in_data[conv::kWeight]).MKLDNNDataReorder(
-        fwd.fwd_pd.weights_primitive_desc());
-    weight_mem = in_data[conv::kWeight].GetMKLDNNData();
+    if (weight.IsDefaultData()) {
+      weight_mem = GetWeights(weight, fwd.fwd_pd.weights_primitive_desc(), param.num_group);
+      // We also need to modify the layout on the original weight array. The
+      // data conversion happens after the weight array is used.
+      weight.MKLDNNDataReorderAsync(fwd.fwd_pd.weights_primitive_desc());
+    } else {
+      weight_mem = weight.GetMKLDNNData();
+      CHECK(weight_mem->get_primitive_desc() == fwd.fwd_pd.weights_primitive_desc());
+    }
   }
   auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(),
                                  req[conv::kOut]);

diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -234,21 +234,27 @@ void MKLDNNDeconvForward::SetDataHandle(const DeconvolutionParam& param,
                                         const std::vector<NDArray> &out_data) {
   auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder(
       fwd_pd.diff_dst_primitive_desc());
+  NDArray weight = in_data[deconv::kWeight];
   const mkldnn::memory *weight_mem;
   if (ctx.is_train) {
     // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
     // to the default format for now.
-    if (in_data[deconv::kWeight].IsMKLDNNData())
-      const_cast<NDArray &>(in_data[deconv::kWeight]).Reorder2Default();
-    weight_mem = GetWeights(in_data[deconv::kWeight],
-                            fwd_pd.weights_primitive_desc(),
-                            param.num_group);
+    if (weight.IsMKLDNNData())
+      // This asks the engine to reorder data after the weight array is used.
+      weight.Reorder2DefaultAsync();
+    weight_mem = GetWeights(weight, fwd_pd.weights_primitive_desc(), param.num_group);
   } else {
     // For inference, we want to reorder the weight array so we don't need to
     // reorder data every time.
-    const_cast<NDArray &>(in_data[deconv::kWeight]).MKLDNNDataReorder(
-        fwd_pd.weights_primitive_desc());
-    weight_mem = in_data[deconv::kWeight].GetMKLDNNData();
+    if (weight.IsDefaultData()) {
+      weight_mem = GetWeights(weight, fwd_pd.weights_primitive_desc(), param.num_group);
+      // We also need to modify the layout on the original weight array. The
+      // data conversion happens after the weight array is used.
+      weight.MKLDNNDataReorderAsync(fwd_pd.weights_primitive_desc());
+    } else {
+      weight_mem = weight.GetMKLDNNData();
+      CHECK(weight_mem->get_primitive_desc() == fwd_pd.weights_primitive_desc());
+    }
   }
   auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut],
       fwd_pd.diff_src_primitive_desc(), req[deconv::kOut]);

diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
@@ -90,6 +90,11 @@ void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   const TShape& oshape = out_data[fullc::kOut].shape();
   NDArray weight = in_data[fullc::kWeight];
   NDArray data = in_data[fullc::kData];
+  // If the input data is a view of an MKLDNN array, we should create a new
+  // NDArray with reordered data.
+  if (data.IsMKLDNNData() && data.IsView())
+    data = in_data[fullc::kData].Reorder2Default();
+
   auto out_md = GetMemDesc(out_data[fullc::kOut]);
   if (data.shape().ndim() != 2 && !param.flatten) {
     data = data.MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1),

diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
@@ -351,7 +351,12 @@ void CastStorageComputeImpl(const OpContext& ctx,
     CHECK_EQ(output.ctx().dev_type, input.ctx().dev_type);
     // If one of them uses the MKLDNN layout.
     if (input.IsMKLDNNData() || output.IsMKLDNNData()) {
-      auto in_mem = input.GetMKLDNNData();
+      NDArray tmp_input = input;
+      // If the input data is MKLDNN and is a view, we need to reorder the input
+      // data first.
+      if (input.IsMKLDNNData() && input.IsView())
+        tmp_input = input.Reorder2Default();
+      const mkldnn::memory *in_mem = tmp_input.GetMKLDNNData();
       const_cast<NDArray &>(output).CopyFrom(*in_mem);
       MKLDNNStream::Get()->Submit();
     } else {