diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05d8021c3677..ed96a6c83719 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -187,8 +187,12 @@ endif()
 
 if(USE_MKL_IF_AVAILABLE)
   if(USE_MKLDNN)
+    # We need to use generic archtecture. Otherwise, MKLDNN compiled in one
+    # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
+    set(ARCH_OPT_FLAGS "-mtune=generic")
     add_subdirectory(3rdparty/mkldnn)
     include_directories(3rdparty/mkldnn/include)
+    add_definitions(-DMXNET_USE_MKLDNN=1)
     list(APPEND mxnet_LINKER_LIBS mkldnn)
   endif()
   find_package(MKL)
@@ -197,10 +201,6 @@ if(USE_MKL_IF_AVAILABLE)
     include_directories(${MKL_INCLUDE_DIR})
     include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/operator/mkl)
 
-    if(USE_MKLDNN)
-      add_definitions(-DMXNET_USE_MKLDNN=1)
-    endif()
-
     add_definitions(-DUSE_MKL=1)
     add_definitions(-DCUB_MKL=1)
     list(APPEND mxnet_LINKER_LIBS ${MKL_LIBRARIES})
diff --git a/Jenkinsfile b/Jenkinsfile
index eb2160f370c1..84116e4d85b0 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -26,7 +26,7 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdpart
 mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/nnvm/lib/libnnvm.a'
 // command to start a docker container
 docker_run = 'tests/ci_build/ci_build.sh'
@@ -534,6 +534,17 @@ try {
         }
       }
     },
+    'Cpp: MKLDNN+GPU': {
+      node('mxnetlinux-gpu') {
+        ws('workspace/ut-cpp-mkldnn-gpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            init_git()
+            unpack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib)
+            sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_gpu_cpp"
+          }
+        }
+      }
+    },
     'R: CPU': {
       node('mxnetlinux-cpu') {
         ws('workspace/ut-r-cpu') {
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 92994620a493..4ab85064e30f 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -323,6 +323,9 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
+    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
+    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
+    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index d175a13632a1..4b4596961953 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -485,8 +485,8 @@ const mkldnn::memory *NDArray::GetMKLDNNData(
 }
 
 const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
-    const mkldnn::memory::primitive_desc &desc) const {
-  if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
+    const mkldnn::memory::primitive_desc &new_pd) const {
+  if (new_pd.get_size() != shape().Size() * GetTypeSize(dtype_)) {
     LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
     return nullptr;
   }
@@ -495,24 +495,41 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
   const mkldnn::memory *mem = GetMKLDNNData();
   // If the memory descriptor matches, it's easy.
   MKLDNNStream *stream = MKLDNNStream::Get();
-  if (mem->get_primitive_desc() == desc) {
-    return GetMKLDNNExact(mem, desc);
+  if (mem->get_primitive_desc() == new_pd) {
+    return GetMKLDNNExact(mem, new_pd);
   }
 
-  mkldnn::memory::primitive_desc _desc = desc;
+  mkldnn::memory::primitive_desc _pd = new_pd;
+  mkldnn::memory::desc desc1 = mem->get_primitive_desc().desc();
+  mkldnn::memory::desc desc2 = _pd.desc();
   // Now we need to determine if we should reorder the memory.
   // If both use the default formats, we think we don't need to reorder.
-  mkldnn::memory::desc desc1 = mem->get_primitive_desc().desc();
-  mkldnn::memory::desc desc2 = _desc.desc();
   if (desc1.data.format == GetDefaultFormat(desc1) &&
       desc2.data.format == GetDefaultFormat(desc2)) {
-    mkldnn_mem_ptr ret(new mkldnn::memory(desc, mem->get_data_handle()));
+    mkldnn_mem_ptr ret(new mkldnn::memory(new_pd, mem->get_data_handle()));
     stream->RegisterMem(ret);
     return ret.get();
-  } else {
-    mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(desc);
+  } else if (same_shape(desc1, desc2)) {
+    // If they have the same shape, we can reorder data directly.
+    mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(new_pd);
     stream->RegisterPrim(mkldnn::reorder(*mem, *ret));
     return ret;
+  } else {
+    // If they have different shapes, we need to reshape the array first.
+    // Since this method will only be used inside an operator, we can call
+    // MKLDNNDataReshape to reshape an array.
+    TShape required_shape(desc2.data.ndims);
+    for (int i = 0; i < desc2.data.ndims; i++)
+      required_shape[i] = desc2.data.dims[i];
+    NDArray reshaped = MKLDNNDataReshape(required_shape);
+    const mkldnn::memory *ret = reshaped.GetMKLDNNData();
+    if (ret->get_primitive_desc() == new_pd) {
+      return GetMKLDNNExact(ret, new_pd);
+    } else {
+      mkldnn::memory *ret2 = TmpMemMgr::Get()->Alloc(new_pd);
+      stream->RegisterPrim(mkldnn::reorder(*ret, *ret2));
+      return ret2;
+    }
   }
 }
 
@@ -559,10 +576,15 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc)
 
 const mkldnn::memory *NDArray::GetMKLDNNData() const {
   CHECK(storage_type() == kDefaultStorage);
-  // If this array uses MKLDNN layout, we have to make sure it's not a view.
-  // Otherwise, we'll have to change the layout inside the array.
-  if (IsMKLDNNData())
+  if (IsMKLDNNData()) {
+    // If this array uses MKLDNN layout, we have to make sure it's not a view.
+    // Otherwise, we'll have to change the layout inside the array.
     CHECK(!IsView());
+    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
+    // If this array uses MKLDNN format, we should return now. Otherwise,
+    // SetMKLMem may mess up mkl_mem_.
+    return ptr_->mkl_mem_->GetRaw();
+  }
   ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_);
   MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
   if (IsView()) {
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 16e5605b668e..48a029817d1b 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -273,12 +273,11 @@ class MKLDNNStream {
   std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
 
  public:
-  static MKLDNNStream *Get() {
-    static thread_local MKLDNNStream stream;
-    return &stream;
-  }
+  static MKLDNNStream *Get();
 
-  void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); }
+  void RegisterPrim(const mkldnn::primitive &prim) {
+    net.push_back(prim);
+  }
 
   void RegisterMem(std::shared_ptr<const mkldnn::memory> mem) {
     mem_holder.push_back(mem);
@@ -288,10 +287,21 @@ class MKLDNNStream {
     return !net.empty();
   }
 
-  void Submit() {
-    if (!net.empty())
+  /*
+   * After submitting mkldnn operations for execution, we need to
+   * clean up memory held by the stream. However, sometimes users
+   * might want to separate mkldnn execution and memory cleanup.
+   */
+  void Submit(bool cleanup = true) {
+    if (!net.empty()) {
       mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
-    net.clear();
+      net.clear();
+    }
+    if (cleanup)
+      Cleanup();
+  }
+
+  void Cleanup() {
     mem_holder.clear();
     TmpMemMgr::Get()->Reset();
   }
@@ -349,6 +359,16 @@ inline bool same_shape(const TShape &shape, const mkldnn_dims_t dims, int ndims)
   return true;
 }
 
+inline bool same_shape(const mkldnn::memory::desc &desc1,
+                       const mkldnn::memory::desc &desc2) {
+  if (desc1.data.ndims != desc2.data.ndims)
+    return false;
+  for (int i = 0; i < desc1.data.ndims; i++)
+    if (desc1.data.dims[i] != desc2.data.dims[i])
+      return false;
+  return true;
+}
+
 inline bool same_shape(const TShape &shape, int dtype,
                        const mkldnn::memory::desc &desc) {
   return same_shape(shape, desc.data.dims, desc.data.ndims)
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 952ed4f569d4..eb20a17c876b 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -25,6 +25,11 @@
 
 namespace mxnet {
 
+MKLDNNStream *MKLDNNStream::Get() {
+  static thread_local MKLDNNStream stream;
+  return &stream;
+}
+
 void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
   if (size > *space)
     return nullptr;
@@ -57,8 +62,11 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) {
     this->curr_mem = static_cast<char *>(mem) + pd.get_size();
     return ret.get();
   } else {
-    LOG(WARNING) << "Allocate " << pd.get_size()
-        << " bytes with malloc directly";
+    // If curr_mem has been initialized and we still reach here. It means
+    // the current allocated memory isn't enough.
+    if (this->curr_mem)
+      LOG(WARNING) << "Allocate " << pd.get_size()
+          << " bytes with malloc directly";
     mkldnn_mem_ptr ret(new mkldnn::memory(pd));
     MKLDNNStream::Get()->RegisterMem(ret);
     return ret.get();
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index 7dc05fda2cc6..c39373b1b798 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -410,7 +410,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
           if (bwd_node_ptr) {
             CHECK_EQ(bwd_node_ptr->inputs.size(), num_inputs);
             input_types.resize(bwd_node_ptr->inputs.size(), -1);
-            for (size_t i = 0; i < num_inputs; ++i) {
+            for (int i = 0; i < num_inputs; ++i) {
               const int map_key = bwd_node_ptr->inputs[i].index;
               CHECK(index2array.find(map_key) != index2array.end());
               const int dtype = index2array[map_key]->dtype();
@@ -421,7 +421,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
               output_types.emplace_back(dtype);
             }
           } else {
-            for (size_t x = 0; x < num_inputs; ++x) {
+            for (int x = 0; x < num_inputs; ++x) {
               input_types.emplace_back(default_dtype());
             }
             for (const auto &fwd_inp : backward_for_op->inputs()) {
@@ -431,10 +431,10 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
           }
         } else {
           CHECK(false);  // above always true?
-          for (size_t x = 0; x < num_inputs; ++x) {
+          for (int x = 0; x < num_inputs; ++x) {
             input_types.emplace_back(default_dtype());
           }
-          for (size_t x = 0; x < inferred_num_outputs; ++x) {
+          for (int x = 0; x < inferred_num_outputs; ++x) {
             output_types.emplace_back(default_dtype());
           }
         }
@@ -455,7 +455,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
             if (bwd_node_ptr) {
               input_shapes.clear();
               CHECK_EQ(bwd_node_ptr->inputs.size(), num_inputs);
-              for (size_t i = 0; i < num_inputs; ++i) {
+              for (int i = 0; i < num_inputs; ++i) {
                 const int map_key = bwd_node_ptr->inputs[i].index;
                 CHECK(index2array.find(map_key) != index2array.end());
                 const nnvm::TShape &shp = index2array[map_key]->shape();
diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc
index 9d4b9823037f..58ad894e36bf 100644
--- a/tests/cpp/operator/mkldnn.cc
+++ b/tests/cpp/operator/mkldnn.cc
@@ -28,6 +28,8 @@
 #include "gtest/gtest.h"
 #include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h"
 
+using namespace mxnet;
+
 #if __GNUC__ >= 5
 bool test_mem_align(void *mem, size_t size, size_t alignment, size_t space) {
   void *ret1, *ret2;
@@ -84,4 +86,250 @@ TEST(MKLDNN_UTIL_FUNC, MemFormat) {
   CHECK_EQ(mkldnn_nchw, 5);
   CHECK_EQ(mkldnn_oihw, 12);
 }
+
+// Init arrays with the default layout.
+static void InitArray(NDArray *arr) {
+  const TBlob &blob = arr->data();
+  mshadow::default_real_t *data = blob.dptr<mshadow::default_real_t>();
+  size_t size = blob.Size();
+  for (size_t i = 0; i < size; i++)
+    data[i] = i;
+}
+
+// Init arrays with the specified layout.
+static void InitMKLDNNArray(NDArray *arr, const mkldnn::memory::primitive_desc &pd) {
+  const TBlob &blob = arr->data();
+  mshadow::default_real_t *data = blob.dptr<mshadow::default_real_t>();
+  size_t size = blob.Size();
+  for (size_t i = 0; i < size; i++)
+    data[i] = i;
+  arr->MKLDNNDataReorderAsync(pd);
+  arr->WaitToRead();
+}
+
+static void VerifyDefMem(const mkldnn::memory &mem) {
+  mkldnn::memory::primitive_desc pd = mem.get_primitive_desc();
+  mshadow::default_real_t *data
+      = static_cast<mshadow::default_real_t *>(mem.get_data_handle());
+  size_t size = pd.get_size() / sizeof(mshadow::default_real_t);
+  size_t num_same = 0;
+  for (size_t i = 0; i < size; i++)
+    num_same += data[i] == static_cast<mshadow::default_real_t>(i);
+  EXPECT_EQ(num_same, size);
+}
+
+static void VerifyMem(const mkldnn::memory &mem) {
+  mkldnn::memory::primitive_desc pd = mem.get_primitive_desc();
+
+  if (pd.desc().data.format == GetDefaultFormat(pd.desc())) {
+    VerifyDefMem(mem);
+  } else {
+    mkldnn::memory::dims dims(pd.desc().data.ndims);
+    for (size_t i = 0; i < dims.size(); i++)
+      dims[i] = pd.desc().data.dims[i];
+    mkldnn::memory::desc desc{dims,
+                              static_cast<mkldnn::memory::data_type>(pd.desc().data.data_type),
+                              static_cast<mkldnn::memory::format>(GetDefaultFormat(pd.desc()))};
+    mkldnn::memory::primitive_desc new_pd(desc, CpuEngine::Get()->get_engine());
+    mkldnn::memory new_mem(new_pd);
+
+    std::vector<mkldnn::primitive> net;
+    net.push_back(mkldnn::reorder(mem, new_mem));
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait();
+    VerifyDefMem(new_mem);
+  }
+}
+
+static mkldnn::memory::primitive_desc GetMemPD(const TShape s, int dtype,
+                                               mkldnn::memory::format format) {
+  mkldnn::memory::dims dims(s.ndim());
+  for (size_t i = 0; i < dims.size(); i++)
+    dims[i] = s[i];
+  mkldnn::memory::desc desc{dims, get_mkldnn_type(dtype), format};
+  return mkldnn::memory::primitive_desc(desc, CpuEngine::Get()->get_engine());
+}
+
+// This function gets special MKLDNN formats without knowing the specific
+// hardware configuration. Certainly, it potentially misses some format if
+// it's specific for certain array shapes. It covers at least one special format
+// for each of the formats: nchw, oihw, goihw.
+// To test the logic of the code in NDArray, these formats should be enough.
+static std::vector<mkldnn::memory::format> GetMKLDNNFormat(size_t num_dims, int dtype) {
+  if (num_dims == 4) {
+    mkldnn::memory::dims data_dims{1, 3, 224, 224};
+    mkldnn::memory::desc data_md{data_dims, get_mkldnn_type(dtype),
+                                 mkldnn::memory::format::any};
+    mkldnn::memory::dims weight_dims{96, 3, 11, 11};
+    mkldnn::memory::desc weight_md{weight_dims, get_mkldnn_type(dtype),
+                                   mkldnn::memory::format::any};
+    mkldnn::memory::dims output_dims{1, 96, 54, 54};
+    mkldnn::memory::desc out_md{output_dims, get_mkldnn_type(dtype),
+                                mkldnn::memory::format::any};
+    mkldnn::memory::dims strides{4, 4};
+    mkldnn::memory::dims padding{0, 0};
+
+    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
+                                           mkldnn::algorithm::convolution_direct,
+                                           data_md, weight_md, out_md, strides,
+                                           padding, padding, mkldnn::padding_kind::zero);
+    mkldnn::convolution_forward::primitive_desc pd(desc, CpuEngine::Get()->get_engine());
+    std::vector<mkldnn::memory::format> ret(2);
+    ret[0] = static_cast<mkldnn::memory::format>(pd.dst_primitive_desc().desc().data.format);
+    ret[1] = static_cast<mkldnn::memory::format>(pd.weights_primitive_desc().desc().data.format);
+    printf("format: %d, %d\n", ret[0], ret[1]);
+    return ret;
+  } else if (num_dims == 5) {
+    mkldnn::memory::dims data_dims{1, 32, 112, 112};
+    mkldnn::memory::desc data_md{data_dims, get_mkldnn_type(dtype),
+                                 mkldnn::memory::format::any};
+    mkldnn::memory::dims weight_dims{32, 1, 1, 3, 3};
+    mkldnn::memory::desc weight_md{weight_dims, get_mkldnn_type(dtype),
+                                   mkldnn::memory::format::any};
+    mkldnn::memory::dims output_dims{1, 32, 112, 112};
+    mkldnn::memory::desc out_md{output_dims, get_mkldnn_type(dtype),
+                                mkldnn::memory::format::any};
+    mkldnn::memory::dims strides{1, 1};
+    mkldnn::memory::dims padding{1, 1};
+
+    mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
+                                           mkldnn::algorithm::convolution_direct,
+                                           data_md, weight_md, out_md, strides,
+                                           padding, padding, mkldnn::padding_kind::zero);
+    mkldnn::convolution_forward::primitive_desc pd(desc, CpuEngine::Get()->get_engine());
+    std::vector<mkldnn::memory::format> ret(1);
+    ret[0] = static_cast<mkldnn::memory::format>(pd.weights_primitive_desc().desc().data.format);
+    printf("format: %d\n", ret[0]);
+    return ret;
+  } else {
+    return std::vector<mkldnn::memory::format>();
+  }
+}
+
+struct TestArrayShapes {
+  std::vector<TShape> shapes;
+  std::vector<mkldnn::memory::primitive_desc> pds;
+};
+
+static TestArrayShapes GetTestArrayShapes() {
+  int dtype = mshadow::DataType<mshadow::default_real_t>::kFlag;
+  std::vector<TShape> shapes;
+  std::vector<mkldnn::memory::primitive_desc> pds;
+  {
+    // 1D
+    TShape s(1);
+    s[0] = 279936;
+    shapes.push_back(s);
+    pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::x));
+    s[0] = 34848;
+    shapes.push_back(s);
+    pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::x));
+  }
+  {
+    // 2D
+    TShape s(2);
+    s[0] = 96;
+    s[1] = 2916;
+    shapes.push_back(s);
+    pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::nc));
+    s[0] = 96;
+    s[1] = 363;
+    shapes.push_back(s);
+    pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::nc));
+  }
+  {
+    // 4D
+    TShape s1(4);
+    s1[0] = 1; s1[1] = 96; s1[2] = 54; s1[3] = 54;
+    shapes.push_back(s1);
+    pds.push_back(GetMemPD(s1, dtype, mkldnn::memory::format::nchw));
+
+    TShape s2(4);
+    s2[0] = 96; s2[1] = 3; s2[2] = 11; s2[3] = 11;
+    shapes.push_back(s2);
+    pds.push_back(GetMemPD(s2, dtype, mkldnn::memory::format::oihw));
+
+    std::vector<mkldnn::memory::format> formats = GetMKLDNNFormat(4, dtype);
+    pds.push_back(GetMemPD(s1, dtype, formats[0]));
+    pds.push_back(GetMemPD(s2, dtype, formats[1]));
+  }
+  {
+    // 5D
+    TShape s(5);
+    s[0] = 96; s[1] = 1; s[2] = 3; s[3] = 11; s[4] = 11;
+    shapes.push_back(s);
+    pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::goihw));
+
+    std::vector<mkldnn::memory::format> formats = GetMKLDNNFormat(5, dtype);
+    pds.push_back(GetMemPD(s, dtype, formats[0]));
+  }
+
+  TestArrayShapes ret;
+  ret.shapes = shapes;
+  ret.pds = pds;
+  return ret;
+}
+
+TEST(MKLDNN_NDArray, GetDataReorder) {
+  TestArrayShapes tas = GetTestArrayShapes();
+  std::vector<TShape> shapes = tas.shapes;
+  std::vector<mkldnn::memory::primitive_desc> pds = tas.pds;
+
+
+  // Reorder from the default to any other layout.
+  for (auto s : shapes) {
+    NDArray arr(s, Context());
+    InitArray(&arr);
+    for (auto pd : pds) {
+      if (s.Size() == pd.get_size() / sizeof(mshadow::default_real_t)) {
+        const mkldnn::memory *mem = arr.GetMKLDNNDataReorder(pd);
+        printf("reorder from (");
+        for (size_t i = 0; i < s.ndim(); i++)
+          printf("%ld, ", s[i]);
+        printf(") to (");
+        for (int i = 0; i < pd.desc().data.ndims; i++)
+          printf("%d, ", pd.desc().data.dims[i]);
+        printf("), format: %d\n", pd.desc().data.format);
+        MKLDNNStream::Get()->Submit(false);
+        VerifyMem(*mem);
+        MKLDNNStream::Get()->Cleanup();
+      }
+    }
+  }
+
+  // Reorder from a special layout to another layout.
+  for (auto s : shapes) {
+    for (auto from_pd : pds) {
+      if (from_pd.get_size() / sizeof(mshadow::default_real_t) == s.Size()) {
+        NDArray arr(s, Context());
+        // There is possibility that the dimensions of an NDArray doesn't match
+        // with the MKLDNN memory inside.
+        printf("Init array (");
+        for (size_t i = 0; i < s.ndim(); i++)
+          printf("%ld, ", s[i]);
+        printf(") with MKLDNN memory (");
+        for (int i = 0; i < from_pd.desc().data.ndims; i++)
+          printf("%d, ", from_pd.desc().data.dims[i]);
+        printf("), format: %d\n", from_pd.desc().data.format);
+        InitMKLDNNArray(&arr, from_pd);
+        for (auto to_pd : pds) {
+          if (to_pd.get_size() / sizeof(mshadow::default_real_t) == s.Size()) {
+            const mkldnn::memory *mem = arr.GetMKLDNNDataReorder(to_pd);
+            printf("reorder from (");
+            for (size_t i = 0; i < s.ndim(); i++)
+              printf("%ld, ", s[i]);
+            printf("), format: %d to (",
+                   arr.GetMKLDNNData()->get_primitive_desc().desc().data.format);
+            for (int i = 0; i < to_pd.desc().data.ndims; i++)
+              printf("%d, ", to_pd.desc().data.dims[i]);
+            printf("), format: %d\n", to_pd.desc().data.format);
+            MKLDNNStream::Get()->Submit(false);
+            VerifyMem(*mem);
+            MKLDNNStream::Get()->Cleanup();
+          }
+        }
+      }
+    }
+  }
+}
+
 #endif
diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
index 378a822d1938..273ad3d69ca3 100644
--- a/tests/python/gpu/test_gluon_model_zoo_gpu.py
+++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -81,15 +81,16 @@ def test_inference():
             gpu_param = gpu_params.get(k)
             gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu()))
 
-        # Run inference.
-        with autograd.record(train_mode=False):
-            cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu()))
-            gpu_out = gpu_model(gpu_data)
-        out = cpu_out.asnumpy()
-        max_val = np.max(np.abs(out))
-        gpu_max_val = np.max(np.abs(gpu_out.asnumpy()))
-        eprint(model_name + ": CPU " + str(max_val) + ", GPU " + str(gpu_max_val))
-        assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-3, atol=1e-3)
+        for i in range(5):
+            # Run inference.
+            with autograd.record(train_mode=False):
+                cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu()))
+                gpu_out = gpu_model(gpu_data)
+            out = cpu_out.asnumpy()
+            max_val = np.max(np.abs(out))
+            gpu_max_val = np.max(np.abs(gpu_out.asnumpy()))
+            eprint(model_name + ": CPU " + str(max_val) + ", GPU " + str(gpu_max_val))
+            assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-3, atol=1e-3)
 
 def get_nn_model(name):
     if "densenet" in name: