diff --git a/CMakeLists.txt b/CMakeLists.txt index 05d8021c3677..ed96a6c83719 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,8 +187,12 @@ endif() if(USE_MKL_IF_AVAILABLE) if(USE_MKLDNN) + # We need to use generic archtecture. Otherwise, MKLDNN compiled in one + # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3). + set(ARCH_OPT_FLAGS "-mtune=generic") add_subdirectory(3rdparty/mkldnn) include_directories(3rdparty/mkldnn/include) + add_definitions(-DMXNET_USE_MKLDNN=1) list(APPEND mxnet_LINKER_LIBS mkldnn) endif() find_package(MKL) @@ -197,10 +201,6 @@ if(USE_MKL_IF_AVAILABLE) include_directories(${MKL_INCLUDE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/operator/mkl) - if(USE_MKLDNN) - add_definitions(-DMXNET_USE_MKLDNN=1) - endif() - add_definitions(-DUSE_MKL=1) add_definitions(-DCUB_MKL=1) list(APPEND mxnet_LINKER_LIBS ${MKL_LIBRARIES}) diff --git a/Jenkinsfile b/Jenkinsfile index eb2160f370c1..84116e4d85b0 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,7 +26,7 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdpart mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a' // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default. mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so' -mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so, build/3rdparty/mkldnn/src/libmkldnn.so.0' +mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0' mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/nnvm/lib/libnnvm.a' // command to start a docker container docker_run = 'tests/ci_build/ci_build.sh' @@ -534,6 +534,17 @@ try { } } }, + 'Cpp: MKLDNN+GPU': { + node('mxnetlinux-gpu') { + ws('workspace/ut-cpp-mkldnn-gpu') { + timeout(time: max_time, unit: 'MINUTES') { + init_git() + unpack_lib('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib) + sh "ci/build.py --nvidiadocker --platform ubuntu_gpu /work/runtime_functions.sh unittest_ubuntu_gpu_cpp" + } + } + } + }, 'R: CPU': { node('mxnetlinux-cpu') { ws('workspace/ut-r-cpu') { diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index 92994620a493..4ab85064e30f 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -323,6 +323,9 @@ build_ubuntu_gpu_cmake_mkldnn() { /work/mxnet ninja -v + # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0. + cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp + mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0 } build_ubuntu_gpu_cmake() { diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index d175a13632a1..4b4596961953 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -485,8 +485,8 @@ const mkldnn::memory *NDArray::GetMKLDNNData( } const mkldnn::memory *NDArray::GetMKLDNNDataReorder( - const mkldnn::memory::primitive_desc &desc) const { - if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + const mkldnn::memory::primitive_desc &new_pd) const { + if (new_pd.get_size() != shape().Size() * GetTypeSize(dtype_)) { LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; return nullptr; } @@ -495,24 +495,41 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder( const mkldnn::memory *mem = GetMKLDNNData(); // If the memory descriptor matches, it's easy. MKLDNNStream *stream = MKLDNNStream::Get(); - if (mem->get_primitive_desc() == desc) { - return GetMKLDNNExact(mem, desc); + if (mem->get_primitive_desc() == new_pd) { + return GetMKLDNNExact(mem, new_pd); } - mkldnn::memory::primitive_desc _desc = desc; + mkldnn::memory::primitive_desc _pd = new_pd; + mkldnn::memory::desc desc1 = mem->get_primitive_desc().desc(); + mkldnn::memory::desc desc2 = _pd.desc(); // Now we need to determine if we should reorder the memory. // If both use the default formats, we think we don't need to reorder. - mkldnn::memory::desc desc1 = mem->get_primitive_desc().desc(); - mkldnn::memory::desc desc2 = _desc.desc(); if (desc1.data.format == GetDefaultFormat(desc1) && desc2.data.format == GetDefaultFormat(desc2)) { - mkldnn_mem_ptr ret(new mkldnn::memory(desc, mem->get_data_handle())); + mkldnn_mem_ptr ret(new mkldnn::memory(new_pd, mem->get_data_handle())); stream->RegisterMem(ret); return ret.get(); - } else { - mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(desc); + } else if (same_shape(desc1, desc2)) { + // If they have the same shape, we can reorder data directly. + mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(new_pd); stream->RegisterPrim(mkldnn::reorder(*mem, *ret)); return ret; + } else { + // If they have different shapes, we need to reshape the array first. + // Since this method will only be used inside an operator, we can call + // MKLDNNDataReshape to reshape an array. + TShape required_shape(desc2.data.ndims); + for (int i = 0; i < desc2.data.ndims; i++) + required_shape[i] = desc2.data.dims[i]; + NDArray reshaped = MKLDNNDataReshape(required_shape); + const mkldnn::memory *ret = reshaped.GetMKLDNNData(); + if (ret->get_primitive_desc() == new_pd) { + return GetMKLDNNExact(ret, new_pd); + } else { + mkldnn::memory *ret2 = TmpMemMgr::Get()->Alloc(new_pd); + stream->RegisterPrim(mkldnn::reorder(*ret, *ret2)); + return ret2; + } } } @@ -559,10 +576,15 @@ void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::primitive_desc &desc) const mkldnn::memory *NDArray::GetMKLDNNData() const { CHECK(storage_type() == kDefaultStorage); - // If this array uses MKLDNN layout, we have to make sure it's not a view. - // Otherwise, we'll have to change the layout inside the array. - if (IsMKLDNNData()) + if (IsMKLDNNData()) { + // If this array uses MKLDNN layout, we have to make sure it's not a view. + // Otherwise, we'll have to change the layout inside the array. CHECK(!IsView()); + MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem()); + // If this array uses MKLDNN format, we should return now. Otherwise, + // SetMKLMem may mess up mkl_mem_. + return ptr_->mkl_mem_->GetRaw(); + } ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_); MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem()); if (IsView()) { diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h index 16e5605b668e..48a029817d1b 100644 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -273,12 +273,11 @@ class MKLDNNStream { std::vector > mem_holder; public: - static MKLDNNStream *Get() { - static thread_local MKLDNNStream stream; - return &stream; - } + static MKLDNNStream *Get(); - void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); } + void RegisterPrim(const mkldnn::primitive &prim) { + net.push_back(prim); + } void RegisterMem(std::shared_ptr mem) { mem_holder.push_back(mem); @@ -288,10 +287,21 @@ class MKLDNNStream { return !net.empty(); } - void Submit() { - if (!net.empty()) + /* + * After submitting mkldnn operations for execution, we need to + * clean up memory held by the stream. However, sometimes users + * might want to separate mkldnn execution and memory cleanup. + */ + void Submit(bool cleanup = true) { + if (!net.empty()) { mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); - net.clear(); + net.clear(); + } + if (cleanup) + Cleanup(); + } + + void Cleanup() { mem_holder.clear(); TmpMemMgr::Get()->Reset(); } @@ -349,6 +359,16 @@ inline bool same_shape(const TShape &shape, const mkldnn_dims_t dims, int ndims) return true; } +inline bool same_shape(const mkldnn::memory::desc &desc1, + const mkldnn::memory::desc &desc2) { + if (desc1.data.ndims != desc2.data.ndims) + return false; + for (int i = 0; i < desc1.data.ndims; i++) + if (desc1.data.dims[i] != desc2.data.dims[i]) + return false; + return true; +} + inline bool same_shape(const TShape &shape, int dtype, const mkldnn::memory::desc &desc) { return same_shape(shape, desc.data.dims, desc.data.ndims) diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc index 952ed4f569d4..eb20a17c876b 100644 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -25,6 +25,11 @@ namespace mxnet { +MKLDNNStream *MKLDNNStream::Get() { + static thread_local MKLDNNStream stream; + return &stream; +} + void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) { if (size > *space) return nullptr; @@ -57,8 +62,11 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) { this->curr_mem = static_cast(mem) + pd.get_size(); return ret.get(); } else { - LOG(WARNING) << "Allocate " << pd.get_size() - << " bytes with malloc directly"; + // If curr_mem has been initialized and we still reach here. It means + // the current allocated memory isn't enough. + if (this->curr_mem) + LOG(WARNING) << "Allocate " << pd.get_size() + << " bytes with malloc directly"; mkldnn_mem_ptr ret(new mkldnn::memory(pd)); MKLDNNStream::Get()->RegisterMem(ret); return ret.get(); diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h index 7dc05fda2cc6..c39373b1b798 100644 --- a/tests/cpp/include/test_core_op.h +++ b/tests/cpp/include/test_core_op.h @@ -410,7 +410,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer if (bwd_node_ptr) { CHECK_EQ(bwd_node_ptr->inputs.size(), num_inputs); input_types.resize(bwd_node_ptr->inputs.size(), -1); - for (size_t i = 0; i < num_inputs; ++i) { + for (int i = 0; i < num_inputs; ++i) { const int map_key = bwd_node_ptr->inputs[i].index; CHECK(index2array.find(map_key) != index2array.end()); const int dtype = index2array[map_key]->dtype(); @@ -421,7 +421,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer output_types.emplace_back(dtype); } } else { - for (size_t x = 0; x < num_inputs; ++x) { + for (int x = 0; x < num_inputs; ++x) { input_types.emplace_back(default_dtype()); } for (const auto &fwd_inp : backward_for_op->inputs()) { @@ -431,10 +431,10 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer } } else { CHECK(false); // above always true? - for (size_t x = 0; x < num_inputs; ++x) { + for (int x = 0; x < num_inputs; ++x) { input_types.emplace_back(default_dtype()); } - for (size_t x = 0; x < inferred_num_outputs; ++x) { + for (int x = 0; x < inferred_num_outputs; ++x) { output_types.emplace_back(default_dtype()); } } @@ -455,7 +455,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer if (bwd_node_ptr) { input_shapes.clear(); CHECK_EQ(bwd_node_ptr->inputs.size(), num_inputs); - for (size_t i = 0; i < num_inputs; ++i) { + for (int i = 0; i < num_inputs; ++i) { const int map_key = bwd_node_ptr->inputs[i].index; CHECK(index2array.find(map_key) != index2array.end()); const nnvm::TShape &shp = index2array[map_key]->shape(); diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc index 9d4b9823037f..58ad894e36bf 100644 --- a/tests/cpp/operator/mkldnn.cc +++ b/tests/cpp/operator/mkldnn.cc @@ -28,6 +28,8 @@ #include "gtest/gtest.h" #include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h" +using namespace mxnet; + #if __GNUC__ >= 5 bool test_mem_align(void *mem, size_t size, size_t alignment, size_t space) { void *ret1, *ret2; @@ -84,4 +86,250 @@ TEST(MKLDNN_UTIL_FUNC, MemFormat) { CHECK_EQ(mkldnn_nchw, 5); CHECK_EQ(mkldnn_oihw, 12); } + +// Init arrays with the default layout. +static void InitArray(NDArray *arr) { + const TBlob &blob = arr->data(); + mshadow::default_real_t *data = blob.dptr(); + size_t size = blob.Size(); + for (size_t i = 0; i < size; i++) + data[i] = i; +} + +// Init arrays with the specified layout. +static void InitMKLDNNArray(NDArray *arr, const mkldnn::memory::primitive_desc &pd) { + const TBlob &blob = arr->data(); + mshadow::default_real_t *data = blob.dptr(); + size_t size = blob.Size(); + for (size_t i = 0; i < size; i++) + data[i] = i; + arr->MKLDNNDataReorderAsync(pd); + arr->WaitToRead(); +} + +static void VerifyDefMem(const mkldnn::memory &mem) { + mkldnn::memory::primitive_desc pd = mem.get_primitive_desc(); + mshadow::default_real_t *data + = static_cast(mem.get_data_handle()); + size_t size = pd.get_size() / sizeof(mshadow::default_real_t); + size_t num_same = 0; + for (size_t i = 0; i < size; i++) + num_same += data[i] == static_cast(i); + EXPECT_EQ(num_same, size); +} + +static void VerifyMem(const mkldnn::memory &mem) { + mkldnn::memory::primitive_desc pd = mem.get_primitive_desc(); + + if (pd.desc().data.format == GetDefaultFormat(pd.desc())) { + VerifyDefMem(mem); + } else { + mkldnn::memory::dims dims(pd.desc().data.ndims); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = pd.desc().data.dims[i]; + mkldnn::memory::desc desc{dims, + static_cast(pd.desc().data.data_type), + static_cast(GetDefaultFormat(pd.desc()))}; + mkldnn::memory::primitive_desc new_pd(desc, CpuEngine::Get()->get_engine()); + mkldnn::memory new_mem(new_pd); + + std::vector net; + net.push_back(mkldnn::reorder(mem, new_mem)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + VerifyDefMem(new_mem); + } +} + +static mkldnn::memory::primitive_desc GetMemPD(const TShape s, int dtype, + mkldnn::memory::format format) { + mkldnn::memory::dims dims(s.ndim()); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = s[i]; + mkldnn::memory::desc desc{dims, get_mkldnn_type(dtype), format}; + return mkldnn::memory::primitive_desc(desc, CpuEngine::Get()->get_engine()); +} + +// This function gets special MKLDNN formats without knowing the specific +// hardware configuration. Certainly, it potentially misses some format if +// it's specific for certain array shapes. It covers at least one special format +// for each of the formats: nchw, oihw, goihw. +// To test the logic of the code in NDArray, these formats should be enough. +static std::vector GetMKLDNNFormat(size_t num_dims, int dtype) { + if (num_dims == 4) { + mkldnn::memory::dims data_dims{1, 3, 224, 224}; + mkldnn::memory::desc data_md{data_dims, get_mkldnn_type(dtype), + mkldnn::memory::format::any}; + mkldnn::memory::dims weight_dims{96, 3, 11, 11}; + mkldnn::memory::desc weight_md{weight_dims, get_mkldnn_type(dtype), + mkldnn::memory::format::any}; + mkldnn::memory::dims output_dims{1, 96, 54, 54}; + mkldnn::memory::desc out_md{output_dims, get_mkldnn_type(dtype), + mkldnn::memory::format::any}; + mkldnn::memory::dims strides{4, 4}; + mkldnn::memory::dims padding{0, 0}; + + mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, + mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, + padding, padding, mkldnn::padding_kind::zero); + mkldnn::convolution_forward::primitive_desc pd(desc, CpuEngine::Get()->get_engine()); + std::vector ret(2); + ret[0] = static_cast(pd.dst_primitive_desc().desc().data.format); + ret[1] = static_cast(pd.weights_primitive_desc().desc().data.format); + printf("format: %d, %d\n", ret[0], ret[1]); + return ret; + } else if (num_dims == 5) { + mkldnn::memory::dims data_dims{1, 32, 112, 112}; + mkldnn::memory::desc data_md{data_dims, get_mkldnn_type(dtype), + mkldnn::memory::format::any}; + mkldnn::memory::dims weight_dims{32, 1, 1, 3, 3}; + mkldnn::memory::desc weight_md{weight_dims, get_mkldnn_type(dtype), + mkldnn::memory::format::any}; + mkldnn::memory::dims output_dims{1, 32, 112, 112}; + mkldnn::memory::desc out_md{output_dims, get_mkldnn_type(dtype), + mkldnn::memory::format::any}; + mkldnn::memory::dims strides{1, 1}; + mkldnn::memory::dims padding{1, 1}; + + mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, + mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, + padding, padding, mkldnn::padding_kind::zero); + mkldnn::convolution_forward::primitive_desc pd(desc, CpuEngine::Get()->get_engine()); + std::vector ret(1); + ret[0] = static_cast(pd.weights_primitive_desc().desc().data.format); + printf("format: %d\n", ret[0]); + return ret; + } else { + return std::vector(); + } +} + +struct TestArrayShapes { + std::vector shapes; + std::vector pds; +}; + +static TestArrayShapes GetTestArrayShapes() { + int dtype = mshadow::DataType::kFlag; + std::vector shapes; + std::vector pds; + { + // 1D + TShape s(1); + s[0] = 279936; + shapes.push_back(s); + pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::x)); + s[0] = 34848; + shapes.push_back(s); + pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::x)); + } + { + // 2D + TShape s(2); + s[0] = 96; + s[1] = 2916; + shapes.push_back(s); + pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::nc)); + s[0] = 96; + s[1] = 363; + shapes.push_back(s); + pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::nc)); + } + { + // 4D + TShape s1(4); + s1[0] = 1; s1[1] = 96; s1[2] = 54; s1[3] = 54; + shapes.push_back(s1); + pds.push_back(GetMemPD(s1, dtype, mkldnn::memory::format::nchw)); + + TShape s2(4); + s2[0] = 96; s2[1] = 3; s2[2] = 11; s2[3] = 11; + shapes.push_back(s2); + pds.push_back(GetMemPD(s2, dtype, mkldnn::memory::format::oihw)); + + std::vector formats = GetMKLDNNFormat(4, dtype); + pds.push_back(GetMemPD(s1, dtype, formats[0])); + pds.push_back(GetMemPD(s2, dtype, formats[1])); + } + { + // 5D + TShape s(5); + s[0] = 96; s[1] = 1; s[2] = 3; s[3] = 11; s[4] = 11; + shapes.push_back(s); + pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::goihw)); + + std::vector formats = GetMKLDNNFormat(5, dtype); + pds.push_back(GetMemPD(s, dtype, formats[0])); + } + + TestArrayShapes ret; + ret.shapes = shapes; + ret.pds = pds; + return ret; +} + +TEST(MKLDNN_NDArray, GetDataReorder) { + TestArrayShapes tas = GetTestArrayShapes(); + std::vector shapes = tas.shapes; + std::vector pds = tas.pds; + + + // Reorder from the default to any other layout. + for (auto s : shapes) { + NDArray arr(s, Context()); + InitArray(&arr); + for (auto pd : pds) { + if (s.Size() == pd.get_size() / sizeof(mshadow::default_real_t)) { + const mkldnn::memory *mem = arr.GetMKLDNNDataReorder(pd); + printf("reorder from ("); + for (size_t i = 0; i < s.ndim(); i++) + printf("%ld, ", s[i]); + printf(") to ("); + for (int i = 0; i < pd.desc().data.ndims; i++) + printf("%d, ", pd.desc().data.dims[i]); + printf("), format: %d\n", pd.desc().data.format); + MKLDNNStream::Get()->Submit(false); + VerifyMem(*mem); + MKLDNNStream::Get()->Cleanup(); + } + } + } + + // Reorder from a special layout to another layout. + for (auto s : shapes) { + for (auto from_pd : pds) { + if (from_pd.get_size() / sizeof(mshadow::default_real_t) == s.Size()) { + NDArray arr(s, Context()); + // There is possibility that the dimensions of an NDArray doesn't match + // with the MKLDNN memory inside. + printf("Init array ("); + for (size_t i = 0; i < s.ndim(); i++) + printf("%ld, ", s[i]); + printf(") with MKLDNN memory ("); + for (int i = 0; i < from_pd.desc().data.ndims; i++) + printf("%d, ", from_pd.desc().data.dims[i]); + printf("), format: %d\n", from_pd.desc().data.format); + InitMKLDNNArray(&arr, from_pd); + for (auto to_pd : pds) { + if (to_pd.get_size() / sizeof(mshadow::default_real_t) == s.Size()) { + const mkldnn::memory *mem = arr.GetMKLDNNDataReorder(to_pd); + printf("reorder from ("); + for (size_t i = 0; i < s.ndim(); i++) + printf("%ld, ", s[i]); + printf("), format: %d to (", + arr.GetMKLDNNData()->get_primitive_desc().desc().data.format); + for (int i = 0; i < to_pd.desc().data.ndims; i++) + printf("%d, ", to_pd.desc().data.dims[i]); + printf("), format: %d\n", to_pd.desc().data.format); + MKLDNNStream::Get()->Submit(false); + VerifyMem(*mem); + MKLDNNStream::Get()->Cleanup(); + } + } + } + } + } +} + #endif diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py index 378a822d1938..273ad3d69ca3 100644 --- a/tests/python/gpu/test_gluon_model_zoo_gpu.py +++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py @@ -81,15 +81,16 @@ def test_inference(): gpu_param = gpu_params.get(k) gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu())) - # Run inference. - with autograd.record(train_mode=False): - cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu())) - gpu_out = gpu_model(gpu_data) - out = cpu_out.asnumpy() - max_val = np.max(np.abs(out)) - gpu_max_val = np.max(np.abs(gpu_out.asnumpy())) - eprint(model_name + ": CPU " + str(max_val) + ", GPU " + str(gpu_max_val)) - assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-3, atol=1e-3) + for i in range(5): + # Run inference. + with autograd.record(train_mode=False): + cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu())) + gpu_out = gpu_model(gpu_data) + out = cpu_out.asnumpy() + max_val = np.max(np.abs(out)) + gpu_max_val = np.max(np.abs(gpu_out.asnumpy())) + eprint(model_name + ": CPU " + str(max_val) + ", GPU " + str(gpu_max_val)) + assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-3, atol=1e-3) def get_nn_model(name): if "densenet" in name: