diff --git a/.gitignore b/.gitignore
index c8a813649bb8..7eb8e7d6e777 100644
--- a/.gitignore
+++ b/.gitignore
@@ -167,13 +167,11 @@ python/.eggs
 tests/Makefile
 tests/mxnet_unit_tests
 
-# generated wrappers for ccache
-cc
-cxx
-
 # Code coverage related
 .coverage
 *.gcov
 *.gcno
 coverage.xml
 
+# Local CMake build config
+cmake_options.yml
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 696803bd7723..6dc04f7c729c 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 696803bd7723ade8230af878460d96c68a550fbc
+Subproject commit 6dc04f7c729cd5c6c6210d5d4d2026a26ce0bfbf
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 161705643194..3e3de2053477 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,7 @@ mxnet_option(USE_F16C             "Build with x86 F16C instruction support" ON)
 mxnet_option(USE_LAPACK           "Build with lapack support" ON)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
 mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
-mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
+mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_SYSTEM_PROCESSOR MATCHES x86_64))
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON IF NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
@@ -215,7 +215,7 @@ if(ENABLE_TESTCOVERAGE)
   if(NOT GCOV_PATH)
     message(FATAL_ERROR "gcov not found! Aborting...")
   endif() # NOT GCOV_PATH
-  
+
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --coverage")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage")
   set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} --coverage")
@@ -227,7 +227,6 @@ if(USE_MKLDNN)
   include(cmake/DownloadMKLML.cmake)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(NOT MSVC)
-    set(MKLDNN_LIBRARY_TYPE "STATIC" CACHE INTERNAL "" FORCE)
     set(ARCH_OPT_FLAGS "-mtune=generic")
   else()
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc")
@@ -733,7 +732,12 @@ install(TARGETS ${MXNET_INSTALL_TARGETS}
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 
+# NOTE: Public headers will be installed into ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}, see
+#       https://cmake.org/cmake/help/v3.0/variable/CMAKE_INSTALL_PREFIX.html
+#       https://cmake.org/cmake/help/v3.0/module/GNUInstallDirs.html
+
 install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(DIRECTORY 3rdparty/tvm/nnvm/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 if (INSTALL_EXAMPLES)
   install(DIRECTORY example  DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME})
 endif()
diff --git a/CODEOWNERS b/CODEOWNERS
index 5a88e89dfb02..8b48257ebf83 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -12,15 +12,18 @@
 *			@apache/mxnet-committers
 
 # Language bindings
-/R-package/                @thirdwing
-/scala-package/            @yzhliu @nswamy
-/perl-package/             @sergeykolychev
-/python/                   @szha
-/contrib/clojure-package/  @gigasquid
+/R-package/                       @thirdwing
+/scala-package/                   @yzhliu @nswamy @pllarroy
+/perl-package/                    @sergeykolychev
+/python/                          @szha @pllarroy
+/python/mxnet/kvstore.py          @eric-haibin-lin
+/python/mxnet/optimizer/          @eric-haibin-lin
+/python/mxnet/gluon/trainer.py    @eric-haibin-lin
+/contrib/clojure-package/         @gigasquid
 
 # C++ base
 /src/kvstore/     @rahul003 @anirudh2290
-/include/         @anirudh2290
+/include/         @anirudh2290 @pllarroy
 /src/c_api/       @anirudh2290
 /src/common/      @anirudh2290
 /src/engine/      @anirudh2290
@@ -31,15 +34,20 @@
 /src/nnvm/        @anirudh2290
 /src/operator/    @anirudh2290
 /src/profiler/    @anirudh2290
+/src/kvstore/     @eric-haibin-lin
 /src/storage/     @anirudh2290
 /tests/cpp/       @anirudh2290
-/cpp-package/ @nswamy
+/cpp-package/     @nswamy @pllarroy
+/src/             @pllarroy
+/plugin/          @pllarroy
 
 # CMake
-CMakeLists.txt    @szha @rahul003
-/cmake/           @szha @rahul003
+CMakeLists.txt    @szha @rahul003 @pllarroy
+/cmake/           @szha @rahul003 @pllarroy
 
 # MXNet CI
+dev_menu.py         @pllarroy
+/ci/                @pllarroy
 /tests/ci_build/    @marcoabreu
 Jenkinsfile         @marcoabreu
 .travis.yml         @marcoabreu
@@ -50,16 +58,16 @@ Makefile          @szha
 prepare_mkl.sh    @szha
 
 # Docs
-/docs/            @szha
+/docs/            @szha @pllarroy
 
 # Submodules
 .gitmodules       @szha
 
 # Examples
-/example/         @szha
+/example/         @szha @pllarroy
 
 # Tools
-/tools/           @szha
+/tools/           @szha @pllarroy
 
 # Github templates
 /.github/         @szha
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index b9f84d592a70..5b5fdce712f1 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -193,6 +193,7 @@ List of Contributors
 * [Yuxi Hu](/~https://github.com/yuxihu)
 * [Harsh Patel](/~https://github.com/harshp8l)
 * [Xiao Wang](/~https://github.com/BeyonderXX)
+* [Piyush Ghai](/~https://github.com/piyushghai)
 
 Label Bot
 ---------
diff --git a/LICENSE b/LICENSE
index 10dea2522182..0b4841a8b0dc 100644
--- a/LICENSE
+++ b/LICENSE
@@ -217,21 +217,30 @@
 
     1. MXNet Cpp-package - For details, /cpp-package/LICENSE
     2. MXNet rcnn - For details, see, example/rcnn/LICENSE
-    3. scala-package - For details, see, scala-package/LICENSE
+    3. MXNet scala-package - For details, see, scala-package/LICENSE
     4. Warp-CTC - For details, see, 3rdparty/ctc_include/LICENSE
     5. 3rdparty/dlpack - For details, see, 3rdparty/dlpack/LICENSE
     6. 3rdparty/dmlc-core - For details, see, 3rdparty/dmlc-core/LICENSE
     7. 3rdparty/mshadow - For details, see, 3rdparty/mshadow/LICENSE
     8. 3rdparty/tvm - For details, see, 3rdparty/tvm/LICENSE
-    9. 3rdparty/tvm/dmlc-core - For details, see, 3rdparty/tvm/dmlc-core/LICENSE
+    9. 3rdparty/tvm/dmlc-core - For details, see, 3rdparty/tvm/3rdparty/dmlc-core/LICENSE
     10. 3rdparty/tvm/dlpack - For details, see, 3rdparty/tvm/3rdparty/dlpack/LICENSE
-    11. 3rdparty/tvm/nnvm - For details, see, 3rdparty/tvm/nnvm/LICENSE
-    12. 3rdparty/ps-lite - For details, see, 3rdparty/ps-lite/LICENSE
-    13. 3rdparty/mkldnn - For details, see, 3rdparty/mkldnn/LICENSE
-    14. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
-    15. clojure-package - For details, see, contrib/clojure-package/LICENSE
-    16. R-package - For details, see, R-package/LICENSE
-    17. ONNX-TensorRT benchmark package - For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/benchmark/LICENSE
+    11. 3rdparty/ps-lite - For details, see, 3rdparty/ps-lite/LICENSE
+    12. 3rdparty/mkldnn - For details, see, 3rdparty/mkldnn/LICENSE
+    13. googlemock scripts/generator - For details, see, 3rdparty/googletest/googlemock/scripts/generator/LICENSE
+    14. MXNet clojure-package - For details, see, contrib/clojure-package/LICENSE
+    15. MXNet R-package - For details, see, R-package/LICENSE
+    16. ONNX-TensorRT benchmark package - For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/benchmark/LICENSE
+    17. Dockerfiles - For details, see docker/Dockerfiles/License.md
+    18. MXNet Julia Package - For details, see julia/LICENSE.md
+    19. Benchdnn - For details, see 3rdparty/mkldnn/tests/benchdnn/README.md
+    20. MXNet perl-package - For details, see perl-package/README
+    21. MXNet perl-package AI-MXNET - For details, see perl-package/AI-MXNet/README
+    22. MXNet perl-package AI-MXNET Gluon Contrib - For details, see perl-package/AI-MXNet-Gluon-Contrib/README
+    23. MXNet perl-package AI-MXNET Gluon ModelZoo - For details, see perl-package/AI-MXNet-Gluon-ModelZoo/README
+    24. MXNet perl-package AI-MXNETCAPI - For details, see perl-package/AI-MXNetCAPI/README
+    25. MXNet perl-package AI-NNVMCAPI - For details, see perl-package/AI-NNVMCAPI/README
+    26. Cephes Library Functions - For details, see src/operator/special_functions-inl.h
 
 
     =======================================================================================
@@ -242,75 +251,64 @@
     2. Faster R-CNN - For details, see example/rcnn/LICENSE
     3. tree_lstm - For details, see example/gluon/tree_lstm/LICENSE
     4. OpenMP - For details, see 3rdparty/openmp/LICENSE.txt
-    5. HalideIR - For details, see nnvm/tvm/HalideIR/LICENSE
     6. HalideIR - For details, see 3rdparty/tvm/3rdparty/HalideIR/LICENSE
     7. ONNX-TensorRT - For details, see 3rdparty/onnx-tensorrt/LICENSE
     8. ONNX-TensorRT - For details, see 3rdparty/onnx-tensorrt/third_party/onnx/LICENSE
+    9. clipboard.js - Refer to https://zenorocha.github.io/clipboard.js
+    10. clipboard.min.js - Refer to https://zenorocha.github.io/clipboard.js
 
 
     =======================================================================================
-    NVIDIA Licenses
+    3-clause BSD licenses
     =======================================================================================
 
-    1. Moderngpu
-    For details, see, 3rdparty/ctc_include/contrib/moderngpu/LICENSE
-
-    /******************************************************************************
-    * Redistribution and use in source and binary forms, with or without
-    * modification, are permitted provided that the following conditions are met:
-    *     * Redistributions of source code must retain the above copyright
-    *       notice, this list of conditions and the following disclaimer.
-    *     * Redistributions in binary form must reproduce the above copyright
-    *       notice, this list of conditions and the following disclaimer in the
-    *       documentation and/or other materials provided with the distribution.
-    *     * Neither the name of the NVIDIA CORPORATION nor the
-    *       names of its contributors may be used to endorse or promote products
-    *       derived from this software without specific prior written permission.
-    *
-    * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-    * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-    * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-    * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-    * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-    * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-    * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-    * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-    * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-    *
-    ******************************************************************************/
-
-    2. CUB Library
-    For details, see, 3rdparty/cub/LICENSE.TXT
+    1. Xbyak - For details, see 3rdparty/mkldnn/src/cpu/xbyak/COPYRIGHT
+    2. gtest - For details, see, 3rdparty/mkldnn/tests/gtests/gtest/LICENSE
+    3. Moderngpu - For details, see, 3rdparty/ctc_include/contrib/moderngpu/LICENSE
+    4. CUB Library - For details, see, 3rdparty/cub/LICENSE.TXT
+    5. Googlemock - For details, see, 3rdparty/googletest/googlemock/LICENSE
+    6. Googletest - For details, see, 3rdparty/googletest/googletest/LICENSE
+    7. OpenMP Testsuite - For details, see, 3rdparty/openmp/testsuite/LICENSE
 
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are met:
-       *  Redistributions of source code must retain the above copyright
-          notice, this list of conditions and the following disclaimer.
-       *  Redistributions in binary form must reproduce the above copyright
-          notice, this list of conditions and the following disclaimer in the
-          documentation and/or other materials provided with the distribution.
-       *  Neither the name of the NVIDIA CORPORATION nor the
-          names of its contributors may be used to endorse or promote products
-          derived from this software without specific prior written permission.
 
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-    DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+    =======================================================================================
+    2-clause BSD licenses
+    =======================================================================================
+
+    1. Sphinx JavaScript utilties for the full-text search - For details, see, docs/_static/searchtools_custom.js
+    2. blockingconcurrentqueue.h - For details, see, 3rdparty/dmlc-core/include/dmlc/blockingconcurrentqueue.h
+    3. concurrentqueue.h - For details, see, 3rdparty/dmlc-core/include/dmlc/concurrentqueue.h
+    4. MSCOCO Toolbox - For details, see, example/ssd/dataset/pycocotools/coco.py
+
 
     =======================================================================================
     Other Licenses
     =======================================================================================
 
-    1. Caffe
-    For details, see, example/rcnn/LICENSE
+    1. Caffe - For details, see, example/rcnn/LICENSE
+    2. pool.h - For details, see, src/operator/nn/pool.h
+    3. pool.cuh - For details, see, src/operator/nn/pool.cuh
+    4. im2col.h - For details, see, src/operator/nn/im2col.h
+    5. im2col.cuh - For details, see, src/operator/nn/im2col.cuh
+    6. deformable_im2col.h - For details, see, src/operator/contrib/nn/deformable_im2col.h
+    7. deformable_im2col.cuh - For details, see, src/operator/contrib/nn/deformable_im2col.cuh
+
+    COPYRIGHT
+
+    All contributions by the University of California:
+    Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+    All rights reserved.
+
+    All other contributions:
+    Copyright (c) 2014, 2015, the respective contributors
+    All rights reserved.
+
+    Caffe uses a shared copyright model: each contributor holds copyright over
+    their contributions to Caffe. The project versioning records all such
+    contribution and copyright details. If a contributor wants to further mark
+    their specific copyright on a particular contribution, they should indicate
+    their copyright solely in the commit message of the change when it is
+    committed.
 
     LICENSE
 
@@ -342,7 +340,7 @@
 
     =======================================================================================
 
-    2. MS COCO API
+    8. MS COCO API
     For details, see, example/rcnn/LICENSE
 
     Redistribution and use in source and binary forms, with or without
@@ -371,155 +369,14 @@
 
     =======================================================================================
 
-    3. Sphinx JavaScript utilties for the full-text search
-    For details, see, docs/_static/searchtools_custom.js
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-
-    * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright
-     notice, this list of conditions and the following disclaimer in the
-     documentation and/or other materials provided with the distribution.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    =======================================================================================
-
-    4. FindCrypto.cmake
-    For details, see, 3rdparty/dmlc-core/cmake/Modules/FindCrypto.cmake,
-    Redistribution and use is allowed according to the terms of the BSD license.
-
-    =======================================================================================
-
-    5. Googlemock
-    For details, see, 3rdparty/googletest/googlemock/LICENSE
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-
-        * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-        * Redistributions in binary form must reproduce the above
-    copyright notice, this list of conditions and the following disclaimer
-    in the documentation and/or other materials provided with the
-    distribution.
-        * Neither the name of Google Inc. nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    =======================================================================================
-
-    6. Googletest
-    For details, see, 3rdparty/googletest/googletest/LICENSE
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-
-        * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-        * Redistributions in binary form must reproduce the above
-    copyright notice, this list of conditions and the following disclaimer
-    in the documentation and/or other materials provided with the
-    distribution.
-        * Neither the name of Google Inc. nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    =======================================================================================
-
-    7. OpenMP Testsuite
-    For details, see, 3rdparty/openmp/testsuite/LICENSE
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions
-    are met:
-
-    o Redistributions of source code must retain the above copyright notice,
-      this list of conditions and the following disclaimer.
-
-    o Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-
-    o Neither the name of the University of Houston System nor the names of its
-      contributors may be used to
-      endorse or promote products derived from this software without specific
-      prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    =======================================================================================
-
-    8. Semaphore implementation in blockingconcurrentqueue.h
+    9. Semaphore implementation in blockingconcurrentqueue.h
     This file uses a semaphore implementation under the terms of its separate zlib license.
     For details, see, 3rdparty/dmlc-core/include/dmlc/blockingconcurrentqueue.h
 
     =======================================================================================
 
-    9. blockingconcurrentqueue.h
-    This file is Distributed under the terms of the simplified BSD license.
-    For details, see, 3rdparty/dmlc-core/include/dmlc/blockingconcurrentqueue.h
-
-    =======================================================================================
-
-    10. concurrentqueue.h
-    This file is Distributed under the terms of the simplified BSD license.
-    For details, see, 3rdparty/dmlc-core/include/dmlc/concurrentqueue.h
-
-    =======================================================================================
-
-    11. ONNX Export module
-    For details, see, python/mxnet/contrib/onnx/_export/LICENSE
+    10. ONNX Export module
+    For details, see, python/mxnet/contrib/onnx/mx2onnx/LICENSE
 
     # Licensed to the Apache Software Foundation (ASF) under one
     # or more contributor license agreements.  See the NOTICE file
@@ -568,41 +425,7 @@
 
     =======================================================================================
 
-    12. Google tests
-        For details, see, 3rdparty/mkldnn/tests/gtests/gtest/LICENSE
-
-    Copyright 2008, Google Inc.
-    All rights reserved.
-
-    Redistribution and use in source and binary forms, with or without
-    modification, are permitted provided that the following conditions are
-    met:
-
-        * Redistributions of source code must retain the above copyright
-    notice, this list of conditions and the following disclaimer.
-        * Redistributions in binary form must reproduce the above
-    copyright notice, this list of conditions and the following disclaimer
-    in the documentation and/or other materials provided with the
-    distribution.
-        * Neither the name of Google Inc. nor the names of its
-    contributors may be used to endorse or promote products derived from
-    this software without specific prior written permission.
-
-    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-    OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-    =======================================================================================
-
-    13. ONNX python bindings
+    11. ONNX python bindings
     For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/pybind11/LICENSE
 
     Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
@@ -642,3 +465,155 @@
     other computer software, distribute, and sublicense such enhancements or
     derivative works thereof, in binary and source code form.
 
+    =======================================================================================
+
+    12. Clang
+    For details, see, 3rdparty/onnx-tensorrt/third_party/onnx/third_party/pybind11/tools/clang/LICENSE.TXT
+
+    LLVM Release License
+    University of Illinois/NCSA
+    Open Source License
+
+    Copyright (c) 2007-2012 University of Illinois at Urbana-Champaign.
+    All rights reserved.
+
+    Developed by:
+
+        LLVM Team
+
+        University of Illinois at Urbana-Champaign
+
+        http://llvm.org
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of
+    this software and associated documentation files (the "Software"), to deal with
+    the Software without restriction, including without limitation the rights to
+    use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+    of the Software, and to permit persons to whom the Software is furnished to do
+    so, subject to the following conditions:
+
+        * Redistributions of source code must retain the above copyright notice,
+          this list of conditions and the following disclaimers.
+
+        * Redistributions in binary form must reproduce the above copyright notice,
+          this list of conditions and the following disclaimers in the
+          documentation and/or other materials provided with the distribution.
+
+        * Neither the names of the LLVM Team, University of Illinois at
+          Urbana-Champaign, nor the names of its contributors may be used to
+          endorse or promote products derived from this Software without specific
+          prior written permission.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+    FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+    CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+    SOFTWARE.
+
+    The LLVM software contains code written by third parties.  Such software will
+    have its own individual LICENSE.TXT file in the directory in which it appears.
+    This file will describe the copyrights, license, and restrictions which apply
+    to that code.
+
+    The disclaimer of warranty in the University of Illinois Open Source License
+    applies to all code in the LLVM Distribution, and nothing in any of the
+    other licenses gives permission to use the names of the LLVM Team or the
+    University of Illinois to endorse or promote products derived from this
+    Software.
+
+    The following pieces of software have additional or alternate copyrights,
+    licenses, and/or restrictions:
+
+    Program             Directory
+    -------             ---------
+    <none yet>
+
+    =======================================================================================
+
+    13. MKL BLAS
+    For details, see, [Intel® Simplified license](https://software.intel.com/en-us/license/intel-simplified-software-license).
+
+    Copyright (c) 2018 Intel Corporation.
+
+    Use and Redistribution.  You may use and redistribute the software (the “Software”), without modification, provided the following conditions are met:
+
+    * Redistributions must reproduce the above copyright notice and the following terms of use in the Software and in the documentation and/or other materials provided with the distribution.
+
+    * Neither the name of Intel nor the names of its suppliers may be used to endorse or promote products derived from this Software without specific prior written permission.
+
+    * No reverse engineering, decompilation, or disassembly of this Software is permitted.
+
+    Limited patent license.  Intel grants you a world-wide, royalty-free, non-exclusive license under patents it now or hereafter owns or controls to make, have made, use, import, offer to sell and sell (“Utilize”) this Software, but solely to the extent that any such patent is necessary to Utilize the Software alone. The patent license shall not apply to any combinations which include this software.  No hardware per se is licensed hereunder.
+
+    Third party and other Intel programs.  “Third Party Programs” are the files listed in the “third-party-programs.txt” text file that is included with the Software and may include Intel programs under separate license terms. Third Party Programs, even if included with the distribution of the Materials, are governed by separate license terms and those license terms solely govern your use of those programs. 
+
+    DISCLAIMER.  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT ARE DISCLAIMED. THIS SOFTWARE IS NOT INTENDED FOR USE IN SYSTEMS OR APPLICATIONS WHERE FAILURE OF THE SOFTWARE MAY CAUSE PERSONAL INJURY OR DEATH AND YOU AGREE THAT YOU ARE FULLY RESPONSIBLE FOR ANY CLAIMS, COSTS, DAMAGES, EXPENSES, AND ATTORNEYS’ FEES ARISING OUT OF ANY SUCH USE, EVEN IF ANY CLAIM ALLEGES THAT INTEL WAS NEGLIGENT REGARDING THE DESIGN OR MANUFACTURE OF THE MATERIALS.
+
+    LIMITATION OF LIABILITY. IN NO EVENT WILL INTEL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. YOU AGREE TO INDEMNIFY AND HOLD INTEL HARMLESS AGAINST ANY CLAIMS AND EXPENSES RESULTING FROM YOUR USE OR UNAUTHORIZED USE OF THE SOFTWARE.
+
+    No support.  Intel may make changes to the Software, at any time without notice, and is not obligated to support, update or provide training for the Software.
+
+    Termination. Intel may terminate your right to use the Software in the event of your breach of this Agreement and you fail to cure the breach within a reasonable period of time.
+
+    Feedback.  Should you provide Intel with comments, modifications, corrections, enhancements or other input (“Feedback”) related to the Software Intel will be free to use, disclose, reproduce, license or otherwise distribute or exploit the Feedback in its sole discretion without any obligations or restrictions of any kind, including without limitation, intellectual property rights or licensing obligations.
+
+    Compliance with laws.  You agree to comply with all relevant laws and regulations governing your use, transfer, import or export (or prohibition thereof) of the Software.
+
+    Governing law.  All disputes will be governed by the laws of the United States of America and the State of Delaware without reference to conflict of law principles and subject to the exclusive jurisdiction of the state or federal courts sitting in the State of Delaware, and each party agrees that it submits to the personal jurisdiction and venue of those courts and waives any objections. The United Nations Convention on Contracts for the International Sale of Goods (1980) is specifically excluded and will not apply to the Software.
+
+    *Other names and brands may be claimed as the property of others.
+
+    =======================================================================================
+
+    14. FindJeMalloc.cmake
+    For details, see, cmake/Modules/FindJeMalloc.cmake
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+
+    Copyright (c)      2014 Thomas Heller
+    Copyright (c) 2007-2012 Hartmut Kaiser
+    Copyright (c) 2010-2011 Matt Anderson
+    Copyright (c) 2011      Bryce Lelbach
+
+    Distributed under the Boost Software License, Version 1.0.
+    Boost Software License - Version 1.0 - August 17th, 2003
+
+    Permission is hereby granted, free of charge, to any person or organization
+    obtaining a copy of the software and accompanying documentation covered by
+    this license (the "Software") to use, reproduce, display, distribute,
+    execute, and transmit the Software, and to prepare derivative works of the
+    Software, and to permit third-parties to whom the Software is furnished to
+    do so, all subject to the following:
+
+    The copyright notices in the Software and this entire statement, including
+    the above license grant, this restriction and the following disclaimer,
+    must be included in all copies of the Software, in whole or in part, and
+    all derivative works of the Software, unless such copies or derivative
+    works are solely in the form of machine-executable object code generated by
+    a source language processor.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+    SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+    FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
diff --git a/MKLDNN_README.md b/MKLDNN_README.md
index 2618d23388e7..ecb721e5fffe 100644
--- a/MKLDNN_README.md
+++ b/MKLDNN_README.md
@@ -1,9 +1,9 @@
 # Build/Install MXNet with MKL-DNN
 
-A better training and inference perforamce are expected to achieved on Intel-Architecture CPUs with MXNET built with [Intel MKL-DNN](/~https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
-In the following sections, you will find building instructions for MXNET with Intel MKL-DNN on Linux, MacOS and Windows.
+A better training and inference performance is expected to be achieved on Intel-Architecture CPUs with MXNet built with [Intel MKL-DNN](/~https://github.com/intel/mkl-dnn) on multiple operating system, including Linux, Windows and MacOS.
+In the following sections, you will find build instructions for MXNet with Intel MKL-DNN on Linux, MacOS and Windows.
 
-The detailed performance data collected on Intel Xeon CPU with MXNET built with Intel MKL-DNN can be found at [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
+The detailed performance data collected on Intel Xeon CPU with MXNet built with Intel MKL-DNN can be found [here](https://mxnet.incubator.apache.org/faq/perf.html#intel-cpu).
 
 
 <h2 id="0">Contents</h2>
@@ -78,12 +78,12 @@ cd incubator-mxnet
 ### Build MXNet with MKL-DNN
 
 ```
-LIBRARY_PATH=$(brew --prefix llvm)/lib/ make -j $(sysctl -n hw.ncpu) CC=$(brew --prefix llvm)/bin/clang++ CXX=$(brew --prefix llvm)/bin/clang++ USE_OPENCV=1 USE_OPENMP=1 USE_MKLDNN=1 USE_BLAS=apple USE_PROFILER=1
+LIBRARY_PATH=$(brew --prefix llvm)/lib/ make -j $(sysctl -n hw.ncpu) CC=$(brew --prefix llvm)/bin/clang CXX=$(brew --prefix llvm)/bin/clang++ USE_OPENCV=1 USE_OPENMP=1 USE_MKLDNN=1 USE_BLAS=apple USE_PROFILER=1
 ```
 
 <h2 id="3">Windows</h2>
 
-On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNET with Intel MKL-DNN.
+On Windows, you can use [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) and [Microsoft Visual Studio 2017](https://www.visualstudio.com/downloads/) to compile MXNet with Intel MKL-DNN.
 [Micrsoft Visual Studio 2015](https://www.visualstudio.com/vs/older-downloads/) is recommended.
 
 **Visual Studio 2015**
@@ -123,7 +123,7 @@ cmake -G "Visual Studio 14 Win64" .. -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -D
 These commands produce a library called ```libmxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
 Also ```libmkldnn.dll``` with be in the ```./build/3rdparty/mkldnn/src/Release/```
 
-6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
+6. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
 
 **Visual Studio 2017**
 
@@ -177,7 +177,7 @@ cmake -G "Visual Studio 15 2017 Win64" .. -T host=x64 -DUSE_CUDA=0 -DUSE_CUDNN=0
 msbuild mxnet.sln /p:Configuration=Release;Platform=x64 /maxcpucount
 ```
 
-9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading mxnet.
+9. Make sure that all the dll files used above(such as `libmkldnn.dll`, `libmklml.dll`, `libiomp5.dll`, `libopenblas.dll`, etc) are added to the system PATH. For convinence, you can put all of them to ```\windows\system32```. Or you will come across `Not Found Dependencies` when loading MXNet.
 
 <h2 id="4">Verify MXNet with python</h2>
 
diff --git a/Makefile b/Makefile
index 31722e86c085..42010e42b08c 100644
--- a/Makefile
+++ b/Makefile
@@ -18,12 +18,11 @@
 ROOTDIR = $(CURDIR)
 TPARTYDIR = $(ROOTDIR)/3rdparty
 
-SCALA_VERSION_PROFILE := scala-2.11
-
 ifeq ($(OS),Windows_NT)
 	UNAME_S := Windows
 else
 	UNAME_S := $(shell uname -s)
+	UNAME_P := $(shell uname -p)
 endif
 
 ifndef config
@@ -60,6 +59,16 @@ endif
 # use customized config file
 include $(config)
 
+ifndef USE_MKLDNN
+ifneq ($(UNAME_S), Darwin)
+ifneq ($(UNAME_S), Windows)
+ifeq ($(UNAME_P), x86_64)
+	USE_MKLDNN=1
+endif
+endif
+endif
+endif
+
 ifeq ($(USE_MKL2017), 1)
 $(warning "USE_MKL2017 is deprecated. We will switch to USE_MKLDNN.")
 	USE_MKLDNN=1
@@ -132,12 +141,7 @@ ifeq ($(USE_MKLDNN), 1)
 		LDFLAGS += -L$(MKLROOT)/lib
 	endif
 	CFLAGS += -I$(MKLDNNROOT)/include
-	# MKLDNN but to needs to be dynamically linked for windows as not all VS compilers support static linking
-	ifneq ($(UNAME_S), Windows)
-		LIB_DEP += $(MKLDNNROOT)/lib/libmkldnn.a
-	else
-		LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
-	endif
+	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
 endif
 
 # setup opencv
@@ -402,18 +406,13 @@ PLUGIN_OBJ =
 PLUGIN_CUOBJ =
 include $(MXNET_PLUGINS)
 
-ifeq ($(UNAME_S), Windows)
-	# TODO(yizhi) currently scala package does not support windows
-	SCALA_PKG_PROFILE := windows
-else
+ifneq ($(UNAME_S), Windows)
 	ifeq ($(UNAME_S), Darwin)
 		WHOLE_ARCH= -all_load
 		NO_WHOLE_ARCH= -noall_load
-		SCALA_PKG_PROFILE := osx-x86_64
 	else
 		WHOLE_ARCH= --whole-archive
 		NO_WHOLE_ARCH= --no-whole-archive
-		SCALA_PKG_PROFILE := linux-x86_64
 	endif
 endif
 
@@ -432,7 +431,6 @@ ifeq ($(USE_CUDA), 1)
 	# Make sure to add stubs as fallback in order to be able to build
 	# without full CUDA install (especially if run without nvidia-docker)
 	LDFLAGS += -L/usr/local/cuda/lib64/stubs
-	SCALA_PKG_PROFILE := $(SCALA_PKG_PROFILE)-gpu
 	ifeq ($(USE_NCCL), 1)
 		ifneq ($(USE_NCCL_PATH), NONE)
 			CFLAGS += -I$(USE_NCCL_PATH)/include
@@ -444,7 +442,6 @@ ifeq ($(USE_CUDA), 1)
 		CFLAGS += -DMXNET_USE_NCCL=0
 	endif
 else
-	SCALA_PKG_PROFILE := $(SCALA_PKG_PROFILE)-cpu
 	CFLAGS += -DMXNET_USE_NCCL=0
 endif
 
@@ -459,6 +456,10 @@ else
 	CFLAGS += -DMXNET_USE_LIBJPEG_TURBO=0
 endif
 
+ifeq ($(CI), 1)
+	MAVEN_ARGS := -B
+endif
+
 # For quick compile test, used smaller subset
 ALLX_DEP= $(ALL_DEP)
 
@@ -468,7 +469,7 @@ build/src/%.o: src/%.cc | mkldnn
 
 build/src/%_gpu.o: src/%.cu | mkldnn
 	@mkdir -p $(@D)
-	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" -M -MT build/src/$*_gpu.o $< >build/src/$*_gpu.d
+	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" --generate-dependencies -MT build/src/$*_gpu.o $< >build/src/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" $<
 
 # A nvcc bug cause it to generate "generic/xxx.h" dependencies from torch headers.
@@ -484,7 +485,7 @@ build/plugin/%.o: plugin/%.cc
 
 %_gpu.o: %.cu
 	@mkdir -p $(@D)
-	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS) -Isrc/operator" -M -MT $*_gpu.o $< >$*_gpu.d
+	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS) -Isrc/operator" --generate-dependencies -MT $*_gpu.o $< >$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS) -Isrc/operator" $<
 
 %.o: %.cc $(CORE_INC)
@@ -590,19 +591,19 @@ rpkg:
 	cp -rf lib/libmxnet.so R-package/inst/libs
 	mkdir -p R-package/inst/include
 	cp -rf include/* R-package/inst/include
+	rm R-package/inst/include/dmlc
+	rm R-package/inst/include/nnvm
 	cp -rf 3rdparty/dmlc-core/include/* R-package/inst/include/
 	cp -rf 3rdparty/tvm/nnvm/include/* R-package/inst/include
 	Rscript -e "if(!require(devtools)){install.packages('devtools', repo = 'https://cloud.r-project.org/')}"
+	Rscript -e "if(!require(devtools)||packageVersion('roxygen2') < '6.1.1'){install.packages('roxygen2', repo = 'https://cloud.r-project.org/')}"
 	Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cloud.r-project.org/')); install_deps(pkg='R-package', dependencies = TRUE)"
 	cp R-package/dummy.NAMESPACE R-package/NAMESPACE
 	echo "import(Rcpp)" >> R-package/NAMESPACE
 	R CMD INSTALL R-package
-	Rscript -e "if (!require('roxygen2')||packageVersion('roxygen2') < '5.0.1'){\
-	    devtools::install_version('roxygen2',version='5.0.1',\
-	    repos='https://cloud.r-project.org/',quiet=TRUE)}"
 	Rscript -e "require(mxnet); mxnet:::mxnet.export('R-package'); warnings()"
 	rm R-package/NAMESPACE
-	Rscript -e "require(roxygen2); roxygen2::roxygenise('R-package'); warnings()"
+	Rscript -e "devtools::document('R-package'); warnings()"
 	R CMD INSTALL R-package
 
 rpkgtest:
@@ -610,80 +611,22 @@ rpkgtest:
 	Rscript -e 'res<-covr:::package_coverage("R-package");fileConn<-file(paste("r-package_coverage_",toString(runif(1)),".json"));writeLines(covr:::to_codecov(res), fileConn);close(fileConn)'
 
 scalaclean:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn clean -P$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE))
-
-scalatestcompile:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn test-compile -P$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) -Dcxx="$(CXX)" \
-		    -Dbuild.platform="$(SCALA_PKG_PROFILE)" \
-			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dcurrent_libdir="$(ROOTDIR)/lib" \
-			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
+	(cd $(ROOTDIR)/scala-package && mvn clean)
 
 scalapkg:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn package -P$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) -Dcxx="$(CXX)" \
-		    -Dbuild.platform="$(SCALA_PKG_PROFILE)" \
-			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dcurrent_libdir="$(ROOTDIR)/lib" \
-			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
+	(cd $(ROOTDIR)/scala-package && mvn install -DskipTests)
+
+scalainstall:
+	(cd $(ROOTDIR)/scala-package && mvn install)
 
 scalaunittest:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn integration-test -P$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE),unittest -Dcxx="$(CXX)" \
-			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a" $(SCALA_TEST_ARGS))
+	(cd $(ROOTDIR)/scala-package && mvn install)
 
 scalaintegrationtest:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn integration-test -P$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE),integrationtest -Dcxx="$(CXX)" \
-			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a" $(SCALA_TEST_ARGS))
-
-scalainstall:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn install -P$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) -DskipTests=true -Dcxx="$(CXX)" \
-		    -Dbuild.platform="$(SCALA_PKG_PROFILE)" \
-			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
-
-scalarelease-dryrun:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn release:clean release:prepare -DdryRun=true -DautoVersionSubmodules=true \
-		-Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \
-		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests=true\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
-
-scalarelease-prepare:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn release:clean release:prepare -DautoVersionSubmodules=true \
-		-Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \
-		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests=true\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
-
-scalarelease-perform:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn release:perform -DautoVersionSubmodules=true \
-		-Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \
-		-Darguments=""-Dbuild\.platform=\""$(SCALA_PKG_PROFILE)\""\ -DskipTests=true\ -Dcflags=\""$(CFLAGS)\""\ -Dcxx=\""$(CXX)\""\ -Dldflags=\""$(LDFLAGS)\""\ -Dlddeps=\""$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a\"""")
-
-scaladeploy:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn deploy -Papache-release,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \-DskipTests=true -Dcxx="$(CXX)" \
-		    -Dbuild.platform="$(SCALA_PKG_PROFILE)" \
-			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
-
-scaladeploylocal:
-	(cd $(ROOTDIR)/scala-package && \
-		mvn deploy -Papache-release,deployLocal,$(SCALA_PKG_PROFILE),$(SCALA_VERSION_PROFILE) \-DskipTests=true -Dcxx="$(CXX)" \
-		  -DaltDeploymentRepository=snapshot-repo::default::file:local-snapshot \
-		  -Dgpg.skip \
-		  -Dbuild.platform="$(SCALA_PKG_PROFILE)" \
-			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
+	(cd $(ROOTDIR)/scala-package && mvn integration-test -DskipTests=false)
 
 jnilint:
-	3rdparty/dmlc-core/scripts/lint.py mxnet-jnicpp cpp scala-package/native/src
+	3rdparty/dmlc-core/scripts/lint.py mxnet-jnicpp cpp scala-package/native/src --exclude_path scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
 
 rclean:
 	$(RM) -r R-package/src/image_recordio.h R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
@@ -691,7 +634,7 @@ rclean:
 
 ifneq ($(EXTRA_OPERATORS),)
 clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
-	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ 
+	$(RM) -r build lib bin deps *~ */*~ */*/*~ */*/*/*~ 
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
diff --git a/NEWS.md b/NEWS.md
index 68cb2b053aec..f06cc35d8b0f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,6 +1,579 @@
 MXNet Change Log
 ================
 
+## 1.4.0
+
+- [New Features](#new-features)
+  * [Java Inference API](#java-inference-api)
+  * [Julia API](#julia-api)
+  * [Control Flow Operators (experimental)](#control-flow-operators--experimental-)
+  * [SVRG Optimization](#svrg-optimization)
+  * [Subgraph API (experimental)](#subgraph-api--experimental-)
+  * [JVM Memory Management](#jvm-memory-management)
+  * [Topology-aware AllReduce (experimental)](#topology-aware-allreduce--experimental-)
+  * [MKLDNN backend: Graph optimization and Quantization (experimental)](#mkldnn-backend--graph-optimization-and-quantization--experimental-)
+    + [Graph Optimization](#graph-optimization)
+    + [Quantization](#quantization)
+- [New Operators](#new-operators)
+- [Feature improvements](#feature-improvements)
+  * [Operator](#operator)
+  * [Optimizer](#optimizer)
+  * [Sparse](#sparse)
+  * [ONNX](#onnx)
+  * [MKLDNN](#mkldnn)
+  * [Inference](#inference)
+  * [Other](#other)
+- [Frontend API updates](#frontend-api-updates)
+  * [Gluon](#gluon)
+  * [Symbol](#symbol)
+- [Language API updates](#language-api-updates)
+  * [Java](#java)
+  * [R](#r)
+  * [Scala](#scala)
+  * [Clojure](#clojure)
+  * [Perl](#perl)
+  * [Julia](#julia)
+- [Performance benchmarks and improvements](#performance-benchmarks-and-improvements)
+- [Bug fixes](#bug-fixes)
+- [Licensing updates](#licensing-updates)
+- [Improvements](#improvements)
+  * [Tutorial](#tutorial)
+  * [Example](#example)
+  * [Documentation](#documentation)
+  * [Website](#website)
+  * [MXNet Distributions](#mxnet-distributions)
+  * [Installation](#installation)
+  * [Build and CI](#build-and-ci)
+  * [3rd party](#3rd-party)
+    + [TVM:](#tvm-)
+    + [CUDNN:](#cudnn-)
+    + [Horovod:](#horovod-)
+- [Deprications](#deprications)
+- [Other](#other-1)
+- [How to build MXNet](#how-to-build-mxnet)
+- [List of submodules used by Apache MXNet (Incubating) and when they were updated last](#list-of-submodules-used-by-apache-mxnet--incubating--and-when-they-were-updated-last)
+### New Features
+#### Java Inference API
+
+Model inference is often managed in a production ecosystem using primarily Java/Scala tools and frameworks. This release seeks to alleviate the need for software engineers to write custom MXNet wrappers to fit their production environment.
+
+Inference on a trained model has a couple of common use cases:
+
+  1. Real-time or Online Inference - tasks that require immediate feedback, such as fraud detection
+  2. Batch or Offline Inference - tasks that don't require immediate feedback, these are use cases where you have massive amounts of data and want to run inference or pre-compute inference results
+Real-time Inference is often performed and deployed on popular web frameworks such as Tomcat, Netty, Jetty, etc., all of which use Java.
+Batch Inference is often performed on big data platforms such as Spark using Scala or Java.
+
+With this project, we had the following goals:
+* Build a new set of APIs that are Java friendly, compatible with Java 7+, are easy to use for inference.
+* Lower the barrier to entry of consuming MXNet for production use cases.
+
+More details can be found at the [Java Inference API document](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Java+Inference+API).
+
+#### Julia API
+
+MXNet.jl is the Julia package of Apache MXNet. MXNet.jl brings flexible and efficient GPU computing and state-of-art deep learning to Julia. Some highlights of features include:
+
+  * Efficient tensor/matrix computation across multiple devices, including multiple CPUs, GPUs and distributed server nodes.
+  * Flexible manipulation of symbolic to composite for construction of state-of-the-art deep learning models.
+
+#### Control Flow Operators (experimental)
+
+Today we observe more and more dynamic neural network models, especially in the fields of natural language processing and graph analysis. The dynamics in these models come from multiple sources, including:
+
+  * Models are expressed with control flow, such as conditions and loops;
+  * NDArrays in a model may have dynamic shapes, meaning the NDArrays of a model or some of the NDArrays have different shapes for different batches;
+  * Models may want to use more dynamic data structures, such as lists or dictionaries.
+It's natural to express dynamic models in frameworks with an imperative programming interface (e.g., Gluon, Pytorch, TensorFlow Eager). In this kind of interface, developers can use Python control flows, or NDArrays with any shape at any moment, or use Python lists and dictionaries to store data as they want. The problem of this approach is that it highly dependent on the originating front-end programming language (mainly Python). A model implemented in one language can only run in the same language.
+
+A common use case is that machine learning scientists want to develop their models in Python, whereas engineers who deploy the models usually have to use a different "production" language (e.g., Java or C). Gluon tries to close the gap between the model development and production deployment. Machine learning scientists design and implement their models in Python with the imperative interface, and then Gluon converts the implementations from imperative to symbolic by invoking `hybridize()` for model exporting.
+
+The goal of this project is to enhance Gluon to turn a dynamic neural network into a static computation graph. The dynamic control flows are expressed by control flow operators with Gluon hybridization, and these are exported for deployment.
+
+More information can be found at [Optimize dynamic neural network models with control flow operators](https://cwiki.apache.org/confluence/display/MXNET/Optimize+dynamic+neural+network+models+with+control+flow+operators)
+
+#### SVRG Optimization
+
+SVRG stands for Stochastic Variance Reduced Gradient, which was first introduced in the paper [Accelerating Stochastic Gradient Descent using Predicative Variance Reduction in 2013](https://papers.nips.cc/paper/4937-accelerating-stochastic-gradient-descent-using-predictive-variance-reduction.pdf). It is an optimization technique that complements SGD.
+
+SGD is known for large scale optimization, but it suffers from slow convergence asymptotically due to the inherent variance. SGD approximates the full gradient using a small batch of samples which introduces variance. In order to converge faster, SGD often needs to start with a smaller learning rate.
+
+SVRG remedies the slow convergence problem by keeping a version of the estimated weights that is close to the optimal parameters and maintains the average of the full gradient over the full pass of data. The average of the full gradients of all data is calculated w.r.t to parameters of last mth epochs. It has provable guarantees for strongly convex smooth functions; a detailed proof can be found in section 3 of the [paper](https://papers.nips.cc/paper/4937-accelerating-stochastic-gradient-descent-using-predictive-variance-reduction.pdf). SVRG uses a different update rule than SGD: gradients w.r.t current parameters minus gradients w.r.t parameters from the last mth epoch, plus the average of gradients over all data.
+
+Key Characteristics of SVRG:
+
+  * Explicit variance reduction
+  * Ability to use relatively large learning rate compared to SGD, which leads to faster convergence.
+More details can be found at [SVRG Optimization in MXNet Python Module](https://cwiki.apache.org/confluence/display/MXNET/Unified+integration+with+external+backend+libraries)
+
+#### Subgraph API (experimental)
+
+MXNet can integrate with many different kinds of backend libraries, including TVM, MKLDNN, TensorRT, Intel nGraph and more. In general, these backends support a limited number of operators, so running computation in a model usually involves an interaction between backend-supported operators and MXNet operators. These backend libraries share some common requirements:
+
+TVM , MKLDNN and nGraph use customized data formats. Interaction between these backends with MXNet requires data format conversion.
+TVM, MKLDNN, TensorRT and nGraph fuses operators.
+Integration with these backends should happen in the granularity of subgraphs instead of in the granularity of operators. To fuse operators, it's obvious that we need to divide a graph into subgraphs so that the operators in a subgraph can be fused into a single operator. To handle customized data formats, we should partition a computation graph into subgraphs as well. Each subgraph contains only TVM, MKLDNN or nGraph operators. In this way, MXNet converts data formats only when entering such a subgraph, and the operators inside a subgraph handle format conversion themselves if necessary. This makes interaction of TVM and MKLDNN with MXNet much easier. Neither the MXNet executor nor the MXNet operators need to deal with customized data formats. Even though invoking these libraries from MXNet requires similar steps, the partitioning rule and the subgraph execution of these backends can be different. As such, we define the following interface for backends to customize graph partitioning and subgraph execution inside an operator. More details can be found at PR 12157 and [Subgraph API](https://cwiki.apache.org/confluence/display/MXNET/Unified+integration+with+external+backend+libraries).
+
+#### JVM Memory Management
+
+The MXNet Scala and Java API uses native memory to manage NDArray, Symbol, Executor, DataIterators using MXNet's internal C APIs.  The C APIs provide appropriate interfaces to create, access and free these objects. MXNet Scala has corresponding Wrappers and APIs that have pointer references to the native memory. Before this project, JVM users (e.g. Scala, Clojure, or Java) of MXNet have to manage MXNet objects manually using the dispose pattern. There are a few usability problems with this approach:
+
+* Users have to track the MXNet objects manually and remember to call `dispose`. This is not Java idiomatic and not user friendly. Quoting a user: "this feels like I am writing C++ code which I stopped ages ago".
+* Leads to memory leaks if `dispose` is not called.
+* Many objects in MXNet-Scala are managed in native memory, needing to use `dispose` on them as well.
+* Bloated code with `dispose()` methods.
+* Hard to debug memory-leaks.
+Goals of the project are:
+* Provide MXNet JVM users automated memory management that can release native memory when there are no references to JVM objects.
+* Provide automated memory management for both GPU and CPU memory without performance degradation.  More details can be found here: [JVM Memory Management](https://cwiki.apache.org/confluence/display/MXNET/JVM+Memory+Management)
+
+#### Topology-aware AllReduce (experimental)
+For distributed training, the `Reduce` communication patterns used by NCCL and MXNet are not optimal for small batch sizes. The `Topology-aware AllReduce` approach is based on the idea of using trees to perform the `Reduce` and `Broadcast` operations. We can use the idea of minimum spanning trees to do a binary tree `Reduce` communication pattern to improve distributed training following this paper by Wang, Li, Edo and Smola [1]. Our strategy is to use:
+
+  * a single tree (latency-optimal for small messages) to handle `Reduce` on small messages
+  * multiple trees (bandwidth-optimal for large messages) to handle `Reduce` on large messages
+
+More details can be found here: [Topology-aware AllReduce](https://cwiki.apache.org/confluence/display/MXNET/Single+machine+All+Reduce+Topology-aware+Communication)
+Note: This is an experimental feature and has known problems - see [13341](/~https://github.com/apache/incubator-mxnet/issues/13341). Please help to contribute to improve the robustness of the feature.
+
+#### MKLDNN backend: Graph optimization and Quantization (experimental)
+
+Two advanced features, graph optimization (operator fusion) and reduced-precision (INT8) computation, are introduced to MKLDNN backend in this release ([#12530](/~https://github.com/apache/incubator-mxnet/pull/12530), [#13297](/~https://github.com/apache/incubator-mxnet/pull/13297), [#13260](/~https://github.com/apache/incubator-mxnet/pull/13260)).
+These features significantly boost the inference performance on CPU (up to 4X) for a broad range of deep learning topologies. Currently, this feature is only available for inference on platforms with [supported Intel CPUs](/~https://github.com/intel/mkl-dnn#system-requirements).
+
+##### Graph Optimization
+MKLDNN backend takes advantage of MXNet subgraph to implement the most of possible operator fusions for inference, such as Convolution + ReLU, Batch Normalization folding, etc. When using mxnet-mkl package, users can easily enable this feature by setting export MXNET_SUBGRAPH_BACKEND=MKLDNN.
+
+##### Quantization
+Performance of reduced-precision (INT8) computation is also dramatically improved after the graph optimization feature is applied on CPU Platforms. Various models are supported and can benefit from reduced-precision computation, including symbolic models, Gluon models and even custom models. Users can run most of the pre-trained models with only a few lines of commands and a new quantization script imagenet_gen_qsym_mkldnn.py. The observed accuracy loss is less than 0.5% for popular CNN networks, like ResNet-50, Inception-BN, MobileNet, etc.
+
+Please find detailed information and performance/accuracy numbers here: [MKLDNN README](/~https://github.com/apache/incubator-mxnet/blob/master/MKLDNN_README.md), [quantization README](/~https://github.com/apache/incubator-mxnet/tree/master/example/quantization#1) and [design proposal](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Graph+Optimization+and+Quantization+based+on+subgraph+and+MKL-DNN)
+
+### New Operators
+
+* Add trigonometric operators (#12424)
+* [MXNET-807] Support integer label type in ctc_loss operator (#12468)
+* [MXNET-876] make CachedOp a normal operator (#11641)
+* Add index_copy() operator (#12810)
+* Fix getnnz operator for CSR matrix (#12908) - issue #12872
+* [MXNET-1173] Debug operators - isfinite, isinf and isnan (#12967)
+* Add sample_like operators (#13034)
+* Add gauss err function operator (#13229)
+* [MXNET -1030] Enhanced Cosine Embedding Loss (#12750)
+* Add bytearray support back to imdecode (#12855, #12868) (#12912)
+* Add Psroipooling CPU implementation (#12738)
+
+### Feature improvements
+#### Operator
+* [MXNET-912] Refactoring ctc loss operator (#12637)
+* Refactor L2_normalization (#13059)
+* Customized and faster `TakeOpForward` operator on CPU (#12997)
+* Allow stop of arange operator to be inferred from dims. (#12064)
+* Make check_isfinite, check_scale optional in clip_global_norm (#12042) add FListInputNames attribute to softmax_cross_entropy (#12701) [MXNET-867] Pooling1D with same padding (#12594)
+* Add support for more req patterns for bilinear sampler backward (#12386) [MXNET-882] Support for N-d arrays added to diag op. (#12430)
+
+#### Optimizer
+* Add a special version of Adagrad optimizer with row-wise learning rate (#12365)
+* Add a Python SVRGModule for performing SVRG Optimization Logic (#12376)
+
+#### Sparse
+
+* Fall back when sparse arrays are passed to MKLDNN-enabled operators (#11664)
+* Add Sparse support for logic operators (#12860)
+* Add Sparse support for take(csr, axis=0)  (#12889)
+
+#### ONNX
+
+* ONNX export - Clip operator (#12457)
+* ONNX version update from 1.2.1 to 1.3 in CI (#12633)
+* Use modern ONNX API to load a model from file (#12777)
+* [MXNET-892] ONNX export/import: DepthToSpace, SpaceToDepth operators (#12731)
+* ONNX export: Fully connected operator w/o bias, ReduceSum, Square (#12646)
+* ONNX export/import: Selu (#12785)
+* ONNX export: Cleanup (#12878)
+* [MXNET-892] ONNX export/import: DepthToSpace, SpaceToDepth operators (#12731)
+* ONNX export: Scalar, Reshape - Set appropriate tensor type (#13067)
+* [MXNET-886] ONNX export: HardSigmoid, Less, Greater, Equal (#12812)
+
+#### MKLDNN
+
+* MKLDNN Forward FullyConnected  op cache (#11611)
+* [MXNET-753] Fallback when using non-MKLDNN supported operators (#12019)
+* MKLDNN Backward op cache (#11301)
+* Implement mkldnn convolution fusion and quantization. (#12530)
+* Improve mkldnn fallback. (#12663)
+* Update MKL-DNN dependency (#12953)
+* Update MKLML dependency (#13181)
+* [MXNET-33] Enhance mkldnn pooling to support full convention (#11047)
+
+#### Inference
+* [MXNET-910] Multithreading inference. (#12456)
+* Tweaked the copy in c_predict_api.h (#12600)
+
+#### Other
+* support for upper triangular matrices in linalg (#12904)
+* Introduce Random module / Refactor code generation (#13038)
+* [MXNET-779]Add DLPack Transformation API (#12047)
+* Draw label name next to corresponding bounding boxes when the mapping of id to names is specified (#9496)
+* Track epoch metric separately (#12182)
+* Set correct update on kvstore flag in dist_device_sync mode (#12786)
+
+### Frontend API updates
+
+#### Gluon
+
+* Update basic_layers.py (#13299)
+* Gluon LSTM Projection and Clipping Support (#13056)
+* Make Gluon download function to be atomic (#12572)
+* [MXNET -1004] Poisson NegativeLog Likelihood loss (#12697)
+* Add activation information for `mxnet.gluon.nn._Conv` (#12354)
+* Gluon DataLoader: avoid recursionlimit error (#12622)
+
+#### Symbol
+* Addressed dumplicate object reference issues (#13214)
+* Throw exception if MXSymbolInferShape fails (#12733)
+* Infer dtype in SymbolBlock import from input symbol (#12412)
+
+### Language API updates
+#### Java
+* [MXNET-1198] MXNet Java API (#13162)
+
+#### R
+* Refactor R Optimizers to fix memory leak - 11374
+* Add new Vignettes to the R package
+  * Char-level Language modeling - 12670
+  * Multidimensional Time series forecasting - 12664
+* Fix broken Examples and tutorials
+  * Tutorial on neural network introduction - 12117
+  * CGAN example - 12283
+  * Test classification with LSTMs - 12263
+
+#### Scala
+* Explain the details for Scala Experimental (#12348)
+* [MXNET-716] Adding Scala Inference Benchmarks (#12721)
+* [MXNET-716][MIRROR #12723] Scala Benchmark Extension pack (#12758)
+* NativeResource Management in Scala (#12647)
+* Ignore generated Scala files (#12928)
+* Use ResourceScope in Model/Trainer/FeedForward.scala (#12882)
+* [MXNET-1180] Scala Image API (#12995)
+* Update log4j version of Scala package (#13131)
+* Review require() usages to add meaningful messages (#12570)
+* Fix Scala readme (#13082)
+
+#### Clojure
+* Introduction to Clojure-MXNet video link (#12754)
+* Improve the Clojure Package README to Make it Easier to Get Started (#12881)
+* MXNET-873 - Bring Clojure Package Inline with New DataDesc and Layout in Scala Package (#12387)
+* Port of Scala Image API to Clojure (#13107)
+
+#### Perl
+* [MXNET-1026] [Perl] Sync with recent changes in Python's API (#12739)
+
+#### Julia
+* Import Julia binding (#10149), how to use is available at /~https://github.com/apache/incubator-mxnet/tree/master/julia
+
+### Performance benchmarks and improvements
+* Update mshadow for omp acceleration when nvcc is not present  (#12674)
+* [MXNET-860] Avoid implicit double conversions (#12361)
+* Add more models to benchmark_score (#12780)
+* Add resnet50-v1 to benchmark_score (#12595)
+
+### Bug fixes
+* Fix for #10920 -  increase tolerance for sparse dot (#12527)
+* [MXNET-1234] Fix shape inference problems in Activation backward (#13409)
+* Fix a bug in `where` op with 1-D input (#12325)
+* [MXNET-825] Fix CGAN R Example with MNIST dataset (#12283)
+* [MXNET-535] Fix bugs in LR Schedulers and add warmup (#11234)
+* Fix speech recognition example (#12291)
+* Fix bug in 'device' type kvstore (#12350)
+* fix search result 404s (#12414)
+* Fix help in imread (#12420)
+* Fix render issue on &lt; and &gt; (#12482)
+* [MXNET-853] Fix for smooth_l1 operator scalar default value (#12284)
+* Fix subscribe links, remove disabled icons (#12474)
+* Fix broken URLs (#12508)
+* Fix/public internal header (#12374)
+* Fix lazy record io when used with dataloader and multi_worker > 0 (#12554)
+* Fix error in try/finally block for blc (#12561)
+* Add cudnn_off parameter to SpatialTransformer Op and fix the inconsistency between CPU & GPU code (#12557)
+* [MXNET-798] Fix the dtype cast from non float32 in Gradient computation (#12290)
+* Fix CodeCovs proper commit detection (#12551)
+* Add TensorRT tutorial to index and fix ToC (#12587)
+* Fixed typo in c_predict_api.cc (#12601)
+* Fix typo in profiler.h (#12599)
+* Fixed NoSuchMethodError for Jenkins Job for MBCC (#12618)
+* [MXNET-922] Fix memleak in profiler (#12499)
+* [MXNET-969] Fix buffer overflow in RNNOp (#12603)
+*  Fixed param coercion of clojure executor/forward (#12627) (#12630)
+* Fix version dropdown behavior (#12632)
+* Fix reference to wrong function (#12644)
+* Fix the location of the tutorial of control flow operators (#12638)
+* Fix issue 12613 (#12614)
+* [MXNET-780] Fix exception handling bug (#12051)
+* Fix bug in prelu, issue 12061 (#12660)
+* [MXNET-833] [R] Char-level RNN tutorial fix (#12670)
+* Fix static / dynamic linking of gperftools and jemalloc (#12714)
+* Fix #12672, importing numpy scalars (zero-dimensional arrays) (#12678)
+* [MXNET-623] Fixing an integer overflow bug in large NDArray (#11742)
+* Fix benchmark on control flow operators (#12693)
+* Fix regression in MKLDNN caused by PR 12019 (#12740)
+* Fixed broken link for Baidu's WARP CTC (#12774)
+* Fix CNN visualization tutorial (#12719)
+* [MXNET-979] Add fix_beta support in BatchNorm (#12625)
+* R fix metric shape (#12776)
+* Revert [MXNET-979] Add fix_beta support in BatchNorm (#12625) (#12789)
+* Fix mismatch shapes (#12793)
+* Fixed symbols naming in RNNCell, LSTMCell, GRUCell (#12794)
+* Fixed __setattr__ method of _MXClassPropertyMetaClass (#12811)
+* Fixed regex for matching platform type in Scala Benchmark scripts (#12826)
+* Fix broken links (#12856)
+* Fix Flaky Topk (#12798)
+* [MXNET-1033] Fix a bug in MultiboxTarget GPU implementation (#12840)
+* [MXNET-1107] Fix CPUPinned unexpected behaviour (#12031)
+* Fix __all__ in optimizer/optimizer.py (#12886)
+* Fix Batch input issue with Scala Benchmark (#12848)
+* fix type inference in index_copy. (#12890)
+* Fix the paths issue for downloading script (#12913)
+* Fix indpt[0] for take(csr) (#12927)
+* Fix the bug of assigning large integer to NDArray (#12921)
+* Fix Sphinx errors for tutorials and install ToCs (#12945)
+* Fix variable name in tutorial code snippet (#13052)
+* Fix example for mxnet.nd.contrib.cond and fix typo in src/engine (#12954)
+* Fix a typo in operator guide (#13115)
+* Fix variational autoencoder example (#12880)
+* Fix problem with some OSX not handling the cast on imDecode (#13207)
+* [MXNET-953] Fix oob memory read (#12631)
+* Fix Sphinx error in ONNX file (#13251)
+* [Example] Fixing Gradcam implementation (#13196)
+* Fix train mnist for inception-bn and resnet (#13239)
+* Fix a bug in index_copy (#13218)
+* Fix Sphinx errors in box_nms (#13261)
+* Fix Sphinx errors (#13252)
+* Fix the cpp example compiler flag (#13293)
+* Made fixes to sparse.py and sparse.md (#13305)
+* [Example] Gradcam- Fixing a link (#13307)
+* Manually track num_max_thread (#12380)
+* [Issue #11912] throw mxnet exceptions when decoding invalid images. (#12999)
+* Undefined name: load_model() --> utils.load_model() (#12867)
+* Change the way NDArrayIter handle the last batch (#12545)
+* Add embedding to print_summary (#12796)
+* Allow foreach on input with 0 length (#12471)
+* [MXNET-360]auto convert str to bytes in img.imdecode when py3 (#10697)
+* Fix unpicklable transform_first on windows (#13686)
+
+### Licensing updates
+* Add license headers to R-package (#12559)
+* License header (#13178)
+* add url and license to clojure package project (#13304)
+
+### Improvements
+#### Tutorial
+* [MXNET-422] Distributed training tutorial (#10955)
+* Add a tutorial for control flow operators. (#12340)
+* Add tutorial Gotchas using NumPy (#12007)
+* Updated Symbol tutorial with Gluon (#12190)
+* Improve tutorial redirection (#12607)
+* Include missing import in TensorRT tutorial (#12609)
+* Update Operator Implementation Tutorial (#12230)
+* Add a tutorial for the subgraph API. (#12698)
+* Improve clojure tutorial (#12974)
+* Update scala intellij tutorial (#12827)
+* [Example] Gradcam consolidation in tutorial (#13255)
+* [MXNET-1203] Tutorial infogan  (#13144)
+* [MXNET-703] Add a TensorRT walkthrough (#12548)
+
+#### Example
+* Update C++ example so it is easier to run (#12397)
+* [MXNET-580] Add SN-GAN example (#12419)
+* [MXNET-637] Multidimensional LSTM example for MXNetR (#12664)
+* [MXNET-982] Provide example to illustrate usage of CSVIter in C++ API (#12636)
+* [MXNET-947] Expand scala imclassification example with resnet (#12639)
+* MKL-DNN Quantization Examples and README (#12808)
+* Extending the DCGAN example implemented by gluon API to provide a more straight-forward evaluation on the generated image (#12790)
+* [MXNET-1017] Updating the readme file for cpp-package and adding readme file for example directory. (#12773)
+* Update tree lstm example (#12960)
+* Update bilstm integer array sorting example (#12929)
+* Updated / Deleted some examples (#12968)
+* Update module example (#12961)
+* Update adversary attack generation example (#12918)
+* Update Gluon example folder (#12951)
+* Update dec example (#12950)
+* Updated capsnet example (#12934)
+* Updates to several examples (#13068)
+* Update multi-task learning example (#12964)
+* Remove obsolete memory cost example (#13235)
+* [Example] Update cpp example README (#13280)
+* [Example]update NER example readme on module prediction (#13184)
+* Update proposal_target.py (#12709)
+* Removing the re-size for validation data, which breaking the validation accuracy of CIFAR training (#12362)
+* Update the README with instruction to redirect the user to gluon-cv (#13186)
+
+#### Documentation
+* Update ONNX API docs references (#12317)
+* Documentation update related to sparse support (#12367)
+* Edit shape.array doc and some style improvements (#12162)
+* Fixed docs/website build checkout bug (#12413)
+* Add Python API docs for test_utils and visualization (#12455)
+* Fix the installation doc for MKL-DNN backend (#12534)
+* Added comment to docs regarding ToTensor transform (#12186)
+* Pinned dockcross to a tag with fixed ABI for RPi (#12588)
+* Refine the documentation of im2rec (#12606)
+* Update and modify Windows docs (#12620)
+* update docs to list cmake required for build from source page (#12592)
+* update the distributed_training document (#12626)
+* Add docstring in im2rec.py (#12621)
+* [Doc] Change the description for pip packages (#12584)
+* Change dependencies documentation opencv2-->opencv (#12654)
+* Add documents for two new environment variables for memory pool. (#12668)
+* Scala Docs - Replace old Symbol api usages (#12759)
+* add/update infer_range docs (#12879)
+* Fix typo in formula in docstring for GRU cell and layer and add clarification to description (gluon.rnn) (#12896)
+* Fix the operator API documentation (#12942)
+* fix broken docs (#12871)
+* fix mac r install and windows python build from source docs (#12919)
+* Document the newly added env variable (#13049)
+* Add documentation on GPU performance on Quantization example (#13145)
+* Fix Sphinx python docstring formatting error. (#13177)
+* [Doc] Fix repo paths in Ubuntu build doc (#13101)
+* Fix Sphinx document parsing error. (#13195)
+* Fix #13090, Add image.imread to python API doc. (#13176)
+* Fix Sphinx docstring formatting error. (#13004, #13005, #13006) (#13175)
+* Fix #12944, Fix Sphinx python docstring formatting error. (#13174)
+* Fix #13013, Fix Sphinx python docstring error. (#13173)
+* Fixed Sparse astype doc string formatting error (#13171)
+* Fixed Documentation issues (#13215)
+* update the doc (#13205)
+* Fix Sphinx doc errors (#13170)
+* Fix Sphinx python docstring error: initializer.InitDesc (#12939) (#13148)
+* Fix Sphinx python docstring error: text contrib module (#12949) (#13149)
+* Fix Sphinx python docstrings (#13160)
+* Add Java API docs generation (#13071)
+* Fix scaladoc build errors (#13189)
+* Add missing documentations for getnnz (#13128)
+* Addressed ONNX module documentation warnings and added notes for short-form representation (#13259)
+* Doc fixes (#13256)
+* Addressed doc issues (#13165)
+* stop gap fix to let website builds through; scaladoc fix pending (#13298)
+* Fix Sphinx python docstring formatting error. (#13194)
+* Visualization doc fix. Added notes for shortform (#13291)
+* [Example] Add docstring for test optimizer and test score (#13286)
+* Fix descriptions in scaladocs for macro ndarray/sybmol APIs (#13210)
+* Sphinx error reduction (#12323)
+* Sphinx errors in Gluon (#13275)
+* Update env_var.md (#12702)
+* Updated the Instructions for use of the label bot (#13192)
+* Added/changed file_name, brief description comments in some files (#13033)
+
+#### Website
+* adding apache conf promo to home page (#12347)
+* Consistent website theme and custom 404 (#12426)
+* update apachecon links to https (#12521)
+* [HOLD] 1.3.0 release website updates (#12509)
+* add mentions of the gluon toolkits and links to resources (#12667)
+* remove apachecon promo (#12695)
+* [MXNet-1002] Add GluonCV and NLP tookits, Keras, and developer wiki to navigation (#12704)
+
+#### MXNet Distributions
+* Make the output of ci/docker/install/ubuntu_mklml.sh less verbose (#12422)
+* Fix tvm dependency for docker (#12479)
+* [MXNET-703] Add TensorRT runtime Dockerfile (#12549)
+* [MXNET-951] Python dockerfiles built on pip binaries and build/release script (#12556)
+* Change numpy version to 1.15.2 in python and docker install requirements (#12711)
+* Add mkl-dnn to docker install method (#12643)
+* Fix docker cleanup race condition (#13092)
+* Bugfix in ci/docker_cache.py (#13249)
+* Update PyPI version number (#11773)
+* update download links to apache distros (#12617)
+
+#### Installation
+* Installation instructions consolidation (#12388)
+* Refine mxnet python installation (#12696)
+* R install instructions update for macOS (#12832)
+* remove legacy installation of Roxygen2 5.0 and add R-specific clean target (#12993) (#12998)
+* Force APT cache update before executing install (#13285)
+* Make the Ubuntu scripts executable after download. (#12180)
+* replacing windows setup with newer instructions (#12504)
+* Updated download links and verification instructions (#12651)
+* Remove pip overwrites (#12604)
+
+#### Build and CI
+* [MXNET-908] Enable minimal OSX Travis build (#12462)
+* Use jom for parallel Windows builds (#12533)
+* [MXNET-950] Enable parallel R dep builds in CI (#12552)
+* Speed up CI Windows builds (#12563)
+* [MXNET-908] Speed up travis builds to avoid timeouts (#12706)
+* Simplify mac MKLDNN build (#12724)
+* [MXNET-674] Speed up GPU builds in CI (#12782)
+* Improved git reset for CI builds (#12784)
+* Improve cpp-package example project build files. (#13093)
+* Add --no-cache option to build.py when building containers (#13182)
+* Addressed sphinx build issue (#13246)
+* Tighten up PyLint directives again (#12322)
+* [MXNET-859] Add a clang-tidy stage to CI (#12282)
+* A solution to prevent zombie containers locally and in CI (#12381)
+*  [MXNET-696][PYTHON][UNDEFINED NAME] import logging in ci/util.py (#12488)
+* [MXNET-703] Static linking for libprotobuf with TensorRT (#12475)
+* Remove regression checks for website links (#12507)
+* [MXNET-953] - Add ASAN sanitizer, Enable in CI (#12370)
+* Allow custom path and static linking for custom mallocs in make (#12645)
+* Correct PR branch detection in code coverage (#12615)
+* Update osx.mk - Added apple to USE_BLAS comment (#12819)
+* [MXNET-953] Correct ASAN cflags flag (#12659)
+* [MXNET-1025] Add Jetpack 3.3 support to Jetson (#12735)
+* Fail the broken link job when broken links are found (#12905)
+* Removed unused header (#13066)
+* Maven Surefire bug workaround (#13081)
+* Add Turing and Volta support to arch_name (#13168)
+* Moves f16c autodetection to its own cmake module (#12331)
+* la_op_inline.h to la_op-inl.h for consistency (#13045)
+* [MXNET-793] Virtualized ARMv7 with Qemu CI integration (#13203)
+* Remove unused variable `rotateM_` (#10803)
+* Separate refactoring from #12276 in a prior PR (#12296)
+* [MXNET-860] Remove std::moves that have no affect (#12730)
+* [MXNET-860] Use emplace where helpful (#12694)
+* Enable C++ coverage (#12642)
+* [MXNET-860] Update to modern nullptr usage (#12352)
+* [MXNET-860] Reduce redundant copies, check for regressions with clang-tidy (#12355)
+
+
+#### 3rd party
+##### TVM:
+* Updated tvm submodule head (#12764)
+* Updated tvm submodule head (#12448)
+##### CUDNN:
+* [MXNET-1179] Enforce deterministic algorithms in convolution layers (#12992)
+* CudnnFind() usage improvements (#12804)
+* Add option for automatic downcasting dtype for cudnn to allow using Tensorcore for fp32  (#12722)
+##### Horovod:
+* [MXNET-1111] Remove CPUPinned in ImageRecordIter (#12666)
+
+### Deprications
+* Add a deprecate message (#13042) contrib_CTCLoss is deprecated. Added a message in command
+### Other
+* Updating news, readme files and bumping master version to 1.3.1 (#12525)
+* Add new name to CONTRIBUTORS.md (#12763)
+* Update contribute.md (#12685)
+* Updated CONTRIBUTORS.md to include lebeg and gigasquid, moved mabreu to committers section (#12766)
+* Update CONTRIBUTORS.md (#12996)
+* Updated CONTRIBUTORS.md to include mxnet-label-bot  (#13048)
+
+### How to build MXNet
+Please follow the instructions at https://mxnet.incubator.apache.org/install/index.html
+
+### List of submodules used by Apache MXNet (Incubating) and when they were updated last
+Submodule@commit ID::Last updated by MXNet:: Last update in submodule
+
+* cub@05eb57f::Jul 31, 2017 :: Jul 31, 2017
+* dlpack@10892ac:: Oct 30, 2017 :: Aug 23, 2018
+* dmlc-core@0a0e8ad:: Aug 15, 2018 :: Nov 15, 2018
+* googletest@ec44c6c:: July 14, 2016 :: July 14, 2016
+* mkldnn@a7c5f53:: Nov 7, 2018 :: Nov 5, 2018
+* mshadow@696803b:: Sep 28, 2018 :: Nov 7,  2018
+* onnx-tensorrt@3d8ee04:: Aug 22, 2018 :: Nov 10, 2018
+* openmp@37c7212: Nov 22, 2017 :: Nov 13, 2018
+* ps-lite@8a76389: April 25, 2018 :: Oct 9, 2018
+* tvm@0f053c8: Oct 10, 2018 :: Oct 8, 2018
+
 ## 1.3.1
 
 ### Bug fixes
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index c710a915bd88..70aa66e36b7e 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -29,7 +29,7 @@ Suggests:
     imager,
     covr
 Depends:
-    R (>= 3.3.0)
+    R (>= 3.4.4)
 LinkingTo: Rcpp
 VignetteBuilder: knitr
-RoxygenNote: 6.1.0
+RoxygenNote: 6.1.1
diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
index f82c238ed075..8291bae1f7b7 100644
--- a/ci/Jenkinsfile_utils.groovy
+++ b/ci/Jenkinsfile_utils.groovy
@@ -64,7 +64,7 @@ def init_git_win() {
 
 // pack libraries for later use
 def pack_lib(name, libs, include_gcov_data = false) {
-  sh """
+  sh returnStatus: true, script: """
 set +e
 echo "Packing ${libs} into ${name}"
 echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
@@ -83,7 +83,7 @@ return 0
 def unpack_and_init(name, libs, include_gcov_data = false) {
   init_git()
   unstash name
-  sh """
+  sh returnStatus: true, script: """
 set +e
 echo "Unpacked ${libs} from ${name}"
 echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
@@ -147,8 +147,9 @@ def collect_test_results_windows(original_file_name, new_file_name) {
 }
 
 
-def docker_run(platform, function_name, use_nvidia, shared_mem = '500m') {
-  def command = "ci/build.py --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
+def docker_run(platform, function_name, use_nvidia, shared_mem = '500m', env_vars = "") {
+  def command = "ci/build.py %ENV_VARS% --docker-registry ${env.DOCKER_CACHE_REGISTRY} %USE_NVIDIA% --platform %PLATFORM% --docker-build-retries 3 --shm-size %SHARED_MEM% /work/runtime_functions.sh %FUNCTION_NAME%"
+  command = command.replaceAll('%ENV_VARS%', env_vars.length() > 0 ? "-e ${env_vars}" : '')
   command = command.replaceAll('%USE_NVIDIA%', use_nvidia ? '--nvidiadocker' : '')
   command = command.replaceAll('%PLATFORM%', platform)
   command = command.replaceAll('%FUNCTION_NAME%', function_name)
diff --git a/ci/build.py b/ci/build.py
index 0069392d9a2a..1c7a4f8b3231 100755
--- a/ci/build.py
+++ b/ci/build.py
@@ -92,22 +92,24 @@ def get_dockerfiles_path():
 
 def get_platforms(path: str = get_dockerfiles_path()) -> List[str]:
     """Get a list of architectures given our dockerfiles"""
-    dockerfiles = glob.glob(os.path.join(path, "Dockerfile.build.*"))
+    dockerfiles = glob.glob(os.path.join(path, "Dockerfile.*"))
     dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles))
-    files = list(map(lambda x: re.sub(r"Dockerfile.build.(.*)", r"\1", x), dockerfiles))
+    files = list(map(lambda x: re.sub(r"Dockerfile.(.*)", r"\1", x), dockerfiles))
     platforms = list(map(lambda x: os.path.split(x)[1], sorted(files)))
     return platforms
 
 
 def get_docker_tag(platform: str, registry: str) -> str:
     """:return: docker tag to be used for the container"""
+    platform = platform if any(x in platform for x in ['build.', 'publish.']) else 'build.{}'.format(platform)
     if not registry:
         registry = "mxnet_local"
-    return "{0}/build.{1}".format(registry, platform)
+    return "{0}/{1}".format(registry, platform)
 
 
 def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
-    return os.path.join(path, "Dockerfile.build.{0}".format(platform))
+    platform = platform if any(x in platform for x in ['build.', 'publish.']) else 'build.{}'.format(platform)
+    return os.path.join(path, "Dockerfile.{0}".format(platform))
 
 
 def get_docker_binary(use_nvidia_docker: bool) -> str:
@@ -215,20 +217,21 @@ def container_run(platform: str,
                   local_ccache_dir: str,
                   command: List[str],
                   cleanup: Cleanup,
+                  environment: Dict[str, str],
                   dry_run: bool = False) -> int:
     """Run command in a container"""
     container_wait_s = 600
     #
     # Environment setup
     #
-    environment = {
+    environment.update({
         'CCACHE_MAXSIZE': '500G',
         'CCACHE_TEMPDIR': '/tmp/ccache',  # temp dir should be local and not shared
         'CCACHE_DIR': '/work/ccache',  # this path is inside the container as /work/ccache is
                                        # mounted
         'CCACHE_LOGFILE': '/tmp/ccache.log',  # a container-scoped log, useful for ccache
                                               # verification.
-    }
+    })
     # These variables are passed to the container to the process tree killer can find runaway
     # process inside the container
     # https://wiki.jenkins.io/display/JENKINS/ProcessTreeKiller
@@ -446,6 +449,10 @@ def main() -> int:
     parser.add_argument("--no-cache", action="store_true",
                         help="passes --no-cache to docker build")
 
+    parser.add_argument("-e", "--environment", nargs="*", default=[],
+                        help="Environment variables for the docker container. "
+                        "Specify with a list containing either names or name=value")
+
     parser.add_argument("command",
                         help="command to run in the container",
                         nargs='*', action='append', type=str)
@@ -474,6 +481,9 @@ def signal_handler(signum, _):
     signal.signal(signal.SIGTERM, signal_handler)
     signal.signal(signal.SIGINT, signal_handler)
 
+    environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
+                        for e in args.environment])
+
     if args.list:
         print(list_platforms())
     elif args.platform:
@@ -493,13 +503,13 @@ def signal_handler(signum, _):
             ret = container_run(
                 platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
-                local_ccache_dir=args.ccache_dir, cleanup=cleanup)
+                local_ccache_dir=args.ccache_dir, cleanup=cleanup, environment=environment)
         elif args.print_docker_run:
             command = []
             ret = container_run(
                 platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
-                local_ccache_dir=args.ccache_dir, dry_run=True, cleanup=cleanup)
+                local_ccache_dir=args.ccache_dir, dry_run=True, cleanup=cleanup, environment=environment)
         else:
             # With no commands, execute a build function for the target platform
             command = ["/work/mxnet/ci/docker/runtime_functions.sh", "build_{}".format(platform)]
@@ -507,7 +517,7 @@ def signal_handler(signum, _):
             ret = container_run(
                 platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
-                local_ccache_dir=args.ccache_dir, cleanup=cleanup)
+                local_ccache_dir=args.ccache_dir, cleanup=cleanup, environment=environment)
 
         if ret != 0:
             logging.critical("Execution of %s failed with status: %d", command, ret)
@@ -515,6 +525,7 @@ def signal_handler(signum, _):
 
     elif args.all:
         platforms = get_platforms()
+        platforms = [platform for platform in platforms if 'build.' in platform]
         logging.info("Building for all architectures: %s", platforms)
         logging.info("Artifacts will be produced in the build/ directory.")
         for platform in platforms:
@@ -535,7 +546,7 @@ def signal_handler(signum, _):
             container_run(
                 platform=platform, nvidia_runtime=args.nvidiadocker,
                 shared_memory_size=args.shared_memory_size, command=command, docker_registry=args.docker_registry,
-                local_ccache_dir=args.ccache_dir, cleanup=cleanup)
+                local_ccache_dir=args.ccache_dir, cleanup=cleanup, environment=environment)
             shutil.move(buildir(), plat_buildir)
             logging.info("Built files left in: %s", plat_buildir)
 
diff --git a/ci/docker/Dockerfile.build.android_armv7 b/ci/docker/Dockerfile.build.android_armv7
index c601fc5e5ff7..a2e98cd2efe1 100644
--- a/ci/docker/Dockerfile.build.android_armv7
+++ b/ci/docker/Dockerfile.build.android_armv7
@@ -75,6 +75,11 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 
 WORKDIR /work
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
 
diff --git a/ci/docker/Dockerfile.build.android_armv8 b/ci/docker/Dockerfile.build.android_armv8
index 60376b8efda2..f7de86763457 100644
--- a/ci/docker/Dockerfile.build.android_armv8
+++ b/ci/docker/Dockerfile.build.android_armv8
@@ -74,6 +74,12 @@ ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
 COPY install/android_arm64_openblas.sh /work/
 RUN /work/android_arm64_openblas.sh
 ENV CPLUS_INCLUDE_PATH /work/deps/OpenBLAS
-WORKDIR /work/build
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
+
+WORKDIR /work/build
\ No newline at end of file
diff --git a/ci/docker/Dockerfile.build.armv6 b/ci/docker/Dockerfile.build.armv6
index 6f16d8c77a0a..60e223b7a60f 100644
--- a/ci/docker/Dockerfile.build.armv6
+++ b/ci/docker/Dockerfile.build.armv6
@@ -38,5 +38,10 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.armv7 b/ci/docker/Dockerfile.build.armv7
index 5f0223448f12..0b557d5839e9 100644
--- a/ci/docker/Dockerfile.build.armv7
+++ b/ci/docker/Dockerfile.build.armv7
@@ -38,5 +38,10 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.armv8 b/ci/docker/Dockerfile.build.armv8
index 27bd425ae9b7..ef9c95865590 100644
--- a/ci/docker/Dockerfile.build.armv8
+++ b/ci/docker/Dockerfile.build.armv8
@@ -42,5 +42,10 @@ ENV OpenBLAS_DIR=${CROSS_ROOT}
 COPY install/deb_ubuntu_ccache.sh /work/
 RUN /work/deb_ubuntu_ccache.sh
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/build
diff --git a/ci/docker/Dockerfile.build.jetson b/ci/docker/Dockerfile.build.jetson
index d128ebc7e2a7..07097887f87d 100644
--- a/ci/docker/Dockerfile.build.jetson
+++ b/ci/docker/Dockerfile.build.jetson
@@ -77,10 +77,16 @@ RUN JETPACK_DOWNLOAD_PREFIX=https://developer.download.nvidia.com/devzone/devcen
     dpkg -i --force-architecture  $ARM_NVINFER_INSTALLER_PACKAGE && \
     dpkg -i --force-architecture  $ARM_NVINFER_DEV_INSTALLER_PACKAGE && \
     apt update -y || true && apt install -y cuda-libraries-dev-9-0 libcudnn7-dev libnvinfer-dev
+RUN ln -s /usr/include/aarch64-linux-gnu/cudnn_v7.h /usr/include/aarch64-linux-gnu/cudnn.h
 ENV PATH $PATH:/usr/local/cuda/bin
 ENV NVCCFLAGS "-m64"
 ENV CUDA_ARCH "-gencode arch=compute_53,code=sm_53 -gencode arch=compute_62,code=sm_62"
 ENV NVCC /usr/local/cuda/bin/nvcc
 
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
 COPY runtime_functions.sh /work/
 WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.publish.test.centos7_cpu b/ci/docker/Dockerfile.publish.test.centos7_cpu
new file mode 100644
index 000000000000..7d284452971b
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.test.centos7_cpu
@@ -0,0 +1,38 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build and run MXNet on CentOS 7 for CPU
+
+FROM centos:7
+
+WORKDIR /work/deps
+
+COPY install/centos7_base.sh /work/
+RUN /work/centos7_base.sh
+
+COPY install/centos7_scala.sh /work/
+RUN /work/centos7_scala.sh
+
+ARG USER_ID=0
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
+
+ENV PYTHONPATH=./python/
+WORKDIR /work/mxnet
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.publish.test.centos7_gpu b/ci/docker/Dockerfile.publish.test.centos7_gpu
new file mode 100644
index 000000000000..e7f584683109
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.test.centos7_gpu
@@ -0,0 +1,38 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build and run MXNet on CentOS 7 for CPU
+
+FROM nvidia/cuda:9.2-cudnn7-devel-centos7
+
+WORKDIR /work/deps
+
+COPY install/centos7_base.sh /work/
+RUN /work/centos7_base.sh
+
+COPY install/centos7_scala.sh /work/
+RUN /work/centos7_scala.sh
+
+ARG USER_ID=0
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
+
+ENV PYTHONPATH=./python/
+WORKDIR /work/mxnet
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1404_cpu b/ci/docker/Dockerfile.publish.test.ubuntu1404_cpu
new file mode 100644
index 000000000000..035837686554
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1404_cpu
@@ -0,0 +1,39 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build and run MXNet on Ubuntu 14.04 for CPU
+
+FROM ubuntu:14.04
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_base.sh /work/
+RUN /work/ubuntu_base.sh
+
+COPY install/ubuntu_scala.sh /work/
+RUN /work/ubuntu_scala.sh
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1404_gpu b/ci/docker/Dockerfile.publish.test.ubuntu1404_gpu
new file mode 100644
index 000000000000..854dd68a63c1
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1404_gpu
@@ -0,0 +1,40 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to run MXNet on Ubuntu 14.04 for GPU
+
+# Use CPU with setup_gpu script
+FROM ubuntu:14.04
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_base.sh /work/
+RUN /work/ubuntu_base.sh
+
+COPY install/ubuntu_scala.sh /work/
+RUN /work/ubuntu_scala.sh
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu b/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu
new file mode 100644
index 000000000000..bbb7b6a0d7bd
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu
@@ -0,0 +1,39 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build and run MXNet on Ubuntu 16.04 for CPU
+
+FROM ubuntu:16.04
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_base.sh /work/
+RUN /work/ubuntu_base.sh
+
+COPY install/ubuntu_scala.sh /work/
+RUN /work/ubuntu_scala.sh
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu b/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu
new file mode 100644
index 000000000000..660461dc0cfa
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu
@@ -0,0 +1,39 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to run MXNet on Ubuntu 16.04 for GPU
+
+FROM nvidia/cuda:9.2-cudnn7-devel-ubuntu16.04
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_base.sh /work/
+RUN /work/ubuntu_base.sh
+
+COPY install/ubuntu_scala.sh /work/
+RUN /work/ubuntu_scala.sh
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu b/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu
new file mode 100644
index 000000000000..e3a8c193f234
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu
@@ -0,0 +1,41 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build and run MXNet on Ubuntu 18.04 for CPU
+
+FROM ubuntu:18.04
+
+WORKDIR /work/deps
+
+ENV DEBIAN_FRONTEND noninteractive
+
+COPY install/ubuntu_base.sh /work/
+RUN /work/ubuntu_base.sh
+
+COPY install/ubuntu_scala.sh /work/
+RUN /work/ubuntu_scala.sh
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1804_gpu b/ci/docker/Dockerfile.publish.test.ubuntu1804_gpu
new file mode 100644
index 000000000000..99f7e0d3eff9
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1804_gpu
@@ -0,0 +1,41 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to run MXNet on Ubuntu 18.04 for GPU
+
+FROM nvidia/cuda:9.2-cudnn7-devel-ubuntu18.04
+
+WORKDIR /work/deps
+
+ENV DEBIAN_FRONTEND noninteractive
+
+COPY install/ubuntu_base.sh /work/
+RUN /work/ubuntu_base.sh
+
+COPY install/ubuntu_scala.sh /work/
+RUN /work/ubuntu_scala.sh
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.ubuntu1404_cpu b/ci/docker/Dockerfile.publish.ubuntu1404_cpu
new file mode 100644
index 000000000000..04ce94f95eae
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.ubuntu1404_cpu
@@ -0,0 +1,36 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to build and run MXNet on Ubuntu 14.04 for CPU
+
+FROM ubuntu:14.04
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_publish.sh /work/
+RUN /work/ubuntu_publish.sh
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.ubuntu1404_gpu b/ci/docker/Dockerfile.publish.ubuntu1404_gpu
new file mode 100644
index 000000000000..9855986a2891
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.ubuntu1404_gpu
@@ -0,0 +1,36 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to run MXNet on Ubuntu 14.04 for GPU
+
+FROM ubuntu:14.04
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_publish.sh /work/
+RUN /work/ubuntu_publish.sh
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/install/centos7_adduser.sh b/ci/docker/install/centos7_adduser.sh
index ba72c9b92281..f9d2402c9554 100755
--- a/ci/docker/install/centos7_adduser.sh
+++ b/ci/docker/install/centos7_adduser.sh
@@ -34,4 +34,9 @@ then
     mkdir /work/mxnet
     mkdir /work/build
     chown -R jenkins_slave /work/
+
+    # Later on, we have to override the links because underlying build systems ignore our compiler settings. Thus,
+    # we have to give the process the proper permission to these files. This is hacky, but unfortunately 
+    # there's no better way to do this without patching all our submodules.
+    chown -R jenkins_slave /usr/local/bin
 fi
diff --git a/ci/docker/install/centos7_base.sh b/ci/docker/install/centos7_base.sh
new file mode 100755
index 000000000000..3b84aeb57b06
--- /dev/null
+++ b/ci/docker/install/centos7_base.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# build and install are separated so changes to build don't invalidate
+# the whole docker cache for the image
+
+set -ex
+
+# Multipackage installation does not fail in yum
+yum -y install epel-release
+yum -y install git
+yum -y install wget
+yum -y install make
+yum -y install cmake
+yum -y install unzip
+yum -y install ninja-build
+yum -y install gcc-gfortran
diff --git a/ci/docker/install/centos7_scala.sh b/ci/docker/install/centos7_scala.sh
index ea46de9b9311..5c43f011cbf1 100755
--- a/ci/docker/install/centos7_scala.sh
+++ b/ci/docker/install/centos7_scala.sh
@@ -23,9 +23,17 @@
 set -ex
 
 yum install -y java-1.8.0-openjdk-devel
+export JAVA_HOME=/usr/lib/jvm/jre-1.8.0-openjdk
+export PATH=$JAVA_HOME/bin:$PATH
 # Build from source with Maven
-wget http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
+wget -q http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
 tar xzf apache-maven-3.3.9-bin.tar.gz
 mkdir /usr/local/maven
 mv apache-maven-3.3.9/ /usr/local/maven/
 alternatives --install /usr/bin/mvn mvn /usr/local/maven/apache-maven-3.3.9/bin/mvn 1
+
+echo "export JAVA_HOME=/usr/lib/jvm/jre-1.8.0-openjdk" >> /etc/profile.d/maven.sh
+echo "export M3_HOME=/usr/local/src/apache-maven" >> /etc/profile.d/maven.sh
+echo "export PATH=$M3_HOME/bin:$JAVA_HOME/bin:$PATH" >> /etc/profile.d/maven.sh
+chmod +x /etc/profile.d/maven.sh
+source /etc/profile.d/maven.sh
diff --git a/ci/docker/install/ubuntu_adduser.sh b/ci/docker/install/ubuntu_adduser.sh
index 515a80f63b07..a7668bac2ab6 100755
--- a/ci/docker/install/ubuntu_adduser.sh
+++ b/ci/docker/install/ubuntu_adduser.sh
@@ -40,4 +40,9 @@ then
     mkdir /work/mxnet
     mkdir /work/build
     chown -R jenkins_slave /work/
+
+    # Later on, we have to override the links because underlying build systems ignore our compiler settings. Thus,
+    # we have to give the process the proper permission to these files. This is hacky, but unfortunately 
+    # there's no better way to do this without patching all our submodules.
+    chown -R jenkins_slave /usr/local/bin
 fi
diff --git a/ci/docker/install/ubuntu_base.sh b/ci/docker/install/ubuntu_base.sh
new file mode 100755
index 000000000000..b34c0b3e18f1
--- /dev/null
+++ b/ci/docker/install/ubuntu_base.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# build and install are separated so changes to build don't invalidate
+# the whole docker cache for the image
+
+set -ex
+apt-get update || true
+apt-get install -y \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    git \
+    ninja-build \
+    libgfortran3 \
+    software-properties-common \
+    sudo \
+    unzip \
+    wget
diff --git a/ci/docker/install/ubuntu_core.sh b/ci/docker/install/ubuntu_core.sh
index 64f8af3e0444..4382aa6aefd0 100755
--- a/ci/docker/install/ubuntu_core.sh
+++ b/ci/docker/install/ubuntu_core.sh
@@ -26,7 +26,6 @@ apt-get install -y \
     apt-transport-https \
     build-essential \
     ca-certificates \
-    cmake \
     curl \
     git \
     libatlas-base-dev \
@@ -41,3 +40,11 @@ apt-get install -y \
     sudo \
     unzip \
     wget
+
+
+# Ubuntu 14.04
+if [[ $(lsb_release -r | grep 14.04) ]]; then
+    apt-get install -y cmake3
+else
+    apt-get install -y cmake
+fi
diff --git a/ci/docker/install/ubuntu_nvidia.sh b/ci/docker/install/ubuntu_nvidia.sh
index 3d8de9d0d7dd..7012b897ff91 100755
--- a/ci/docker/install/ubuntu_nvidia.sh
+++ b/ci/docker/install/ubuntu_nvidia.sh
@@ -18,11 +18,6 @@
 # under the License.
 
 set -ex
-apt-get update || true
-apt install -y software-properties-common
-
-# Adding ppas frequently fails due to busy gpg servers, retry 5 times with 5 minute delays.
-for i in 1 2 3 4 5; do add-apt-repository -y ppa:graphics-drivers && break || sleep 300; done
 
 # Retrieve ppa:graphics-drivers and install nvidia-drivers.
 # Note: DEBIAN_FRONTEND required to skip the interactive setup steps
diff --git a/ci/docker/install/ubuntu_publish.sh b/ci/docker/install/ubuntu_publish.sh
new file mode 100644
index 000000000000..1ad6ab947842
--- /dev/null
+++ b/ci/docker/install/ubuntu_publish.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Build on Ubuntu 14.04 LTS for LINUX CPU/GPU
+apt-get update
+apt-get install -y software-properties-common
+add-apt-repository ppa:ubuntu-toolchain-r/test -y
+add-apt-repository ppa:openjdk-r/ppa -y # Java lib
+apt-get update
+apt-get install -y git \
+    cmake3 \
+    libcurl4-openssl-dev \
+    unzip \
+    gcc-4.8 \
+    g++-4.8 \
+    gfortran \
+    gfortran-4.8 \
+    binutils \
+    nasm \
+    libtool \
+    curl \
+    wget \
+    sudo \
+    gnupg \
+    gnupg2 \
+    gnupg-agent \
+    pandoc \
+    python3-pip \
+    automake \
+    pkg-config \
+    openjdk-8-jdk
+curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
+tar xzf apache-maven-3.3.9-bin.tar.gz
+mkdir /usr/local/maven
+mv apache-maven-3.3.9/ /usr/local/maven/
+update-alternatives --install /usr/bin/mvn mvn /usr/local/maven/apache-maven-3.3.9/bin/mvn 1
+update-ca-certificates -f
+
+apt-get install -y python python3
+
+# the version of the pip shipped with ubuntu may be too lower, install a recent version here
+wget -nv https://bootstrap.pypa.io/get-pip.py
+python3 get-pip.py
+python2 get-pip.py
+
+apt-get remove -y python3-urllib3
+
+pip2 install nose cpplint==1.3.0 pylint==1.9.3 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
+pip3 install nose cpplint==1.3.0 pylint==2.1.1 'numpy<=1.15.2,>=1.8.2' nose-timer 'requests<2.19.0,>=2.18.4' h5py==2.8.0rc1 scipy==1.0.1 boto3
diff --git a/ci/docker/install/ubuntu_scala.sh b/ci/docker/install/ubuntu_scala.sh
index 6ecb8d801186..5bade47463e2 100755
--- a/ci/docker/install/ubuntu_scala.sh
+++ b/ci/docker/install/ubuntu_scala.sh
@@ -24,13 +24,31 @@ set -ex
 cd "$(dirname "$0")"
 # install libraries for mxnet's scala package on ubuntu
 echo 'Installing Scala...'
-apt-get update || true
-apt-get install -y software-properties-common
-apt-get update || true
-apt-get install -y openjdk-8-jdk
-apt-get install -y openjdk-8-jre
 
+# Ubuntu 14.04
+if [[ $(lsb_release -r | grep 14.04) ]]; then
+   add-apt-repository -y ppa:openjdk-r/ppa
+fi
+
+# All Ubuntu
 apt-get update || true
 apt-get install -y \
-    maven \
+    openjdk-8-jdk \
+    openjdk-8-jre \
+    software-properties-common \
+    gnupg \
+    gnupg2 \
+    gnupg-agent \
     scala
+
+# Ubuntu 14.04
+if [[ $(lsb_release -r | grep 14.04) ]]; then
+    curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
+    tar xzf apache-maven-3.3.9-bin.tar.gz
+    mkdir /usr/local/maven
+    mv apache-maven-3.3.9/ /usr/local/maven/
+    update-alternatives --install /usr/bin/mvn mvn /usr/local/maven/apache-maven-3.3.9/bin/mvn 1
+    update-ca-certificates -f
+else
+    apt-get install -y maven
+fi
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 82e6feb2a728..a6bb1064a589 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -36,35 +36,67 @@ clean_repo() {
     git submodule update --init --recursive
 }
 
+scala_prepare() {
+    # Clean up maven logs
+    export MAVEN_OPTS="-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn"
+}
+
 build_ccache_wrappers() {
     set -ex
 
-    rm -f cc
-    rm -f cxx
-
-    touch cc
-    touch cxx
-
     if [ -z ${CC+x} ]; then
         echo "No \$CC set, defaulting to gcc";
         export CC=gcc
     fi
-
-    if [ -z ${CXX+x} ]; then
+     if [ -z ${CXX+x} ]; then
        echo "No \$CXX set, defaulting to g++";
        export CXX=g++
     fi
 
-    # this function is nessesary for cuda enabled make based builds, since nvcc needs just an executable for -ccbin
-
-    echo -e "#!/bin/sh\n/usr/local/bin/ccache ${CC} \"\$@\"\n" >> cc
-    echo -e "#!/bin/sh\n/usr/local/bin/ccache ${CXX} \"\$@\"\n" >> cxx
-
-    chmod +x cc
-    chmod +x cxx
-
-    export CC=`pwd`/cc
-    export CXX=`pwd`/cxx
+    # Recommended by CCache: https://ccache.samba.org/manual.html#_run_modes
+    # Add to the beginning of path to ensure this redirection is picked up instead
+    # of the original ones. Especially CUDA/NVCC appends itself to the beginning of the
+    # path and thus this redirect is ignored. This change fixes this problem
+    # This hacky approach with symbolic links is required because underlying build
+    # systems of our submodules ignore our CMake settings. If they use Makefile,
+    # we can't influence them at all in general and NVCC also prefers to hardcode their
+    # compiler instead of respecting the settings. Thus, we take this brutal approach
+    # and just redirect everything of this installer has been called.
+    # In future, we could do these links during image build time of the container.
+    # But in the beginning, we'll make this opt-in. In future, loads of processes like
+    # the scala make step or numpy compilation and other pip package generations
+    # could be heavily sped up by using ccache as well.
+    mkdir /tmp/ccache-redirects
+    export PATH=/tmp/ccache-redirects:$PATH
+    ln -s ccache /tmp/ccache-redirects/gcc
+    ln -s ccache /tmp/ccache-redirects/gcc-8
+    ln -s ccache /tmp/ccache-redirects/g++
+    ln -s ccache /tmp/ccache-redirects/g++-8
+    ln -s ccache /tmp/ccache-redirects/nvcc
+    ln -s ccache /tmp/ccache-redirects/clang++-3.9
+    ln -s ccache /tmp/ccache-redirects/clang-3.9
+    ln -s ccache /tmp/ccache-redirects/clang++-5.0
+    ln -s ccache /tmp/ccache-redirects/clang-5.0
+    ln -s ccache /tmp/ccache-redirects/clang++-6.0
+    ln -s ccache /tmp/ccache-redirects/clang-6.0
+    ln -s ccache /usr/local/bin/gcc
+    ln -s ccache /usr/local/bin/gcc-8
+    ln -s ccache /usr/local/bin/g++
+    ln -s ccache /usr/local/bin/g++-8
+    ln -s ccache /usr/local/bin/nvcc
+    ln -s ccache /usr/local/bin/clang++-3.9
+    ln -s ccache /usr/local/bin/clang-3.9
+    ln -s ccache /usr/local/bin/clang++-5.0
+    ln -s ccache /usr/local/bin/clang-5.0
+    ln -s ccache /usr/local/bin/clang++-6.0
+    ln -s ccache /usr/local/bin/clang-6.0
+
+    export NVCC=ccache
+
+    # Uncomment if you would like to debug CCache hit rates.
+    # You can monitor using tail -f ccache-log
+    # export CCACHE_LOGFILE=/work/mxnet/ccache-log
+    # export CCACHE_DEBUG=1
 }
 
 build_wheel() {
@@ -106,6 +138,8 @@ build_jetson() {
     set -ex
     pushd .
 
+    #build_ccache_wrappers
+
     cp make/crosscompile.jetson.mk ./config.mk
     make -j$(nproc)
 
@@ -129,6 +163,7 @@ build_armv6() {
 
     # We do not need OpenMP, since most armv6 systems have only 1 core
 
+    build_ccache_wrappers
     cmake \
         -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -159,6 +194,7 @@ build_armv7() {
     # file tries to add -llapack. Lapack functionality though, requires -lgfortran
     # to be linked additionally.
 
+    build_ccache_wrappers
     cmake \
         -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
         -DCMAKE_CROSSCOMPILING=ON \
@@ -181,6 +217,7 @@ build_armv7() {
 }
 
 build_armv8() {
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -205,6 +242,7 @@ build_armv8() {
 build_android_armv7() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DANDROID=ON\
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -225,6 +263,7 @@ build_android_armv7() {
 build_android_armv8() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake\
         -DANDROID=ON \
         -DUSE_CUDA=OFF\
@@ -244,19 +283,21 @@ build_centos7_cpu() {
     cd /work/mxnet
     export CC="ccache gcc"
     export CXX="ccache g++"
-
+    build_ccache_wrappers
     make \
         DEV=1 \
         USE_LAPACK=1 \
         ENABLE_TESTCOVERAGE=1 \
         USE_LAPACK_PATH=/usr/lib64/liblapack.so \
         USE_BLAS=openblas \
+        USE_MKLDNN=0 \
         USE_DIST_KVSTORE=1 \
         -j$(nproc)
 }
 
 build_amzn_linux_cpu() {
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -273,19 +314,17 @@ build_amzn_linux_cpu() {
     ninja -v
 }
 
-
 build_centos7_mkldnn() {
     set -ex
     cd /work/mxnet
     export CC="ccache gcc"
     export CXX="ccache g++"
-
+    build_ccache_wrappers
     make \
         DEV=1 \
         ENABLE_TESTCOVERAGE=1 \
         USE_LAPACK=1 \
         USE_LAPACK_PATH=/usr/lib64/liblapack.so \
-        USE_MKLDNN=1 \
         USE_BLAS=openblas \
         -j$(nproc)
 }
@@ -294,13 +333,14 @@ build_centos7_gpu() {
     set -ex
     cd /work/mxnet
     # unfortunately this build has problems in 3rdparty dependencies with ccache and make
-    # build_ccache_wrappers
+    build_ccache_wrappers
     make \
         DEV=1                                     \
         ENABLE_TESTCOVERAGE=1                     \
         USE_LAPACK=1                              \
         USE_LAPACK_PATH=/usr/lib64/liblapack.so   \
         USE_BLAS=openblas                         \
+        USE_MKLDNN=0                              \
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
@@ -315,13 +355,15 @@ build_ubuntu_cpu() {
 
 build_ubuntu_cpu_openblas() {
     set -ex
-    export CC="ccache gcc"
-    export CXX="ccache g++"
+    export CC="gcc"
+    export CXX="g++"
+    build_ccache_wrappers
     make \
         DEV=1                         \
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
+        USE_MKLDNN=0                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
 }
@@ -335,6 +377,7 @@ build_ubuntu_cpu_mkl() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=mkl                  \
+        USE_MKLDNN=0                  \
         USE_INTEL_PATH=/opt/intel     \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
@@ -344,6 +387,7 @@ build_ubuntu_cpu_cmake_debug() {
     set -ex
     pushd .
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -352,6 +396,7 @@ build_ubuntu_cpu_cmake_debug() {
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_OPENMP=OFF \
         -DUSE_OPENCV=ON \
+        -DUSE_SIGNAL_HANDLER=ON \
         -DCMAKE_BUILD_TYPE=Debug \
         -G Ninja \
         /work/mxnet
@@ -365,13 +410,15 @@ build_ubuntu_cpu_cmake_asan() {
 
     pushd .
     cd /work/build
+    export CXX=g++-8
+    export CC=gcc-8
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-        -DCMAKE_CXX_COMPILER=/usr/bin/g++-8 \
-        -DCMAKE_C_COMPILER=/usr/bin/gcc-8 \
         -DUSE_CUDA=OFF \
         -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DUSE_MKLDNN=OFF \
         -DUSE_OPENMP=OFF \
         -DUSE_OPENCV=OFF \
         -DCMAKE_BUILD_TYPE=Debug \
@@ -391,13 +438,14 @@ build_ubuntu_cpu_cmake_asan() {
 
 build_ubuntu_cpu_clang39() {
     set -ex
-     export CXX=clang++-3.9
+    export CXX=clang++-3.9
     export CC=clang-3.9
-     build_ccache_wrappers
-     make \
+    build_ccache_wrappers
+    make \
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
+        USE_MKLDNN=0                  \
         USE_OPENMP=0                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
@@ -415,6 +463,7 @@ build_ubuntu_cpu_clang60() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
+        USE_MKLDNN=0                  \
         USE_OPENMP=1                  \
         USE_DIST_KVSTORE=1            \
         -j$(nproc)
@@ -429,10 +478,12 @@ build_ubuntu_cpu_clang_tidy() {
 
     pushd .
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DUSE_CUDA=OFF \
+        -DUSE_MKLDNN=OFF \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_OPENCV=ON \
         -DCMAKE_BUILD_TYPE=Debug \
@@ -458,7 +509,6 @@ build_ubuntu_cpu_clang39_mkldnn() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
-        USE_MKLDNN=1                  \
         USE_OPENMP=0                  \
         -j$(nproc)
 }
@@ -475,7 +525,6 @@ build_ubuntu_cpu_clang60_mkldnn() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
-        USE_MKLDNN=1                  \
         USE_OPENMP=1                  \
         -j$(nproc)
 }
@@ -490,7 +539,6 @@ build_ubuntu_cpu_mkldnn() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=openblas             \
-        USE_MKLDNN=1                  \
         -j$(nproc)
 }
 
@@ -504,7 +552,6 @@ build_ubuntu_cpu_mkldnn_mkl() {
         ENABLE_TESTCOVERAGE=1         \
         USE_CPP_PACKAGE=1             \
         USE_BLAS=mkl                  \
-        USE_MKLDNN=1                  \
         -j$(nproc)
 }
 
@@ -526,6 +573,8 @@ build_ubuntu_gpu_tensorrt() {
     mkdir -p build
     cd build
     cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
         -DCMAKE_CXX_FLAGS=-I/usr/include/python${PYVER}\
         -DBUILD_SHARED_LIBS=ON ..\
         -G Ninja
@@ -540,7 +589,10 @@ build_ubuntu_gpu_tensorrt() {
     cd 3rdparty/onnx-tensorrt/
     mkdir -p build
     cd build
-    cmake ..
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        ..
     make -j$(nproc)
     export LIBRARY_PATH=`pwd`:$LIBRARY_PATH
     popd
@@ -559,6 +611,7 @@ build_ubuntu_gpu_tensorrt() {
         USE_CUDA_PATH=/usr/local/cuda                        \
         USE_CUDNN=1                                          \
         USE_OPENCV=0                                         \
+        USE_MKLDNN=0                                         \
         USE_DIST_KVSTORE=0                                   \
         USE_TENSORRT=1                                       \
         USE_JEMALLOC=0                                       \
@@ -578,7 +631,6 @@ build_ubuntu_gpu_mkldnn() {
         ENABLE_TESTCOVERAGE=1                     \
         USE_CPP_PACKAGE=1                         \
         USE_BLAS=openblas                         \
-        USE_MKLDNN=1                              \
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
@@ -595,7 +647,6 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
         DEV=1                                     \
         ENABLE_TESTCOVERAGE=1                     \
         USE_BLAS=openblas                         \
-        USE_MKLDNN=1                              \
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=0                               \
@@ -611,6 +662,7 @@ build_ubuntu_gpu_cuda91_cudnn7() {
         DEV=1                                     \
         ENABLE_TESTCOVERAGE=1                     \
         USE_BLAS=openblas                         \
+        USE_MKLDNN=0                              \
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
@@ -623,6 +675,7 @@ build_ubuntu_gpu_cuda91_cudnn7() {
 build_ubuntu_amalgamation() {
     set -ex
     # Amalgamation can not be run with -j nproc
+    build_ccache_wrappers
     make -C amalgamation/ clean
     make -C amalgamation/     \
         USE_BLAS=openblas     \
@@ -632,6 +685,7 @@ build_ubuntu_amalgamation() {
 build_ubuntu_amalgamation_min() {
     set -ex
     # Amalgamation can not be run with -j nproc
+    build_ccache_wrappers
     make -C amalgamation/ clean
     make -C amalgamation/     \
         USE_BLAS=openblas     \
@@ -642,14 +696,16 @@ build_ubuntu_amalgamation_min() {
 build_ubuntu_gpu_cmake_mkldnn() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
+        -DUSE_SIGNAL_HANDLER=ON                 \
         -DENABLE_TESTCOVERAGE=ON                \
         -DUSE_CUDA=1                            \
         -DUSE_CUDNN=1                           \
         -DUSE_MKLML_MKL=1                       \
-        -DUSE_MKLDNN=1                          \
         -DCMAKE_BUILD_TYPE=Release              \
         -DCUDA_ARCH_NAME=Manual                 \
         -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
@@ -657,20 +713,27 @@ build_ubuntu_gpu_cmake_mkldnn() {
         /work/mxnet
 
     ninja -v
+    # libmkldnn.so.0 is a link file. We need an actual binary file named libmkldnn.so.0.
+    cp 3rdparty/mkldnn/src/libmkldnn.so.0 3rdparty/mkldnn/src/libmkldnn.so.0.tmp
+    mv 3rdparty/mkldnn/src/libmkldnn.so.0.tmp 3rdparty/mkldnn/src/libmkldnn.so.0
 }
 
 build_ubuntu_gpu_cmake() {
     set -ex
     cd /work/build
+    build_ccache_wrappers
     cmake \
         -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
         -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
+        -DUSE_SIGNAL_HANDLER=ON                 \
         -DENABLE_TESTCOVERAGE=ON                \
-        -DUSE_CUDA=1                            \
-        -DUSE_CUDNN=1                           \
-        -DUSE_MKLML_MKL=0                       \
-        -DUSE_MKLDNN=0                          \
-        -DUSE_DIST_KVSTORE=1                    \
+        -DUSE_CUDA=ON                           \
+        -DUSE_CUDNN=ON                          \
+        -DUSE_MKL_IF_AVAILABLE=OFF              \
+        -DUSE_MKLML_MKL=OFF                     \
+        -DUSE_MKLDNN=OFF                        \
+        -DUSE_DIST_KVSTORE=ON                   \
         -DCMAKE_BUILD_TYPE=Release              \
         -DCUDA_ARCH_NAME=Manual                 \
         -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
@@ -783,21 +846,25 @@ unittest_ubuntu_python3_quantization_gpu() {
 
 unittest_ubuntu_cpu_scala() {
     set -ex
-    make scalapkg USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
-    make scalaunittest USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
+    scala_prepare
+    cd scala-package
+    mvn -B integration-test
 }
 
 unittest_centos7_cpu_scala() {
     set -ex
     cd /work/mxnet
-    make scalapkg USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
-    make scalaunittest USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
+    scala_prepare
+    cd scala-package
+    mvn -B integration-test
 }
 
 unittest_ubuntu_cpu_clojure() {
     set -ex
-    make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
-    make scalainstall USE_OPENCV=1 USE_BLAS=openblas USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
+    scala_prepare
+    cd scala-package
+    mvn -B install
+    cd ..
     ./contrib/clojure-package/ci-test.sh
 }
 
@@ -806,7 +873,7 @@ unittest_ubuntu_cpugpu_perl() {
     ./perl-package/test.sh
 }
 
-unittest_ubuntu_gpu_cpp() {
+unittest_cpp() {
     set -ex
     build/tests/mxnet_unit_tests
 }
@@ -858,6 +925,7 @@ unittest_ubuntu_cpu_julia06() {
 
     # FIXME
     export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
+    export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
 
     # use the prebuilt binary from $MXNET_HOME/lib
     julia -e 'Pkg.build("MXNet")'
@@ -886,11 +954,10 @@ unittest_centos7_gpu() {
 integrationtest_ubuntu_cpu_onnx() {
 	set -ex
 	export PYTHONPATH=./python/
-	pytest tests/python-pytest/onnx/import/mxnet_backend_test.py
-	pytest tests/python-pytest/onnx/import/onnx_import_test.py
-	pytest tests/python-pytest/onnx/import/gluon_backend_test.py
-	pytest tests/python-pytest/onnx/export/onnx_backend_test.py
-	python tests/python-pytest/onnx/export/mxnet_export_test.py
+	python tests/python-pytest/onnx/backend_test.py
+	pytest tests/python-pytest/onnx/mxnet_export_test.py
+	pytest tests/python-pytest/onnx/test_models.py
+	pytest tests/python-pytest/onnx/test_node.py
 }
 
 integrationtest_ubuntu_gpu_python() {
@@ -941,8 +1008,10 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
 
 integrationtest_ubuntu_gpu_scala() {
     set -ex
-    make scalapkg USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_DIST_KVSTORE=1 SCALA_ON_GPU=1 ENABLE_TESTCOVERAGE=1
-    make scalaintegrationtest USE_OPENCV=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 SCALA_TEST_ON_GPU=1 USE_DIST_KVSTORE=1 ENABLE_TESTCOVERAGE=1
+    scala_prepare
+    cd scala-package
+    export SCALA_TEST_ON_GPU=1
+    mvn -B integration-test -DskipTests=false
 }
 
 integrationtest_ubuntu_gpu_dist_kvstore() {
@@ -1006,7 +1075,6 @@ build_docs() {
     popd
 }
 
-
 # Functions that run the nightly Tests:
 
 #Runs Apache RAT Check on MXNet Source for License Headers
@@ -1135,7 +1203,7 @@ nightly_straight_dope_python3_multi_gpu_tests() {
 nightly_tutorial_test_ubuntu_python3_gpu() {
     set -ex
     cd /work/mxnet/docs
-    export BUILD_VER=tutorial 
+    export BUILD_VER=tutorial
     export MXNET_DOCS_BUILD_MXNET=0
     make html
     export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
@@ -1158,6 +1226,14 @@ nightly_tutorial_test_ubuntu_python2_gpu() {
     nosetests-3.4 $NOSE_TIMER_ARGUMENTS --with-xunit --xunit-file nosetests_tutorials.xml test_tutorials.py --nologcapture
 }
 
+nightly_java_demo_test_cpu() {
+    set -ex
+    cd /work/mxnet/scala-package/mxnet-demo/java-demo
+    make javademo
+    ./bin/java_sample.sh
+    ./bin/run_od.sh
+}
+
 
 # Deploy
 
@@ -1165,7 +1241,7 @@ deploy_docs() {
     set -ex
     pushd .
 
-    make docs
+    make docs SPHINXOPTS=-W
 
     popd
 }
@@ -1184,6 +1260,7 @@ deploy_jl_docs() {
 
     # FIXME
     export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
+    export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
 
     # use the prebuilt binary from $MXNET_HOME/lib
     julia -e 'Pkg.build("MXNet")'
@@ -1195,6 +1272,30 @@ deploy_jl_docs() {
     # ...
 }
 
+publish_scala_build() {
+    set -ex
+    pushd .
+    scala_prepare
+    ./ci/publish/scala/build.sh
+    popd
+}
+
+publish_scala_test() {
+    set -ex
+    pushd .
+    scala_prepare
+    ./ci/publish/scala/test.sh
+    popd
+}
+
+publish_scala_deploy() {
+    set -ex
+    pushd .
+    scala_prepare
+    ./ci/publish/scala/deploy.sh
+    popd
+}
+
 # broken_link_checker
 
 broken_link_checker() {
@@ -1221,5 +1322,3 @@ EOF
     declare -F | cut -d' ' -f3
     echo
 fi
-
-
diff --git a/ci/docker_cache.py b/ci/docker_cache.py
index fe1882a567aa..f906b0eba66c 100755
--- a/ci/docker_cache.py
+++ b/ci/docker_cache.py
@@ -30,12 +30,16 @@
 import sys
 import subprocess
 import json
-import time
 from typing import *
 import build as build_util
+from util import retry
 
 DOCKERHUB_LOGIN_NUM_RETRIES = 5
 DOCKERHUB_RETRY_SECONDS = 5
+DOCKER_CACHE_NUM_RETRIES = 3
+DOCKER_CACHE_TIMEOUT_MINS = 15
+PARALLEL_BUILDS = 10
+
 
 def build_save_containers(platforms, registry, load_cache) -> int:
     """
@@ -49,7 +53,7 @@ def build_save_containers(platforms, registry, load_cache) -> int:
     if len(platforms) == 0:
         return 0
 
-    platform_results = Parallel(n_jobs=len(platforms), backend="multiprocessing")(
+    platform_results = Parallel(n_jobs=PARALLEL_BUILDS, backend="multiprocessing")(
         delayed(_build_save_container)(platform, registry, load_cache)
         for platform in platforms)
 
@@ -107,6 +111,8 @@ def _upload_image(registry, docker_tag, image_id) -> None:
     subprocess.check_call(push_cmd)
 
 
+@retry(target_exception=subprocess.CalledProcessError, tries=DOCKERHUB_LOGIN_NUM_RETRIES,
+       delay_s=DOCKERHUB_RETRY_SECONDS)
 def _login_dockerhub():
     """
     Login to the Docker Hub account
@@ -114,30 +120,19 @@ def _login_dockerhub():
     """
     dockerhub_credentials = _get_dockerhub_credentials()
 
-    for i in range(DOCKERHUB_LOGIN_NUM_RETRIES):
-        logging.info('Logging in to DockerHub')
-        # We use password-stdin instead of --password to avoid leaking passwords in case of an error.
-        # This method will produce the following output:
-        # > WARNING! Your password will be stored unencrypted in /home/jenkins_slave/.docker/config.json.
-        # > Configure a credential helper to remove this warning. See
-        # > https://docs.docker.com/engine/reference/commandline/login/#credentials-store
-        # Since we consider the restricted slaves a secure environment, that's fine. Also, using this will require
-        # third party applications which would need a review first as well.
-        p = subprocess.run(['docker', 'login', '--username', dockerhub_credentials['username'], '--password-stdin'],
-                           stdout=subprocess.PIPE, input=str.encode(dockerhub_credentials['password']))
-        logging.info(p.stdout)
-        if p.returncode != 0:
-            logging.error('Error logging in to DockerHub')
-            logging.error(p.stderr)
-
-            # Linear backoff
-            time.sleep(1000 * DOCKERHUB_RETRY_SECONDS * (i + 1))
-        else:
-            logging.info('Successfully logged in to DockerHub')
-            break
-    else:
-        logging.error('DockerHub login not possible after %d retries, aborting', DOCKERHUB_LOGIN_NUM_RETRIES)
-        raise Exception('Unable to log in to DockerHub')
+    logging.info('Logging in to DockerHub')
+    # We use password-stdin instead of --password to avoid leaking passwords in case of an error.
+    # This method will produce the following output:
+    # > WARNING! Your password will be stored unencrypted in /home/jenkins_slave/.docker/config.json.
+    # > Configure a credential helper to remove this warning. See
+    # > https://docs.docker.com/engine/reference/commandline/login/#credentials-store
+    # Since we consider the restricted slaves a secure environment, that's fine. Also, using this will require
+    # third party applications which would need a review first as well.
+    p = subprocess.run(['docker', 'login', '--username', dockerhub_credentials['username'], '--password-stdin'],
+                       stdout=subprocess.PIPE, input=str.encode(dockerhub_credentials['password']))
+    logging.info(p.stdout)
+    logging.info('Successfully logged in to DockerHub')
+
 
 def _logout_dockerhub():
     """
@@ -149,6 +144,8 @@ def _logout_dockerhub():
     logging.info('Successfully logged out of DockerHub')
 
 
+@retry(target_exception=subprocess.TimeoutExpired, tries=DOCKER_CACHE_NUM_RETRIES,
+       delay_s=DOCKERHUB_RETRY_SECONDS)
 def load_docker_cache(registry, docker_tag) -> None:
     """
     Load the precompiled docker cache from the registry
@@ -163,7 +160,10 @@ def load_docker_cache(registry, docker_tag) -> None:
 
     logging.info('Loading Docker cache for %s from %s', docker_tag, registry)
     pull_cmd = ['docker', 'pull', docker_tag]
-    subprocess.call(pull_cmd)  # Don't throw an error if the image does not exist
+
+    # Don't throw an error if the image does not exist
+    subprocess.run(pull_cmd, timeout=DOCKER_CACHE_TIMEOUT_MINS*60)
+    logging.info('Successfully pulled docker cache')
 
 
 def delete_local_docker_cache(docker_tag):
@@ -211,8 +211,7 @@ def _get_dockerhub_credentials():  # pragma: no cover
             logging.exception("The request was invalid due to:")
         elif client_error.response['Error']['Code'] == 'InvalidParameterException':
             logging.exception("The request had invalid params:")
-        else:
-            raise
+        raise
     else:
         secret = get_secret_value_response['SecretString']
         secret_dict = json.loads(secret)
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index d5cbd97683ed..33d76aa1668a 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -23,19 +23,17 @@
 utils = load('ci/Jenkinsfile_utils.groovy')
 
 // mxnet libraries
-mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 
 // Python wheels
 mx_pip = 'build/*.whl'
 
-// for scala build, need to pass extra libs when run with dist_kvstore
-mx_dist_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, lib/libmkldnn.a'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
-mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
@@ -100,7 +98,7 @@ def compile_unix_cpu_openblas() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_openblas', false)
-            utils.pack_lib('cpu', mx_dist_lib, true)
+            utils.pack_lib('cpu', mx_lib, true)
           }
         }
       }
@@ -108,7 +106,7 @@ def compile_unix_cpu_openblas() {
 }
 
 def compile_unix_openblas_debug_cpu() {
-    return ['CPU: Openblas, debug': {
+    return ['CPU: Openblas, cmake, debug': {
       node(NODE_LINUX_CPU) {
         ws('workspace/build-cpu-openblas') {
           timeout(time: max_time, unit: 'MINUTES') {
@@ -128,7 +126,7 @@ def compile_unix_mkl_cpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_mkl', false)
-            utils.pack_lib('cpu_mkl', mx_dist_lib, true)
+            utils.pack_lib('cpu_mkl', mx_mkldnn_lib, true)
           }
         }
       }
@@ -254,7 +252,7 @@ def compile_centos7_cpu() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('centos7_cpu', 'build_centos7_cpu', false)
-            utils.pack_lib('centos7_cpu', mx_dist_lib, true)
+            utils.pack_lib('centos7_cpu', mx_lib, true)
           }
         }
       }
@@ -268,7 +266,7 @@ def compile_centos7_cpu_mkldnn() {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
             utils.docker_run('centos7_cpu', 'build_centos7_mkldnn', false)
-            utils.pack_lib('centos7_mkldnn', mx_lib, true)
+            utils.pack_lib('centos7_mkldnn', mx_mkldnn_lib, true)
           }
         }
       }
@@ -825,7 +823,21 @@ def test_unix_scala_cpu() {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-scala-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_dist_lib, true)
+            utils.unpack_and_init('cpu', mx_lib, true)
+            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
+            utils.publish_test_coverage()
+          }
+        }
+      }
+    }]
+}
+
+def test_unix_scala_mkldnn_cpu(){
+  return ['Scala: MKLDNN-CPU': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/ut-scala-mkldnn-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init('mkldnn_cpu', mx_mkldnn_lib, true)
             utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_scala', false)
             utils.publish_test_coverage()
           }
@@ -839,7 +851,7 @@ def test_unix_scala_gpu() {
       node(NODE_LINUX_GPU) {
         ws('workspace/ut-scala-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('gpu', mx_dist_lib, true)
+            utils.unpack_and_init('gpu', mx_lib, true)
             utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_scala', true)
             utils.publish_test_coverage()
           }
@@ -853,7 +865,7 @@ def test_unix_clojure_cpu() {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-clojure-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('cpu', mx_dist_lib, true)
+            utils.unpack_and_init('cpu', mx_lib, true)
             utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_clojure', false)
             utils.publish_test_coverage()
           }
@@ -882,7 +894,7 @@ def test_unix_cpp_gpu() {
         ws('workspace/ut-cpp-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('cmake_gpu', mx_cmake_lib, true)
-            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
+            utils.docker_run('ubuntu_gpu', 'unittest_cpp', true)
             utils.publish_test_coverage()
           }
         }
@@ -896,7 +908,21 @@ def test_unix_cpp_mkldnn_gpu() {
         ws('workspace/ut-cpp-mkldnn-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib, true)
-            utils.docker_run('ubuntu_gpu', 'unittest_ubuntu_gpu_cpp', true)
+            utils.docker_run('ubuntu_gpu', 'unittest_cpp', true)
+            utils.publish_test_coverage()
+          }
+        }
+      }
+    }]
+}
+
+def test_unix_cpp_cpu() {
+    return ['Cpp: CPU': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/ut-cpp-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init('cpu_debug', mx_cmake_lib_debug, true)
+            utils.docker_run('ubuntu_cpu', 'unittest_cpp', false)
             utils.publish_test_coverage()
           }
         }
@@ -1015,7 +1041,7 @@ def test_centos7_scala_cpu() {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-scala-centos7-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init('centos7_cpu', mx_dist_lib, true)
+            utils.unpack_and_init('centos7_cpu', mx_lib, true)
             utils.docker_run('centos7_cpu', 'unittest_centos7_cpu_scala', false)
             utils.publish_test_coverage()
           }
diff --git a/ci/jenkins/Jenkinsfile_centos_cpu b/ci/jenkins/Jenkinsfile_centos_cpu
index 3b66f8100173..a47ab3de7fb7 100644
--- a/ci/jenkins/Jenkinsfile_centos_cpu
+++ b/ci/jenkins/Jenkinsfile_centos_cpu
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
diff --git a/ci/jenkins/Jenkinsfile_centos_gpu b/ci/jenkins/Jenkinsfile_centos_gpu
index aec3b9054f2f..cad77a9a7dd8 100644
--- a/ci/jenkins/Jenkinsfile_centos_gpu
+++ b/ci/jenkins/Jenkinsfile_centos_gpu
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
diff --git a/ci/jenkins/Jenkinsfile_clang b/ci/jenkins/Jenkinsfile_clang
index 61920cf9865b..029c7208107b 100644
--- a/ci/jenkins/Jenkinsfile_clang
+++ b/ci/jenkins/Jenkinsfile_clang
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
diff --git a/ci/jenkins/Jenkinsfile_edge b/ci/jenkins/Jenkinsfile_edge
index 275a0c96de94..9d8e01399d7c 100644
--- a/ci/jenkins/Jenkinsfile_edge
+++ b/ci/jenkins/Jenkinsfile_edge
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
@@ -34,7 +34,7 @@ utils.assign_node_labels(utility: 'utility', linux_cpu: 'mxnetlinux-cpu', linux_
 utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
-//    custom_steps.compile_armv8_jetson_gpu(),
+    custom_steps.compile_armv8_jetson_gpu(),
     custom_steps.compile_armv7_cpu(),
     custom_steps.compile_armv6_cpu(),
     custom_steps.compile_armv8_cpu(),
diff --git a/ci/jenkins/Jenkinsfile_miscellaneous b/ci/jenkins/Jenkinsfile_miscellaneous
index c02cc991b864..dbf2a9e41c76 100644
--- a/ci/jenkins/Jenkinsfile_miscellaneous
+++ b/ci/jenkins/Jenkinsfile_miscellaneous
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 
 node('utility') {
diff --git a/ci/jenkins/Jenkinsfile_sanity b/ci/jenkins/Jenkinsfile_sanity
index 123fedfdab79..ed4d16ec47db 100644
--- a/ci/jenkins/Jenkinsfile_sanity
+++ b/ci/jenkins/Jenkinsfile_sanity
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index e581bcf65dc5..00b1aa9f68d9 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
@@ -50,11 +50,12 @@ core_logic: {
     custom_steps.test_unix_python3_mkldnn_cpu(),
     custom_steps.test_unix_python3_mkldnn_mkl_cpu(),
     custom_steps.test_unix_scala_cpu(),
+    custom_steps.test_unix_scala_mkldnn_cpu(),
     custom_steps.test_unix_clojure_cpu(),
     custom_steps.test_unix_r_cpu(),
     custom_steps.test_unix_julia_cpu(),
     custom_steps.test_unix_onnx_cpu(),
-
+    custom_steps.test_unix_cpp_cpu(),
     /*  Disabled due to master build failure:
      *  http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1221/pipeline/
      *  /~https://github.com/apache/incubator-mxnet/issues/11801
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index cf92836e96e4..bd884904d596 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
diff --git a/ci/jenkins/Jenkinsfile_website b/ci/jenkins/Jenkinsfile_website
index 050f509e68e9..acdd2be4d00e 100644
--- a/ci/jenkins/Jenkinsfile_website
+++ b/ci/jenkins/Jenkinsfile_website
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
diff --git a/ci/jenkins/Jenkinsfile_windows_cpu b/ci/jenkins/Jenkinsfile_windows_cpu
index 9e70df38dca5..a8746db73d34 100644
--- a/ci/jenkins/Jenkinsfile_windows_cpu
+++ b/ci/jenkins/Jenkinsfile_windows_cpu
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
diff --git a/ci/jenkins/Jenkinsfile_windows_gpu b/ci/jenkins/Jenkinsfile_windows_gpu
index 69fd07343859..2319f25942de 100644
--- a/ci/jenkins/Jenkinsfile_windows_gpu
+++ b/ci/jenkins/Jenkinsfile_windows_gpu
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
diff --git a/ci/publish/Jenkinsfile b/ci/publish/Jenkinsfile
new file mode 100644
index 000000000000..9a360c6b5bed
--- /dev/null
+++ b/ci/publish/Jenkinsfile
@@ -0,0 +1,107 @@
+// -*- mode: groovy -*-
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+//mxnet libraries
+mx_scala_pub = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, config.mk, scala-package/pom.xml, scala-package/**/pom.xml, scala-package/*/target/**, scala-package/local-snapshot/**'
+
+// timeout in minutes
+max_time = 120
+
+node('restricted-utility') {
+  // Loading the utilities requires a node context unfortunately
+  checkout scm
+  utils = load('ci/Jenkinsfile_utils.groovy')
+}
+utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
+
+// CPU and GPU. OSX nodes are not currently supported by Jenkins
+def nodeMap = ['cpu': NODE_LINUX_CPU, 'gpu': NODE_LINUX_GPU]
+def scalaOSMap = ['cpu': 'linux-x86_64-cpu', 'gpu': 'linux-x86_64-gpu']
+
+def wrapStep(nodeToRun, workspaceName, step) {
+  return {
+    node(nodeToRun) {
+      ws("workspace/${workspaceName}") {
+        timeout(time: max_time, unit: 'MINUTES') {
+          step()
+        }
+      }
+    }
+  }
+}
+
+def toBuild = [:]
+def labels = ['cpu'] // , 'gpu']
+for (x in labels) {
+  def label = x // Required due to language
+  toBuild["Scala Build ${label}"] = wrapStep(nodeMap[label], "build-scala-${label}") {
+    withEnv(["MAVEN_PUBLISH_OS_TYPE=${scalaOSMap[label]}"]) {
+      utils.init_git()
+      utils.docker_run("ubuntu_${label}", 'publish_scala_build', label == 'gpu', '500m', 'MAVEN_PUBLISH_OS_TYPE')
+      utils.pack_lib("scala_${label}", mx_scala_pub, false)
+    }
+  }
+}
+
+def toTest = [:]
+def systems = ['ubuntu1604', 'ubuntu1804', 'centos7']
+for (x in labels) {
+  def label = x // Required due to language
+  for (y in systems) {
+    def system = y // Required due to language
+    toTest["Scala Test ${system} ${label}"] = wrapStep(nodeMap[label], "test-scala-${system}-${label}") {
+      utils.unpack_and_init("scala_${label}", mx_scala_pub, false)
+      utils.docker_run("publish.test.${system}_${label}", 'publish_scala_test', label == 'gpu')
+    }
+  }
+}
+
+def toDeploy = [:]
+for (x in labels) {
+  def label = x // Required due to language
+  toDeploy["Scala Deploy ${label}"] = wrapStep(nodeMap[label], "deploy-scala-${label}") {
+    withEnv(["MAVEN_PUBLISH_OS_TYPE=${scalaOSMap[label]}"]) {
+      utils.unpack_and_init("scala_${label}", mx_scala_pub, false)
+      utils.docker_run("ubuntu_${label}", 'publish_scala_deploy', label == 'gpu', '500m', 'MAVEN_PUBLISH_OS_TYPE MAVEN_PUBLISH_SECRET_ENDPOINT_URL MAVEN_PUBLISH_SECRET_NAME_CREDENTIALS MAVEN_PUBLISH_SECRET_NAME_GPG DOCKERHUB_SECRET_ENDPOINT_REGION')
+    }
+  }
+}
+
+utils.main_wrapper(
+core_logic: {
+  stage('Build Packages') {
+    parallel toBuild
+  }
+  stage('Test Packages') {
+    parallel toTest
+  }
+  stage('Deploy Packages') {
+    parallel toDeploy
+  }
+}
+,
+failure_handler: {
+  if (currentBuild.result == "FAILURE") {
+    // emailext body: 'Generating the nightly maven has failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[NIGHTLY MAVEN FAILED] Build ${BUILD_NUMBER}', to: '${EMAIL}'
+  }
+}
+)
diff --git a/ci/publish/scala/build.sh b/ci/publish/scala/build.sh
new file mode 100755
index 000000000000..17f969afe142
--- /dev/null
+++ b/ci/publish/scala/build.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -ex
+
+# Setup Environment Variables
+# MAVEN_PUBLISH_OS_TYPE: linux-x86_64-cpu|linux-x86_64-gpu|osx-x86_64-cpu
+# export MAVEN_PUBLISH_OS_TYPE=linux-x86_64-cpu
+
+bash scala-package/dev/compile-mxnet-backend.sh $MAVEN_PUBLISH_OS_TYPE ./
+
+# Compile tests for discovery later
+cd scala-package
+mvn -B deploy
diff --git a/ci/publish/scala/buildkey.py b/ci/publish/scala/buildkey.py
new file mode 100644
index 000000000000..8a1b7bf63286
--- /dev/null
+++ b/ci/publish/scala/buildkey.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import json
+import logging
+import subprocess
+
+HOME = os.environ['HOME']
+KEY_PATH = os.path.join(HOME, ".m2")
+
+
+'''
+This file would do the following items:
+    Import keys from AWS Credential services
+    Create settings.xml in .m2 with pass phrase
+    Create security-settings.xml in .m2 with master password
+    Import keys.asc the encrypted keys in gpg
+'''
+
+
+def getCredentials():
+    import boto3
+    import botocore
+    endpoint_url = os.environ['MAVEN_PUBLISH_SECRET_ENDPOINT_URL']
+    secret_creds_name = os.environ['MAVEN_PUBLISH_SECRET_NAME_CREDENTIALS']
+    secret_key_name = os.environ['MAVEN_PUBLISH_SECRET_NAME_GPG']
+    region_name = os.environ['DOCKERHUB_SECRET_ENDPOINT_REGION']
+
+    session = boto3.Session()
+    client = session.client(
+        service_name='secretsmanager',
+        region_name=region_name,
+        endpoint_url=endpoint_url
+    )
+    try:
+        get_secret_value_response = client.get_secret_value(
+            SecretId=secret_creds_name
+        )
+        get_secret_key_response = client.get_secret_value(
+            SecretId=secret_key_name
+        )
+    except botocore.exceptions.ClientError as client_error:
+        if client_error.response['Error']['Code'] == 'ResourceNotFoundException':
+            name = (secret_key_name if get_secret_value_response
+                    else secret_creds_name)
+            logging.exception("The requested secret %s was not found", name)
+        elif client_error.response['Error']['Code'] == 'InvalidRequestException':
+            logging.exception("The request was invalid due to:")
+        elif client_error.response['Error']['Code'] == 'InvalidParameterException':
+            logging.exception("The request had invalid params:")
+        raise
+    else:
+        secret = get_secret_value_response['SecretString']
+        secret_dict = json.loads(secret)
+        secret_key = get_secret_key_response['SecretString']
+        return secret_dict, secret_key
+
+
+def importASC(key, gpgPassphrase):
+    filename = os.path.join(KEY_PATH, "key.asc")
+    with open(filename, 'w') as f:
+        f.write(key)
+    subprocess.check_output(['gpg2', '--batch', '--yes',
+                    '--passphrase-fd', '0',
+                    "--import", "{}".format(filename)],
+                   input=str.encode(gpgPassphrase))
+
+
+def encryptMasterPSW(password):
+    filename = os.path.join(KEY_PATH, "encryptMasterPassword.exp")
+    with open(filename, 'w') as f:
+        f.write('''
+        spawn mvn --encrypt-master-password
+        expect -exact "Master password: "
+        send -- "{}\r"
+        expect eof
+        '''.format(password))
+    result = subprocess.check_output(['expect', filename])
+    return str(result).split('\r\n')[-1][2:-3]
+
+
+def encryptPSW(password):
+    filename = os.path.join(KEY_PATH, "encryptPassword.exp")
+    with open(filename, 'w') as f:
+        f.write('''
+        spawn mvn --encrypt-password
+        expect -exact "Password: "
+        send -- "{}\r"
+        expect eof
+        '''.format(password))
+    result = subprocess.check_output(['expect', filename])
+    return str(result).split('\r\n')[-1][2:-3]
+
+
+def masterPSW(password):
+    with open(os.path.join(KEY_PATH, "settings-security.xml"), "w") as f:
+        f.write("<settingsSecurity>\n <master>{}</master>\n</settingsSecurity>"
+                .format(password))
+
+
+def serverPSW(username, password, gpgPassphrase):
+    with open(os.path.join(KEY_PATH, "settings.xml"), "w") as f:
+        settingsString = '''<?xml version="1.0" encoding="UTF-8"?>
+<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">
+<pluginGroups></pluginGroups>
+<proxies></proxies>
+<servers>
+<server>
+        <id>apache.snapshots.https</id>
+        <username>{}</username>
+        <password>{}</password>
+</server>
+<!-- To stage a release of some part of Maven -->
+<server>
+        <id>apache.releases.https</id>
+        <username>{}</username>
+        <password>{}</password>
+</server>
+</servers>
+<mirrors></mirrors>
+<profiles>
+<profile>
+        <id>gpg</id>
+        <properties>
+        <gpg.executable>gpg2</gpg.executable>
+        <gpg.passphrase>{}</gpg.passphrase>
+        <gpg.skip>true</gpg.skip>
+        </properties>
+</profile>
+</profiles>
+<activeProfiles>
+        <activeProfile>gpg</activeProfile>
+</activeProfiles>
+</settings> '''.format(username, password, username, password, gpgPassphrase)
+        f.write(settingsString)
+
+
+if __name__ == "__main__":
+    if not os.path.exists(KEY_PATH):
+        os.makedirs(KEY_PATH)
+    credentials, gpgKey = getCredentials()
+    masterPass = encryptMasterPSW(credentials['masterpass'])
+    masterPSW(masterPass)
+    passwordEncrypted = encryptPSW(credentials['password'])
+    serverPSW(credentials['user'], passwordEncrypted,
+             credentials['gpgPassphrase'])
+    importASC(gpgKey, credentials['gpgPassphrase'])
diff --git a/ci/publish/scala/deploy.sh b/ci/publish/scala/deploy.sh
new file mode 100755
index 000000000000..4eb33907eeb5
--- /dev/null
+++ b/ci/publish/scala/deploy.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -ex
+
+# Setup Environment Variables
+# MAVEN_PUBLISH_OS_TYPE: linux-x86_64-cpu|linux-x86_64-gpu|osx-x86_64-cpu
+# export MAVEN_PUBLISH_OS_TYPE=linux-x86_64-cpu
+
+# Run python to configure keys
+python3 ci/publish/scala/buildkey.py
+
+# Updating cache
+mkdir -p ~/.gnupg
+echo "default-cache-ttl 14400" > ~/.gnupg/gpg-agent.conf
+echo "max-cache-ttl 14400" >> ~/.gnupg/gpg-agent.conf
+echo "allow-loopback-pinentry" >> ~/.gnupg/gpg-agent.conf
+echo "pinentry-mode loopback" >> ~/.gnupg/gpg-agent.conf
+export GPG_TTY=$(tty)
+
+cd scala-package
+
+mvn -B deploy -Pnightly
+
+# Clear all password .xml files, exp files, and gpg key files
+rm -rf ~/.m2/*.xml ~/.m2/key.asc ~/.m2/*.exp
diff --git a/ci/publish/scala/fullDeploy.sh b/ci/publish/scala/fullDeploy.sh
new file mode 100644
index 000000000000..69d674a97497
--- /dev/null
+++ b/ci/publish/scala/fullDeploy.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -ex
+
+./ci/publish/scala/build.sh
+./ci/publish/scala/test.sh
+./ci/publish/scala/deploy.sh
diff --git a/ci/publish/scala/test.sh b/ci/publish/scala/test.sh
new file mode 100755
index 000000000000..5cef35ca3c2b
--- /dev/null
+++ b/ci/publish/scala/test.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -ex
+
+if [ -z "$JAVA_HOME" ]; then
+    source /etc/profile
+fi
+
+# Test
+cd scala-package/packageTest
+# make testlocal CI=1
+make testsnapshot UNIT=1 CI=1
diff --git a/ci/test_docker_cache.py b/ci/test_docker_cache.py
index 358d54985aca..0a3bc4640c05 100644
--- a/ci/test_docker_cache.py
+++ b/ci/test_docker_cache.py
@@ -135,7 +135,7 @@ def test_full_cache(self):
                 """
         platform = 'test_full_cache'
         docker_tag = build_util.get_docker_tag(platform=platform, registry=DOCKER_REGISTRY_PATH)
-        dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.build.' + platform)
+        dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.' + platform)
         try:
             with open(dockerfile_path, 'w') as dockerfile_handle:
                 dockerfile_handle.write(dockerfile_content)
@@ -196,7 +196,7 @@ def test_partial_cache(self):
                 """
         platform = 'test_partial_cache'
         docker_tag = build_util.get_docker_tag(platform=platform, registry=DOCKER_REGISTRY_PATH)
-        dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.build.' + platform)
+        dockerfile_path = os.path.join(DOCKERFILE_DIR, 'Dockerfile.' + platform)
         try:
             # Write initial Dockerfile
             with open(dockerfile_path, 'w') as dockerfile_handle:
diff --git a/ci/util.py b/ci/util.py
index 4d68b57a3af4..9a8d52eb1716 100644
--- a/ci/util.py
+++ b/ci/util.py
@@ -18,7 +18,6 @@
 import os
 import contextlib
 import logging
-import requests
 
 def get_mxnet_root() -> str:
     curpath = os.path.abspath(os.path.dirname(__file__))
@@ -89,6 +88,7 @@ def under_ci() -> bool:
 
 
 def ec2_instance_id_hostname() -> str:
+    import requests
     if under_ci():
         result = []
         try:
diff --git a/cmake/cmake_options.yml b/cmake/cmake_options.yml
index 01446f7b8f28..a4323feb92d4 100644
--- a/cmake/cmake_options.yml
+++ b/cmake/cmake_options.yml
@@ -16,7 +16,7 @@
 # under the License.
 
 --- # CMake configuration
-USE_CUDA: "ON" # Build with CUDA support
+USE_CUDA: "OFF" # Build with CUDA support
 USE_OLDCMAKECUDA: "OFF" # Build with old cmake cuda
 USE_NCCL: "OFF" # Use NVidia NCCL with CUDA
 USE_OPENCV: "ON" # Build with OpenCV support
@@ -48,3 +48,6 @@ USE_TENSORRT: "OFF" # Enable infeference optimization with TensorRT.
 USE_ASAN: "OFF" # Enable Clang/GCC ASAN sanitizers.
 ENABLE_TESTCOVERAGE: "OFF" # Enable compilation with test coverage metric output
 CMAKE_BUILD_TYPE: "Debug"
+CMAKE_CUDA_COMPILER_LAUNCHER: "ccache"
+CMAKE_C_COMPILER_LAUNCHER: "ccache"
+CMAKE_CXX_COMPILER_LAUNCHER: "ccache"
diff --git a/contrib/clojure-package/README.md b/contrib/clojure-package/README.md
index 152c9c635e6d..ba6160aed5c8 100644
--- a/contrib/clojure-package/README.md
+++ b/contrib/clojure-package/README.md
@@ -105,9 +105,12 @@ brew install opencv
 - Create a new project with `lein new my-mxnet`
 - Edit your `project.clj` and add one of the following entries to `:dependencies`, based on your system and the compute device you want to use:
 
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu "1.3.1"]`
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu "1.3.1"]`
-  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu "1.3.1"]`
+
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-cpu <latest-version>]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-linux-gpu <latest-version>]`
+  - `[org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu <latest-version>]`
+
+You can find the latest version out on [maven central- clojure-mxnet latest](https://search.maven.org/search?q=clojure-mxnet)
 
 After making this change and running `lein deps`, you should be able to run example code like this [NDArray Tutorial](/~https://github.com/apache/incubator-mxnet/blob/master/contrib/clojure-package/examples/tutorial/src/tutorial/ndarray.clj).
 
@@ -116,38 +119,58 @@ After making this change and running `lein deps`, you should be able to run exam
 With this option, you will install a Git revision of the Clojure package source and a [Scala package jar from Maven](https://search.maven.org/search?q=g:org.apache.mxnet) with native dependencies baked in.
 
 - Install additional dependencies as described in [the corresponding section for Option 1](#installing-additional-dependencies),
-- Recursively clone the MXNet repository and checkout the desired revision. Here we assume the `1.3.1` tag and a clone into the `~/mxnet` directory:
+
+- Recursively clone the MXNet repository and checkout the desired version, (example 1.3.1). You should use the latest [version](https://search.maven.org/search?q=clojure-mxnet)), and a clone into the `~/mxnet` directory:
 
   ```bash
   git clone --recursive /~https://github.com/apache/incubator-mxnet.git ~/mxnet
   cd ~/mxnet
   git tag --list  # Find the tag that matches the Scala package version
-  git checkout tags/1.3.1 -b my_mxnet
+
+  git checkout tags/<version> -b my_mxnet
   git submodule update --init --recursive
   cd contrib/clojure
   ```
 
 - Edit `project.clj` to include the desired Scala jar from Maven:
 
-      [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.3.1”]
+
+      [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu <latest-version>]
 
 - Run `lein test`. All the tests should run without error.
 - At this point you can run `lein install` to build and install the Clojure jar locally.
 
 To run examples, you can now use `lein run` in any of the example directories, e.g., `examples/imclassification`. You can also specify the compute device, e.g., `lein run :cpu 2` (for 2 CPUs) or `lein run :gpu` (for 1 GPU).
 
-**Note:** Instead of a release tag, you can also use a development version of the Clojure package, e.g., Git `master`, together with the prebuilt Scala jar. In that case, however, breakage can happen at any point, for instance when the Scala development version adds, changes or removes an interface and the Clojure development version moves along. If you really need the most recent version, you should consider [installation option 3](#option-3-everything-from-source).
+#### Experimental: Using Scala Snapshot Jars
+**Note:** Instead of a release tag, you can also use a development version of the Clojure package, e.g., Git `master`, together with the prebuilt Scala jar. There is a repo of nightly built snapshots of Scala jars. You can use them in your `project.clj` by adding a repository:
+
+```
+["snapshots" {:url "https://repository.apache.org/content/repositories/snapshots"
+                              :snapshots true
+                              :sign-releases false
+                              :checksum :fail
+                              :update :always
+                              :releases {:checksum :fail :update :always}}]
+```
+
+Then you should be able to run with your dependency:
+
+    [org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "latest-version-SNAPSHOT"]
+
+
+In that case, however, breakage can happen at any point, for instance when the Scala development version adds, changes or removes an interface and the Clojure development version moves along. If you really need the most recent version, you should consider [installation option 3](#option-3-everything-from-source).
 
 ### Option 3: Everything from Source
 
 With this option, you will compile the core MXNet C++ package and jars for both Scala and Clojure language bindings from source. If you intend to make changes to the code in any of the parts, or if you simply want the latest and greatest features, this choice is for you.
 
-The first step is to recursively clone the MXNet repository and checkout the desired revision. Here we assume a clone into the `~/mxnet` directory:
+The first step is to recursively clone the MXNet repository and checkout the desired version, (example 1.3.1). You should use the latest [version](https://search.maven.org/search?q=clojure-mxnet)), and clone into the `~/mxnet` directory:
 
   ```bash
   git clone --recursive /~https://github.com/apache/incubator-mxnet.git ~/mxnet
   cd ~/mxnet
-  git checkout tags/1.3.1 -b my_mxnet  # this is optional
+  git checkout tags/version -b my_mxnet  # this is optional
   git submodule update --init --recursive
   ```
 
@@ -170,13 +193,13 @@ The outcome of this step will be a shared library `lib/libmxnet.so` that is used
 - Build and install the Scala package in your local Maven directory using the following commands:
 
   ```bash
-  make scalapkg
-  make scalainstall
+  cd scala-package
+  mvn install
   ```
 
 #### Building the Clojure jar
  
-- Enter the `contrib/clojure` directory and edit the `project.clj` file. Add the Scala jar that was just created and installed, e.g., `[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "1.5.0-SNAPSHOT"]`, to the `:dependencies`.
+- Enter the `contrib/clojure` directory and edit the `project.clj` file. Add the Scala jar that was just created and installed, e.g., `[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "latest-version-SNAPSHOT"]`, to the `:dependencies`.
 - Run `lein test`. All the tests should run without an error.
 - Run `lein install` to build and install the Clojure jar locally.
 
diff --git a/contrib/clojure-package/examples/captcha/.gitignore b/contrib/clojure-package/examples/captcha/.gitignore
new file mode 100644
index 000000000000..e1569bd89020
--- /dev/null
+++ b/contrib/clojure-package/examples/captcha/.gitignore
@@ -0,0 +1,3 @@
+/.lein-*
+/.nrepl-port
+images/*
diff --git a/contrib/clojure-package/examples/captcha/README.md b/contrib/clojure-package/examples/captcha/README.md
new file mode 100644
index 000000000000..6b593b2f1c65
--- /dev/null
+++ b/contrib/clojure-package/examples/captcha/README.md
@@ -0,0 +1,61 @@
+# Captcha
+
+This is the clojure version of [captcha recognition](/~https://github.com/xlvector/learning-dl/tree/master/mxnet/ocr)
+example by xlvector and mirrors the R captcha example. It can be used as an
+example of multi-label training. For the following captcha example, we consider it as an
+image with 4 labels and train a CNN over the data set.
+
+![captcha example](captcha_example.png)
+
+## Installation
+
+Before you run this example, make sure that you have the clojure package
+installed. In the main clojure package directory, do `lein install`.
+Then you can run `lein install` in this directory.
+
+## Usage
+
+### Training
+
+First the OCR model needs to be trained based on [labeled data](https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/captcha_example.zip).
+The training can be started using the following:
+```
+$ lein train [:cpu|:gpu] [num-devices]
+```
+This downloads the training/evaluation data using the `get_data.sh` script
+before starting training.
+
+It is possible that you will encounter some out-of-memory issues while training using :gpu on Ubuntu
+linux (18.04). However, the command `lein train` (training on one CPU) may resolve the issue.
+
+The training runs for 10 iterations by default and saves the model with the
+prefix `ocr-`. The model achieved an exact match accuracy of ~0.954 and
+~0.628 on training and validation data respectively.
+
+### Inference
+
+Once the model has been saved, it can be used for prediction. This can be done
+by running:
+```
+$ lein infer
+INFO  MXNetJVM: Try loading mxnet-scala from native path.
+INFO  MXNetJVM: Try loading mxnet-scala-linux-x86_64-gpu from native path.
+INFO  MXNetJVM: Try loading mxnet-scala-linux-x86_64-cpu from native path.
+WARN  MXNetJVM: MXNet Scala native library not found in path. Copying native library from the archive. Consider installing the library somewhere in the path (for Windows: PATH, for Linux: LD_LIBRARY_PATH), or specifying by Java cmd option -Djava.library.path=[lib path].
+WARN  org.apache.mxnet.DataDesc: Found Undefined Layout, will use default index 0 for batch axis
+INFO  org.apache.mxnet.infer.Predictor: Latency increased due to batchSize mismatch 8 vs 1
+WARN  org.apache.mxnet.DataDesc: Found Undefined Layout, will use default index 0 for batch axis
+WARN  org.apache.mxnet.DataDesc: Found Undefined Layout, will use default index 0 for batch axis
+CAPTCHA output: 6643
+INFO  org.apache.mxnet.util.NativeLibraryLoader: Deleting /tmp/mxnet6045308279291774865/libmxnet.so
+INFO  org.apache.mxnet.util.NativeLibraryLoader: Deleting /tmp/mxnet6045308279291774865/mxnet-scala
+INFO  org.apache.mxnet.util.NativeLibraryLoader: Deleting /tmp/mxnet6045308279291774865
+```
+The model runs on `captcha_example.png` by default.
+
+It can be run on other generated captcha images as well. The script
+`gen_captcha.py` generates random captcha images for length 4.
+Before running the python script, you will need to install the [captcha](https://pypi.org/project/captcha/)
+library using `pip3 install --user captcha`. The captcha images are generated
+in the `images/` folder and we can run the prediction using
+`lein infer images/7534.png`.
diff --git a/contrib/clojure-package/examples/captcha/captcha_example.png b/contrib/clojure-package/examples/captcha/captcha_example.png
new file mode 100644
index 000000000000..09b84f7190fa
Binary files /dev/null and b/contrib/clojure-package/examples/captcha/captcha_example.png differ
diff --git a/contrib/clojure-package/examples/captcha/gen_captcha.py b/contrib/clojure-package/examples/captcha/gen_captcha.py
new file mode 100755
index 000000000000..43e0d26fb961
--- /dev/null
+++ b/contrib/clojure-package/examples/captcha/gen_captcha.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from captcha.image import ImageCaptcha
+import os
+import random
+
+length = 4
+width = 160
+height = 60
+IMAGE_DIR = "images"
+
+
+def random_text():
+    return ''.join(str(random.randint(0, 9))
+                   for _ in range(length))
+
+
+if __name__ == '__main__':
+    image = ImageCaptcha(width=width, height=height)
+    captcha_text = random_text()
+    if not os.path.exists(IMAGE_DIR):
+        os.makedirs(IMAGE_DIR)
+    image.write(captcha_text, os.path.join(IMAGE_DIR, captcha_text + ".png"))
diff --git a/contrib/clojure-package/examples/captcha/get_data.sh b/contrib/clojure-package/examples/captcha/get_data.sh
new file mode 100755
index 000000000000..baa7f9eb818f
--- /dev/null
+++ b/contrib/clojure-package/examples/captcha/get_data.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -evx
+
+EXAMPLE_ROOT=$(cd "$(dirname $0)"; pwd)
+
+data_path=$EXAMPLE_ROOT
+
+if [ ! -f "$data_path/captcha_example.zip" ]; then
+  wget https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/captcha_example.zip -P $data_path
+fi
+
+if [ ! -f "$data_path/captcha_example/captcha_train.rec" ]; then
+  unzip $data_path/captcha_example.zip -d $data_path
+fi
diff --git a/contrib/clojure-package/examples/captcha/project.clj b/contrib/clojure-package/examples/captcha/project.clj
new file mode 100644
index 000000000000..fa37fecbe035
--- /dev/null
+++ b/contrib/clojure-package/examples/captcha/project.clj
@@ -0,0 +1,28 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(defproject captcha "0.1.0-SNAPSHOT"
+  :description "Captcha recognition via multi-label classification"
+  :plugins [[lein-cljfmt "0.5.7"]]
+  :dependencies [[org.clojure/clojure "1.9.0"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+  :main ^:skip-aot captcha.train-ocr
+  :profiles {:train {:main captcha.train-ocr}
+             :infer {:main captcha.infer-ocr}
+             :uberjar {:aot :all}}
+  :aliases {"train" ["with-profile" "train" "run"]
+            "infer" ["with-profile" "infer" "run"]})
diff --git a/contrib/clojure-package/examples/captcha/src/captcha/consts.clj b/contrib/clojure-package/examples/captcha/src/captcha/consts.clj
new file mode 100644
index 000000000000..318e0d806873
--- /dev/null
+++ b/contrib/clojure-package/examples/captcha/src/captcha/consts.clj
@@ -0,0 +1,27 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns captcha.consts)
+
+(def batch-size 8)
+(def channels 3)
+(def height 30)
+(def width 80)
+(def data-shape [channels height width])
+(def num-labels 10)
+(def label-width 4)
+(def model-prefix "ocr")
diff --git a/contrib/clojure-package/examples/captcha/src/captcha/infer_ocr.clj b/contrib/clojure-package/examples/captcha/src/captcha/infer_ocr.clj
new file mode 100644
index 000000000000..f6a648e9867b
--- /dev/null
+++ b/contrib/clojure-package/examples/captcha/src/captcha/infer_ocr.clj
@@ -0,0 +1,56 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns captcha.infer-ocr
+  (:require [captcha.consts :refer :all]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]))
+
+(defn create-predictor
+  []
+  (let [data-desc {:name "data"
+                   :shape [batch-size channels height width]
+                   :layout layout/NCHW
+                   :dtype dtype/FLOAT32}
+        label-desc {:name "label"
+                    :shape [batch-size label-width]
+                    :layout layout/NT
+                    :dtype dtype/FLOAT32}
+        factory (infer/model-factory model-prefix
+                                     [data-desc label-desc])]
+    (infer/create-predictor factory)))
+
+(defn -main
+  [& args]
+  (let [[filename] args
+        image-fname (or filename "captcha_example.png")
+        image-ndarray (-> image-fname
+                          infer/load-image-from-file
+                          (infer/reshape-image width height)
+                          (infer/buffered-image-to-pixels [channels height width])
+                          (ndarray/expand-dims 0))
+        label-ndarray (ndarray/zeros [1 label-width])
+        predictor (create-predictor)
+        predictions (-> (infer/predict-with-ndarray
+                         predictor
+                         [image-ndarray label-ndarray])
+                        first
+                        (ndarray/argmax 1)
+                        ndarray/->vec)]
+    (println "CAPTCHA output:" (apply str (mapv int predictions)))))
diff --git a/contrib/clojure-package/examples/captcha/src/captcha/train_ocr.clj b/contrib/clojure-package/examples/captcha/src/captcha/train_ocr.clj
new file mode 100644
index 000000000000..91ec2fff3af7
--- /dev/null
+++ b/contrib/clojure-package/examples/captcha/src/captcha/train_ocr.clj
@@ -0,0 +1,156 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns captcha.train-ocr
+  (:require [captcha.consts :refer :all]
+            [clojure.java.io :as io]
+            [clojure.java.shell :refer [sh]]
+            [org.apache.clojure-mxnet.callback :as callback]
+            [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
+            [org.apache.clojure-mxnet.initializer :as initializer]
+            [org.apache.clojure-mxnet.io :as mx-io]
+            [org.apache.clojure-mxnet.module :as m]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [org.apache.clojure-mxnet.optimizer :as optimizer]
+            [org.apache.clojure-mxnet.symbol :as sym])
+  (:gen-class))
+
+(when-not (.exists (io/file "captcha_example/captcha_train.lst"))
+  (sh "./get_data.sh"))
+
+(defonce train-data
+  (mx-io/image-record-iter {:path-imgrec "captcha_example/captcha_train.rec"
+                            :path-imglist "captcha_example/captcha_train.lst"
+                            :batch-size batch-size
+                            :label-width label-width
+                            :data-shape data-shape
+                            :shuffle true
+                            :seed 42}))
+
+(defonce eval-data
+  (mx-io/image-record-iter {:path-imgrec "captcha_example/captcha_test.rec"
+                            :path-imglist "captcha_example/captcha_test.lst"
+                            :batch-size batch-size
+                            :label-width label-width
+                            :data-shape data-shape}))
+
+(defn accuracy
+  [label pred & {:keys [by-character]
+                 :or {by-character false} :as opts}]
+  (let [[nr nc] (ndarray/shape-vec label)
+        pred-context (ndarray/context pred)
+        label-t (-> label
+                    ndarray/transpose
+                    (ndarray/reshape [-1])
+                    (ndarray/as-in-context pred-context))
+        pred-label (ndarray/argmax pred 1)
+        matches (ndarray/equal label-t pred-label)
+        [digit-matches] (-> matches
+                            ndarray/sum
+                            ndarray/->vec)
+        [complete-matches] (-> matches
+                               (ndarray/reshape [nc nr])
+                               (ndarray/sum 0)
+                               (ndarray/equal label-width)
+                               ndarray/sum
+                               ndarray/->vec)]
+    (if by-character
+      (float (/ digit-matches nr nc))
+      (float (/ complete-matches nr)))))
+
+(defn get-data-symbol
+  []
+  (let [data (sym/variable "data")
+        ;; normalize the input pixels
+        scaled (sym/div (sym/- data 127) 128)
+
+        conv1 (sym/convolution {:data scaled :kernel [5 5] :num-filter 32})
+        pool1 (sym/pooling {:data conv1 :pool-type "max" :kernel [2 2] :stride [1 1]})
+        relu1 (sym/activation {:data pool1 :act-type "relu"})
+
+        conv2 (sym/convolution {:data relu1 :kernel [5 5] :num-filter 32})
+        pool2 (sym/pooling {:data conv2 :pool-type "avg" :kernel [2 2] :stride [1 1]})
+        relu2 (sym/activation {:data pool2 :act-type "relu"})
+
+        conv3 (sym/convolution {:data relu2 :kernel [3 3] :num-filter 32})
+        pool3 (sym/pooling {:data conv3 :pool-type "avg" :kernel [2 2] :stride [1 1]})
+        relu3 (sym/activation {:data pool3 :act-type "relu"})
+
+        conv4 (sym/convolution {:data relu3 :kernel [3 3] :num-filter 32})
+        pool4 (sym/pooling {:data conv4 :pool-type "avg" :kernel [2 2] :stride [1 1]})
+        relu4 (sym/activation {:data pool4 :act-type "relu"})
+
+        flattened (sym/flatten {:data relu4})
+        fc1 (sym/fully-connected {:data flattened :num-hidden 256})
+        fc21 (sym/fully-connected {:data fc1 :num-hidden num-labels})
+        fc22 (sym/fully-connected {:data fc1 :num-hidden num-labels})
+        fc23 (sym/fully-connected {:data fc1 :num-hidden num-labels})
+        fc24 (sym/fully-connected {:data fc1 :num-hidden num-labels})]
+    (sym/concat "concat" nil [fc21 fc22 fc23 fc24] {:dim 0})))
+
+(defn get-label-symbol
+  []
+  (as-> (sym/variable "label") label
+    (sym/transpose {:data label})
+    (sym/reshape {:data label :shape [-1]})))
+
+(defn create-captcha-net
+  []
+  (let [scores (get-data-symbol)
+        labels (get-label-symbol)]
+    (sym/softmax-output {:data scores :label labels})))
+
+(def optimizer
+  (optimizer/adam
+   {:learning-rate 0.0002
+    :wd 0.00001
+    :clip-gradient 10}))
+
+(defn train-ocr
+  [devs]
+  (println "Starting the captcha training ...")
+  (let [model (m/module
+               (create-captcha-net)
+               {:data-names ["data"] :label-names ["label"]
+                :contexts devs})]
+    (m/fit model {:train-data train-data
+                  :eval-data eval-data
+                  :num-epoch 10
+                  :fit-params (m/fit-params
+                               {:kvstore "local"
+                                :batch-end-callback
+                                (callback/speedometer batch-size 100)
+                                :initializer
+                                (initializer/xavier {:factor-type "in"
+                                                     :magnitude 2.34})
+                                :optimizer optimizer
+                                :eval-metric (eval-metric/custom-metric
+                                              #(accuracy %1 %2)
+                                              "accuracy")})})
+    (println "Finished the fit")
+    model))
+
+(defn -main
+  [& args]
+  (let [[dev dev-num] args
+        num-devices (Integer/parseInt (or dev-num "1"))
+        devs (if (= dev ":gpu")
+               (mapv #(context/gpu %) (range num-devices))
+               (mapv #(context/cpu %) (range num-devices)))
+        model (train-ocr devs)]
+    (m/save-checkpoint model {:prefix model-prefix :epoch 0})))
diff --git a/contrib/clojure-package/examples/captcha/test/captcha/train_ocr_test.clj b/contrib/clojure-package/examples/captcha/test/captcha/train_ocr_test.clj
new file mode 100644
index 000000000000..ab785f7fedf2
--- /dev/null
+++ b/contrib/clojure-package/examples/captcha/test/captcha/train_ocr_test.clj
@@ -0,0 +1,119 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns captcha.train-ocr-test
+  (:require [clojure.test :refer :all]
+            [captcha.consts :refer :all]
+            [captcha.train-ocr :refer :all]
+            [org.apache.clojure-mxnet.io :as mx-io]
+            [org.apache.clojure-mxnet.module :as m]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [org.apache.clojure-mxnet.shape :as shape]
+            [org.apache.clojure-mxnet.util :as util]))
+
+(deftest test-consts
+  (is (= 8 batch-size))
+  (is (= [3 30 80] data-shape))
+  (is (= 4 label-width))
+  (is (= 10 num-labels)))
+
+(deftest test-labeled-data
+  (let [train-batch (mx-io/next train-data)
+        eval-batch (mx-io/next eval-data)
+        allowed-labels (into #{} (map float (range 10)))]
+    (is (= 8 (-> train-batch mx-io/batch-index count)))
+    (is (= 8 (-> eval-batch mx-io/batch-index count)))
+    (is (= [8 3 30 80] (-> train-batch
+                           mx-io/batch-data
+                           first
+                           ndarray/shape-vec)))
+    (is (= [8 3 30 80] (-> eval-batch
+                           mx-io/batch-data
+                           first
+                           ndarray/shape-vec)))
+    (is (every? #(<= 0 % 255) (-> train-batch
+                                  mx-io/batch-data
+                                  first
+                                  ndarray/->vec)))
+    (is (every? #(<= 0 % 255) (-> eval-batch
+                                  mx-io/batch-data
+                                  first
+                                  ndarray/->vec)))
+    (is (= [8 4] (-> train-batch
+                     mx-io/batch-label
+                     first
+                     ndarray/shape-vec)))
+    (is (= [8 4] (-> eval-batch
+                     mx-io/batch-label
+                     first
+                     ndarray/shape-vec)))
+    (is (every? allowed-labels (-> train-batch
+                                   mx-io/batch-label
+                                   first
+                                   ndarray/->vec)))
+    (is (every? allowed-labels (-> eval-batch
+                                   mx-io/batch-label
+                                   first
+                                   ndarray/->vec)))))
+
+(deftest test-model
+  (let [batch (mx-io/next train-data)
+        model (m/module (create-captcha-net)
+                        {:data-names ["data"] :label-names ["label"]})
+        _ (m/bind model
+                  {:data-shapes (mx-io/provide-data-desc train-data)
+                   :label-shapes (mx-io/provide-label-desc train-data)})
+        _ (m/init-params model)
+        _ (m/forward-backward model batch)
+        output-shapes (-> model
+                          m/output-shapes
+                          util/coerce-return-recursive)
+        outputs (-> model
+                    m/outputs-merged
+                    first)
+        grads (->> model m/grad-arrays (map first))]
+    (is (= [["softmaxoutput0_output" (shape/->shape [8 10])]]
+           output-shapes))
+    (is (= [32 10] (-> outputs ndarray/shape-vec)))
+    (is (every? #(<= 0.0 % 1.0) (-> outputs ndarray/->vec)))
+    (is (= [[32 3 5 5] [32]   ; convolution1 weights+bias
+            [32 32 5 5] [32]  ; convolution2 weights+bias
+            [32 32 3 3] [32]  ; convolution3 weights+bias
+            [32 32 3 3] [32]  ; convolution4 weights+bias
+            [256 28672] [256] ; fully-connected1 weights+bias
+            [10 256] [10]     ; 1st label scores
+            [10 256] [10]     ; 2nd label scores
+            [10 256] [10]     ; 3rd label scores
+            [10 256] [10]]    ; 4th label scores
+           (map ndarray/shape-vec grads)))))
+
+(deftest test-accuracy
+  (let [labels (ndarray/array [1 2 3 4,
+                               5 6 7 8]
+                              [2 4])
+        pred-labels (ndarray/array [1 0,
+                                    2 6,
+                                    3 0,
+                                    4 8]
+                                   [8])
+        preds (ndarray/one-hot pred-labels 10)]
+    (is (float? (accuracy labels preds)))
+    (is (float? (accuracy labels preds :by-character false)))
+    (is (float? (accuracy labels preds :by-character true)))
+    (is (= 0.5 (accuracy labels preds)))
+    (is (= 0.5 (accuracy labels preds :by-character false)))
+    (is (= 0.75 (accuracy labels preds :by-character true)))))
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/.gitignore b/contrib/clojure-package/examples/infer/imageclassifier/.gitignore
new file mode 100644
index 000000000000..35491f1a084a
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/imageclassifier/.gitignore
@@ -0,0 +1,12 @@
+/target
+/classes
+/checkouts
+/images
+pom.xml
+pom.xml.asc
+*.jar
+*.class
+/.lein-*
+/.nrepl-port
+.hgignore
+.hg/
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/README.md b/contrib/clojure-package/examples/infer/imageclassifier/README.md
new file mode 100644
index 000000000000..a8328607c9a2
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/imageclassifier/README.md
@@ -0,0 +1,24 @@
+# imageclassifier
+
+Run image classification using clojure infer package.
+
+## Installation
+
+Before you run this example, make sure that you have the clojure package installed.
+In the main clojure package directory, do `lein install`. Then you can run
+`lein install` in this directory.
+
+## Usage
+
+```
+$ chmod +x scripts/get_resnet_18_data.sh
+$ ./scripts/get_resnet_18_data.sh
+$
+$ lein run -- --help
+$ lein run -- -m models/resnet-18/resnet-18 -i images/kitten.jpg -d images/
+$
+$ lein uberjar
+$ java -jar target/imageclassifier-0.1.0-SNAPSHOT-standalone.jar --help
+$ java -jar target/imageclassifier-0.1.0-SNAPSHOT-standalone.jar \
+    -m models/resnet-18/resnet-18 -i images/kitten.jpg -d images/
+```
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/project.clj b/contrib/clojure-package/examples/infer/imageclassifier/project.clj
new file mode 100644
index 000000000000..2d5b171d9ab7
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/imageclassifier/project.clj
@@ -0,0 +1,25 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(defproject imageclassifier "0.1.0-SNAPSHOT"
+  :description "Image classification using infer with MXNet"
+  :plugins [[lein-cljfmt "0.5.7"]]
+  :dependencies [[org.clojure/clojure "1.9.0"]
+                 [org.clojure/tools.cli "0.4.1"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+  :main ^:skip-aot infer.imageclassifier-example
+  :profiles {:uberjar {:aot :all}})
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_18_data.sh b/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_18_data.sh
new file mode 100755
index 000000000000..1a142e8edbfd
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_18_data.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -evx
+
+MXNET_ROOT=$(cd "$(dirname $0)/.."; pwd)
+
+data_path=$MXNET_ROOT/models/resnet-18/
+
+image_path=$MXNET_ROOT/images/
+
+if [ ! -d "$data_path" ]; then
+    mkdir -p "$data_path"
+fi
+
+if [ ! -d "$image_path" ]; then
+    mkdir -p "$image_path"
+fi
+
+if [ ! -f "$data_path/resnet-18-0000.params" ]; then
+    wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-symbol.json -P $data_path
+    wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-0000.params -P $data_path
+    wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/synset.txt -P $data_path
+fi
+
+if [ ! -f "$image_path/kitten.jpg" ]; then
+    wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/kitten.jpg -P $image_path
+    wget https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg -P $image_path
+fi
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_data.sh b/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_data.sh
new file mode 100755
index 000000000000..fcef59bacc6f
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_data.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+MXNET_ROOT=$(cd "$(dirname $0)/.."; pwd)
+
+data_path=$MXNET_ROOT/models/resnet-152/
+
+image_path=$MXNET_ROOT/images/
+
+if [ ! -d "$data_path" ]; then
+  mkdir -p "$data_path"
+fi
+
+if [ ! -d "$image_path" ]; then
+  mkdir -p "$image_path"
+fi
+
+if [ ! -f "$data_path/resnet-152-0000.params" ]; then
+  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/resnet-152-0000.params -P $data_path
+  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/resnet-152-symbol.json -P $data_path
+  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/synset.txt -P $data_path
+fi
+
+if [ ! -f "$image_path/kitten.jpg" ]; then
+  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/kitten.jpg -P $image_path
+fi
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/src/infer/imageclassifier_example.clj b/contrib/clojure-package/examples/infer/imageclassifier/src/infer/imageclassifier_example.clj
new file mode 100644
index 000000000000..4ec7ff7f1490
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/imageclassifier/src/infer/imageclassifier_example.clj
@@ -0,0 +1,112 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns infer.imageclassifier-example
+  (:require [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [clojure.java.io :as io]
+            [clojure.string :refer [join]]
+            [clojure.tools.cli :refer [parse-opts]])
+  (:gen-class))
+
+(defn check-valid-dir
+  "Check that the input directory exists"
+  [input-dir]
+  (let [dir (io/file input-dir)]
+    (and
+     (.exists dir)
+     (.isDirectory dir))))
+
+(defn check-valid-file
+  "Check that the file exists"
+  [input-file]
+  (.exists (io/file input-file)))
+
+(def cli-options
+  [["-m" "--model-path-prefix PREFIX" "Model path prefix"
+    :default "models/resnet-18/resnet-18"
+    :validate [#(check-valid-file (str % "-symbol.json"))
+               "Model path prefix is invalid"]]
+   ["-i" "--input-image IMAGE" "Input image"
+    :default "images/kitten.jpg"
+    :validate [check-valid-file "Input file not found"]]
+   ["-d" "--input-dir IMAGE_DIR" "Input directory"
+    :default "images/"
+    :validate [check-valid-dir "Input directory not found"]]
+   ["-h" "--help"]])
+
+(defn print-predictions
+  "Print image classifier predictions for the given input file"
+  [predictions]
+  (println (apply str (repeat 80 "=")))
+  (doseq [[label probability] predictions]
+    (println (format "Class: %s Probability=%.8f" label probability)))
+  (println (apply str (repeat 80 "="))))
+
+(defn classify-single-image
+  "Classify a single image and print top-5 predictions"
+  [classifier input-image]
+  (let [image (infer/load-image-from-file input-image)
+        topk 5
+        [predictions] (infer/classify-image classifier image topk)]
+    predictions))
+
+(defn classify-images-in-dir
+  "Classify all jpg images in the directory"
+  [classifier input-dir]
+  (let [batch-size 20
+        image-file-batches (->> input-dir
+                                io/file
+                                file-seq
+                                (filter #(.isFile %))
+                                (filter #(re-matches #".*\.jpg$" (.getPath %)))
+                                (mapv #(.getPath %))
+                                (partition-all batch-size))]
+    (apply
+     concat
+     (for [image-files image-file-batches]
+       (let [image-batch (infer/load-image-paths image-files)
+             topk 5]
+         (infer/classify-image-batch classifier image-batch topk))))))
+
+(defn run-classifier
+  "Runs an image classifier based on options provided"
+  [options]
+  (let [{:keys [model-path-prefix input-image input-dir]} options
+        descriptors [{:name "data"
+                      :shape [1 3 224 224]
+                      :layout layout/NCHW
+                      :dtype dtype/FLOAT32}]
+        factory (infer/model-factory model-path-prefix descriptors)
+        classifier (infer/create-image-classifier
+                    factory {:contexts [(context/default-context)]})]
+    (println "Classifying a single image")
+    (print-predictions (classify-single-image classifier input-image))
+    (println "Classifying images in a directory")
+    (doseq [predictions (classify-images-in-dir classifier input-dir)]
+      (print-predictions predictions))))
+
+(defn -main
+  [& args]
+  (let [{:keys [options summary errors] :as opts}
+        (parse-opts args cli-options)]
+    (cond
+      (:help options) (println summary)
+      (some? errors) (println (join "\n" errors))
+      :else (run-classifier options))))
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/test/infer/imageclassifier_example_test.clj b/contrib/clojure-package/examples/infer/imageclassifier/test/infer/imageclassifier_example_test.clj
new file mode 100644
index 000000000000..5b3e08d134f8
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/imageclassifier/test/infer/imageclassifier_example_test.clj
@@ -0,0 +1,69 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns infer.imageclassifier-example-test
+  (:require [infer.imageclassifier-example :refer [classify-single-image
+                                                   classify-images-in-dir]]
+            [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [clojure.java.io :as io]
+            [clojure.java.shell :refer [sh]]
+            [clojure.test :refer :all]))
+
+(def model-dir "models/")
+(def image-dir "images/")
+(def model-path-prefix (str model-dir "resnet-18/resnet-18"))
+(def image-file (str image-dir "kitten.jpg"))
+
+(when-not (.exists (io/file (str model-path-prefix "-symbol.json")))
+  (sh "./scripts/get_resnet_18_data.sh"))
+
+(defn create-classifier []
+  (let [descriptors [{:name "data"
+                      :shape [1 3 224 224]
+                      :layout layout/NCHW
+                      :dtype dtype/FLOAT32}]
+        factory (infer/model-factory model-path-prefix descriptors)]
+    (infer/create-image-classifier factory)))
+
+(deftest test-single-classification
+  (let [classifier (create-classifier)
+        predictions (classify-single-image classifier image-file)]
+    (is (some? predictions))
+    (is (= 5 (count predictions)))
+    (is (every? #(= 2 (count %)) predictions))
+    (is (every? #(string? (first %)) predictions))
+    (is (every? #(float? (second %)) predictions))
+    (is (every? #(< 0 (second %) 1) predictions))
+    (is (= ["n02123159 tiger cat"
+            "n02124075 Egyptian cat"
+            "n02123045 tabby, tabby cat"
+            "n02127052 lynx, catamount"
+            "n02128757 snow leopard, ounce, Panthera uncia"]
+           (map first predictions)))))
+
+(deftest test-batch-classification
+  (let [classifier (create-classifier)
+        batch-predictions (classify-images-in-dir classifier image-dir)
+        predictions (first batch-predictions)]
+    (is (some? batch-predictions))
+    (is (= 5 (count predictions)))
+    (is (every? #(= 2 (count %)) predictions))
+    (is (every? #(string? (first %)) predictions))
+    (is (every? #(float? (second %)) predictions))
+    (is (every? #(< 0 (second %) 1) predictions))))
diff --git a/contrib/clojure-package/examples/infer/objectdetector/.gitignore b/contrib/clojure-package/examples/infer/objectdetector/.gitignore
new file mode 100644
index 000000000000..35491f1a084a
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/objectdetector/.gitignore
@@ -0,0 +1,12 @@
+/target
+/classes
+/checkouts
+/images
+pom.xml
+pom.xml.asc
+*.jar
+*.class
+/.lein-*
+/.nrepl-port
+.hgignore
+.hg/
diff --git a/contrib/clojure-package/examples/infer/objectdetector/README.md b/contrib/clojure-package/examples/infer/objectdetector/README.md
new file mode 100644
index 000000000000..921c53e046d3
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/objectdetector/README.md
@@ -0,0 +1,24 @@
+# objectdetector
+
+Run object detection on images using clojure infer package.
+
+## Installation
+
+Before you run this example, make sure that you have the clojure package installed.
+In the main clojure package directory, do `lein install`. Then you can run
+`lein install` in this directory.
+
+## Usage
+
+```
+$ chmod +x scripts/get_ssd_data.sh
+$ ./scripts/get_ssd_data.sh
+$
+$ lein run -- --help
+$ lein run -- -m models/resnet50_ssd/resnet50_ssd_model -i images/dog.jpg -d images/
+$
+$ lein uberjar
+$ java -jar target/objectdetector-0.1.0-SNAPSHOT-standalone.jar --help
+$ java -jar target/objectdetector-0.1.0-SNAPSHOT-standalone.jar \
+    -m models/resnet50_ssd/resnet50_ssd_model -i images/dog.jpg -d images/
+```
diff --git a/contrib/clojure-package/examples/infer/objectdetector/project.clj b/contrib/clojure-package/examples/infer/objectdetector/project.clj
new file mode 100644
index 000000000000..4501f14a358e
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/objectdetector/project.clj
@@ -0,0 +1,25 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(defproject objectdetector "0.1.0-SNAPSHOT"
+  :description "Object detection using infer with MXNet"
+  :plugins [[lein-cljfmt "0.5.7"]]
+  :dependencies [[org.clojure/clojure "1.9.0"]
+                 [org.clojure/tools.cli "0.4.1"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+  :main ^:skip-aot infer.objectdetector-example
+  :profiles {:uberjar {:aot :all}})
diff --git a/contrib/clojure-package/examples/infer/objectdetector/scripts/get_ssd_data.sh b/contrib/clojure-package/examples/infer/objectdetector/scripts/get_ssd_data.sh
new file mode 100755
index 000000000000..06440a28452e
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/objectdetector/scripts/get_ssd_data.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+set -e
+
+MXNET_ROOT=$(cd "$(dirname $0)/.."; pwd)
+
+data_path=$MXNET_ROOT/models/resnet50_ssd
+
+image_path=$MXNET_ROOT/images
+
+if [ ! -d "$data_path" ]; then
+  mkdir -p "$data_path"
+fi
+
+if [ ! -d "$image_path" ]; then
+  mkdir -p "$image_path"
+fi
+
+if [ ! -f "$data_path/resnet50_ssd_model-0000.params" ]; then
+  wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_model-symbol.json -P $data_path
+  wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_model-0000.params -P $data_path
+  wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/synset.txt -P $data_path
+fi
+
+if [ ! -f "$image_path/000001.jpg" ]; then
+    cd $image_path
+    wget https://cloud.githubusercontent.com/assets/3307514/20012566/cbb53c76-a27d-11e6-9aaa-91939c9a1cd5.jpg -O 000001.jpg
+    wget https://cloud.githubusercontent.com/assets/3307514/20012567/cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg -O dog.jpg
+    wget https://cloud.githubusercontent.com/assets/3307514/20012563/cbb41382-a27d-11e6-92a9-18dab4fd1ad3.jpg -O person.jpg
+fi
+
diff --git a/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj b/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
new file mode 100644
index 000000000000..53172f0c8cad
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
@@ -0,0 +1,121 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns infer.objectdetector-example
+  (:require [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [clojure.java.io :as io]
+            [clojure.string :refer [join]]
+            [clojure.tools.cli :refer [parse-opts]])
+  (:gen-class))
+
+(defn check-valid-dir
+  "Check that the input directory exists"
+  [input-dir]
+  (let [dir (io/file input-dir)]
+    (and
+     (.exists dir)
+     (.isDirectory dir))))
+
+(defn check-valid-file
+  "Check that the file exists"
+  [input-file]
+  (.exists (io/file input-file)))
+
+(def cli-options
+  [["-m" "--model-path-prefix PREFIX" "Model path prefix"
+    :default "models/resnet50_ssd/resnet50_ssd_model"
+    :validate [#(check-valid-file (str % "-symbol.json"))
+               "Model path prefix is invalid"]]
+   ["-i" "--input-image IMAGE" "Input image"
+    :default "images/dog.jpg"
+    :validate [check-valid-file "Input file not found"]]
+   ["-d" "--input-dir IMAGE_DIR" "Input directory"
+    :default "images/"
+    :validate [check-valid-dir "Input directory not found"]]
+   ["-h" "--help"]])
+
+(defn print-predictions
+  "Print image detector predictions for the given input file"
+  [predictions width height]
+  (println (apply str (repeat 80 "=")))
+  (doseq [[label prob-and-bounds] predictions]
+    (println (format
+              "Class: %s Prob=%.5f Coords=(%.3f, %.3f, %.3f, %.3f)"
+              label
+              (aget prob-and-bounds 0)
+              (* (aget prob-and-bounds 1) width)
+              (* (aget prob-and-bounds 2) height)
+              (* (aget prob-and-bounds 3) width)
+              (* (aget prob-and-bounds 4) height))))
+  (println (apply str (repeat 80 "="))))
+
+(defn detect-single-image
+  "Detect objects in a single image and print top-5 predictions"
+  [detector input-image]
+  (let [image (infer/load-image-from-file input-image)
+        topk 5
+        [predictions] (infer/detect-objects detector image topk)]
+    predictions))
+
+(defn detect-images-in-dir
+  "Detect objects in all jpg images in the directory"
+  [detector input-dir]
+  (let [batch-size 20
+        image-file-batches (->> input-dir
+                                io/file
+                                file-seq
+                                (filter #(.isFile %))
+                                (filter #(re-matches #".*\.jpg$" (.getPath %)))
+                                (mapv #(.getPath %))
+                                (partition-all batch-size))]
+    (apply
+     concat
+     (for [image-files image-file-batches]
+       (let [image-batch (infer/load-image-paths image-files)
+             topk 5]
+         (infer/detect-objects-batch detector image-batch topk))))))
+
+(defn run-detector
+  "Runs an image detector based on options provided"
+  [options]
+  (let [{:keys [model-path-prefix input-image input-dir
+                device device-id]} options
+        width 512 height 512
+        descriptors [{:name "data"
+                      :shape [1 3 height width]
+                      :layout layout/NCHW
+                      :dtype dtype/FLOAT32}]
+        factory (infer/model-factory model-path-prefix descriptors)
+        detector (infer/create-object-detector
+                  factory
+                  {:contexts [(context/default-context)]})]
+    (println "Object detection on a single image")
+    (print-predictions (detect-single-image detector input-image) width height)
+    (println "Object detection on images in a directory")
+    (doseq [predictions (detect-images-in-dir detector input-dir)]
+      (print-predictions predictions width height))))
+
+(defn -main
+  [& args]
+  (let [{:keys [options summary errors] :as opts}
+        (parse-opts args cli-options)]
+    (cond
+      (:help options) (println summary)
+      (some? errors) (println (join "\n" errors))
+      :else (run-detector options))))
diff --git a/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj b/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
new file mode 100644
index 000000000000..90ed02f67a73
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
@@ -0,0 +1,65 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns infer.objectdetector-example-test
+  (:require [infer.objectdetector-example :refer [detect-single-image
+                                                  detect-images-in-dir]]
+            [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [clojure.java.io :as io]
+            [clojure.java.shell :refer [sh]]
+            [clojure.test :refer :all]))
+
+(def model-dir "models/")
+(def image-dir "images/")
+(def model-path-prefix (str model-dir "resnet50_ssd/resnet50_ssd_model"))
+(def image-file (str image-dir "dog.jpg"))
+
+(when-not (.exists (io/file (str model-path-prefix "-symbol.json")))
+  (sh "./scripts/get_ssd_data.sh"))
+
+(defn create-detector []
+  (let [descriptors [{:name "data"
+                      :shape [1 3 512 512]
+                      :layout layout/NCHW
+                      :dtype dtype/FLOAT32}]
+        factory (infer/model-factory model-path-prefix descriptors)]
+    (infer/create-object-detector factory)))
+
+(deftest test-single-detection
+  (let [detector (create-detector)
+        predictions (detect-single-image detector image-file)]
+    (is (some? predictions))
+    (is (= 5 (count predictions)))
+    (is (every? #(= 2 (count %)) predictions))
+    (is (every? #(string? (first %)) predictions))
+    (is (every? #(= 5 (count (second %))) predictions))
+    (is (every? #(< 0 (first (second %)) 1) predictions))
+    (is (= ["car" "bicycle" "dog" "bicycle" "person"]
+           (map first predictions)))))
+
+(deftest test-batch-detection
+  (let [detector (create-detector)
+        batch-predictions (detect-images-in-dir detector image-dir)
+        predictions (first batch-predictions)]
+    (is (some? batch-predictions))
+    (is (= 5 (count predictions)))
+    (is (every? #(= 2 (count %)) predictions))
+    (is (every? #(string? (first %)) predictions))
+    (is (every? #(= 5 (count (second %))) predictions))
+    (is (every? #(< 0 (first (second %)) 1) predictions))))
diff --git a/contrib/clojure-package/examples/infer/predictor/.gitignore b/contrib/clojure-package/examples/infer/predictor/.gitignore
new file mode 100644
index 000000000000..35491f1a084a
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/predictor/.gitignore
@@ -0,0 +1,12 @@
+/target
+/classes
+/checkouts
+/images
+pom.xml
+pom.xml.asc
+*.jar
+*.class
+/.lein-*
+/.nrepl-port
+.hgignore
+.hg/
diff --git a/contrib/clojure-package/examples/infer/predictor/README.md b/contrib/clojure-package/examples/infer/predictor/README.md
new file mode 100644
index 000000000000..9ca71cf469a0
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/predictor/README.md
@@ -0,0 +1,24 @@
+# predictor
+
+Run model prediction using clojure infer package.
+
+## Installation
+
+Before you run this example, make sure that you have the clojure package installed.
+In the main clojure package directory, do `lein install`. Then you can run
+`lein install` in this directory.
+
+## Usage
+
+```
+$ chmod +x scripts/get_resnet_18_data.sh
+$ ./scripts/get_resnet_18_data.sh
+$
+$ lein run -- --help
+$ lein run -- -m models/resnet-18/resnet-18 -i images/kitten.jpg
+$
+$ lein uberjar
+$ java -jar target/predictor-0.1.0-SNAPSHOT-standalone.jar --help
+$ java -jar target/predictor-0.1.0-SNAPSHOT-standalone.jar \
+    -m models/resnet-18/resnet-18 -i images/kitten.jpg
+```
diff --git a/contrib/clojure-package/examples/infer/predictor/project.clj b/contrib/clojure-package/examples/infer/predictor/project.clj
new file mode 100644
index 000000000000..0bd1eaee671d
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/predictor/project.clj
@@ -0,0 +1,25 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(defproject predictor "0.1.0-SNAPSHOT"
+  :description "Model prediction using infer with MXNet"
+  :plugins [[lein-cljfmt "0.5.7"]]
+  :dependencies [[org.clojure/clojure "1.9.0"]
+                 [org.clojure/tools.cli "0.4.1"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
+  :main ^:skip-aot infer.predictor-example
+  :profiles {:uberjar {:aot :all}})
diff --git a/contrib/clojure-package/examples/infer/predictor/scripts/get_resnet_18_data.sh b/contrib/clojure-package/examples/infer/predictor/scripts/get_resnet_18_data.sh
new file mode 100755
index 000000000000..cf85355fae2d
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/predictor/scripts/get_resnet_18_data.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -evx
+
+MXNET_ROOT=$(cd "$(dirname $0)/.."; pwd)
+
+data_path=$MXNET_ROOT/models/resnet-18/
+
+image_path=$MXNET_ROOT/images/
+
+if [ ! -d "$data_path" ]; then
+  mkdir -p "$data_path"
+fi
+
+if [ ! -d "$image_path" ]; then
+  mkdir -p "$image_path"
+fi
+
+if [ ! -f "$data_path/resnet-18-0000.params" ]; then
+  wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-symbol.json -P $data_path
+  wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-0000.params -P $data_path
+  wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/synset.txt -P $data_path
+fi
+
+if [ ! -f "$image_path/kitten.jpg" ]; then
+  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/kitten.jpg -P $image_path
+fi
diff --git a/contrib/clojure-package/examples/infer/predictor/scripts/get_resnet_data.sh b/contrib/clojure-package/examples/infer/predictor/scripts/get_resnet_data.sh
new file mode 100755
index 000000000000..fcef59bacc6f
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/predictor/scripts/get_resnet_data.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+MXNET_ROOT=$(cd "$(dirname $0)/.."; pwd)
+
+data_path=$MXNET_ROOT/models/resnet-152/
+
+image_path=$MXNET_ROOT/images/
+
+if [ ! -d "$data_path" ]; then
+  mkdir -p "$data_path"
+fi
+
+if [ ! -d "$image_path" ]; then
+  mkdir -p "$image_path"
+fi
+
+if [ ! -f "$data_path/resnet-152-0000.params" ]; then
+  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/resnet-152-0000.params -P $data_path
+  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/resnet-152-symbol.json -P $data_path
+  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/synset.txt -P $data_path
+fi
+
+if [ ! -f "$image_path/kitten.jpg" ]; then
+  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/kitten.jpg -P $image_path
+fi
diff --git a/contrib/clojure-package/examples/infer/predictor/src/infer/predictor_example.clj b/contrib/clojure-package/examples/infer/predictor/src/infer/predictor_example.clj
new file mode 100644
index 000000000000..498964128dd8
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/predictor/src/infer/predictor_example.clj
@@ -0,0 +1,101 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns infer.predictor-example
+  (:require [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.image :as image]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [clojure.java.io :as io]
+            [clojure.string :refer [join split]]
+            [clojure.tools.cli :refer [parse-opts]])
+  (:gen-class))
+
+(defn check-valid-file
+  "Check that the file exists"
+  [input-file]
+  (.exists (io/file input-file)))
+
+(def cli-options
+  [["-m" "--model-path-prefix PREFIX" "Model path prefix"
+    :default "models/resnet-18/resnet-18"
+    :validate [#(check-valid-file (str % "-symbol.json"))
+               "Model path prefix is invalid"]]
+   ["-i" "--input-image IMAGE" "Image path"
+    :default "images/kitten.jpg"
+    :validate [check-valid-file "Input image path not found"]]
+   ["-h" "--help"]])
+
+(defn print-prediction
+  [prediction]
+  (println (apply str (repeat 80 "=")))
+  (println prediction)
+  (println (apply str (repeat 80 "="))))
+
+(defn preprocess
+  "Preprocesses image to make it ready for prediction"
+  [image-path width height]
+  (-> image-path
+      infer/load-image-from-file
+      (infer/reshape-image width height)
+      (infer/buffered-image-to-pixels [3 width height])
+      (ndarray/expand-dims 0)))
+
+(defn do-inference
+  "Run inference using given predictor"
+  [predictor image]
+  (let [[predictions] (infer/predict-with-ndarray predictor [image])]
+    predictions))
+
+(defn postprocess
+  [model-path-prefix predictions]
+  (let [synset-file (-> model-path-prefix
+                        io/file
+                        (.getParent)
+                        (io/file "synset.txt"))
+        synset-names (split (slurp synset-file) #"\n")
+        [max-idx] (ndarray/->int-vec (ndarray/argmax predictions 1))]
+    (synset-names max-idx)))
+
+(defn run-predictor
+  "Runs an image classifier based on options provided"
+  [options]
+  (let [{:keys [model-path-prefix input-image]} options
+        width 224
+        height 224
+        descriptors [{:name "data"
+                      :shape [1 3 height width]
+                      :layout layout/NCHW
+                      :dtype dtype/FLOAT32}]
+        factory (infer/model-factory model-path-prefix descriptors)
+        predictor (infer/create-predictor
+                   factory
+                   {:contexts [(context/default-context)]})
+        image-ndarray (preprocess input-image width height)
+        predictions (do-inference predictor image-ndarray)
+        best-prediction (postprocess model-path-prefix predictions)]
+    (print-prediction best-prediction)))
+
+(defn -main
+  [& args]
+  (let [{:keys [options summary errors] :as opts}
+        (parse-opts args cli-options)]
+    (cond
+      (:help options) (println summary)
+      (some? errors) (println (join "\n" errors))
+      :else (run-predictor options))))
diff --git a/contrib/clojure-package/examples/infer/predictor/test/infer/predictor_example_test.clj b/contrib/clojure-package/examples/infer/predictor/test/infer/predictor_example_test.clj
new file mode 100644
index 000000000000..02f826fbb77f
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/predictor/test/infer/predictor_example_test.clj
@@ -0,0 +1,51 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns infer.predictor-example-test
+  (:require [infer.predictor-example :refer [preprocess
+                                             do-inference
+                                             postprocess]]
+            [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [clojure.java.io :as io]
+            [clojure.java.shell :refer [sh]]
+            [clojure.test :refer :all]))
+
+(def model-dir "models/")
+(def image-file "images/kitten.jpg")
+(def model-path-prefix (str model-dir "resnet-18/resnet-18"))
+(def width 224)
+(def height 224)
+
+(when-not (.exists (io/file (str model-path-prefix "-symbol.json")))
+  (sh "./scripts/get_resnet_18_data.sh"))
+
+(defn create-predictor []
+  (let [descriptors [{:name "data"
+                      :shape [1 3 height width]
+                      :layout layout/NCHW
+                      :dtype dtype/FLOAT32}]
+        factory (infer/model-factory model-path-prefix descriptors)]
+    (infer/create-predictor factory)))
+
+(deftest predictor-test
+  (let [predictor (create-predictor)
+        image-ndarray (preprocess image-file width height)
+        predictions (do-inference predictor image-ndarray)
+        best-prediction (postprocess model-path-prefix predictions)]
+    (is (= "n02123159 tiger cat" best-prediction))))
diff --git a/contrib/clojure-package/integration-tests.sh b/contrib/clojure-package/integration-tests.sh
index 3297fdc2c329..6e5868712026 100755
--- a/contrib/clojure-package/integration-tests.sh
+++ b/contrib/clojure-package/integration-tests.sh
@@ -18,11 +18,11 @@
 
 set -evx
 
-MXNET_HOME=${PWD}
+MXNET_HOME=$(cd "$(dirname $0)/../.."; pwd)
 EXAMPLES_HOME=${MXNET_HOME}/contrib/clojure-package/examples
 #cd ${MXNET_HOME}/contrib/clojure-package
 #lein test
 #lein cloverage --codecov
-for i in `find ${EXAMPLES_HOME} -name test` ; do
-cd ${i} && lein test
+for test_dir in `find ${EXAMPLES_HOME} -name test` ; do
+  cd ${test_dir} && lein test
 done
diff --git a/contrib/clojure-package/project.clj b/contrib/clojure-package/project.clj
index 12a0504e02d5..c4428ce6eff4 100644
--- a/contrib/clojure-package/project.clj
+++ b/contrib/clojure-package/project.clj
@@ -29,7 +29,7 @@
                  ;[org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu "1.2.1"]
 
                  ;;; CI
-                 [org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.5.0-SNAPSHOT"]
+                 [org.apache.mxnet/mxnet-full_2.11 "INTERNAL"]
 
                  [org.clojure/tools.logging "0.4.0"]
                  [org.apache.logging.log4j/log4j-core "2.8.1"]
diff --git a/contrib/clojure-package/scripts/infer/get_resnet_18_data.sh b/contrib/clojure-package/scripts/infer/get_resnet_18_data.sh
new file mode 100755
index 000000000000..601f362c4159
--- /dev/null
+++ b/contrib/clojure-package/scripts/infer/get_resnet_18_data.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -evx
+
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME/data"
+else
+  MXNET_ROOT=$(cd "$(dirname $0)/../.."; pwd)
+  data_path="$MXNET_ROOT/data"
+fi
+
+if [ ! -d "$data_path" ]; then
+    mkdir -p "$data_path"
+fi
+
+resnet_18_data_path="$data_path/resnet-18"
+if [ ! -f "$resnet_18_data_path/resnet-18-0000.params" ]; then
+    wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-symbol.json -P $resnet_18_data_path
+    wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-0000.params -P $resnet_18_data_path
+    wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/synset.txt -P $resnet_18_data_path
+fi
diff --git a/contrib/clojure-package/scripts/infer/get_ssd_data.sh b/contrib/clojure-package/scripts/infer/get_ssd_data.sh
new file mode 100755
index 000000000000..96e27a12d280
--- /dev/null
+++ b/contrib/clojure-package/scripts/infer/get_ssd_data.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+set -evx
+
+if [ ! -z "$MXNET_HOME" ]; then
+  data_path="$MXNET_HOME/data"
+else
+  MXNET_ROOT=$(cd "$(dirname $0)/../.."; pwd)
+  data_path="$MXNET_ROOT/data"
+fi
+
+if [ ! -d "$data_path" ]; then
+    mkdir -p "$data_path"
+fi
+
+resnet50_ssd_data_path="$data_path/resnet50_ssd"
+if [ ! -f "$resnet50_ssd_data_path/resnet50_ssd_model-0000.params" ]; then
+  wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_model-symbol.json -P $resnet50_ssd_data_path
+  wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_model-0000.params -P $resnet50_ssd_data_path
+  wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/synset.txt -P $resnet50_ssd_data_path
+fi
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
index 6e726eba9da6..e2e87ed47e2f 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/image.clj
@@ -62,8 +62,8 @@
    (util/validate! ::optional-color-flag color-flag "Invalid color flag")
    (util/validate! ::optional-to-rgb to-rgb "Invalid conversion flag")
    (util/validate! ::output output "Invalid output")
-   (Image/imRead 
-    filename 
+   (Image/imRead
+    filename
     ($/option color-flag)
     ($/option to-rgb)
     ($/option output)))
@@ -89,7 +89,7 @@
 
 (defn apply-border
   "Pad image border"
-  ([input top bottom left right 
+  ([input top bottom left right
     {:keys [fill-type value values output]
      :or {fill-type nil value nil values nil output nil}
      :as opts}]
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/infer.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/infer.clj
new file mode 100644
index 000000000000..224a39275dac
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/infer.clj
@@ -0,0 +1,345 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.infer
+  (:refer-clojure :exclude [type])
+  (:require [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.io :as mx-io]
+            [org.apache.clojure-mxnet.shape :as shape]
+            [org.apache.clojure-mxnet.util :as util]
+            [clojure.spec.alpha :as s])
+  (:import (java.awt.image BufferedImage)
+           (org.apache.mxnet NDArray)
+           (org.apache.mxnet.infer Classifier ImageClassifier
+                                   ObjectDetector Predictor)))
+
+(s/def ::predictor #(instance? Predictor %))
+(s/def ::classifier #(instance? Classifier %))
+(s/def ::image-classifier #(instance? ImageClassifier %))
+(s/def ::object-detector #(instance? ObjectDetector %))
+
+(defrecord WrappedPredictor [predictor])
+(defrecord WrappedClassifier [classifier])
+(defrecord WrappedImageClassifier [image-classifier])
+(defrecord WrappedObjectDetector [object-detector])
+
+(s/def ::ndarray #(instance? NDArray %))
+(s/def ::float-array (s/and #(.isArray (class %)) #(every? float? %)))
+(s/def ::vec-of-float-arrays (s/coll-of ::float-array :kind vector?))
+(s/def ::vec-of-ndarrays (s/coll-of ::ndarray :kind vector?))
+
+(s/def ::wrapped-predictor (s/keys :req-un [::predictor]))
+(s/def ::wrapped-classifier (s/keys :req-un [::classifier]))
+(s/def ::wrapped-image-classifier (s/keys :req-un [::image-classifier]))
+(s/def ::wrapped-detector (s/keys :req-un [::object-detector]))
+
+(defprotocol APredictor
+  (predict [wrapped-predictor inputs])
+  (predict-with-ndarray [wrapped-predictor input-arrays]))
+
+(defprotocol AClassifier
+  (classify
+    [wrapped-classifier inputs]
+    [wrapped-classifier inputs topk])
+  (classify-with-ndarray
+    [wrapped-classifier inputs]
+    [wrapped-classifier inputs topk]))
+
+(defprotocol AImageClassifier
+  (classify-image
+    [wrapped-image-classifier image]
+    [wrapped-image-classifier image topk]
+    [wrapped-image-classifier image topk dtype])
+  (classify-image-batch
+    [wrapped-image-classifier images]
+    [wrapped-image-classifier images topk]
+    [wrapped-image-classifier images topk dtype]))
+
+(defprotocol AObjectDetector
+  (detect-objects
+    [wrapped-detector image]
+    [wrapped-detector image topk])
+  (detect-objects-batch
+    [wrapped-detector images]
+    [wrapped-detector images topk])
+  (detect-objects-with-ndarrays
+    [wrapped-detector input-arrays]
+    [wrapped-detector input-arrays topk]))
+
+(extend-protocol APredictor
+  WrappedPredictor
+  (predict
+    [wrapped-predictor inputs]
+    (util/validate! ::wrapped-predictor wrapped-predictor
+                    "Invalid predictor")
+    (util/validate! ::vec-of-float-arrays inputs
+                    "Invalid inputs")
+    (util/coerce-return-recursive
+     (.predict (:predictor wrapped-predictor)
+               (util/vec->indexed-seq inputs))))
+  (predict-with-ndarray [wrapped-predictor input-arrays]
+    (util/validate! ::wrapped-predictor wrapped-predictor
+                    "Invalid predictor")
+    (util/validate! ::vec-of-ndarrays input-arrays
+                    "Invalid input arrays")
+    (util/coerce-return-recursive
+     (.predictWithNDArray (:predictor wrapped-predictor)
+                          (util/vec->indexed-seq input-arrays)))))
+
+(s/def ::nil-or-int (s/nilable int?))
+
+(extend-protocol AClassifier
+  WrappedClassifier
+  (classify
+    ([wrapped-classifier inputs]
+     (classify wrapped-classifier inputs nil))
+    ([wrapped-classifier inputs topk]
+     (util/validate! ::wrapped-classifier wrapped-classifier
+                     "Invalid classifier")
+     (util/validate! ::vec-of-float-arrays inputs
+                     "Invalid inputs")
+     (util/validate! ::nil-or-int topk "Invalid top-K")
+     (util/coerce-return-recursive
+      (.classify (:classifier wrapped-classifier)
+                 (util/vec->indexed-seq inputs)
+                 (util/->int-option topk)))))
+  (classify-with-ndarray
+    ([wrapped-classifier inputs]
+     (classify-with-ndarray wrapped-classifier inputs nil))
+    ([wrapped-classifier inputs topk]
+     (util/validate! ::wrapped-classifier wrapped-classifier
+                     "Invalid classifier")
+     (util/validate! ::vec-of-ndarrays inputs
+                     "Invalid inputs")
+     (util/validate! ::nil-or-int topk "Invalid top-K")
+     (util/coerce-return-recursive
+      (.classifyWithNDArray (:classifier wrapped-classifier)
+                            (util/vec->indexed-seq inputs)
+                           (util/->int-option topk)))))
+  WrappedImageClassifier
+  (classify
+    ([wrapped-image-classifier inputs]
+     (classify wrapped-image-classifier inputs nil))
+    ([wrapped-image-classifier inputs topk]
+     (util/validate! ::wrapped-image-classifier wrapped-image-classifier
+                     "Invalid classifier")
+     (util/validate! ::vec-of-float-arrays inputs
+                     "Invalid inputs")
+     (util/validate! ::nil-or-int topk "Invalid top-K")
+     (util/coerce-return-recursive
+      (.classify (:image-classifier wrapped-image-classifier)
+                 (util/vec->indexed-seq inputs)
+                 (util/->int-option topk)))))
+  (classify-with-ndarray
+    ([wrapped-image-classifier inputs]
+     (classify-with-ndarray wrapped-image-classifier inputs nil))
+    ([wrapped-image-classifier inputs topk]
+    (util/validate! ::wrapped-image-classifier wrapped-image-classifier
+                    "Invalid classifier")
+    (util/validate! ::vec-of-ndarrays inputs
+                    "Invalid inputs")
+    (util/validate! ::nil-or-int topk "Invalid top-K")
+    (util/coerce-return-recursive
+     (.classifyWithNDArray (:image-classifier wrapped-image-classifier)
+                           (util/vec->indexed-seq inputs)
+                           (util/->int-option topk))))))
+
+(s/def ::image #(instance? BufferedImage %))
+(s/def ::dtype #{dtype/UINT8 dtype/INT32 dtype/FLOAT16 dtype/FLOAT32 dtype/FLOAT64})
+
+(extend-protocol AImageClassifier
+  WrappedImageClassifier
+  (classify-image
+    ([wrapped-image-classifier image]
+     (classify-image wrapped-image-classifier image nil dtype/FLOAT32))
+    ([wrapped-image-classifier image topk]
+     (classify-image wrapped-image-classifier image topk dtype/FLOAT32))
+    ([wrapped-image-classifier image topk dtype]
+     (util/validate! ::wrapped-image-classifier wrapped-image-classifier
+                     "Invalid classifier")
+     (util/validate! ::image image "Invalid image")
+     (util/validate! ::nil-or-int topk "Invalid top-K")
+     (util/validate! ::dtype dtype "Invalid dtype")
+     (util/coerce-return-recursive
+      (.classifyImage (:image-classifier wrapped-image-classifier)
+                      image
+                      (util/->int-option topk)
+                      dtype))))
+  (classify-image-batch
+    ([wrapped-image-classifier images]
+     (classify-image-batch wrapped-image-classifier images nil dtype/FLOAT32))
+    ([wrapped-image-classifier images topk]
+         (classify-image-batch wrapped-image-classifier images topk dtype/FLOAT32))
+    ([wrapped-image-classifier images topk dtype]
+     (util/validate! ::wrapped-image-classifier wrapped-image-classifier
+                     "Invalid classifier")
+     (util/validate! ::nil-or-int topk "Invalid top-K")
+     (util/validate! ::dtype dtype "Invalid dtype")
+     (util/coerce-return-recursive
+      (.classifyImageBatch (:image-classifier wrapped-image-classifier)
+                           images
+                           (util/->int-option topk)
+                           dtype)))))
+
+(extend-protocol AObjectDetector
+  WrappedObjectDetector
+  (detect-objects
+    ([wrapped-detector image]
+     (detect-objects wrapped-detector image nil))
+    ([wrapped-detector image topk]
+    (util/validate! ::wrapped-detector wrapped-detector
+                    "Invalid object detector")
+     (util/validate! ::image image "Invalid image")
+     (util/validate! ::nil-or-int topk "Invalid top-K")
+     (util/coerce-return-recursive
+      (.imageObjectDetect (:object-detector wrapped-detector)
+                          image
+                          (util/->int-option topk)))))
+  (detect-objects-batch
+    ([wrapped-detector images]
+     (detect-objects-batch wrapped-detector images nil))
+    ([wrapped-detector images topk]
+     (util/validate! ::wrapped-detector wrapped-detector
+                     "Invalid object detector")
+     (util/validate! ::nil-or-int topk "Invalid top-K")
+     (util/coerce-return-recursive
+      (.imageBatchObjectDetect (:object-detector wrapped-detector)
+                               images
+                               (util/->int-option topk)))))
+  (detect-objects-with-ndarrays
+    ([wrapped-detector input-arrays]
+     (detect-objects-with-ndarrays wrapped-detector input-arrays nil))
+    ([wrapped-detector input-arrays topk]
+     (util/validate! ::wrapped-detector wrapped-detector
+                     "Invalid object detector")
+     (util/validate! ::vec-of-ndarrays input-arrays
+                     "Invalid inputs")
+     (util/validate! ::nil-or-int topk "Invalid top-K")
+     (util/coerce-return-recursive
+      (.objectDetectWithNDArray (:object-detector wrapped-detector)
+                                (util/vec->indexed-seq input-arrays)
+                                (util/->int-option topk))))))
+
+(defprotocol AInferenceFactory
+  (create-predictor [factory] [factory opts])
+  (create-classifier [factory] [factory opts])
+  (create-image-classifier [factory] [factory opts])
+  (create-object-detector [factory] [factory opts]))
+
+(defn convert-descriptors
+  [descriptors]
+  (util/vec->indexed-seq
+   (into [] (map mx-io/data-desc descriptors))))
+
+(defrecord InferenceFactory [model-path-prefix input-descriptors]
+  AInferenceFactory
+  (create-predictor
+    [factory]
+    (create-predictor factory {}))
+  (create-predictor
+    [factory opts]
+    (let [{:keys [contexts epoch]
+           :or {contexts [(context/cpu)] epoch 0}} opts]
+      (->WrappedPredictor
+       (new Predictor
+            model-path-prefix
+            (convert-descriptors input-descriptors)
+            (into-array contexts)
+            (util/->int-option epoch)))))
+  (create-classifier
+    [factory]
+    (create-classifier factory {}))
+  (create-classifier
+    [factory opts]
+    (let [{:keys [contexts epoch]
+           :or {contexts [(context/cpu)] epoch 0}} opts]
+      (->WrappedClassifier
+       (new Classifier
+            model-path-prefix
+            (convert-descriptors input-descriptors)
+            (into-array contexts)
+            (util/->int-option epoch)))))
+  (create-image-classifier
+    [factory]
+    (create-image-classifier factory {}))
+  (create-image-classifier
+    [factory opts]
+    (let [{:keys [contexts epoch]
+           :or {contexts [(context/cpu)] epoch 0}} opts]
+      (->WrappedImageClassifier
+       (new ImageClassifier
+            model-path-prefix
+            (convert-descriptors input-descriptors)
+            (into-array contexts)
+            (util/->int-option epoch)))))
+  (create-object-detector
+    [factory]
+    (create-object-detector factory {}))
+  (create-object-detector
+    [factory opts]
+    (let [{:keys [contexts epoch]
+           :or {contexts [(context/cpu)] epoch 0}} opts]
+      (->WrappedObjectDetector
+       (new ObjectDetector
+            model-path-prefix
+            (convert-descriptors input-descriptors)
+            (into-array contexts)
+            (util/->int-option epoch))))))
+
+(s/def ::model-path-prefix string?)
+(s/def ::input-descriptors (s/coll-of ::mx-io/data-desc))
+
+(defn model-factory
+  "Creates a factory that can be used to instantiate an image classifier
+  predictor or object detector"
+  [model-path-prefix input-descriptors]
+  (util/validate! ::model-path-prefix model-path-prefix
+                  "Invalid model path prefix")
+  (util/validate! ::input-descriptors input-descriptors
+                  "Invalid input descriptors")
+  (->InferenceFactory model-path-prefix input-descriptors))
+
+(defn reshape-image
+  "Reshape an image to a new shape"
+  [image width height]
+  (util/validate! ::image image "Invalid image")
+  (util/validate! int? width "Invalid width")
+  (util/validate! int? height "Invalid height")
+  (ImageClassifier/reshapeImage image width height))
+
+(defn buffered-image-to-pixels
+  "Convert input BufferedImage to NDArray of input shape"
+  [image input-shape-vec]
+  (util/validate! ::image image "Invalid image")
+  (util/validate! (s/coll-of int?) input-shape-vec "Invalid shape vector")
+  (ImageClassifier/bufferedImageToPixels image (shape/->shape input-shape-vec) dtype/FLOAT32))
+
+(s/def ::image-path string?)
+(s/def ::image-paths (s/coll-of ::image-path))
+
+(defn load-image-from-file
+  "Loads an input image given a file name"
+  [image-path]
+  (util/validate! ::image-path image-path "Invalid image path")
+  (ImageClassifier/loadImageFromFile image-path))
+
+(defn load-image-paths
+  "Loads images from a list of file names"
+  [image-paths]
+  (util/validate! ::image-paths image-paths "Invalid image paths")
+  (ImageClassifier/loadInputBatch (util/convert-vector image-paths)))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/primitives.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/primitives.clj
new file mode 100644
index 000000000000..0967df2289d8
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/primitives.clj
@@ -0,0 +1,46 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.primitives
+  (:import (org.apache.mxnet MX_PRIMITIVES$MX_FLOAT MX_PRIMITIVES$MX_Double
+                             MX_PRIMITIVES$MX_PRIMITIVE_TYPE)))
+
+
+;;; Defines customer mx primitives that can be used for mathematical computations
+;;; in NDArrays to control precision. Currently Float and Double are supported
+
+;;; For purposes of automatic conversion in ndarray functions, doubles are default
+;; to specify using floats you must use a Float
+
+(defn mx-float
+  "Creates a MXNet float primitive"
+  [num]
+  (new MX_PRIMITIVES$MX_FLOAT num))
+
+(defn mx-double
+  "Creates a MXNet double primitive"
+  [num]
+  (new MX_PRIMITIVES$MX_Double num))
+
+(defn ->num
+  "Returns the underlying number value"
+  [primitive]
+  (.data primitive))
+
+(defn primitive? [x]
+  (instance? MX_PRIMITIVES$MX_PRIMITIVE_TYPE x))
+
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
index 6f22b0eb3a0f..43970c0abd79 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
@@ -19,6 +19,7 @@
   (:require [clojure.spec.alpha :as s]
             [t6.from-scala.core :refer [$ $$] :as $]
             [clojure.string :as string]
+            [org.apache.clojure-mxnet.primitives :as primitives]
             [org.apache.clojure-mxnet.shape :as mx-shape])
   (:import (org.apache.mxnet NDArray)
            (scala Product Tuple2 Tuple3)
@@ -36,7 +37,8 @@
                            "byte<>" "byte-array"
                            "java.lang.String<>" "vec-or-strings"
                            "org.apache.mxnet.NDArray" "ndarray"
-                           "org.apache.mxnet.Symbol" "sym"})
+                           "org.apache.mxnet.Symbol" "sym"
+                           "org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE" "double-or-float"})
 
 (def symbol-param-coerce {"java.lang.String" "sym-name"
                           "float" "num"
@@ -66,6 +68,9 @@
 (defn ->option [v]
   ($ Option v))
 
+(defn ->int-option [v]
+  (->option (when v (int v))))
+
 (defn option->value [opt]
   ($/view opt))
 
@@ -141,6 +146,8 @@
     (and (get targets "int<>") (vector? param)) (int-array param)
     (and (get targets "float<>") (vector? param)) (float-array param)
     (and (get targets "java.lang.String<>") (vector? param)) (into-array param)
+    (and (get targets "org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE") (instance? Float param)) (primitives/mx-float param)
+    (and (get targets "org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE") (number? param)) (primitives/mx-double param)
     :else param))
 
 (defn nil-or-coerce-param [param targets]
@@ -174,8 +181,15 @@
     (instance? Map return-val) (scala-map->map return-val)
     (instance? Tuple2 return-val) (tuple->vec return-val)
     (instance? Tuple3 return-val) (tuple->vec return-val)
+    (primitives/primitive? return-val) (primitives/->num return-val)
     :else return-val))
 
+(defn coerce-return-recursive [return-val]
+  (let [coerced-val (coerce-return return-val)]
+    (if (vector? coerced-val)
+      (into [] (map coerce-return-recursive coerced-val))
+      coerced-val)))
+
 (defmacro scala-fn
   "Creates a scala fn from an anonymous clojure fn of the form (fn [x] body)"
   [f]
diff --git a/contrib/clojure-package/test/good-test-ndarray.clj b/contrib/clojure-package/test/good-test-ndarray.clj
index 3b53b1906006..b048a819c642 100644
--- a/contrib/clojure-package/test/good-test-ndarray.clj
+++ b/contrib/clojure-package/test/good-test-ndarray.clj
@@ -27,11 +27,12 @@
 
 (defn
  div
- ([ndarray num-or-ndarray]
+ ([ndarray ndarray-or-double-or-float]
   (util/coerce-return
    (.$div
     ndarray
     (util/coerce-param
-     num-or-ndarray
-     #{"float" "org.apache.mxnet.NDArray"})))))
+     ndarray-or-double-or-float
+     #{"org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE"
+       "org.apache.mxnet.NDArray"})))))
 
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/imageclassifier_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/imageclassifier_test.clj
new file mode 100644
index 000000000000..b459b06132b2
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/imageclassifier_test.clj
@@ -0,0 +1,76 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.infer.imageclassifier-test
+  (:require [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [clojure.java.io :as io]
+            [clojure.java.shell :refer [sh]]
+            [clojure.test :refer :all]))
+
+(def model-dir "data/")
+(def model-path-prefix (str model-dir "resnet-18/resnet-18"))
+
+(when-not (.exists (io/file (str model-path-prefix "-symbol.json")))
+  (sh "./scripts/infer/get_resnet_18_data.sh"))
+
+(defn create-classifier []
+  (let [descriptors [{:name "data"
+                      :shape [1 3 224 224]
+                      :layout layout/NCHW
+                      :dtype dtype/FLOAT32}]
+        factory (infer/model-factory model-path-prefix descriptors)]
+    (infer/create-image-classifier factory)))
+
+(deftest test-single-classification
+  (let [classifier (create-classifier)
+        image (infer/load-image-from-file "test/test-images/kitten.jpg")
+        [predictions-all] (infer/classify-image classifier image)
+        [predictions-with-default-dtype] (infer/classify-image classifier image 10)
+        [predictions] (infer/classify-image classifier image 5 dtype/FLOAT32)]
+    (is (= 1000 (count predictions-all)))
+    (is (= 10 (count predictions-with-default-dtype)))
+    (is (some? predictions))
+    (is (= 5 (count predictions)))
+    (is (every? #(= 2 (count %)) predictions))
+    (is (every? #(string? (first %)) predictions))
+    (is (every? #(float? (second %)) predictions))
+    (is (every? #(< 0 (second %) 1) predictions))
+    (is (= ["n02123159 tiger cat"
+            "n02124075 Egyptian cat"
+            "n02123045 tabby, tabby cat"
+            "n02127052 lynx, catamount"
+            "n02128757 snow leopard, ounce, Panthera uncia"]
+           (map first predictions)))))
+
+(deftest test-batch-classification
+  (let [classifier (create-classifier)
+        image-batch (infer/load-image-paths ["test/test-images/kitten.jpg"
+                                             "test/test-images/Pug-Cookie.jpg"])
+        batch-predictions-all (infer/classify-image-batch classifier image-batch)
+        batch-predictions-with-default-dtype (infer/classify-image-batch classifier image-batch 10)
+        batch-predictions (infer/classify-image-batch classifier image-batch 5 dtype/FLOAT32)
+        predictions (first batch-predictions)]
+    (is (= 1000 (count (first batch-predictions-all))))
+    (is (= 10 (count (first batch-predictions-with-default-dtype))))
+    (is (some? batch-predictions))
+    (is (= 5 (count predictions)))
+    (is (every? #(= 2 (count %)) predictions))
+    (is (every? #(string? (first %)) predictions))
+    (is (every? #(float? (second %)) predictions))
+    (is (every? #(< 0 (second %) 1) predictions))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/objectdetector_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/objectdetector_test.clj
new file mode 100644
index 000000000000..3a0e3d30a1d9
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/objectdetector_test.clj
@@ -0,0 +1,67 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.infer.objectdetector-test
+  (:require [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [clojure.java.io :as io]
+            [clojure.java.shell :refer [sh]]
+            [clojure.test :refer :all]))
+
+(def model-dir "data/")
+(def model-path-prefix (str model-dir "resnet50_ssd/resnet50_ssd_model"))
+
+(when-not (.exists (io/file (str model-path-prefix "-symbol.json")))
+  (sh "./scripts/infer/get_ssd_data.sh"))
+
+(defn create-detector []
+  (let [descriptors [{:name "data"
+                      :shape [1 3 512 512]
+                      :layout layout/NCHW
+                      :dtype dtype/FLOAT32}]
+        factory (infer/model-factory model-path-prefix descriptors)]
+    (infer/create-object-detector factory)))
+
+(deftest test-single-detection
+  (let [detector (create-detector)
+        image (infer/load-image-from-file "test/test-images/kitten.jpg")
+        [predictions-all] (infer/detect-objects detector image)
+        [predictions] (infer/detect-objects detector image 5)]
+    (is (some? predictions))
+    (is (= 5 (count predictions)))
+    (is (= 13 (count predictions-all)))
+    (is (every? #(= 2 (count %)) predictions))
+    (is (every? #(string? (first %)) predictions))
+    (is (every? #(= 5 (count (second %))) predictions))
+    (is (every? #(< 0 (first (second %)) 1) predictions))
+    (is (= "cat" (first (first predictions))))))
+
+(deftest test-batch-detection
+  (let [detector (create-detector)
+        image-batch (infer/load-image-paths ["test/test-images/kitten.jpg"
+                                             "test/test-images/Pug-Cookie.jpg"])
+        batch-predictions-all (infer/detect-objects-batch detector image-batch)
+        batch-predictions (infer/detect-objects-batch detector image-batch 5)
+        predictions (first batch-predictions)]
+    (is (some? batch-predictions))
+    (is (= 13 (count (first batch-predictions-all))))
+    (is (= 5 (count predictions)))
+    (is (every? #(= 2 (count %)) predictions))
+    (is (every? #(string? (first %)) predictions))
+    (is (every? #(= 5 (count (second %))) predictions))
+    (is (every? #(< 0 (first (second %)) 1) predictions))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/predictor_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/predictor_test.clj
new file mode 100644
index 000000000000..0e7532bc2258
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/predictor_test.clj
@@ -0,0 +1,59 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.infer.predictor-test
+  (:require [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [org.apache.clojure-mxnet.shape :as shape]
+            [clojure.java.io :as io]
+            [clojure.java.shell :refer [sh]]
+            [clojure.string :refer [split]]
+            [clojure.test :refer :all]))
+
+(def model-dir "data/")
+(def model-path-prefix (str model-dir "resnet-18/resnet-18"))
+(def width 224)
+(def height 224)
+
+(when-not (.exists (io/file (str model-path-prefix "-symbol.json")))
+  (sh "./scripts/infer/get_resnet_18_data.sh"))
+
+(defn create-predictor []
+  (let [descriptors [{:name "data"
+                      :shape [1 3 height width]
+                      :layout layout/NCHW
+                      :dtype dtype/FLOAT32}]
+        factory (infer/model-factory model-path-prefix descriptors)]
+    (infer/create-predictor factory)))
+
+(deftest predictor-test
+  (let [predictor (create-predictor)
+        image-ndarray (-> "test/test-images/kitten.jpg"
+                          infer/load-image-from-file
+                          (infer/reshape-image width height)
+                          (infer/buffered-image-to-pixels [3 width height])
+                          (ndarray/expand-dims 0))
+        [predictions] (infer/predict-with-ndarray predictor [image-ndarray])
+        synset-file (-> (io/file model-path-prefix)
+                        (.getParent)
+                        (io/file "synset.txt"))
+        synset-names (split (slurp synset-file) #"\n")
+        [best-index] (ndarray/->int-vec (ndarray/argmax predictions 1))
+        best-prediction (synset-names best-index)]
+    (is (= "n02123159 tiger cat" best-prediction))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_test.clj
index 79e94412d0df..9ffd3abed2f9 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_test.clj
@@ -97,7 +97,7 @@
     (is (= [1.0 1.0] (->vec ndhalves)))))
 
 (deftest test-full
-  (let [nda (full [1 2] 3)]
+  (let [nda (full [1 2] 3.0)]
     (is (= (shape nda) (mx-shape/->shape [1 2])))
     (is (= [3.0 3.0] (->vec nda)))))
 
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/primitives_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/primitives_test.clj
new file mode 100644
index 000000000000..1a538e537b8b
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/primitives_test.clj
@@ -0,0 +1,45 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.primitives-test
+  (:require [org.apache.clojure-mxnet.primitives :as primitives]
+            [clojure.test :refer :all])
+  (:import (org.apache.mxnet MX_PRIMITIVES$MX_PRIMITIVE_TYPE
+                             MX_PRIMITIVES$MX_FLOAT
+                             MX_PRIMITIVES$MX_Double)))
+
+(deftest test-primitive-types
+  (is (not (primitives/primitive? 3)))
+  (is (primitives/primitive? (primitives/mx-float 3)))
+  (is (primitives/primitive? (primitives/mx-double 3))))
+
+(deftest test-float-primitives
+  (is (instance? MX_PRIMITIVES$MX_PRIMITIVE_TYPE (primitives/mx-float 3)))
+  (is (instance? MX_PRIMITIVES$MX_FLOAT (primitives/mx-float 3)))
+  (is (instance? Float (-> (primitives/mx-float 3)
+                           (primitives/->num))))
+  (is (= 3.0 (-> (primitives/mx-float 3)
+                 (primitives/->num)))))
+
+(deftest test-double-primitives
+  (is (instance? MX_PRIMITIVES$MX_PRIMITIVE_TYPE (primitives/mx-double 2)))
+  (is (instance? MX_PRIMITIVES$MX_Double (primitives/mx-double 2)))
+  (is (instance? Double (-> (primitives/mx-double 2)
+                            (primitives/->num))))
+  (is (= 2.0 (-> (primitives/mx-double 2)
+                 (primitives/->num)))))
+
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
index ee7710317e4c..c26f83d5aa49 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
@@ -20,6 +20,7 @@
             [org.apache.clojure-mxnet.shape :as mx-shape]
             [org.apache.clojure-mxnet.util :as util]
             [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [org.apache.clojure-mxnet.primitives :as primitives]
             [org.apache.clojure-mxnet.symbol :as sym]
             [org.apache.clojure-mxnet.test-util :as test-util]
             [clojure.spec.alpha :as s])
@@ -54,6 +55,16 @@
     (is (instance? Option x))
     (is (= 1 (.get x)))))
 
+(deftest test->int-option
+  (let [x (util/->int-option 4.5)]
+    (is (instance? Option x))
+    (is (= 4 (.get x)))))
+
+(deftest test-empty->int-option
+  (let [x (util/->int-option nil)]
+    (is (instance? Option x))
+    (is (.isEmpty x))))
+
 (deftest test-option->value
   (is (= 2 (-> (util/->option 2)
                (util/option->value)))))
@@ -123,6 +134,9 @@
   (is (= "[F"  (->> (util/coerce-param [1 2] #{"float<>"}) str (take 2) (apply str))))
   (is (= "[L"  (->> (util/coerce-param [1 2] #{"java.lang.String<>"}) str (take 2) (apply str))))
 
+  (is (primitives/primitive? (util/coerce-param 1.0 #{"org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE"})))
+  (is (primitives/primitive? (util/coerce-param (float 1.0) #{"org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE"})))
+
   (is (= 1 (util/coerce-param 1 #{"unknown"}))))
 
 (deftest test-nil-or-coerce-param
@@ -161,6 +175,12 @@
                 (util/convert-tuple [1 2]))))
   (is (= [1 2 3] (util/coerce-return
                   (util/convert-tuple [1 2 3]))))
+
+  (is (instance? Double (util/coerce-return (primitives/mx-double 3))))
+  (is (= 3.0 (util/coerce-return (primitives/mx-double 3))))
+  (is (instance? Float (util/coerce-return (primitives/mx-float 2))))
+  (is (= 2.0 (util/coerce-return (primitives/mx-float 2))))
+
   (is (= "foo" (util/coerce-return "foo"))))
 
 (deftest test-translate-keyword-shape
diff --git a/contrib/clojure-package/test/test-images/Pug-Cookie.jpg b/contrib/clojure-package/test/test-images/Pug-Cookie.jpg
new file mode 100644
index 000000000000..56f5dc16ed7a
Binary files /dev/null and b/contrib/clojure-package/test/test-images/Pug-Cookie.jpg differ
diff --git a/contrib/clojure-package/test/test-images/kitten.jpg b/contrib/clojure-package/test/test-images/kitten.jpg
new file mode 100644
index 000000000000..ffcd2be2c674
Binary files /dev/null and b/contrib/clojure-package/test/test-images/kitten.jpg differ
diff --git a/cpp-package/CMakeLists.txt b/cpp-package/CMakeLists.txt
index f7fbc77e1a5e..5d2977279d74 100644
--- a/cpp-package/CMakeLists.txt
+++ b/cpp-package/CMakeLists.txt
@@ -20,4 +20,6 @@ if(USE_CPP_PACKAGE)
     add_subdirectory(example)
   endif()
 
+  install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
 endif()
diff --git a/cpp-package/example/README.md b/cpp-package/example/README.md
index c7223e94c920..c2329330b6be 100644
--- a/cpp-package/example/README.md
+++ b/cpp-package/example/README.md
@@ -2,7 +2,8 @@
 
 ## Building C++ examples
 
-The examples are built while building the MXNet library and cpp-package from source . However, they can be built manually as follows
+The examples in this folder demonstrate the **training** workflow. The **inference workflow** related examples can be found in [inference](</~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference>) folder.
+The examples in this folder are built while building the MXNet library and cpp-package from source . However, they can be built manually as follows
 
 From cpp-package/examples directory
 
@@ -18,7 +19,7 @@ The examples that are built to be run on GPU may not work on the non-GPU machine
 The makefile will also download the necessary data files and store in a data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.)
 
 
-## Examples
+## Examples demonstrating training workflow
 
 This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS. For example `export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/home/ubuntu/incubator-mxnet/lib` on ubuntu using gpu.
 
diff --git a/cpp-package/example/inference/Makefile b/cpp-package/example/inference/Makefile
new file mode 100644
index 000000000000..5efe6cfb68e5
--- /dev/null
+++ b/cpp-package/example/inference/Makefile
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+CPPEX_SRC = $(wildcard *.cpp)
+CPPEX_EXE = $(patsubst %.cpp, %, $(CPPEX_SRC))
+OPENCV_CFLAGS=`pkg-config --cflags opencv`
+OPENCV_LDFLAGS=`pkg-config --libs opencv`
+
+CXX=g++
+
+
+CFLAGS=$(COMMFLAGS) -I../../../3rdparty/tvm/nnvm/include -I../../../3rdparty/dmlc-core/include -I ../../include -I ../../../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas
+CPPEX_EXTRA_LDFLAGS := -L../../../lib -lmxnet $(OPENCV_LDFLAGS)
+
+all: $(CPPEX_EXE)
+
+debug: CPPEX_CFLAGS += -DDEBUG -g
+debug: all
+
+
+$(CPPEX_EXE):% : %.cpp
+	$(CXX) -std=c++0x $(CFLAGS)  $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(CPPEX_EXTRA_LDFLAGS)
+
+clean:
+	rm -f $(CPPEX_EXE)
diff --git a/cpp-package/example/inference/README.md b/cpp-package/example/inference/README.md
new file mode 100644
index 000000000000..79831b40b6bd
--- /dev/null
+++ b/cpp-package/example/inference/README.md
@@ -0,0 +1,41 @@
+# MXNet C++ Package Inference Workflow Examples
+
+## Building C++ Inference examples
+
+The examples in this folder demonstrate the **inference** workflow.
+To build examples use following commands:
+
+-  Release: **make all**
+-  Debug: **make debug all**
+
+
+## Examples demonstrating inference workflow
+
+This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS.
+
+### [inception_inference.cpp](</~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/inception_inference.cpp>)
+
+This example demonstrates image classification workflow with pre-trained models using MXNet C++ API. The command line parameters the example can accept are as shown below:
+
+```
+./inception_inference --help
+Usage:
+inception_inference --symbol <model symbol file in json format>
+                    --params <model params file>
+					--image <path to the image used for prediction
+					--synset file containing labels for prediction
+					[--input_shape <dimensions of input image e.g "3 224 224"]
+					[--mean file containing mean image for normalizing the input image
+					[--gpu] Specify this option if workflow needs to be run in gpu context
+```
+The model json and param file and synset files are required to run this example.  The sample command line is as follows:
+
+```
+
+./inception_inference --symbol "./model/Inception-BN-symbol.json" --params "./model/Inception-BN-0126.params" --synset "./model/synset.txt" --mean "./model/mean_224.nd" --image "./model/dog.jpg"
+```
+Alternatively, The script [unit_test_inception_inference.sh](</~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/unit_test_inception_inference.sh>) downloads the pre-trained **Inception** model and a test image. The users can invoke this script as follows:
+
+```
+./unit_test_inception_inference.sh
+```
diff --git a/cpp-package/example/inference/inception_inference.cpp b/cpp-package/example/inference/inception_inference.cpp
new file mode 100644
index 000000000000..7005e745b2f4
--- /dev/null
+++ b/cpp-package/example/inference/inception_inference.cpp
@@ -0,0 +1,446 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * This example demonstrates image classification workflow with pre-trained models using MXNet C++ API.
+ * The example performs following tasks.
+ * 1. Load the pre-trained model.
+ * 2. Load the parameters of pre-trained model.
+ * 3. Load the image to be classified  in to NDArray.
+ * 4. Normalize the image using the mean of images that were used for training.
+ * 5. Run the forward pass and predict the input image.
+ */
+
+#include <sys/stat.h>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+#include <opencv2/opencv.hpp>
+
+using namespace mxnet::cpp;
+
+static mx_float DEFAULT_MEAN_R = 123.675;
+static mx_float DEFAULT_MEAN_G = 116.28;
+static mx_float DEFAULT_MEAN_B = 103.53;
+/*
+ * class Predictor
+ *
+ * This class encapsulates the functionality to load the model, process input image and run the forward pass.
+ */
+
+class Predictor {
+ public:
+    Predictor() {}
+    Predictor(const std::string& model_json_file,
+              const std::string& model_params_file,
+              const Shape& input_shape,
+              bool gpu_context_type = false,
+              const std::string& synset_file = "",
+              const std::string& mean_image_file = "");
+    void PredictImage(const std::string& image_file);
+    ~Predictor();
+
+ private:
+    void LoadModel(const std::string& model_json_file);
+    void LoadParameters(const std::string& model_parameters_file);
+    void LoadSynset(const std::string& synset_file);
+    NDArray LoadInputImage(const std::string& image_file);
+    void LoadMeanImageData();
+    void LoadDefaultMeanImageData();
+    void NormalizeInput(const std::string& mean_image_file);
+    inline bool FileExists(const std::string& name) {
+        struct stat buffer;
+        return (stat(name.c_str(), &buffer) == 0);
+    }
+    NDArray mean_img;
+    std::map<std::string, NDArray> args_map;
+    std::map<std::string, NDArray> aux_map;
+    std::vector<std::string> output_labels;
+    Symbol net;
+    Executor *executor;
+    Shape input_shape;
+    NDArray mean_image_data;
+    NDArray std_dev_image_data;
+    Context global_ctx = Context::cpu();
+    std::string mean_image_file;
+};
+
+
+/*
+ * The constructor takes following parameters as input:
+ * 1. model_json_file:  The model in json formatted file.
+ * 2. model_params_file: File containing model parameters
+ * 3. synset_file: File containing the list of image labels
+ * 4. input_shape: Shape of input data to the model. Since this class will be running one inference at a time,
+ *                 the input shape is required to be in format Shape(1, number_of_channels, height, width)
+ * The input image will be resized to (height x width) size before running the inference.
+ * The constructor will:
+ *  1. Load the model and parameter files.
+ *  2. Load the synset file.
+ *  3. Invoke the SimpleBind to bind the input argument to the model and create an executor.
+ *
+ *  The SimpleBind is expected to be invoked only once.
+ */
+Predictor::Predictor(const std::string& model_json_file,
+                     const std::string& model_params_file,
+                     const Shape& input_shape,
+                     bool gpu_context_type,
+                     const std::string& synset_file,
+                     const std::string& mean_image_file):
+                     input_shape(input_shape),
+                     mean_image_file(mean_image_file) {
+  if (gpu_context_type) {
+    global_ctx = Context::gpu();
+  }
+  // Load the model
+  LoadModel(model_json_file);
+
+  // Load the model parameters.
+  LoadParameters(model_params_file);
+
+  /*
+   * The data will be used to output the exact label that matches highest output of the model.
+   */
+  LoadSynset(synset_file);
+
+  /*
+   * Load the mean image data if specified.
+   */
+  if (!mean_image_file.empty()) {
+    LoadMeanImageData();
+  } else {
+    LG << "Mean image file for normalizing the input is not provide."
+       << " We will use the default mean values for R,G and B channels.";
+    LoadDefaultMeanImageData();
+  }
+
+  // Create an executor after binding the model to input parameters.
+  args_map["data"] = NDArray(input_shape, global_ctx, false);
+  executor = net.SimpleBind(global_ctx, args_map, std::map<std::string, NDArray>(),
+                              std::map<std::string, OpReqType>(), aux_map);
+}
+
+/*
+ * The following function loads the model from json file.
+ */
+void Predictor::LoadModel(const std::string& model_json_file) {
+  if (!FileExists(model_json_file)) {
+    LG << "Model file " << model_json_file << " does not exist";
+    throw std::runtime_error("Model file does not exist");
+  }
+  LG << "Loading the model from " << model_json_file << std::endl;
+  net = Symbol::Load(model_json_file);
+}
+
+
+/*
+ * The following function loads the model parameters.
+ */
+void Predictor::LoadParameters(const std::string& model_parameters_file) {
+  if (!FileExists(model_parameters_file)) {
+    LG << "Parameter file " << model_parameters_file << " does not exist";
+    throw std::runtime_error("Model parameters does not exist");
+  }
+  LG << "Loading the model parameters from " << model_parameters_file << std::endl;
+  std::map<std::string, NDArray> parameters;
+  NDArray::Load(model_parameters_file, 0, &parameters);
+  for (const auto &k : parameters) {
+    if (k.first.substr(0, 4) == "aux:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      aux_map[name] = k.second.Copy(global_ctx);
+    }
+    if (k.first.substr(0, 4) == "arg:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      args_map[name] = k.second.Copy(global_ctx);
+    }
+  }
+  /*WaitAll is need when we copy data between GPU and the main memory*/
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the synset file.
+ * This information will be used later to report the label of input image.
+ */
+void Predictor::LoadSynset(const std::string& synset_file) {
+  if (!FileExists(synset_file)) {
+    LG << "Synset file " << synset_file << " does not exist";
+    throw std::runtime_error("Synset file does not exist");
+  }
+  LG << "Loading the synset file.";
+  std::ifstream fi(synset_file.c_str());
+  if (!fi.is_open()) {
+    std::cerr << "Error opening synset file " << synset_file << std::endl;
+    throw std::runtime_error("Error in opening the synset file.");
+  }
+  std::string synset, lemma;
+  while (fi >> synset) {
+    getline(fi, lemma);
+    output_labels.push_back(lemma);
+  }
+  fi.close();
+}
+
+
+/*
+ * The following function loads the mean data from mean image file.
+ * This data will be used for normalizing the image before running the forward
+ * pass.
+ * The output data has the same shape as that of the input image data.
+ */
+void Predictor::LoadMeanImageData() {
+  LG << "Load the mean image data that will be used to normalize "
+     << "the image before running forward pass.";
+  mean_image_data = NDArray(input_shape, global_ctx, false);
+  mean_image_data.SyncCopyFromCPU(
+        NDArray::LoadToMap(mean_image_file)["mean_img"].GetData(),
+        input_shape.Size());
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the default mean values for
+ * R, G and B channels into NDArray that has the same shape as that of
+ * input image.
+ */
+void Predictor::LoadDefaultMeanImageData() {
+  LG << "Loading the default mean image data";
+  std::vector<float> array;
+  /*resize pictures to (224, 224) according to the pretrained model*/
+  int height = input_shape[2];
+  int width = input_shape[3];
+  int channels = input_shape[1];
+  std::vector<mx_float> default_means;
+  default_means.push_back(DEFAULT_MEAN_R);
+  default_means.push_back(DEFAULT_MEAN_G);
+  default_means.push_back(DEFAULT_MEAN_B);
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        array.push_back(default_means[c]);
+      }
+    }
+  }
+  mean_image_data = NDArray(input_shape, global_ctx, false);
+  mean_image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the input image into NDArray.
+ */
+NDArray Predictor::LoadInputImage(const std::string& image_file) {
+  if (!FileExists(image_file)) {
+    LG << "Image file " << image_file << " does not exist";
+    throw std::runtime_error("Image file does not exist");
+  }
+  LG << "Loading the image " << image_file << std::endl;
+  std::vector<float> array;
+  cv::Mat mat = cv::imread(image_file);
+  /*resize pictures to (224, 224) according to the pretrained model*/
+  int height = input_shape[2];
+  int width = input_shape[3];
+  int channels = input_shape[1];
+  cv::resize(mat, mat, cv::Size(height, width));
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        array.push_back(static_cast<float>(mat.data[(i * height + j) * 3 + c]));
+      }
+    }
+  }
+  NDArray image_data = NDArray(input_shape, global_ctx, false);
+  image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
+  NDArray::WaitAll();
+  return image_data;
+}
+
+
+/*
+ * The following function runs the forward pass on the model.
+ * The executor is created in the constructor.
+ *
+ */
+void Predictor::PredictImage(const std::string& image_file) {
+  // Load the input image
+  NDArray image_data = LoadInputImage(image_file);
+
+  // Normalize the image
+  image_data.Slice(0, 1) -= mean_image_data;
+
+  LG << "Running the forward pass on model to predict the image";
+  /*
+   * The executor->arg_arrays represent the arguments to the model.
+   *
+   * Copying the image_data that contains the NDArray of input image
+   * to the arg map of the executor. The input is stored with the key "data" in the map.
+   *
+   */
+  image_data.CopyTo(&(executor->arg_dict()["data"]));
+  NDArray::WaitAll();
+
+  // Run the forward pass.
+  executor->Forward(false);
+
+  // The output is available in executor->outputs.
+  auto array = executor->outputs[0].Copy(global_ctx);
+  NDArray::WaitAll();
+
+  /*
+   * Find out the maximum accuracy and the index associated with that accuracy.
+   * This is done by using the argmax operator on NDArray.
+   */
+  auto predicted = array.ArgmaxChannel();
+  NDArray::WaitAll();
+
+  int best_idx = predicted.At(0, 0);
+  float best_accuracy = array.At(0, best_idx);
+
+  if (output_labels.empty()) {
+    LG << "The model predicts the highest accuracy of " << best_accuracy << " at index "
+       << best_idx;
+  } else {
+    LG << "The model predicts the input image to be a [" << output_labels[best_idx]
+       << " ] with Accuracy = " << best_accuracy << std::endl;
+  }
+}
+
+
+Predictor::~Predictor() {
+  if (executor) {
+    delete executor;
+  }
+  MXNotifyShutdown();
+}
+
+
+/*
+ * Convert the input string of number of hidden units into the vector of integers.
+ */
+std::vector<index_t> getShapeDimensions(const std::string& hidden_units_string) {
+    std::vector<index_t> dimensions;
+    char *p_next;
+    int num_unit = strtol(hidden_units_string.c_str(), &p_next, 10);
+    dimensions.push_back(num_unit);
+    while (*p_next) {
+        num_unit = strtol(p_next, &p_next, 10);
+        dimensions.push_back(num_unit);
+    }
+    return dimensions;
+}
+
+void printUsage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "inception_inference --symbol <model symbol file in json format>  " << std::endl
+              << "--params <model params file> " << std::endl
+              << "--image <path to the image used for prediction> " << std::endl
+              << "--synset <file containing labels for prediction> " << std::endl
+              << "[--input_shape <dimensions of input image e.g \"3 224 224\">] " << std::endl
+              << "[--mean <file containing mean image for normalizing the input image>] "
+              << std::endl
+              << "[--gpu  <Specify this option if workflow needs to be run in gpu context>]"
+              << std::endl;
+}
+
+int main(int argc, char** argv) {
+  std::string model_file_json;
+  std::string model_file_params;
+  std::string synset_file = "";
+  std::string mean_image = "";
+  std::string input_image = "";
+  bool gpu_context_type = false;
+
+  std::string input_shape = "3 224 224";
+    int index = 1;
+    while (index < argc) {
+        if (strcmp("--symbol", argv[index]) == 0) {
+            index++;
+            model_file_json = (index < argc ? argv[index]:"");
+        } else if (strcmp("--params", argv[index]) == 0) {
+            index++;
+            model_file_params = (index < argc ? argv[index]:"");
+        } else if (strcmp("--synset", argv[index]) == 0) {
+            index++;
+            synset_file = (index < argc ? argv[index]:"");
+        } else if (strcmp("--mean", argv[index]) == 0) {
+            index++;
+            mean_image = (index < argc ? argv[index]:"");
+        } else if (strcmp("--image", argv[index]) == 0) {
+            index++;
+            input_image = (index < argc ? argv[index]:"");
+        } else if (strcmp("--input_shape", argv[index]) == 0) {
+            index++;
+            input_shape = (index < argc ? argv[index]:input_shape);
+        } else if (strcmp("--gpu", argv[index]) == 0) {
+            gpu_context_type = true;
+        } else if (strcmp("--help", argv[index]) == 0) {
+            printUsage();
+            return 0;
+        }
+        index++;
+    }
+
+  if (model_file_json.empty() || model_file_params.empty() || synset_file.empty()) {
+    LG << "ERROR: Model details such as symbol, param and/or synset files are not specified";
+    printUsage();
+    return 1;
+  }
+
+  if (input_image.empty()) {
+    LG << "ERROR: Path to the input image is not specified.";
+    printUsage();
+    return 1;
+  }
+
+  std::vector<index_t> input_dimensions = getShapeDimensions(input_shape);
+
+  /*
+   * Since we are running inference for 1 image, add 1 to the input_dimensions so that
+   * the shape of input data for the model will be
+   * {no. of images, channels, height, width}
+   */
+  input_dimensions.insert(input_dimensions.begin(), 1);
+
+  Shape input_data_shape(input_dimensions);
+
+  try {
+    // Initialize the predictor object
+    Predictor predict(model_file_json, model_file_params, input_data_shape, gpu_context_type,
+                      synset_file, mean_image);
+
+    // Run the forward pass to predict the image.
+    predict.PredictImage(input_image);
+  } catch (std::runtime_error &error) {
+    LG << "Execution failed with ERROR: " << error.what();
+  } catch (...) {
+    /*
+     * If underlying MXNet code has thrown an exception the error message is
+     * accessible through MXGetLastError() function.
+     */
+    LG << "Execution failed with following MXNet error";
+    LG << MXGetLastError();
+  }
+  return 0;
+}
diff --git a/cpp-package/example/inference/unit_test_inception_inference.sh b/cpp-package/example/inference/unit_test_inception_inference.sh
new file mode 100755
index 000000000000..4f40b496bbd3
--- /dev/null
+++ b/cpp-package/example/inference/unit_test_inception_inference.sh
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Downloading the data and model
+mkdir -p model
+wget -nc http://data.dmlc.ml/mxnet/models/imagenet/inception-bn.tar.gz
+wget -nc -O model/dog.jpg /~https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/dog.jpg?raw=true
+wget -nc -O model/mean_224.nd /~https://github.com/dmlc/web-data/raw/master/mxnet/example/feature_extract/mean_224.nd
+tar -xvzf inception-bn.tar.gz -C model
+
+# Building
+make all
+
+
+# Running the example with dog image.
+if [ "$(uname)" == "Darwin" ]; then
+    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../lib ./inception_inference --symbol "./model/Inception-BN-symbol.json" --params "./model/Inception-BN-0126.params" --synset "./model/synset.txt" --mean "./model/mean_224.nd" --image "./model/dog.jpg" 2&> inception_inference.log
+else
+    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./inception_inference --symbol "./model/Inception-BN-symbol.json" --params "./model/Inception-BN-0126.params" --synset "./model/synset.txt" --mean "./model/mean_224.nd" --image "./model/dog.jpg" 2&> inception_inference.log
+fi
+result=`grep -c "pug-dog" inception_inference.log`
+if [ $result == 1 ];
+then
+    echo "PASS: inception_inference correctly identified the image."
+    exit 0
+else
+    echo "FAIL: inception_inference FAILED to identify the image."
+    exit 1
+fi
diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index c26c3709cb07..ca430ec99e6e 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -223,14 +223,14 @@ def GetOpDefinitionString(self, use_name, indent=0):
             if arg.isEnum and use_name:
                 # comments
                 ret = ret + self.GenDescription(arg.description, \
-                                        '/*! \\breif ', \
+                                        '/*! \\brief ', \
                                         ' *        ')
                 ret = ret + " */\n"
                 # definition
                 ret = ret + arg.enum.GetDefinitionString(indent) + '\n'
         # create function comments
         ret = ret + self.GenDescription(self.description, \
-                                        '/*!\n * \\breif ', \
+                                        '/*!\n * \\brief ', \
                                         ' *        ')
         for arg in self.args:
             if arg.name != 'symbol_name' or use_name:
diff --git a/dev_menu.py b/dev_menu.py
index 0fd78cb222e3..7329b446ed34 100755
--- a/dev_menu.py
+++ b/dev_menu.py
@@ -20,6 +20,7 @@
 # -*- coding: utf-8 -*-
 """Tool to ease working with the build system and reproducing test results"""
 
+import argparse
 import os
 import sys
 from subprocess import check_call
@@ -29,6 +30,11 @@
 from collections import OrderedDict
 import logging
 import yaml
+import shutil
+
+DEFAULT_PYENV=os.environ.get('DEFAULT_PYENV','py3_venv')
+DEFAULT_PYTHON=os.environ.get('DEFAULT_PYTHON','python3')
+DEFAULT_CMAKE_OPTIONS=os.environ.get('DEFAULT_CMAKE_OPTIONS','cmake_options.yml')
 
 class Confirm(object):
     def __init__(self, cmds):
@@ -46,7 +52,7 @@ def __call__(self):
                 resp = input("Please answer yes or no: ")
 
 class CMake(object):
-    def __init__(self, cmake_options_yaml='cmake_options.yml', cmake_options_yaml_default='cmake/cmake_options.yml'):
+    def __init__(self, cmake_options_yaml=DEFAULT_CMAKE_OPTIONS, cmake_options_yaml_default='cmake/cmake_options.yml'):
         if os.path.exists(cmake_options_yaml):
             self.cmake_options_yaml = cmake_options_yaml
         else:
@@ -87,10 +93,34 @@ def __call__(self, build_dir='build', generator='Ninja', build_cmd='ninja'):
             logging.info('Now building')
             check_call(shlex.split(build_cmd))
 
-
+def create_virtualenv(venv_exe, pyexe, venv) -> None:
+    logging.info("Creating virtualenv in %s with python %s", venv, pyexe)
+    if not (venv_exe and pyexe and venv):
+        logging.warn("Skipping creation of virtualenv")
+        return
+    check_call([venv_exe, '-p', pyexe, venv])
+    activate_this_py = os.path.join(venv, 'bin', 'activate_this.py')
+    # Activate virtualenv in this interpreter
+    exec(open(activate_this_py).read(), dict(__file__=activate_this_py))
+    check_call(['pip', 'install', '--upgrade','--force-reinstall', '-e', 'python'])
+    check_call(['pip', 'install', '-r', 'tests/requirements.txt'])
+
+def create_virtualenv_default():
+    create_virtualenv('virtualenv', DEFAULT_PYTHON, DEFAULT_PYENV)
+    logging.info("You can use the virtualenv by executing 'source %s/bin/activate'", DEFAULT_PYENV)
 
 COMMANDS = OrderedDict([
-    ('[Docker] sanity_check',
+    ('[Local] BUILD CMake/Ninja (using cmake_options.yaml (cp cmake/cmake_options.yml .) and edit) ({} virtualenv in "{}")'.format(DEFAULT_PYTHON, DEFAULT_PYENV),
+    [
+        CMake(),
+        create_virtualenv_default,
+    ]),
+    ('[Local] Python Unit tests',
+        "./py3_venv/bin/nosetests -v tests/python/unittest/"
+    ),
+    ('[Website and docs build] Will build to docs/_build/html/',
+        "ci/docker/runtime_functions.sh deploy_docs"),
+    ('[Docker] sanity_check. Check for linting and code formatting.',
         "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh sanity_check"),
     ('[Docker] Python3 CPU unittests',
     [
@@ -117,8 +147,6 @@ def __call__(self, build_dir='build', generator='Ninja', build_cmd='ninja'):
         "ci/build.py -p armv7",
         "ci/build.py -p test.arm_qemu ./runtime_functions.py run_ut_py3_qemu"
     ]),
-    ('[Local] CMake build (using cmake/cmake_options.yaml)',
-        CMake()),
     ('Clean (RESET HARD) repository (Warning! erases local changes / DATA LOSS)',
        Confirm("ci/docker/runtime_functions.sh clean_repo"))
 ])
@@ -128,6 +156,7 @@ def clip(x, mini, maxi):
 
 @retry((ValueError, RuntimeError), 3, delay_s = 0)
 def show_menu(items: List[str], header=None) -> int:
+    print('\n-- MXNet dev menu --\n')
     def hr():
         print(''.join(['-']*30))
     if header:
@@ -156,13 +185,55 @@ def handle_command(cmd):
     else:
         raise RuntimeError("handle_commands(cmds): argument should be str or List[str] but is %s", type(cmds))
 
-def main():
-    logging.getLogger().setLevel(logging.INFO)
+def use_menu_ui(args) -> None:
     command_list = list(COMMANDS.keys())
-    choice = show_menu(command_list, 'Available actions')
+    if hasattr(args, 'choice') and args.choice and args.choice[0].isdigit():
+        choice = int(args.choice[0]) - 1
+    else:
+        choice = show_menu(command_list, 'Available actions')
     handle_commands(COMMANDS[command_list[choice]])
+
+def build(args) -> None:
+    """Build using CMake"""
+    venv_exe = shutil.which('virtualenv')
+    pyexe = shutil.which(args.pyexe)
+    if not venv_exe:
+        logging.warn("virtualenv wasn't found in path, it's recommended to install virtualenv to manage python environments")
+    if not pyexe:
+        logging.warn("Python executable %s not found in path", args.pyexe)
+    if args.cmake_options:
+        cmake = CMake(args.cmake_options)
+    else:
+        cmake = CMake()
+    cmake()
+    create_virtualenv(venv_exe, pyexe, args.venv)
+
+def main():
+    logging.getLogger().setLevel(logging.INFO)
+    parser = argparse.ArgumentParser(description="""Utility for compiling and testing MXNet easily""")
+    parser.set_defaults(command='use_menu_ui')
+
+    subparsers = parser.add_subparsers(help='sub-command help')
+    build_parser = subparsers.add_parser('build', help='build with the specified flags from file')
+    build_parser.add_argument('cmake_options', nargs='?',
+        help='File containing CMake options in YAML')
+    build_parser.add_argument('-v', '--venv',
+        type=str,
+        default=DEFAULT_PYENV,
+        help='virtualenv dir')
+    build_parser.add_argument('-p', '--pyexe',
+        type=str,
+        default=DEFAULT_PYTHON,
+        help='python executable')
+    build_parser.set_defaults(command='build')
+
+    menu_parser = subparsers.add_parser('menu', help='jump to menu option #')
+    menu_parser.set_defaults(command='use_menu_ui')
+    menu_parser.add_argument('choice', nargs=1)
+
+    args = parser.parse_args()
+    globals()[args.command](args)
     return 0
 
 if __name__ == '__main__':
     sys.exit(main())
-
diff --git a/docs/Doxyfile b/docs/Doxyfile
index cee3942bcc7c..bf6344d4f74a 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -770,7 +770,7 @@ WARN_LOGFILE           =
 # spaces.
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = include src/common
+INPUT                  = include src/common cpp-package/include/mxnet-cpp
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
@@ -805,7 +805,7 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                =
+EXCLUDE                = 3rdparty
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
diff --git a/docs/Jenkinsfile b/docs/Jenkinsfile
index b65bfad4247b..676204291893 100644
--- a/docs/Jenkinsfile
+++ b/docs/Jenkinsfile
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 // timeout in minutes
-max_time = 120
+max_time = 180
 
 node('restricted-utility') {
   // Loading the utilities requires a node context unfortunately
diff --git a/docs/README.md b/docs/README.md
index c21836edd821..80463cc68d54 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -17,9 +17,13 @@ git clone --recursive /~https://github.com/apache/incubator-mxnet.git mxnet
 cd mxnet/docs/build_version_doc
 ./setup_docs_ubuntu.sh
 cd ../../
-make docs USE_OPENMP=1
+make docs USE_OPENMP=1 SPHINXOPTS=-W
 ```
 
+OpenMP speeds things up and will work on Ubuntu if you used the `setup_docs_ubuntu.sh` script.
+The `-W` Sphinx option enforces "warnings as errors". This will help you debug your builds and get them through CI.
+**CI will not let a PR through if it breaks the website.** Refer to the [MXNet Developer wiki's documentation guide](https://cwiki.apache.org/confluence/display/MXNET/Documentation+Guide) for troubleshooting tips.
+
 For more information on each API's documentation dependencies, how to serve the docs, or how to build the full website with each legacy MXNet version, refer to the following links:
 
 * [Dependencies](/~https://github.com/apache/incubator-mxnet/tree/master/docs/build_version_doc#dependencies) - required before you build the docs
diff --git a/docs/_static/js/auto_module_index.js b/docs/_static/js/auto_module_index.js
index 8df9a20ef8ec..8527a71edd95 100644
--- a/docs/_static/js/auto_module_index.js
+++ b/docs/_static/js/auto_module_index.js
@@ -1,3 +1,23 @@
+/*!
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* Customizations to the Sphinx auto module plugin output */
 function auto_index(module) {
   $(document).ready(function () {
     // find all classes or functions
diff --git a/docs/_static/js/clipboard.js b/docs/_static/js/clipboard.js
new file mode 100644
index 000000000000..75b6af35323d
--- /dev/null
+++ b/docs/_static/js/clipboard.js
@@ -0,0 +1,778 @@
+/*!
+ * clipboard.js v1.6.1
+ * https://zenorocha.github.io/clipboard.js
+ *
+ * Licensed MIT © Zeno Rocha
+ */
+(function(f){if(typeof exports==="object"&&typeof module!=="undefined"){module.exports=f()}else if(typeof define==="function"&&define.amd){define([],f)}else{var g;if(typeof window!=="undefined"){g=window}else if(typeof global!=="undefined"){g=global}else if(typeof self!=="undefined"){g=self}else{g=this}g.Clipboard = f()}})(function(){var define,module,exports;return (function e(t,n,r){function s(o,u){if(!n[o]){if(!t[o]){var a=typeof require=="function"&&require;if(!u&&a)return a(o,!0);if(i)return i(o,!0);var f=new Error("Cannot find module '"+o+"'");throw f.code="MODULE_NOT_FOUND",f}var l=n[o]={exports:{}};t[o][0].call(l.exports,function(e){var n=t[o][1][e];return s(n?n:e)},l,l.exports,e,t,n,r)}return n[o].exports}var i=typeof require=="function"&&require;for(var o=0;o<r.length;o++)s(r[o]);return s})({1:[function(require,module,exports){
+var DOCUMENT_NODE_TYPE = 9;
+
+/**
+ * A polyfill for Element.matches()
+ */
+if (typeof Element !== 'undefined' && !Element.prototype.matches) {
+    var proto = Element.prototype;
+
+    proto.matches = proto.matchesSelector ||
+                    proto.mozMatchesSelector ||
+                    proto.msMatchesSelector ||
+                    proto.oMatchesSelector ||
+                    proto.webkitMatchesSelector;
+}
+
+/**
+ * Finds the closest parent that matches a selector.
+ *
+ * @param {Element} element
+ * @param {String} selector
+ * @return {Function}
+ */
+function closest (element, selector) {
+    while (element && element.nodeType !== DOCUMENT_NODE_TYPE) {
+        if (element.matches(selector)) return element;
+        element = element.parentNode;
+    }
+}
+
+module.exports = closest;
+
+},{}],2:[function(require,module,exports){
+var closest = require('./closest');
+
+/**
+ * Delegates event to a selector.
+ *
+ * @param {Element} element
+ * @param {String} selector
+ * @param {String} type
+ * @param {Function} callback
+ * @param {Boolean} useCapture
+ * @return {Object}
+ */
+function delegate(element, selector, type, callback, useCapture) {
+    var listenerFn = listener.apply(this, arguments);
+
+    element.addEventListener(type, listenerFn, useCapture);
+
+    return {
+        destroy: function() {
+            element.removeEventListener(type, listenerFn, useCapture);
+        }
+    }
+}
+
+/**
+ * Finds closest match and invokes callback.
+ *
+ * @param {Element} element
+ * @param {String} selector
+ * @param {String} type
+ * @param {Function} callback
+ * @return {Function}
+ */
+function listener(element, selector, type, callback) {
+    return function(e) {
+        e.delegateTarget = closest(e.target, selector);
+
+        if (e.delegateTarget) {
+            callback.call(element, e);
+        }
+    }
+}
+
+module.exports = delegate;
+
+},{"./closest":1}],3:[function(require,module,exports){
+/**
+ * Check if argument is a HTML element.
+ *
+ * @param {Object} value
+ * @return {Boolean}
+ */
+exports.node = function(value) {
+    return value !== undefined
+        && value instanceof HTMLElement
+        && value.nodeType === 1;
+};
+
+/**
+ * Check if argument is a list of HTML elements.
+ *
+ * @param {Object} value
+ * @return {Boolean}
+ */
+exports.nodeList = function(value) {
+    var type = Object.prototype.toString.call(value);
+
+    return value !== undefined
+        && (type === '[object NodeList]' || type === '[object HTMLCollection]')
+        && ('length' in value)
+        && (value.length === 0 || exports.node(value[0]));
+};
+
+/**
+ * Check if argument is a string.
+ *
+ * @param {Object} value
+ * @return {Boolean}
+ */
+exports.string = function(value) {
+    return typeof value === 'string'
+        || value instanceof String;
+};
+
+/**
+ * Check if argument is a function.
+ *
+ * @param {Object} value
+ * @return {Boolean}
+ */
+exports.fn = function(value) {
+    var type = Object.prototype.toString.call(value);
+
+    return type === '[object Function]';
+};
+
+},{}],4:[function(require,module,exports){
+var is = require('./is');
+var delegate = require('delegate');
+
+/**
+ * Validates all params and calls the right
+ * listener function based on its target type.
+ *
+ * @param {String|HTMLElement|HTMLCollection|NodeList} target
+ * @param {String} type
+ * @param {Function} callback
+ * @return {Object}
+ */
+function listen(target, type, callback) {
+    if (!target && !type && !callback) {
+        throw new Error('Missing required arguments');
+    }
+
+    if (!is.string(type)) {
+        throw new TypeError('Second argument must be a String');
+    }
+
+    if (!is.fn(callback)) {
+        throw new TypeError('Third argument must be a Function');
+    }
+
+    if (is.node(target)) {
+        return listenNode(target, type, callback);
+    }
+    else if (is.nodeList(target)) {
+        return listenNodeList(target, type, callback);
+    }
+    else if (is.string(target)) {
+        return listenSelector(target, type, callback);
+    }
+    else {
+        throw new TypeError('First argument must be a String, HTMLElement, HTMLCollection, or NodeList');
+    }
+}
+
+/**
+ * Adds an event listener to a HTML element
+ * and returns a remove listener function.
+ *
+ * @param {HTMLElement} node
+ * @param {String} type
+ * @param {Function} callback
+ * @return {Object}
+ */
+function listenNode(node, type, callback) {
+    node.addEventListener(type, callback);
+
+    return {
+        destroy: function() {
+            node.removeEventListener(type, callback);
+        }
+    }
+}
+
+/**
+ * Add an event listener to a list of HTML elements
+ * and returns a remove listener function.
+ *
+ * @param {NodeList|HTMLCollection} nodeList
+ * @param {String} type
+ * @param {Function} callback
+ * @return {Object}
+ */
+function listenNodeList(nodeList, type, callback) {
+    Array.prototype.forEach.call(nodeList, function(node) {
+        node.addEventListener(type, callback);
+    });
+
+    return {
+        destroy: function() {
+            Array.prototype.forEach.call(nodeList, function(node) {
+                node.removeEventListener(type, callback);
+            });
+        }
+    }
+}
+
+/**
+ * Add an event listener to a selector
+ * and returns a remove listener function.
+ *
+ * @param {String} selector
+ * @param {String} type
+ * @param {Function} callback
+ * @return {Object}
+ */
+function listenSelector(selector, type, callback) {
+    return delegate(document.body, selector, type, callback);
+}
+
+module.exports = listen;
+
+},{"./is":3,"delegate":2}],5:[function(require,module,exports){
+function select(element) {
+    var selectedText;
+
+    if (element.nodeName === 'SELECT') {
+        element.focus();
+
+        selectedText = element.value;
+    }
+    else if (element.nodeName === 'INPUT' || element.nodeName === 'TEXTAREA') {
+        var isReadOnly = element.hasAttribute('readonly');
+
+        if (!isReadOnly) {
+            element.setAttribute('readonly', '');
+        }
+
+        element.select();
+        element.setSelectionRange(0, element.value.length);
+
+        if (!isReadOnly) {
+            element.removeAttribute('readonly');
+        }
+
+        selectedText = element.value;
+    }
+    else {
+        if (element.hasAttribute('contenteditable')) {
+            element.focus();
+        }
+
+        var selection = window.getSelection();
+        var range = document.createRange();
+
+        range.selectNodeContents(element);
+        selection.removeAllRanges();
+        selection.addRange(range);
+
+        selectedText = selection.toString();
+    }
+
+    return selectedText;
+}
+
+module.exports = select;
+
+},{}],6:[function(require,module,exports){
+function E () {
+  // Keep this empty so it's easier to inherit from
+  // (via /~https://github.com/lipsmack from /~https://github.com/scottcorgan/tiny-emitter/issues/3)
+}
+
+E.prototype = {
+  on: function (name, callback, ctx) {
+    var e = this.e || (this.e = {});
+
+    (e[name] || (e[name] = [])).push({
+      fn: callback,
+      ctx: ctx
+    });
+
+    return this;
+  },
+
+  once: function (name, callback, ctx) {
+    var self = this;
+    function listener () {
+      self.off(name, listener);
+      callback.apply(ctx, arguments);
+    };
+
+    listener._ = callback
+    return this.on(name, listener, ctx);
+  },
+
+  emit: function (name) {
+    var data = [].slice.call(arguments, 1);
+    var evtArr = ((this.e || (this.e = {}))[name] || []).slice();
+    var i = 0;
+    var len = evtArr.length;
+
+    for (i; i < len; i++) {
+      evtArr[i].fn.apply(evtArr[i].ctx, data);
+    }
+
+    return this;
+  },
+
+  off: function (name, callback) {
+    var e = this.e || (this.e = {});
+    var evts = e[name];
+    var liveEvents = [];
+
+    if (evts && callback) {
+      for (var i = 0, len = evts.length; i < len; i++) {
+        if (evts[i].fn !== callback && evts[i].fn._ !== callback)
+          liveEvents.push(evts[i]);
+      }
+    }
+
+    // Remove event from queue to prevent memory leak
+    // Suggested by /~https://github.com/lazd
+    // Ref: /~https://github.com/scottcorgan/tiny-emitter/commit/c6ebfaa9bc973b33d110a84a307742b7cf94c953#commitcomment-5024910
+
+    (liveEvents.length)
+      ? e[name] = liveEvents
+      : delete e[name];
+
+    return this;
+  }
+};
+
+module.exports = E;
+
+},{}],7:[function(require,module,exports){
+(function (global, factory) {
+    if (typeof define === "function" && define.amd) {
+        define(['module', 'select'], factory);
+    } else if (typeof exports !== "undefined") {
+        factory(module, require('select'));
+    } else {
+        var mod = {
+            exports: {}
+        };
+        factory(mod, global.select);
+        global.clipboardAction = mod.exports;
+    }
+})(this, function (module, _select) {
+    'use strict';
+
+    var _select2 = _interopRequireDefault(_select);
+
+    function _interopRequireDefault(obj) {
+        return obj && obj.__esModule ? obj : {
+            default: obj
+        };
+    }
+
+    var _typeof = typeof Symbol === "function" && typeof Symbol.iterator === "symbol" ? function (obj) {
+        return typeof obj;
+    } : function (obj) {
+        return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj;
+    };
+
+    function _classCallCheck(instance, Constructor) {
+        if (!(instance instanceof Constructor)) {
+            throw new TypeError("Cannot call a class as a function");
+        }
+    }
+
+    var _createClass = function () {
+        function defineProperties(target, props) {
+            for (var i = 0; i < props.length; i++) {
+                var descriptor = props[i];
+                descriptor.enumerable = descriptor.enumerable || false;
+                descriptor.configurable = true;
+                if ("value" in descriptor) descriptor.writable = true;
+                Object.defineProperty(target, descriptor.key, descriptor);
+            }
+        }
+
+        return function (Constructor, protoProps, staticProps) {
+            if (protoProps) defineProperties(Constructor.prototype, protoProps);
+            if (staticProps) defineProperties(Constructor, staticProps);
+            return Constructor;
+        };
+    }();
+
+    var ClipboardAction = function () {
+        /**
+         * @param {Object} options
+         */
+        function ClipboardAction(options) {
+            _classCallCheck(this, ClipboardAction);
+
+            this.resolveOptions(options);
+            this.initSelection();
+        }
+
+        /**
+         * Defines base properties passed from constructor.
+         * @param {Object} options
+         */
+
+
+        _createClass(ClipboardAction, [{
+            key: 'resolveOptions',
+            value: function resolveOptions() {
+                var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};
+
+                this.action = options.action;
+                this.emitter = options.emitter;
+                this.target = options.target;
+                this.text = options.text;
+                this.trigger = options.trigger;
+
+                this.selectedText = '';
+            }
+        }, {
+            key: 'initSelection',
+            value: function initSelection() {
+                if (this.text) {
+                    this.selectFake();
+                } else if (this.target) {
+                    this.selectTarget();
+                }
+            }
+        }, {
+            key: 'selectFake',
+            value: function selectFake() {
+                var _this = this;
+
+                var isRTL = document.documentElement.getAttribute('dir') == 'rtl';
+
+                this.removeFake();
+
+                this.fakeHandlerCallback = function () {
+                    return _this.removeFake();
+                };
+                this.fakeHandler = document.body.addEventListener('click', this.fakeHandlerCallback) || true;
+
+                this.fakeElem = document.createElement('textarea');
+                // Prevent zooming on iOS
+                this.fakeElem.style.fontSize = '12pt';
+                // Reset box model
+                this.fakeElem.style.border = '0';
+                this.fakeElem.style.padding = '0';
+                this.fakeElem.style.margin = '0';
+                // Move element out of screen horizontally
+                this.fakeElem.style.position = 'absolute';
+                this.fakeElem.style[isRTL ? 'right' : 'left'] = '-9999px';
+                // Move element to the same position vertically
+                var yPosition = window.pageYOffset || document.documentElement.scrollTop;
+                this.fakeElem.style.top = yPosition + 'px';
+
+                this.fakeElem.setAttribute('readonly', '');
+                this.fakeElem.value = this.text;
+
+                document.body.appendChild(this.fakeElem);
+
+                this.selectedText = (0, _select2.default)(this.fakeElem);
+                this.copyText();
+            }
+        }, {
+            key: 'removeFake',
+            value: function removeFake() {
+                if (this.fakeHandler) {
+                    document.body.removeEventListener('click', this.fakeHandlerCallback);
+                    this.fakeHandler = null;
+                    this.fakeHandlerCallback = null;
+                }
+
+                if (this.fakeElem) {
+                    document.body.removeChild(this.fakeElem);
+                    this.fakeElem = null;
+                }
+            }
+        }, {
+            key: 'selectTarget',
+            value: function selectTarget() {
+                this.selectedText = (0, _select2.default)(this.target);
+                this.copyText();
+            }
+        }, {
+            key: 'copyText',
+            value: function copyText() {
+                var succeeded = void 0;
+
+                try {
+                    succeeded = document.execCommand(this.action);
+                } catch (err) {
+                    succeeded = false;
+                }
+
+                this.handleResult(succeeded);
+            }
+        }, {
+            key: 'handleResult',
+            value: function handleResult(succeeded) {
+                this.emitter.emit(succeeded ? 'success' : 'error', {
+                    action: this.action,
+                    text: this.selectedText,
+                    trigger: this.trigger,
+                    clearSelection: this.clearSelection.bind(this)
+                });
+            }
+        }, {
+            key: 'clearSelection',
+            value: function clearSelection() {
+                if (this.target) {
+                    this.target.blur();
+                }
+
+                window.getSelection().removeAllRanges();
+            }
+        }, {
+            key: 'destroy',
+            value: function destroy() {
+                this.removeFake();
+            }
+        }, {
+            key: 'action',
+            set: function set() {
+                var action = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 'copy';
+
+                this._action = action;
+
+                if (this._action !== 'copy' && this._action !== 'cut') {
+                    throw new Error('Invalid "action" value, use either "copy" or "cut"');
+                }
+            },
+            get: function get() {
+                return this._action;
+            }
+        }, {
+            key: 'target',
+            set: function set(target) {
+                if (target !== undefined) {
+                    if (target && (typeof target === 'undefined' ? 'undefined' : _typeof(target)) === 'object' && target.nodeType === 1) {
+                        if (this.action === 'copy' && target.hasAttribute('disabled')) {
+                            throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');
+                        }
+
+                        if (this.action === 'cut' && (target.hasAttribute('readonly') || target.hasAttribute('disabled'))) {
+                            throw new Error('Invalid "target" attribute. You can\'t cut text from elements with "readonly" or "disabled" attributes');
+                        }
+
+                        this._target = target;
+                    } else {
+                        throw new Error('Invalid "target" value, use a valid Element');
+                    }
+                }
+            },
+            get: function get() {
+                return this._target;
+            }
+        }]);
+
+        return ClipboardAction;
+    }();
+
+    module.exports = ClipboardAction;
+});
+
+},{"select":5}],8:[function(require,module,exports){
+(function (global, factory) {
+    if (typeof define === "function" && define.amd) {
+        define(['module', './clipboard-action', 'tiny-emitter', 'good-listener'], factory);
+    } else if (typeof exports !== "undefined") {
+        factory(module, require('./clipboard-action'), require('tiny-emitter'), require('good-listener'));
+    } else {
+        var mod = {
+            exports: {}
+        };
+        factory(mod, global.clipboardAction, global.tinyEmitter, global.goodListener);
+        global.clipboard = mod.exports;
+    }
+})(this, function (module, _clipboardAction, _tinyEmitter, _goodListener) {
+    'use strict';
+
+    var _clipboardAction2 = _interopRequireDefault(_clipboardAction);
+
+    var _tinyEmitter2 = _interopRequireDefault(_tinyEmitter);
+
+    var _goodListener2 = _interopRequireDefault(_goodListener);
+
+    function _interopRequireDefault(obj) {
+        return obj && obj.__esModule ? obj : {
+            default: obj
+        };
+    }
+
+    function _classCallCheck(instance, Constructor) {
+        if (!(instance instanceof Constructor)) {
+            throw new TypeError("Cannot call a class as a function");
+        }
+    }
+
+    var _createClass = function () {
+        function defineProperties(target, props) {
+            for (var i = 0; i < props.length; i++) {
+                var descriptor = props[i];
+                descriptor.enumerable = descriptor.enumerable || false;
+                descriptor.configurable = true;
+                if ("value" in descriptor) descriptor.writable = true;
+                Object.defineProperty(target, descriptor.key, descriptor);
+            }
+        }
+
+        return function (Constructor, protoProps, staticProps) {
+            if (protoProps) defineProperties(Constructor.prototype, protoProps);
+            if (staticProps) defineProperties(Constructor, staticProps);
+            return Constructor;
+        };
+    }();
+
+    function _possibleConstructorReturn(self, call) {
+        if (!self) {
+            throw new ReferenceError("this hasn't been initialised - super() hasn't been called");
+        }
+
+        return call && (typeof call === "object" || typeof call === "function") ? call : self;
+    }
+
+    function _inherits(subClass, superClass) {
+        if (typeof superClass !== "function" && superClass !== null) {
+            throw new TypeError("Super expression must either be null or a function, not " + typeof superClass);
+        }
+
+        subClass.prototype = Object.create(superClass && superClass.prototype, {
+            constructor: {
+                value: subClass,
+                enumerable: false,
+                writable: true,
+                configurable: true
+            }
+        });
+        if (superClass) Object.setPrototypeOf ? Object.setPrototypeOf(subClass, superClass) : subClass.__proto__ = superClass;
+    }
+
+    var Clipboard = function (_Emitter) {
+        _inherits(Clipboard, _Emitter);
+
+        /**
+         * @param {String|HTMLElement|HTMLCollection|NodeList} trigger
+         * @param {Object} options
+         */
+        function Clipboard(trigger, options) {
+            _classCallCheck(this, Clipboard);
+
+            var _this = _possibleConstructorReturn(this, (Clipboard.__proto__ || Object.getPrototypeOf(Clipboard)).call(this));
+
+            _this.resolveOptions(options);
+            _this.listenClick(trigger);
+            return _this;
+        }
+
+        /**
+         * Defines if attributes would be resolved using internal setter functions
+         * or custom functions that were passed in the constructor.
+         * @param {Object} options
+         */
+
+
+        _createClass(Clipboard, [{
+            key: 'resolveOptions',
+            value: function resolveOptions() {
+                var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {};
+
+                this.action = typeof options.action === 'function' ? options.action : this.defaultAction;
+                this.target = typeof options.target === 'function' ? options.target : this.defaultTarget;
+                this.text = typeof options.text === 'function' ? options.text : this.defaultText;
+            }
+        }, {
+            key: 'listenClick',
+            value: function listenClick(trigger) {
+                var _this2 = this;
+
+                this.listener = (0, _goodListener2.default)(trigger, 'click', function (e) {
+                    return _this2.onClick(e);
+                });
+            }
+        }, {
+            key: 'onClick',
+            value: function onClick(e) {
+                var trigger = e.delegateTarget || e.currentTarget;
+
+                if (this.clipboardAction) {
+                    this.clipboardAction = null;
+                }
+
+                this.clipboardAction = new _clipboardAction2.default({
+                    action: this.action(trigger),
+                    target: this.target(trigger),
+                    text: this.text(trigger),
+                    trigger: trigger,
+                    emitter: this
+                });
+            }
+        }, {
+            key: 'defaultAction',
+            value: function defaultAction(trigger) {
+                return getAttributeValue('action', trigger);
+            }
+        }, {
+            key: 'defaultTarget',
+            value: function defaultTarget(trigger) {
+                var selector = getAttributeValue('target', trigger);
+
+                if (selector) {
+                    return document.querySelector(selector);
+                }
+            }
+        }, {
+            key: 'defaultText',
+            value: function defaultText(trigger) {
+                return getAttributeValue('text', trigger);
+            }
+        }, {
+            key: 'destroy',
+            value: function destroy() {
+                this.listener.destroy();
+
+                if (this.clipboardAction) {
+                    this.clipboardAction.destroy();
+                    this.clipboardAction = null;
+                }
+            }
+        }], [{
+            key: 'isSupported',
+            value: function isSupported() {
+                var action = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ['copy', 'cut'];
+
+                var actions = typeof action === 'string' ? [action] : action;
+                var support = !!document.queryCommandSupported;
+
+                actions.forEach(function (action) {
+                    support = support && !!document.queryCommandSupported(action);
+                });
+
+                return support;
+            }
+        }]);
+
+        return Clipboard;
+    }(_tinyEmitter2.default);
+
+    /**
+     * Helper function to retrieve attribute value.
+     * @param {String} suffix
+     * @param {Element} element
+     */
+    function getAttributeValue(suffix, element) {
+        var attribute = 'data-clipboard-' + suffix;
+
+        if (!element.hasAttribute(attribute)) {
+            return;
+        }
+
+        return element.getAttribute(attribute);
+    }
+
+    module.exports = Clipboard;
+});
+
+},{"./clipboard-action":7,"good-listener":4,"tiny-emitter":6}]},{},[8])(8)
+});
\ No newline at end of file
diff --git a/docs/_static/js/copycode.js b/docs/_static/js/copycode.js
index f9ebd64abb2b..d42e99277ff8 100644
--- a/docs/_static/js/copycode.js
+++ b/docs/_static/js/copycode.js
@@ -1,4 +1,23 @@
-/*Copy code to clipboard*/
+/*!
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* Copy code to clipboard */
 LANG_GP = {'default':'>>> ', 'python':'>>> ' , 'scala':'scala>', 'julia':'julia> ', 'r':'> ', 'perl':'pdl>' , 'cpp':'', 'bash':'$ '};
 
 function addBtn() {
diff --git a/docs/_static/js/docversion.js b/docs/_static/js/docversion.js
index f87c4587b4a0..1119f4ec1fff 100644
--- a/docs/_static/js/docversion.js
+++ b/docs/_static/js/docversion.js
@@ -1,3 +1,23 @@
+/*!
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* Set the version of the website */
 function setVersion(){
         let doc = window.location.pathname.match(/^\/(api\/.*)$/) || window.location.pathname.match(/^\/versions\/[^*]+\/(api\/.*)$/);
         if (doc) {
@@ -11,7 +31,7 @@ function setVersion(){
                                     $( el ).attr('href', versionedDoc);
                             }
                     });
-            }        
+            }
         }
 }
 
diff --git a/docs/_static/js/navbar.js b/docs/_static/js/navbar.js
index 0384194fa2de..5dde7d8ff2e2 100644
--- a/docs/_static/js/navbar.js
+++ b/docs/_static/js/navbar.js
@@ -1,3 +1,23 @@
+/*!
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* Custom navigation bar formatting */
 var searchBox = $("#search-input-wrap");
 var TITLE = ['/install/', '/gluon/', '/api/', '/docs/', '/community/' ];
 var DOC_TITLE = ['/faq/', '/tutorials/', '/architecture/', '/model_zoo/'];
@@ -25,7 +45,7 @@ function navbar() {
             $(this).hide;
         }
         else rightPos = $(this).offset().left + $(this).width();
-        
+
         if(isCovered) {
             plusMenuList.push($(this).clone());
             $(this).hide();
@@ -38,7 +58,7 @@ function navbar() {
         }
         else $(this).show();
     });
-    
+
     if(plusMenuList.length == 0) {
         $(".plusIcon").first().hide();
         return;
diff --git a/docs/_static/js/options.js b/docs/_static/js/options.js
index 4fbfbed00a51..f4fde4e1f2ef 100644
--- a/docs/_static/js/options.js
+++ b/docs/_static/js/options.js
@@ -1,3 +1,23 @@
+/*!
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* Installation page display functions for install selector */
 var versionSelect   = defaultVersion = 'v1.3.1';
 var platformSelect    = 'Linux';
 var languageSelect  = 'Python';
diff --git a/docs/_static/js/page.js b/docs/_static/js/page.js
index caba7dd1b2d4..425998d6d706 100644
--- a/docs/_static/js/page.js
+++ b/docs/_static/js/page.js
@@ -1,3 +1,22 @@
+/*!
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /* Generate url tracking for each page */
 var protocol = location.protocol.concat("//");
 var host = protocol.concat(window.location.host);
diff --git a/docs/_static/js/search.js b/docs/_static/js/search.js
index 9df9702225a2..6a70b4ef5cbb 100644
--- a/docs/_static/js/search.js
+++ b/docs/_static/js/search.js
@@ -1,9 +1,29 @@
+/*!
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* Display functionality for the search feature */
 $(document).ready(function () {
     var searchForm = $("#search-input-wrap").children("form").first();
     searchForm.append('<div class="form-group searchBtn"><input type="submit" class="form-control" value="Go"></div>');
     searchForm.children("div").first().addClass("searchBox");
     $(".searchBox").addClass("searchBoxNorm");
-    
+
     $('#searchIcon').click(function () {
         if($('#search-input-wrap').is(':hidden')) {
             $('#search-input-wrap').show();
@@ -16,4 +36,4 @@ $(document).ready(function () {
             $('#searchIcon span').addClass('glyphicon-search');
         }
     });
-});
\ No newline at end of file
+});
diff --git a/docs/_static/js/sidebar.js b/docs/_static/js/sidebar.js
index c5b8b519ed9c..65899d5ddd65 100644
--- a/docs/_static/js/sidebar.js
+++ b/docs/_static/js/sidebar.js
@@ -1,4 +1,23 @@
-/*Preprocess*/
+/*!
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* Customizations to the sphinx theme */
 var LANG = ['python', 'c++', 'clojure', 'julia', 'perl', 'r', 'scala', 'java'];
 var TITLE_WITH_LANG = ['/get_started/', '/tutorials/', '/faq/', '/architecture/', '/community/'];
 for(var i = 0; i < LANG.length; ++i) {
diff --git a/docs/_static/selectlang.js b/docs/_static/selectlang.js
index 25337abcb22b..c0075f956bd0 100644
--- a/docs/_static/selectlang.js
+++ b/docs/_static/selectlang.js
@@ -1,3 +1,22 @@
+/*!
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 function changeLanguage(langSelect, langSelectLabel, rootpath){
 	langSelect.change(function() {
 		var lang = langSelect.val();
@@ -22,4 +41,4 @@ $(document).ready(function () {
 	langSelectLabel.text($("option:selected").text());
 
 	changeLanguage(langSelect, langSelectLabel, getRootPath());
-})
\ No newline at end of file
+})
diff --git a/docs/api/c++/index.md b/docs/api/c++/index.md
index 9a68cdafda6e..6bb7a410b341 100644
--- a/docs/api/c++/index.md
+++ b/docs/api/c++/index.md
@@ -1,8 +1,55 @@
 # MXNet - C++ API
 
+The MXNet C++ Package provides C++ API bindings to the users of MXNet.  Currently, these bindings are not available as standalone package.
+The users of these bindings are required to build this package as mentioned below.
+
+## Building C++ Package
+
+The cpp-package directory contains the implementation of C++ API. As mentioned above, users are required to build this directory or package before using it.
+**The cpp-package is built while building the MXNet shared library, *libmxnet.so*.**
+
+### Steps to build the C++ package:
+1.  Building the MXNet C++ package requires building MXNet from source.
+2.  Clone the MXNet GitHub repository **recursively** to ensure the code in submodules is available for building MXNet.
+	```
+	git clone --recursive /~https://github.com/apache/incubator-mxnet mxnet
+	```
+
+3.  Install the [prerequisites](<https://mxnet.incubator.apache.org/install/build_from_source#prerequisites>), desired [BLAS libraries](<https://mxnet.incubator.apache.org/install/build_from_source#blas-library>) and optional [OpenCV, CUDA, and cuDNN](<https://mxnet.incubator.apache.org/install/build_from_source#optional>) for building MXNet from source.
+4.  There is a configuration file for make, [make/config.mk](</~https://github.com/apache/incubator-mxnet/blob/master/make/config.mk>) that contains all the compilation options. You can edit this file and set the appropriate options prior to running the **make** command.
+5.  Please refer to  [platform specific build instructions](<https://mxnet.incubator.apache.org/install/build_from_source#build-instructions-by-operating-system>) and available [build configurations](https://mxnet.incubator.apache.org/install/build_from_source#build-configurations) for more details.
+5.  For enabling the build of C++ Package, set the **USE\_CPP\_PACKAGE = 1** in [make/config.mk](</~https://github.com/apache/incubator-mxnet/blob/master/make/config.mk>). Optionally, the compilation flag can also be specified on **make** command line as follows.
+	```
+	make -j USE_CPP_PACKAGE=1
+	```
+
+## Usage
+
+In order to consume the C++ API please follow the steps below.
+
+1. Ensure that the MXNet shared library is built from source with the **USE\_CPP\_PACKAGE = 1**.
+2. Include the [MxNetCpp.h](</~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/include/mxnet-cpp/MxNetCpp.h>) in the program that is going to consume MXNet C++ API.
+	```
+	#include <mxnet-cpp/MxNetCpp.h>
+	```
+3. While building the program, ensure that the correct paths to the directories containing header files and MXNet shared library.
+4. The program links the MXNet shared library dynamically. Hence the library needs to be accessible to the program during runtime. This can be achieved by including the path to the shared library in the environment variable  **LD\_LIBRARY\_PATH** for Linux, Mac. and Ubuntu OS and **PATH** for Windows OS.
+
+
+## Tutorial
+
+A basic tutorial can be found at <https://mxnet.incubator.apache.org/tutorials/c++/basics.html>.
+
+## Examples
+
+The example directory contains examples for you to get started.
 For namespaces, classes, and code files for the MXNet C++ package, see the  following:
 
-* [Namespaces](http://mxnet.io/doxygen/namespaces.html)
-* [Classes](http://mxnet.io/doxygen/annotated.html)
-* [Code Files](http://mxnet.io/doxygen/files.html)
-* [MXNet CPP Package](/~https://github.com/dmlc/mxnet/tree/master/cpp-package)
+## Links to the documentation
+
+The classes and functions in MXNet C++ API are available under **mxnet::cpp** namespace. The links to the documenation are as follows:
+
+1. [Namespaces](../../doxygen/namespaces.html)
+2. [Classes in mxnet::cpp namespace](../../doxygen/namespacemxnet_1_1cpp.html)
+3. [Code Files](../../doxygen/files.html)
+4. [MXNet CPP Package](/~https://github.com/dmlc/mxnet/tree/master/cpp-package)
diff --git a/docs/build_version_doc/artifacts/.htaccess b/docs/build_version_doc/artifacts/.htaccess
index d553ce5a8dc6..6bf3a659ce55 100644
--- a/docs/build_version_doc/artifacts/.htaccess
+++ b/docs/build_version_doc/artifacts/.htaccess
@@ -1,28 +1,29 @@
 RewriteEngine on
-RewriteRule ^get_started/why_mxnet.html$ /faq/why_mxnet.html [R=301,L]
-RewriteRule ^get_started.*$ /install/ [R=301,L]
-RewriteRule ^how_to.*$ /faq/ [R=301,L]
-RewriteRule ^api/python/symbol.html$ /api/python/symbol/symbol.html [R=301,L]
-RewriteRule ^community/index.html$ /community/contribute.html [R=301,L]
+RewriteRule .* - [E=default_version:/versions/master]
+RewriteRule ^get_started/why_mxnet.html$ %{ENV:default_version}/faq/why_mxnet.html [R=301,L]
+RewriteRule ^get_started.*$ %{ENV:default_version}/install/ [R=301,L]
+RewriteRule ^how_to.*$ %{ENV:default_version}/faq/ [R=301,L]
+RewriteRule ^api/python/symbol.html$ %{ENV:default_version}/api/python/symbol/symbol.html [R=301,L]
+RewriteRule ^community/index.html$ %{ENV:default_version}/community/contribute.html [R=301,L]
 
 # Navigation bar redirects to latest info
-RewriteRule ^versions/[^\/]+/architecture/.*$ /architecture/ [R=301,L]
-RewriteRule ^versions/[^\/]+/community/.*$ /community/ [R=301,L]
-RewriteRule ^versions/[^\/]+/faq/.*$ /faq/ [R=301,L]
-RewriteRule ^versions/[^\/]+/gluon/.*$ /gluon/ [R=301,L]
-RewriteRule ^versions/[^\/]+/install/.*$ /install/ [R=301,L]
-RewriteRule ^versions/[^\/]+/tutorials/(.*)$ /tutorials/$1 [R=301,L]
+RewriteRule ^versions\/[0-9.]+\/architecture/(.*)$ %{ENV:default_version}/architecture/$1 [R=301,L]
+RewriteRule ^versions\/[0-9.]+\/community/(.*)$ %{ENV:default_version}/community/$1 [R=301,L]
+RewriteRule ^versions\/[0-9.]+\/faq/(.*)$ %{ENV:default_version}/faq/$1 [R=301,L]
+RewriteRule ^versions\/[0-9.]+\/gluon/(.*)$ %{ENV:default_version}/gluon/$1 [R=301,L]
+RewriteRule ^versions\/[0-9.]+\/install/(.*)$ %{ENV:default_version}/install/$1 [R=301,L]
+RewriteRule ^versions\/[0-9.]+\/tutorials/(.*)$ %{ENV:default_version}/tutorials/$1 [R=301,L]
 
 # Redirect navbar APIs that did not exist
-RewriteRule ^versions/0.11.0/api/python/contrib/onnx.html /error/api.html [R=301,L]
-RewriteRule ^versions/0.12.1/api/python/contrib/onnx.html /error/api.html [R=301,L]
-RewriteRule ^versions/1.0.0/api/python/contrib/onnx.html /error/api.html [R=301,L]
-RewriteRule ^versions/1.1.0/api/python/contrib/onnx.html /error/api.html [R=301,L]
+RewriteRule ^versions/0.11.0/api/python/contrib/onnx.html %{ENV:default_version}/error/api.html [R=301,L]
+RewriteRule ^versions/0.12.1/api/python/contrib/onnx.html %{ENV:default_version}/error/api.html [R=301,L]
+RewriteRule ^versions/1.0.0/api/python/contrib/onnx.html %{ENV:default_version}/error/api.html [R=301,L]
+RewriteRule ^versions/1.1.0/api/python/contrib/onnx.html %{ENV:default_version}/error/api.html [R=301,L]
 
-RewriteRule ^versions/0.11.0/api/clojure/.*$ /error/api.html [R=301,L]
-RewriteRule ^versions/0.12.1/api/clojure/.*$ /error/api.html [R=301,L]
-RewriteRule ^versions/1.0.0/api/clojure/.*$ /error/api.html [R=301,L]
-RewriteRule ^versions/1.1.0/api/clojure/.*$ /error/api.html [R=301,L]
-RewriteRule ^versions/1.2.1/api/clojure/.*$ /error/api.html [R=301,L]
+RewriteRule ^versions/0.11.0/api/clojure/.*$ %{ENV:default_version}/error/api.html [R=301,L]
+RewriteRule ^versions/0.12.1/api/clojure/.*$ %{ENV:default_version}/error/api.html [R=301,L]
+RewriteRule ^versions/1.0.0/api/clojure/.*$ %{ENV:default_version}/error/api.html [R=301,L]
+RewriteRule ^versions/1.1.0/api/clojure/.*$ %{ENV:default_version}/error/api.html [R=301,L]
+RewriteRule ^versions/1.2.1/api/clojure/.*$ %{ENV:default_version}/error/api.html [R=301,L]
 
 ErrorDocument 404 https://mxnet.incubator.apache.org/error/404.html
diff --git a/docs/build_version_doc/build_all_version.sh b/docs/build_version_doc/build_all_version.sh
index 5f857996f19d..3c432bbedfc9 100755
--- a/docs/build_version_doc/build_all_version.sh
+++ b/docs/build_version_doc/build_all_version.sh
@@ -43,6 +43,9 @@
 set -e
 set -x
 
+# Set OPTS to any Sphinx build options, like -W for "warnings as errors"
+OPTS=
+
 # $1 is the list of branches/tags to build
 if [ -z "$1" ]
   then
@@ -117,6 +120,10 @@ function checkout () {
   git checkout "$repo_folder" || git branch $repo_folder "upstream/$repo_folder" && git checkout "$repo_folder" || exit 1
   if [ $tag == 'master' ]; then
     git pull
+    # master gets warnings as errors for Sphinx builds
+    OPTS="-W"
+  else
+    OPTS=
   fi
   git submodule update --init --recursive
   cd ..
@@ -160,7 +167,7 @@ for key in ${!build_arr[@]}; do
 
     echo "Building $tag..."
     cd $tag/docs
-    make html USE_OPENMP=1 BUILD_VER=$tag || exit 1
+    make html USE_OPENMP=1 BUILD_VER=$tag SPHINXOPTS=$OPTS || exit 1
     # Navigate back to build_version_doc folder
     cd ../../../
     # Use the display tag name for the folder name
diff --git a/docs/community/contribute.md b/docs/community/contribute.md
index 0f40ba86a267..1b83638edcb5 100644
--- a/docs/community/contribute.md
+++ b/docs/community/contribute.md
@@ -62,13 +62,24 @@ To join the MXNet slack channel send request to the contributor mailing list.
 
 ### Social Media
 
-Keep connected with the latest MXNet news and updates on [Twitter](https://twitter.com/apachemxnet) and [Reddit](https://reddit.com/r/mxnet). Also, subscribe to the [MXNet YouTube channel](https://www.youtube.com/channel/UCQua2ZAkbr_Shsgfk1LCy6A).
-
-<div class="g-ytsubscribe" data-channelid="UCQua2ZAkbr_Shsgfk1LCy6A" data-layout="full" data-count="hidden"></div>
-<br/><br/>
-<a href="https://twitter.com/apachemxnet?ref_src=twsrc%5Etfw" class="twitter-follow-button" data-show-count="false">Follow @apachemxnet</a><script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-<br/><br/>
-<a href="https://reddit.com/r/mxnet"><img src="https://www.redditstatic.com/spreddit5.gif" alt="reddit" border="0"/> r/mxnet</a>
+Keep connected with the latest MXNet news and updates.
+
+<p>
+<a id="medium">
+<a href="https://medium.com/apache-mxnet"><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/social/medium_black.svg?sanitize=true" height="30px"/> Contributor and guest blogs about MXNet</a>
+</p>
+<p>
+<a id="reddit">
+<a href="https://reddit.com/r/mxnet"><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/social/reddit_blue.svg?sanitize=true" height="30px" alt="reddit"/> Discuss MXNet on r/mxnet</a>
+</p>
+<p>
+<a id="twitter">
+<a href="https://twitter.com/apachemxnet"><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/social/twitter.svg?sanitize=true" height="30px"/> Apache MXNet on Twitter</a>
+</p>
+<p>
+<a id="youtube">
+<a href="https://www.youtube.com/apachemxnet"><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/social/youtube_red.svg?sanitize=true" height="30px"/> MXNet YouTube channel</a>
+</p>
 
 
 ## JIRA
diff --git a/docs/community/mxnet_channels.md b/docs/community/mxnet_channels.md
index 98cce941e236..8bc8471dffd1 100644
--- a/docs/community/mxnet_channels.md
+++ b/docs/community/mxnet_channels.md
@@ -3,8 +3,8 @@
 Converse with the MXNet community via the following channels:
 
 - [Forum](https://discuss.mxnet.io/): [discuss.mxnet.io](https://discuss.mxnet.io/)
-- [MXNet Apache developer mailing list](https://lists.apache.org/list.html?dev@mxnet.apache.org) (dev@mxnet.apache.org): To subscribe, send an email to <a href="mailto:user-subscribe@mxnet.apache.org">user-subscribe@mxnet.apache.org</a>
-- [MXNet Apache user mailing list](https://lists.apache.org/list.html?user@mxnet.apache.org) (user@mxnet.apache.org): To subscribe, send an email to <a href="mailto:dev-subscribe@mxnet.apache.org">dev-subscribe@mxnet.apache.org</a>
-- [MXNet Slack channel](https://the-asf.slack.com/) (Channel: #mxnet): To request an invitation to the channel please subscribe to the mailing list above and then email: <a href="mailto:dev@mxnet.apache.org">dev@mxnet.apache.org</a>
+- [MXNet Apache developer mailing list](https://lists.apache.org/list.html?dev@mxnet.apache.org) (dev@mxnet.apache.org): To subscribe, send an email to <a href="mailto:dev-subscribe@mxnet.apache.org">dev-subscribe@mxnet.apache.org</a>
+- [MXNet Apache user mailing list](https://lists.apache.org/list.html?user@mxnet.apache.org) (user@mxnet.apache.org): To subscribe, send an email to <a href="mailto:user-subscribe@mxnet.apache.org">user-subscribe@mxnet.apache.org</a>
+- [MXNet Slack channel](https://the-asf.slack.com/) (Channel: #mxnet): To request an invitation to the channel please subscribe to the dev mailing list above and then email: <a href="mailto:dev@mxnet.apache.org">dev@mxnet.apache.org</a>
 
 Note: if you have an email address with apache.org, you do not need an approval to join the MXNet Slack channel.
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 8d08e320721a..98057d0d76d6 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -37,6 +37,12 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_CPU_NNPACK_NTHREADS
   - Values: Int ```(default=4)```
   - The number of threads used for NNPACK. NNPACK package aims to provide high-performance implementations of some layers for multi-core CPUs. Checkout [NNPACK](http://mxnet.io/faq/nnpack.html) to know more about it.
+* MXNET_MP_WORKER_NTHREADS
+  - Values: Int ```(default=1)```
+  - The number of scheduling threads on CPU given to multiprocess workers. Enlarge this number allows more operators to run in parallel in individual workers but please consider reducing the overall `num_workers` to avoid thread contention (not available on Windows).
+* MXNET_MP_OPENCV_NUM_THREADS
+  - Values: Int ```(default=0)```
+  - The number of OpenCV execution threads given to multiprocess workers. OpenCV multithreading is disabled if `MXNET_MP_OPENCV_NUM_THREADS` < 1 (default). Enlarge this number may boost the performance of individual workers when executing underlying OpenCV functions but please consider reducing the overall `num_workers` to avoid thread contention (not available on Windows).
 
 ## Memory Options
 
@@ -99,10 +105,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_KVSTORE_REDUCTION_NTHREADS
   - Values: Int ```(default=4)```
   - The number of CPU threads used for summing up big arrays on a single machine
-  - This will also be used for `dist_sync` kvstore to sum up arrays from different contexts on a single machine. 
-  - This does not affect summing up of arrays from different machines on servers. 
+  - This will also be used for `dist_sync` kvstore to sum up arrays from different contexts on a single machine.
+  - This does not affect summing up of arrays from different machines on servers.
   - Summing up of arrays for `dist_sync_device` kvstore is also unaffected as that happens on GPUs.
-  
+
 * MXNET_KVSTORE_BIGARRAY_BOUND
   - Values: Int ```(default=1000000)```
   - The minimum size of a "big array".
@@ -166,7 +172,7 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
 
 * MXNET_CUDNN_AUTOTUNE_DEFAULT
   - Values: 0, 1, or 2 ```(default=1)```
-  - The default value of cudnn auto tuning for convolution layers. 
+  - The default value of cudnn auto tuning for convolution layers.
   - Value of 0 means there is no auto tuning to pick the convolution algo
   - Performance tests are run to pick the convolution algo when value is 1 or 2
   - Value of 1 chooses the best algo in a limited workspace
@@ -190,12 +196,12 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
 * MXNET_HOME
   - Data directory in the filesystem for storage, for example when downloading gluon models.
   - Default in *nix is .mxnet APPDATA/mxnet in windows.
-  
+
 * MXNET_MKLDNN_ENABLED
   - Values: 0, 1 ```(default=1)```
   - Flag to enable or disable MKLDNN accelerator. On by default.
   - Only applies to mxnet that has been compiled with MKLDNN (```pip install mxnet-mkl``` or built from source with ```USE_MKLDNN=1```)
-  
+
 * MXNET_MKLDNN_CACHE_NUM
   - Values: Int ```(default=-1)```
   - Flag to set num of elements that MKLDNN cache can hold. Default is -1 which means cache size is unbounded. Should only be set if your model has variable input shapes, as cache size may grow unbounded. The number represents the number of items in the cache and is proportional to the number of layers that use MKLDNN and different input shape.
@@ -222,3 +228,17 @@ Settings for More GPU Parallelism
 - Set ```MXNET_GPU_WORKER_NTHREADS``` to a larger number (e.g., 2)
   - To reduce memory usage, consider setting ```MXNET_EXEC_NUM_TEMP```.
   - This might not speed things up, especially for image applications, because GPU is usually fully utilized even with serialized jobs.
+
+Settings for controlling OMP tuning
+---------------------------------
+- Set ```MXNET_USE_OPERATOR_TUNING=0``` to disable Operator tuning code which decides whether to use OMP or not for operator
+   - Values: String representation of MXNET_ENABLE_OPERATOR_TUNING environment variable
+   -            0=disable all
+   -            1=enable all
+   -            float32, float16, float32=list of types to enable, and disable those not listed
+   - refer : /~https://github.com/apache/incubator-mxnet/blob/master/src/operator/operator_tune-inl.h#L444
+
+- Set ```MXNET_USE_NUM_CORES_OPERATOR_TUNING``` to define num_cores to be used by operator tuning code.
+  - This reduces operator tuning overhead when there are multiple instances of mxnet running in the system and we know that
+    each mxnet will take only partial num_cores available with system. 
+  - refer: /~https://github.com/apache/incubator-mxnet/pull/13602
diff --git a/docs/faq/index.md b/docs/faq/index.md
index 1b4a95d3f331..fe91f7ca43b7 100644
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@@ -15,72 +15,72 @@ and full working examples, visit the [tutorials section](../tutorials/index.md).
 
 ## API
 
-* [What's the difference between the Module and Gluon APIs for Python?](http://mxnet.io/api/python/index.html)
+* [What's the difference between the Module and Gluon APIs for Python?](../api/python/index.html)
 
 ## Modeling
-* [How do I fine-tune pre-trained models to a new dataset?](http://mxnet.io/faq/finetune.html)
+* [How do I fine-tune pre-trained models to a new dataset?](finetune.html)
 
-* [How do I work with variable-length input in MXNet (bucketing)?](http://mxnet.io/faq/bucketing.html)
+* [How do I work with variable-length input in MXNet (bucketing)?](bucketing.html)
 
-* [How do I visualize neural networks as computation graphs?](http://mxnet.io/faq/visualize_graph.html)
+* [How do I visualize neural networks as computation graphs?](visualize_graph.html)
 
 
 ## Scale
-* [How can I train with multiple CPU/GPUs on a single machine with data parallelism?](http://mxnet.io/faq/multi_devices.html)
+* [How can I train with multiple CPU/GPUs on a single machine with data parallelism?](multi_devices.html)
 
-* [How can I train using multiple machines with data parallelism?](http://mxnet.io/faq/distributed_training.html)
+* [How can I train using multiple machines with data parallelism?](distributed_training.html)
 
-* [How can I train using multiple GPUs with model parallelism?](http://mxnet.io/faq/model_parallel_lstm.html)
+* [How can I train using multiple GPUs with model parallelism?](model_parallel_lstm.html)
 
 
 ## Speed
-* [How do I use gradient compression with distributed training?](http://mxnet.io/faq/gradient_compression.html)
+* [How do I use gradient compression with distributed training?](gradient_compression.html)
 
-* [Can I use nnpack to improve the CPU performance of MXNet?](http://mxnet.io/faq/nnpack.html)
+* [Can I use nnpack to improve the CPU performance of MXNet?](nnpack.html)
 
-* [What are the best setup and data-handling tips and tricks for improving speed?](http://mxnet.io/faq/perf.html)
+* [What are the best setup and data-handling tips and tricks for improving speed?](perf.html)
 
-* [How do I use mixed precision with MXNet or Gluon?](http://mxnet.io/faq/float16.html)
+* [How do I use mixed precision with MXNet or Gluon?](float16.html)
 
 ## Deployment Environments
-* [Can I run MXNet on smart or mobile devices?](http://mxnet.io/faq/smart_device.html)
+* [Can I run MXNet on smart or mobile devices?](smart_device.html)
 
 * [How to use data from S3 for training?](s3_integration.md)
 
-* [How to setup MXNet on AWS?](http://docs.aws.amazon.com/mxnet/latest/dg/mxnet-on-ec2-instance.html)
+* [How to run MXNet on AWS?](https://docs.aws.amazon.com/mxnet/latest/dg/whatis.html)
 
 * [How to do distributed training using MXNet on AWS?](http://docs.aws.amazon.com/mxnet/latest/dg/mxnet-on-ec2-cluster.html)
 
-* [How do I run MXNet on a Raspberry Pi for computer vision?](http://mxnet.io/tutorials/embedded/wine_detector.html)
+* [How do I run MXNet on a Raspberry Pi for computer vision?](../tutorials/embedded/wine_detector.html)
 
 * [How do I run Keras 2 with MXNet backend?](/~https://github.com/awslabs/keras-apache-mxnet/blob/master/docs/mxnet_backend/installation.md)
 
 * [How to convert MXNet models to Apple CoreML format?](/~https://github.com/apache/incubator-mxnet/tree/master/tools/coreml)
 
 ## Security
-* [How to run MXNet securely?](http://mxnet.io/faq/security.html)
+* [How to run MXNet securely?](security.html)
 
 ## Extend and Contribute to MXNet
 
-* [How do I join the MXNet development discussion?](http://mxnet.io/community/mxnet_channels.html)
+* [How do I join the MXNet development discussion?](../community/mxnet_channels.html)
 
-* [How do I contribute a patch to MXNet?](http://mxnet.io/community/contribute.html)
+* [How do I contribute a patch to MXNet?](../community/contribute.html)
 
-* [How do I implement operators in MXNet backend?](http://mxnet.io/faq/add_op_in_backend.html)
+* [How do I implement operators in MXNet backend?](add_op_in_backend.html)
 
-* [How do I create new operators in MXNet?](http://mxnet.io/faq/new_op.html)
+* [How do I create new operators in MXNet?](new_op.html)
 
 * [How do I implement sparse operators in MXNet backend?](https://cwiki.apache.org/confluence/display/MXNET/A+Guide+to+Implementing+Sparse+Operators+in+MXNet+Backend)
 
 * [How do I contribute an example or tutorial?](/~https://github.com/apache/incubator-mxnet/tree/master/example#contributing)
 
-* [How do I set MXNet's environmental variables?](http://mxnet.io/faq/env_var.html)
+* [How do I set MXNet's environmental variables?](env_var.html)
 
 ## Questions about Using MXNet
 If you need help with using MXNet, have questions about applying it to a particular kind of problem, or have a discussion topic, please use our [forum](https://discuss.mxnet.io).
 
 ## Issue Tracker
-We track bugs and new feature requests in the MXNet Github repo in the issues folder: [mxnet/issues](/~https://github.com/dmlc/mxnet/issues).
+We track bugs and new feature requests in the MXNet Github repo in the issues folder: [mxnet/issues](/~https://github.com/apache/incubator-mxnet/issues).
 
 ## Roadmap
-MXNet is evolving fast. To see what's next and what we are working on internally, go to the [MXNet Roadmap](/~https://github.com/dmlc/mxnet/labels/Roadmap).
+MXNet is evolving fast. To see what's next and what we are working on internally, go to the [MXNet Roadmap](https://cwiki.apache.org/confluence/display/MXNET/MXNet+Roadmap).
diff --git a/docs/install/index.md b/docs/install/index.md
index 6491d46be5c4..319e72a32c39 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -697,13 +697,13 @@ To run MXNet you also should have OpenCV and OpenBLAS installed. You may install
 
 ```bash
 brew install opencv
-brew install openblas@0.3.1
+brew install openblas
 ```
 
-Add a soft link to the OpenBLAS installation. This example links the 0.3.1 version:
+To ensure MXNet R package runs with the version of OpenBLAS installed, create a symbolic link as follows:
 
 ```bash
-ln -sf /usr/local/opt/openblas/lib/libopenblasp-r0.3.* /usr/local/opt/openblas/lib/libopenblasp-r0.3.1.dylib
+ln -sf /usr/local/opt/openblas/lib/libopenblas.dylib /usr/local/opt/openblas/lib/libopenblasp-r0.3.1.dylib
 ```
 
 Install the latest version (3.5.1+) of R from [CRAN](https://cran.r-project.org/bin/macosx/).
diff --git a/docs/install/osx_setup.md b/docs/install/osx_setup.md
index a2b59fe03618..7d90d3d456f6 100644
--- a/docs/install/osx_setup.md
+++ b/docs/install/osx_setup.md
@@ -83,7 +83,7 @@ After you have installed the dependencies, pull the MXNet source code from Git a
 The file called ```osx.mk``` has the configuration required for building MXNet on OS X. First copy ```make/osx.mk``` into ```config.mk```, which is used by the ```make``` command:
 
 ```bash
-    git clone --recursive /~https://github.com/dmlc/mxnet ~/mxnet
+    git clone --recursive /~https://github.com/apache/incubator-mxnet ~/mxnet
     cd ~/mxnet
     cp make/osx.mk ./config.mk
     echo "USE_BLAS = openblas" >> ./config.mk
@@ -96,7 +96,7 @@ The file called ```osx.mk``` has the configuration required for building MXNet o
 To build with MKLDNN
 
 ```bash
-echo "CC=$(brew --prefix llvm)/bin/clang++" >> ./config.mk
+echo "CC=$(brew --prefix llvm)/bin/clang" >> ./config.mk
 echo "CXX=$(brew --prefix llvm)/bin/clang++" >> ./config.mk
 echo "USE_OPENCV=1" >> ./config.mk
 echo "USE_OPENMP=1" >> ./config.mk
diff --git a/docs/mxdoc.py b/docs/mxdoc.py
index 7be4b90e799b..52f050e0ad2f 100644
--- a/docs/mxdoc.py
+++ b/docs/mxdoc.py
@@ -87,10 +87,10 @@ def generate_doxygen(app):
 def build_mxnet(app):
     """Build mxnet .so lib"""
     if not os.path.exists(os.path.join(app.builder.srcdir, '..', 'config.mk')):
-        _run_cmd("cd %s/.. && cp make/config.mk config.mk && make -j$(nproc) DEBUG=1" %
+        _run_cmd("cd %s/.. && cp make/config.mk config.mk && make -j$(nproc) DEBUG=1 USE_MKLDNN=0" %
                 app.builder.srcdir)
     else:
-        _run_cmd("cd %s/.. && make -j$(nproc) DEBUG=1" %
+        _run_cmd("cd %s/.. && make -j$(nproc) DEBUG=1 USE_MKLDNN=0" %
                 app.builder.srcdir)
 
 def build_r_docs(app):
@@ -113,12 +113,12 @@ def build_scala_docs(app):
     scala_doc_sources = 'find . -type f -name "*.scala" | egrep \"\.\/core|\.\/infer\" | egrep -v \"\/javaapi\"  | egrep -v \"Suite\"'
     scala_doc_classpath = ':'.join([
         '`find native -name "*.jar" | grep "target/lib/" | tr "\\n" ":" `',
-        '`find macros -name "*-SNAPSHOT.jar" | tr "\\n" ":" `',
-        '`find core -name "*-SNAPSHOT.jar" | tr "\\n" ":" `',
-        '`find infer -name "*-SNAPSHOT.jar" | tr "\\n" ":" `'
+        '`find macros -name "*.jar" | tr "\\n" ":" `',
+        '`find core -name "*.jar" | tr "\\n" ":" `',
+        '`find infer -name "*.jar" | tr "\\n" ":" `'
     ])
     # There are unresolvable errors on mxnet 1.2.x. We are ignoring those errors while aborting the ci on newer versions
-    scala_ignore_errors = '; exit 0' if '1.2.' in _BUILD_VER else ''
+    scala_ignore_errors = '; exit 0' if '1.2.' or '1.3.' in _BUILD_VER else ''
     _run_cmd('cd {}; scaladoc `{}` -classpath {} -feature -deprecation {}'
              .format(scala_path, scala_doc_sources, scala_doc_classpath, scala_ignore_errors))
     dest_path = app.builder.outdir + '/api/scala/docs'
@@ -135,9 +135,9 @@ def build_java_docs(app):
     java_doc_sources = 'find . -type f -name "*.scala" | egrep \"\.\/core|\.\/infer\" | egrep \"\/javaapi\" | egrep -v \"Suite\"'
     java_doc_classpath = ':'.join([
         '`find native -name "*.jar" | grep "target/lib/" | tr "\\n" ":" `',
-        '`find macros -name "*-SNAPSHOT.jar" | tr "\\n" ":" `',
-        '`find core -name "*-SNAPSHOT.jar" | tr "\\n" ":" `',
-        '`find infer -name "*-SNAPSHOT.jar" | tr "\\n" ":" `'
+        '`find macros -name "*.jar" | tr "\\n" ":" `',
+        '`find core -name "*.jar" | tr "\\n" ":" `',
+        '`find infer -name "*.jar" | tr "\\n" ":" `'
     ])
     _run_cmd('cd {}; scaladoc `{}` -classpath {} -feature -deprecation'
              .format(java_path, java_doc_sources, java_doc_classpath))
diff --git a/docs/tutorials/basic/reshape_transpose.md b/docs/tutorials/basic/reshape_transpose.md
new file mode 100644
index 000000000000..999b22ca2f7e
--- /dev/null
+++ b/docs/tutorials/basic/reshape_transpose.md
@@ -0,0 +1,197 @@
+## Difference between reshape and transpose operators
+
+What does it mean if MXNet gives you an error like the this?
+```
+Check failed: shape_.Size() == shape.Size() (127872 vs. 25088) NDArray.Reshape: target shape must have the same size as current shape when recording with autograd.
+```
+This error message tells you that the data being passed to your model or between layers in the model is not in the correct format. Modifying the shape of tensors is a very common operation in Deep Learning.
+For instance, when using pretrained neural networks it is often necessary to adjust the input data dimensions to correspond to what the network has been trained on, e.g. tensors of shape `[batch_size, channels, width, height]`.  This notebook discusses briefly the difference between the operators [Reshape](http://mxnet.incubator.apache.org/test/api/python/ndarray.html#mxnet.ndarray.NDArray.reshape) and [Transpose](http://mxnet.incubator.apache.org/test/api/python/ndarray.html#mxnet.ndarray.transpose). Both allow you to change the shape, however they are not the same and are commonly mistaken.
+
+```python
+import matplotlib.pyplot as plt
+import matplotlib.image as mpimg
+import mxnet as mx
+from mxnet import gluon
+import numpy as np
+```
+
+
+```python
+img_array = mpimg.imread('https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/basic/transpose_reshape/cat.png')
+plt.imshow(img_array)
+plt.axis("off")
+print (img_array.shape)
+```
+
+(157, 210, 3) <!--notebook-skip-line-->
+
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/basic/transpose_reshape/cat.png) <!--notebook-skip-line-->
+
+
+The color image has the following properties:
+* width: 210 pixels
+* height: 157 pixels
+* colors: 3 (RGB)
+
+Now let's reshape the image in order to exchange width and height dimensions.
+
+
+```python
+reshaped = img_array.reshape((210,157,3))
+print (reshaped.shape)
+plt.imshow(reshaped)
+plt.axis("off")
+```
+(210,157,3)<!--notebook-skip-line-->
+
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/basic/transpose_reshape/reshaped_image.png) <!--notebook-skip-line-->
+
+
+As we can see the first and second dimensions have changed. However the image can't be identified as cat any longer. In order to understand what happened, let's have a look at the image below.
+
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/basic/transpose_reshape/reshape.png" style="width:700px;height:300px;">
+
+While the number of rows and columns changed, the layout of the underlying data did not. The pixel values that have been in one row are still in one row. This means for instance that pixel 10 in the upper right corner ends up in the middle of the image instead of the lower left corner. Consequently contextual information gets lost, because the relative position of pixel values is not the same anymore. As one can imagine a neural network would not be able to classify such an image as cat. 
+
+`Transpose` instead changes the layout of the underlying data.
+
+
+```python
+transposed = img_array.transpose((1,0,2))
+plt.imshow(transposed)
+plt.axis("off")
+```
+
+![png](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/basic/transpose_reshape/transposed_image.png) <!--notebook-skip-line-->
+
+
+As we can see width and height changed, by rotating pixel values by 90 degrees. Transpose does the following:
+
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/basic/transpose_reshape/transpose.png" style="width:700px;height:300px;">
+
+As shown in the diagram, the axes have been flipped: pixel values that were in the first row are now in the first column.
+## When to transpose/reshape with MXNet
+In this chapter we discuss when transpose and reshape is used in MXNet. 
+#### Channel first for images
+Images are usually stored in the format height, wight, channel. When working with [convolutional](https://mxnet.incubator.apache.org/api/python/gluon/nn.html#mxnet.gluon.nn.Conv1D) layers, MXNet expects the layout to be `NCHW` (batch, channel, height, width). This is in contrast to Tensorflow, where image tensors are in the form `NHWC`. MXNet uses `NCHW` layout because of performance reasons on the GPU. When preprocessing the input images, you may have a function like the following:
+```python
+def transform(data, label): 
+     return data.astype(np.float32).transpose((2,0,1))/255.0, np.float32(label)
+```
+Images may also be stored as 1 dimensional vector for example in byte packed datasets. For instance, instead of `[28,28,1]` you may have `[784,1]`. In this situation you need to perform a reshape e.g. `ndarray.reshape((1,28,28))`
+
+
+#### TNC layout for RNN
+When working with [LSTM](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.LSTM) or [GRU](https://mxnet.incubator.apache.org/api/python/gluon/rnn.html#mxnet.gluon.rnn.GRU) layers, the default layout for input and ouput tensors has to be `TNC` (sequence length, batch size, and feature dimensions). For instance in the following network the input goes into a 1 dimensional convolution layer and whose output goes into a GRU cell. Here the tensors would mismatch, because `Conv1D` takes data as `NCT`, but GRU  expects it to be `NTC`. To ensure that the forward pass does not crash, we need to do a tensor transpose. We can do this by defining a ```HybridLambda```.
+```python
+network = gluon.nn.HybridSequential()
+with network.name_scope():
+       network.add(gluon.nn.Conv1D(196, kernel_size=2, strides=1))
+       network.add(gluon.nn.HybridLambda(lambda F, x: F.transpose(x, (0, 2, 1))))
+       network.add(gluon.rnn.GRU(128))
+
+network.hybridize()
+network.initialize(mx.init.Xavier(), ctx=mx.cpu())
+a = mx.random.uniform(shape=(1,100,1000))
+network(a)
+output = network(a)
+print (output.shape)
+```
+(1, 999, 128) <!--notebook-skip-line-->
+#### Advanced reshaping with MXNet ndarrays
+It is sometimes useful to automatically infer the shape of tensors. Especially when you deal with very deep neural networks, it may not always be clear what the shape of a tensor is after a specific layer. For instance you may want the tensor to be two-dimensional where one dimension is the known batch_size. With ```mx.nd.array(-1, batch_size)``` the first dimension will be automatically inferred. Here is a simplified example:
+```python
+batch_size = 100
+input_data = mx.random.uniform(shape=(batch_size, 20,100))
+reshaped = input_data.reshape(batch_size, -1)
+print (input_data.shape, reshaped.shape) 
+```
+(100L, 20L, 100L), (100L, 2000L) <!--notebook-skip-line-->
+
+The reshape function of [MXNet's NDArray API](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html?highlight=reshape#mxnet.ndarray.NDArray.reshape) allows even more advanced transformations: For instance:0 copies the dimension from the input to the output shape,  -2 copies all/remainder of the input dimensions to the output shape. With -3 reshape uses the product of two consecutive dimensions of the input shape as the output dim.  With -4 reshape splits one dimension of the input into two dimensions passed subsequent to -4. Here an example:
+```python
+x = mx.nd.random.uniform(shape=(1, 3, 4, 64, 64))
+
+```
+Assume ```x```  with the shape ```[batch_size, channel, upscale, width, height]``` is the output of a model for image superresolution. Now we want to apply the upscale on width and height, to increase the 64x64 to an 128x128 image.
+To do so, we can use advanced reshaping, where we have to split the third dimension (upscale) and multiply it with width and height. We can do 
+```python
+x = x.reshape(1, 3, -4, 2, 2, 0, 0)
+print (x.shape)
+```
+
+(1L, 3L, 2L, 2L, 64L, 64L) <!--notebook-skip-line-->
+
+This splits up the third dimension into ```[2,2]```, so (1L, 3L, **4L** , 64L, 64L) becomes (1L, 3L, **2L** , **2L** , 64L, 64L)  The other dimensions remain unchanged. In order to multiply the new dimensions with width and height, we can do a transpose and then use reshape with -3.
+```python
+x = x.transpose((0, 1, 4, 2, 5, 3))
+print (x.shape)
+x = x.reshape(0, 0, -3, -3)
+print (x.shape)
+```
+
+(1L, 3L, 64L, 2L, 64L, 2L) <!--notebook-skip-line-->
+
+(1L, 3L, 128L, 128L) <!--notebook-skip-line-->
+
+Reshape -3 will calculate the dot product between the current and subsequent column. So (1L, 3L, **64L** , **2L** , ***64L, 2L*** ) becomes (1L, 3L, **128L** , ***128L*** )
+
+#### Most Common Pitfalls 
+In this section we want to show some of the most common pitfalls that happen when your input data is not correctly shaped.
+
+##### Forward Pass
+
+You execute the forward pass and get an error message followed by a very long stacktrace, for instance:
+
+```
+*** Error in `python': free(): invalid pointer: 0x00007fde5405a918 ***
+======= Backtrace: =========
+/lib/x86_64-linux-gnu/libc.so.6(+0x777e5)[0x7fdf475927e5]
+/lib/x86_64-linux-gnu/libc.so.6(+0x8037a)[0x7fdf4759b37a]
+/lib/x86_64-linux-gnu/libc.so.6(cfree+0x4c)[0x7fdf4759f53c]
+/home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(MXExecutorReshape+0x1852)[0x7fdecef2c6e2]
+/home/ubuntu/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c)[0x7fdf46332ec0]
+/home/ubuntu/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d)[0x7fdf4633287d]
+/home/ubuntu/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce)[0x7fdf46547e2e]
+/home/ubuntu/anaconda3/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12865)[0x7fdf46548865]
+python(_PyObject_FastCallDict+0x8b)[0x56457eba2d7b]
+python(+0x19e7ce)[0x56457ec327ce]
+python(_PyEval_EvalFrameDefault+0x2fa)[0x56457ec54cba]
+python(+0x197dae)[0x56457ec2bdae]
+[...]
+
+```
+This happens when you your data does not have the shape ```[batch_size, channel, width, height]``` e.g. your data may be a one-dimensional vector or when the color channel may be the last dimension instead of the second one.
+
+##### Backward Pass
+In other cases the forward pass may not fail, but the shape of the network output is not as expected. For instance in our previous RNN example, nothing is preventing us to skip the transpose. 
+
+```python
+network = gluon.nn.HybridSequential()
+with network.name_scope():
+       network.add(gluon.nn.Conv1D(196, kernel_size=2, strides=1))
+       #network.add(gluon.nn.HybridLambda(lambda F, x: F.transpose(x, (0, 2, 1))))
+       network.add(gluon.rnn.GRU(128))
+
+network.hybridize()
+network.initialize(mx.init.Xavier(), ctx=mx.cpu())
+a = mx.random.uniform(shape=(1,100,1000))
+output = network(a)
+print (output.shape)
+```
+(1, 196, 128)  <!--notebook-skip-line-->
+
+Instead of ```(1, 999, 128)``` the shape is now ```(1, 196, 128)```. But during the training loop, calculating the loss would crash because of shape mismatch between labels and output. You may get an error like the following:
+```
+mxnet.base.MXNetError: [10:56:29] src/ndarray/ndarray.cc:229: Check failed: shape_.Size() == shape.Size() (127872 vs. 25088) NDArray.Reshape: target shape must have the same size as current shape when recording with autograd.
+
+Stack trace returned 6 entries:
+[bt] (0) 0   libmxnet.so                         0x00000001126c0b90 libmxnet.so + 15248
+[bt] (1) 1   libmxnet.so                         0x00000001126c093f libmxnet.so + 14655
+[bt] (2) 2   libmxnet.so                         0x0000000113cd236d MXNDListFree + 1407789
+[bt] (3) 3   libmxnet.so                         0x0000000113b345ca MXNDArrayReshape64 + 970
+[bt] (4) 4   libffi.6.dylib                      0x000000010b399884 ffi_call_unix64 + 76
+[bt] (5) 5   ???                                 0x00007fff54cadf50 0x0 + 140734615969616
+
+```
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/gluon/hybrid.md b/docs/tutorials/gluon/hybrid.md
index 6d64acdce275..f11622bd6fd1 100644
--- a/docs/tutorials/gluon/hybrid.md
+++ b/docs/tutorials/gluon/hybrid.md
@@ -1,6 +1,9 @@
 # Hybrid - Faster training and easy deployment
 
-*Note: a newer version is available [here](http://gluon.mxnet.io/chapter07_distributed-learning/hybridize.html).*
+*Related Content:*
+* [Fast, portable neural networks with Gluon HybridBlocks](https://gluon.mxnet.io/chapter07_distributed-learning/hybridize.html)
+* [A Hybrid of Imperative and Symbolic Programming
+](http://en.diveintodeeplearning.org/chapter_computational-performance/hybridize.html)
 
 Deep learning frameworks can be roughly divided into two categories: declarative
 and imperative. With declarative frameworks (including Tensorflow, Theano, etc)
@@ -137,4 +140,106 @@ to gluon with `SymbolBlock`:
 net2 = gluon.SymbolBlock.imports('model-symbol.json', ['data'], 'model-0001.params')
 ```
 
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
+## Operators that do not work with hybridize
+
+If you want to hybridize your model, you must use `F.some_operator` in your 'hybrid_forward' function.
+`F` will be `mxnet.nd` before you hybridize and `mxnet.sym` after hybridize. While most APIs are the same in NDArray and Symbol, there are some differences. Writing `F.some_operator` and call `hybridize` may not work all of the time.
+Here we list some frequently used NDArray APIs that can't be hybridized and provide you the work arounds.  
+
+### Element-wise Operators
+
+In NDArray APIs, the following arithmetic and comparison APIs are automatically broadcasted if the input NDArrays have different shapes.
+However, that's not the case in Symbol API. It's not automatically broadcasted, and you have to manually specify to use another set of broadcast operators for Symbols expected to have different shapes.
+
+
+| NDArray APIs  | Description  |
+|---|---|
+| [*NDArray.\__add\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__add__) | x.\__add\__(y) <=> x+y <=> mx.nd.add(x, y)  |
+| [*NDArray.\__sub\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__sub__) | x.\__sub\__(y) <=> x-y <=> mx.nd.subtract(x, y)  |
+| [*NDArray.\__mul\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__mul__) | x.\__mul\__(y) <=> x*y <=> mx.nd.multiply(x, y)  |
+| [*NDArray.\__div\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__div__) | x.\__div\__(y) <=> x/y <=> mx.nd.divide(x, y)  |
+| [*NDArray.\__mod\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__mod__) | x.\__mod\__(y) <=> x%y <=> mx.nd.modulo(x, y)  |
+| [*NDArray.\__lt\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__lt__) |  x.\__lt\__(y) <=> x<y <=> x mx.nd.lesser(x, y) |
+| [*NDArray.\__le\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__le__) |  x.\__le\__(y) <=> x<=y <=> mx.nd.less_equal(x, y) |
+| [*NDArray.\__gt\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__gt__) |  x.\__gt\__(y) <=> x>y <=> mx.nd.greater(x, y) |
+| [*NDArray.\__ge\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__ge__) |  x.\__ge\__(y) <=> x>=y <=> mx.nd.greater_equal(x, y)|
+| [*NDArray.\__eq\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__eq__) |  x.\__eq\__(y) <=> x==y <=> mx.nd.equal(x, y) |
+| [*NDArray.\__ne\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__ne__) |  x.\__ne\__(y) <=> x!=y <=> mx.nd.not_equal(x, y) |
+
+The current workaround is to use corresponding broadcast operators for arithmetic and comparison to avoid potential hybridization failure when input shapes are different.
+
+| Symbol APIs  | Description  |
+|---|---|
+|[*broadcast_add*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_add) | Returns element-wise sum of the input arrays with broadcasting. |
+|[*broadcast_sub*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_sub) | Returns element-wise difference of the input arrays with broadcasting. |
+|[*broadcast_mul*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_mul) | Returns element-wise product of the input arrays with broadcasting. |
+|[*broadcast_div*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_div) | Returns element-wise division of the input arrays with broadcasting. |
+|[*broadcast_mod*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_mod) | Returns element-wise modulo of the input arrays with broadcasting. |
+|[*broadcast_equal*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_equal) | Returns the result of element-wise *equal to* (==) comparison operation with broadcasting. |
+|[*broadcast_not_equal*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_not_equal) | Returns the result of element-wise *not equal to* (!=) comparison operation with broadcasting. |
+|[*broadcast_greater*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_greater) | Returns the result of element-wise *greater than* (>) comparison operation with broadcasting. |
+|[*broadcast_greater_equal*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_greater_equal) | Returns the result of element-wise *greater than or equal to* (>=) comparison operation with broadcasting. |
+|[*broadcast_lesser*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_lesser) |	Returns the result of element-wise *lesser than* (<) comparison operation with broadcasting. |
+|[*broadcast_lesser_equal*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_lesser_equal) | Returns the result of element-wise *lesser than or equal to* (<=) comparison operation with broadcasting. |
+
+For example, if you want to add a NDarray to your input x, use `broadcast_add` instead of `+`:
+
+```python
+def hybrid_forward(self, F, x):
+    # avoid writing: return x + F.ones((1, 1))
+    return F.broadcast_add(x, F.ones((1, 1)))
+```
+
+If you used `+`, it would still work before hybridization, but will throw an error of shape missmtach after hybridization.
+
+### Shape
+
+Gluon's imperative interface is very flexible and allows you to print the shape of the NDArray. However, Symbol does not have shape attributes. As a result, you need to avoid printing shapes in `hybrid_forward`.
+Otherwise, you will get the following error:
+```bash
+AttributeError: 'Symbol' object has no attribute 'shape'
+```
+
+### Slice
+`[]` in NDArray is used to get a slice from the array. However, `[]` in Symbol is used to get an output from a grouped symbol.
+For example, you will get different results for the following method before and after hybridization.
+
+```python
+def hybrid_forward(self, F, x):
+    return x[0]
+```
+
+The current workaround is to explicitly call [`slice`](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.slice) or [`slice_axis`](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.slice_axis) operators in `hybrid_forward`.
+
+
+### Not implemented operators
+
+Some of the often used operators in NDArray are not implemented in Symbol, and will cause hybridization failure.
+
+#### NDArray.asnumpy
+Symbol does not support the `asnumpy` function. You need to avoid calling `asnumpy` in `hybrid_forward`.
+
+#### Array creation APIs
+
+`mx.nd.array()` is used a lot, but Symbol does not have the `array` API. The current workaround is to use `F.ones`, `F.zeros`, or `F.full`, which exist in both the NDArray and Symbol APIs.
+
+#### In-Place Arithmetic Operators
+
+In-place arithmetic operators may be used in Gluon imperative mode, however if you expect to hybridize, you should write these operations explicitly instead.
+For example, avoid writing `x += y` and use `x  = x + y`, otherwise you will get `NotImplementedError`. This applies to all the following operators:
+
+| NDArray in-place arithmetic operators | Description |
+|---|---|
+|[*NDArray.\__iadd\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__iadd__) |	x.\__iadd\__(y) <=> x+=y |
+|[*NDArray.\__isub\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__isub__) |	x.\__isub\__(y) <=> x-=y |
+|[*NDArray.\__imul\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__imul__) |	x.\__imul\__(y) <=> x*=y |
+|[*NDArray.\__idiv\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__idiv__) |	x.\__rdiv\__(y) <=> x/=y |
+|[*NDArray.\__imod\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__imod__) |	x.\__rmod\__(y) <=> x%=y |
+
+
+
+## Summary
+
+The recommended practice is to utilize the flexibility of imperative NDArray API during experimentation. Once you finalized your model, make necessary changes mentioned above so you can call `hybridize` function to improve performance.
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 7d102bb88f89..9457a409a7e9 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -78,6 +78,7 @@ Select API:&nbsp;
         * NDArray
             * [NDArray API](/tutorials/gluon/ndarray.html) ([Alternative](http://gluon.mxnet.io/chapter01_crashcourse/ndarray.html) <img src="https://upload.wikimedia.org/wikipedia/commons/6/6a/External_link_font_awesome.svg" alt="External link" height="15px" style="margin: 0px 0px 3px 3px;"/>)
             * [Advanced NDArray API](/tutorials/basic/ndarray.html)
+            * [Difference between reshape and transpose](/tutorials/basic/reshape_transpose.html)
             * [NDArray Indexing](https://mxnet.incubator.apache.org/tutorials/basic/ndarray_indexing.html)
             * Sparse NDArray
                 * [Sparse Gradient Updates (RowSparseNDArray)](/tutorials/sparse/row_sparse.html)
diff --git a/docs/tutorials/python/profiler.md b/docs/tutorials/python/profiler.md
index d99bb19ee029..7dcda10f11b8 100644
--- a/docs/tutorials/python/profiler.md
+++ b/docs/tutorials/python/profiler.md
@@ -94,10 +94,10 @@ Let's define a method that will run one training iteration given data and label.
 
 ```python
 # Use GPU if available
-try:
-    mx.test_utils.list_gpus(); ctx = mx.gpu()
-except:
-    ctx = mx.cpu()
+if len(mx.test_utils.list_gpus())!=0:
+    ctx=mx.gpu()
+else:
+    ctx=mx.cpu()
 
 # Initialize the parameters with random weights
 net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
diff --git a/example/bayesian-methods/README.md b/example/bayesian-methods/README.md
index ec9e8be86927..fc35b94219d7 100644
--- a/example/bayesian-methods/README.md
+++ b/example/bayesian-methods/README.md
@@ -11,3 +11,27 @@ and *Bayesian Dark Knowledge (BDK)* [<cite>(Balan, Rathod, Murphy and Welling, 2
 **bdk.ipynb** shows how to use MXNet to implement the DistilledSGLD algorithm in Bayesian Dark Knowledge.
 
 **bdk_demo.py** contains scripts (more than the notebook) related to Bayesian Dark Knowledge. Use `python bdk_demo.py -d 1 -l 2 -t 50000` to run classification on MNIST. 
+
+View parameters we can use with the following command.
+
+```shell
+python bdk_demo.py -h
+
+
+usage: bdk_demo.py [-h] [-d DATASET] [-l ALGORITHM] [-t TRAINING] [--gpu GPU]
+
+Examples in the paper [NIPS2015]Bayesian Dark Knowledge and [ICML2011]Bayesian
+Learning via Stochastic Gradient Langevin Dynamics
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -d DATASET, --dataset DATASET
+                        Dataset to use. 0 --> TOY, 1 --> MNIST, 2 -->
+                        Synthetic Data in the SGLD paper
+  -l ALGORITHM, --algorithm ALGORITHM
+                        Type of algorithm to use. 0 --> SGD, 1 --> SGLD,
+                        other-->DistilledSGLD
+  -t TRAINING, --training TRAINING
+                        Number of training samples
+  --gpu GPU             0 to use GPU, not set to use CPU
+```
diff --git a/example/bayesian-methods/bdk_demo.py b/example/bayesian-methods/bdk_demo.py
index 145dac10e2a6..cd39bfd2a7c9 100644
--- a/example/bayesian-methods/bdk_demo.py
+++ b/example/bayesian-methods/bdk_demo.py
@@ -156,34 +156,34 @@ def get_toy_sym(teacher=True, teacher_noise_precision=None):
     return net
 
 
-def dev():
-    return mx.gpu()
+def dev(gpu_id=None):
+    return mx.gpu(gpu_id) if gpu_id else mx.cpu()
 
 
-def run_mnist_SGD(training_num=50000):
+def run_mnist_SGD(training_num=50000, gpu_id=None):
     X, Y, X_test, Y_test = load_mnist(training_num)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
-    data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                   'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
+    data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                   'softmax_label': nd.zeros((minibatch_size,), ctx=dev(gpu_id))}
     initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
-    exe, exe_params, _ = SGD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y,
+    exe, exe_params, _ = SGD(sym=net, dev=dev(gpu_id), data_inputs=data_inputs, X=X, Y=Y,
                              X_test=X_test, Y_test=Y_test,
                              total_iter_num=1000000,
                              initializer=initializer,
                              lr=5E-6, prior_precision=1.0, minibatch_size=100)
 
 
-def run_mnist_SGLD(training_num=50000):
+def run_mnist_SGLD(training_num=50000, gpu_id=None):
     X, Y, X_test, Y_test = load_mnist(training_num)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
-    data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                   'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
+    data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                   'softmax_label': nd.zeros((minibatch_size,), ctx=dev(gpu_id))}
     initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
-    exe, sample_pool = SGLD(sym=net, dev=dev(), data_inputs=data_inputs, X=X, Y=Y,
+    exe, sample_pool = SGLD(sym=net, dev=dev(gpu_id), data_inputs=data_inputs, X=X, Y=Y,
                             X_test=X_test, Y_test=Y_test,
                             total_iter_num=1000000,
                             initializer=initializer,
@@ -191,7 +191,7 @@ def run_mnist_SGLD(training_num=50000):
                             thin_interval=100, burn_in_iter_num=1000)
 
 
-def run_mnist_DistilledSGLD(training_num=50000):
+def run_mnist_DistilledSGLD(training_num=50000, gpu_id=None):
     X, Y, X_test, Y_test = load_mnist(training_num)
     minibatch_size = 100
     if training_num >= 10000:
@@ -214,10 +214,10 @@ def run_mnist_DistilledSGLD(training_num=50000):
     logsoftmax = LogSoftmax()
     student_net = get_mnist_sym(output_op=logsoftmax, num_hidden=num_hidden)
     data_shape = (minibatch_size,) + X.shape[1::]
-    teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                           'softmax_label': nd.zeros((minibatch_size,), ctx=dev())}
-    student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                           'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev())}
+    teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                           'softmax_label': nd.zeros((minibatch_size,), ctx=dev(gpu_id))}
+    student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                           'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev(gpu_id))}
     teacher_initializer = BiasXavier(factor_type="in", magnitude=1)
     student_initializer = BiasXavier(factor_type="in", magnitude=1)
     student_exe, student_params, _ = \
@@ -231,17 +231,17 @@ def run_mnist_DistilledSGLD(training_num=50000):
                       teacher_learning_rate=teacher_learning_rate,
                       student_learning_rate=student_learning_rate,
                       teacher_prior_precision=teacher_prior, student_prior_precision=student_prior,
-                      perturb_deviation=perturb_deviation, minibatch_size=100, dev=dev())
+                      perturb_deviation=perturb_deviation, minibatch_size=100, dev=dev(gpu_id))
 
 
-def run_toy_SGLD():
+def run_toy_SGLD(gpu_id=None):
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0 / 9.0
     net = get_toy_sym(True, teacher_noise_precision)
     data_shape = (minibatch_size,) + X.shape[1::]
-    data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                   'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
+    data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                   'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))}
     initializer = mx.init.Uniform(0.07)
     exe, params, _ = \
         SGLD(sym=net, data_inputs=data_inputs,
@@ -253,20 +253,20 @@ def run_toy_SGLD():
              burn_in_iter_num=1000,
              thin_interval=10,
              task='regression',
-             minibatch_size=minibatch_size, dev=dev())
+             minibatch_size=minibatch_size, dev=dev(gpu_id))
 
 
-def run_toy_DistilledSGLD():
+def run_toy_DistilledSGLD(gpu_id=None):
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0
     teacher_net = get_toy_sym(True, teacher_noise_precision)
     student_net = get_toy_sym(False)
     data_shape = (minibatch_size,) + X.shape[1::]
-    teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                           'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
-    student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev())}
-    #                   'softmax_label': nd.zeros((minibatch_size, 10), ctx=dev())}
+    teacher_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                           'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))}
+    student_data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id))}
+
     teacher_initializer = mx.init.Uniform(0.07)
     student_initializer = mx.init.Uniform(0.07)
     student_grad_f = lambda student_outputs, teacher_pred: \
@@ -284,21 +284,21 @@ def run_toy_DistilledSGLD():
                       student_grad_f=student_grad_f,
                       teacher_prior_precision=0.1, student_prior_precision=0.001,
                       perturb_deviation=0.1, minibatch_size=minibatch_size, task='regression',
-                      dev=dev())
+                      dev=dev(gpu_id))
 
 
-def run_toy_HMC():
+def run_toy_HMC(gpu_id=None):
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = Y.shape[0]
     noise_precision = 1 / 9.0
     net = get_toy_sym(True, noise_precision)
     data_shape = (minibatch_size,) + X.shape[1::]
-    data_inputs = {'data': nd.zeros(data_shape, ctx=dev()),
-                   'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev())}
+    data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
+                   'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))}
     initializer = mx.init.Uniform(0.07)
     sample_pool = HMC(net, data_inputs=data_inputs, X=X, Y=Y, X_test=X_test, Y_test=Y_test,
                       sample_num=300000, initializer=initializer, prior_precision=1.0,
-                      learning_rate=1E-3, L=10, dev=dev())
+                      learning_rate=1E-3, L=10, dev=dev(gpu_id))
 
 
 def run_synthetic_SGLD():
@@ -350,21 +350,22 @@ def run_synthetic_SGLD():
                         help="Type of algorithm to use. 0 --> SGD, 1 --> SGLD, other-->DistilledSGLD")
     parser.add_argument("-t", "--training", type=int, default=50000,
                         help="Number of training samples")
+    parser.add_argument("--gpu", type=int, help="0 to use GPU, not set to use CPU")
     args = parser.parse_args()
     training_num = args.training
     if args.dataset == 1:
         if 0 == args.algorithm:
-            run_mnist_SGD(training_num)
+            run_mnist_SGD(training_num, gpu_id=args.gpu)
         elif 1 == args.algorithm:
-            run_mnist_SGLD(training_num)
+            run_mnist_SGLD(training_num, gpu_id=args.gpu)
         else:
-            run_mnist_DistilledSGLD(training_num)
+            run_mnist_DistilledSGLD(training_num, gpu_id=args.gpu)
     elif args.dataset == 0:
         if 1 == args.algorithm:
-            run_toy_SGLD()
+            run_toy_SGLD(gpu_id=args.gpu)
         elif 2 == args.algorithm:
-            run_toy_DistilledSGLD()
+            run_toy_DistilledSGLD(gpu_id=args.gpu)
         elif 3 == args.algorithm:
-            run_toy_HMC()
+            run_toy_HMC(gpu_id=args.gpu)
     else:
         run_synthetic_SGLD()
diff --git a/example/fcn-xs/README.md b/example/fcn-xs/README.md
index 145aa31cb700..49c57fc08eaf 100644
--- a/example/fcn-xs/README.md
+++ b/example/fcn-xs/README.md
@@ -40,14 +40,33 @@ this is the fully convolution style of the origin
 Once you completed all these steps, your working directory should contain a ```.\VOC2012``` directory, which contains the following: ```JPEGImages folder```, ```SegmentationClass folder```, ```train.lst```, ```val.lst```
 
 #### Step 3: Train the fcn-xs model
-* Based on your hardware, configure GPU or CPU for training in `fcn_xs.py`. It is recommended to use GPU due to the computational complexity and data load.
-```python
-# ctx = mx.cpu(0)
-ctx = mx.gpu(0)
+* Based on your hardware, configure CPU or GPU for training by parameter ```--gpu```. It is recommended to use GPU due to the computational complexity and data load. 
+View parameters we can use with the following command.
+```shell
+python fcn_xs.py -h
+
+
+usage: fcn_xs.py [-h] [--model MODEL] [--prefix PREFIX] [--epoch EPOCH]
+                 [--init-type INIT_TYPE] [--retrain] [--gpu GPU]
+
+Convert vgg16 model to vgg16fc model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --model MODEL         The type of fcn-xs model, e.g. fcnxs, fcn16s, fcn8s.
+  --prefix PREFIX       The prefix(include path) of vgg16 model with mxnet
+                        format.
+  --epoch EPOCH         The epoch number of vgg16 model.
+  --init-type INIT_TYPE
+                        the init type of fcn-xs model, e.g. vgg16, fcnxs
+  --retrain             true means continue training.
+  --gpu GPU             0 to use GPU, not set to use CPU
 ```
+
 * It is recommended to train fcn-32s and fcn-16s before training the fcn-8s model
 
 To train the fcn-32s model, run the following:
+
 ```shell
 python -u fcn_xs.py --model=fcn32s --prefix=VGG_FC_ILSVRC_16_layers --epoch=74 --init-type=vgg16
 ```
diff --git a/example/fcn-xs/fcn_xs.py b/example/fcn-xs/fcn_xs.py
index 53244a1759c3..5b799f32e46e 100644
--- a/example/fcn-xs/fcn_xs.py
+++ b/example/fcn-xs/fcn_xs.py
@@ -28,9 +28,10 @@
 
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
-ctx = mx.gpu(0)
+
 
 def main():
+    ctx = mx.cpu() if not args.gpu else mx.gpu(args.gpu)
     fcnxs = symbol_fcnxs.get_fcn32s_symbol(numclass=21, workspace_default=1536)
     fcnxs_model_prefix = "model_pascal/FCN32s_VGG16"
     if args.model == "fcn16s":
@@ -85,6 +86,7 @@ def main():
         help='the init type of fcn-xs model, e.g. vgg16, fcnxs')
     parser.add_argument('--retrain', action='store_true', default=False,
         help='true means continue training.')
+    parser.add_argument("--gpu", type=int, help="0 to use GPU, not set to use CPU")
     args = parser.parse_args()
     logging.info(args)
     main()
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index c38019fbe7b9..938890bb75df 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -273,7 +273,9 @@ def save_params(fname, arg_params, aux_params, logger=None):
     logger.info('rgb_std = %s' % rgb_std)
     rgb_std = [float(i) for i in rgb_std.split(',')]
     std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]}    
-
+    combine_mean_std = {}
+    combine_mean_std.update(mean_args)
+    combine_mean_std.update(std_args)
     if calib_mode == 'none':
         logger.info('Quantizing FP32 model %s' % args.model)
         qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
@@ -294,8 +296,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
                                      shuffle=args.shuffle_dataset,
                                      shuffle_chunk_seed=args.shuffle_chunk_seed,
                                      seed=args.shuffle_seed,
-                                     **mean_args,
-                                     **std_args)
+                                     **combine_mean_std)
 
         qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
                                                         ctx=ctx, excluded_sym_names=excluded_sym_names,
diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py
index 3fdb52f40cb2..0725165b0ca5 100644
--- a/example/quantization/imagenet_inference.py
+++ b/example/quantization/imagenet_inference.py
@@ -181,6 +181,9 @@ def benchmark_score(symbol_file, ctx, batch_size, num_batches, logger=None):
     logger.info('rgb_std = %s' % rgb_std)
     rgb_std = [float(i) for i in rgb_std.split(',')]
     std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]}
+    combine_mean_std = {}
+    combine_mean_std.update(mean_args)
+    combine_mean_std.update(std_args)
 
     label_name = args.label_name
     logger.info('label_name = %s' % label_name)
@@ -206,8 +209,7 @@ def benchmark_score(symbol_file, ctx, batch_size, num_batches, logger=None):
                                     shuffle=True,
                                     shuffle_chunk_seed=3982304,
                                     seed=48564309,
-                                    **mean_args,
-                                    **std_args)
+                                    **combine_mean_std)
 
         # loading model
         sym, arg_params, aux_params = load_model(symbol_file, param_file, logger)
diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index b5284183d160..5e6127ccb08d 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -9,7 +9,7 @@ For a gluon imperative version, checkout /~https://github.com/dmlc/gluon-cv.
 
 ### Out-of-box inference models
 Download any of the following models to the current directory and run `python3 demo.py --dataset $Dataset$ --network $Network$ --params $MODEL_FILE$ --image $YOUR_IMAGE$` to get single image inference.
-For example `python3 demo.py --dataset voc --network vgg16 --params vgg16_voc0712.params --image myimage.jpg`, add `--gpu 0` to use GPU optionally.
+For example `python3 demo.py --dataset voc --network vgg16 --params vgg16_voc0712.params --image myimage.jpg`, add `--gpu 0` to use GPU, not set to use CPU. 
 Different network has different configuration. Different dataset has different object class names. You must pass them explicitly as command line arguments.
 
 | Network | Dataset | Imageset | Reference | Result | Link  |
diff --git a/example/rcnn/demo.py b/example/rcnn/demo.py
index 2315bb8af366..b0a4ddbeab49 100644
--- a/example/rcnn/demo.py
+++ b/example/rcnn/demo.py
@@ -92,7 +92,7 @@ def parse_args():
     parser.add_argument('--params', type=str, default='', help='path to trained model')
     parser.add_argument('--dataset', type=str, default='voc', help='training dataset')
     parser.add_argument('--image', type=str, default='', help='path to test image')
-    parser.add_argument('--gpu', type=str, default='', help='gpu device eg. 0')
+    parser.add_argument('--gpu', type=str, default='', help='GPU devices, eg."0,1,2,3" , not set to use CPU.')
     parser.add_argument('--vis', action='store_true', help='display results')
     parser.add_argument('--vis-thresh', type=float, default=0.7, help='threshold display boxes')
     # faster rcnn params
diff --git a/example/rcnn/test.py b/example/rcnn/test.py
index 3c047d222016..e964c9080667 100644
--- a/example/rcnn/test.py
+++ b/example/rcnn/test.py
@@ -35,7 +35,7 @@ def test_net(sym, imdb, args):
     logger.info('called with args\n{}'.format(pprint.pformat(vars(args))))
 
     # setup context
-    ctx = mx.gpu(args.gpu)
+    ctx = mx.cpu() if not args.gpu else mx.gpu(args.gpu)
 
     # load testing data
     test_data = TestLoader(imdb.roidb, batch_size=1, short=args.img_short_side, max_size=args.img_long_side,
@@ -94,7 +94,7 @@ def parse_args():
     parser.add_argument('--params', type=str, default='', help='path to trained model')
     parser.add_argument('--dataset', type=str, default='voc', help='training dataset')
     parser.add_argument('--imageset', type=str, default='', help='imageset splits')
-    parser.add_argument('--gpu', type=int, default=0, help='gpu device eg. 0')
+    parser.add_argument('--gpu', type=int, default=0, help='0 to use GPU, not set to use CPU')
     # faster rcnn params
     parser.add_argument('--img-short-side', type=int, default=600)
     parser.add_argument('--img-long-side', type=int, default=1000)
diff --git a/example/rcnn/train.py b/example/rcnn/train.py
index 0739069afb4a..7b1f2f7f31a5 100644
--- a/example/rcnn/train.py
+++ b/example/rcnn/train.py
@@ -33,7 +33,7 @@ def train_net(sym, roidb, args):
     logger.info('called with args\n{}'.format(pprint.pformat(vars(args))))
 
     # setup multi-gpu
-    ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
+    ctx = [mx.cpu()] if not args.gpus else [mx.gpu(int(i)) for i in args.gpus.split(',')]
     batch_size = args.rcnn_batch_size * len(ctx)
 
     # load training data
@@ -127,7 +127,7 @@ def parse_args():
     parser.add_argument('--pretrained', type=str, default='', help='path to pretrained model')
     parser.add_argument('--dataset', type=str, default='voc', help='training dataset')
     parser.add_argument('--imageset', type=str, default='', help='imageset splits')
-    parser.add_argument('--gpus', type=str, default='0', help='gpu devices eg. 0,1')
+    parser.add_argument('--gpus', type=str, help='GPU devices, eg: "0,1,2,3" , not set to use CPU')
     parser.add_argument('--epochs', type=int, default=10, help='training epochs')
     parser.add_argument('--lr', type=float, default=0.001, help='base learning rate')
     parser.add_argument('--lr-decay-epoch', type=str, default='7', help='epoch to decay lr')
diff --git a/include/mxnet/c_api_error.h b/include/mxnet/c_api_error.h
new file mode 100644
index 000000000000..0c6ea03fa459
--- /dev/null
+++ b/include/mxnet/c_api_error.h
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file c_api_error.h
+ * \brief Error handling for C API.
+ */
+#ifndef MXNET_C_API_ERROR_H_
+#define MXNET_C_API_ERROR_H_
+
+/*!
+ * \brief Macros to guard beginning and end section of all functions
+ * every function starts with API_BEGIN()
+ * and finishes with API_END() or API_END_HANDLE_ERROR()
+ * The finally clause contains procedure to cleanup states when an error happens.
+ */
+#define MX_API_BEGIN() try { on_enter_api(__FUNCTION__);
+#define MX_API_END() } catch(dmlc::Error &_except_) { on_exit_api(); return MXAPIHandleException(_except_); } on_exit_api(); return 0;  // NOLINT(*)
+#define MX_API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; on_exit_api(); return MXAPIHandleException(_except_); } on_exit_api(); return 0; // NOLINT(*)
+/*!
+ * \brief Set the last error message needed by C API
+ * \param msg The error message to set.
+ */
+void MXAPISetLastError(const char* msg);
+/*!
+ * \brief handle exception throwed out
+ * \param e the exception
+ * \return the return value of API after exception is handled
+ */
+inline int MXAPIHandleException(const dmlc::Error &e) {
+  MXAPISetLastError(e.what());
+  return -1;
+}
+
+namespace mxnet {
+extern void on_enter_api(const char *function);
+extern void on_exit_api();
+}
+#endif  // MXNET_C_API_ERROR_H_
diff --git a/julia/README.md b/julia/README.md
index a4299575f95e..2ff7553063f3 100644
--- a/julia/README.md
+++ b/julia/README.md
@@ -3,7 +3,7 @@
 [![MXNet](http://pkg.julialang.org/badges/MXNet_0.6.svg)](http://pkg.julialang.org/?pkg=MXNet)
 
 
-MXNet.jl is the [dmlc/mxnet](/~https://github.com/apache/incubator-mxnet) [Julia](http://julialang.org/) package. MXNet.jl brings flexible and efficient GPU computing and state-of-art deep learning to Julia. Some highlight of its features include:
+MXNet.jl is the [Apache MXNet](/~https://github.com/apache/incubator-mxnet) [Julia](http://julialang.org/) package. MXNet.jl brings flexible and efficient GPU computing and state-of-art deep learning to Julia. Some highlight of its features include:
 
 * Efficient tensor/matrix computation across multiple devices, including multiple CPUs, GPUs and distributed server nodes.
 * Flexible symbolic manipulation to composite and construction of state-of-the-art deep learning models.
diff --git a/make/config.mk b/make/config.mk
index 03f438db89d5..8a1aa2c165c4 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -98,8 +98,10 @@ USE_LIBJPEG_TURBO_PATH = NONE
 # use openmp for parallelization
 USE_OPENMP = 1
 
-# whether use MKL-DNN library
-USE_MKLDNN = 0
+# whether use MKL-DNN library: 0 = disabled, 1 = enabled
+# if USE_MKLDNN is not defined, MKL-DNN will be enabled by default on x86 Linux.
+# you can disable it explicity with USE_MKLDNN = 0
+USE_MKLDNN =
 
 # whether use NNPACK library
 USE_NNPACK = 0
diff --git a/make/config/libmxnet.sym b/make/config/libmxnet.sym
new file mode 100644
index 000000000000..0ddf63fca433
--- /dev/null
+++ b/make/config/libmxnet.sym
@@ -0,0 +1,15 @@
+MX*
+NN*
+_MX*
+_NN*
+mx*
+nn*
+_mx*
+_nn*
+Java_org_apache_mxnet*
+*NDArray*
+*Engine*Get*
+*Storage*Get*
+*on_enter_api*
+*on_exit_api*
+*MXAPISetLastError*
diff --git a/make/config/libmxnet.ver b/make/config/libmxnet.ver
new file mode 100644
index 000000000000..560549c29e58
--- /dev/null
+++ b/make/config/libmxnet.ver
@@ -0,0 +1,19 @@
+{
+    global:
+        NN*;
+        MX*;
+        _NN*;
+        _MX*;
+        nn*;
+        mx*;
+        _nn*;
+        _mx*;
+        Java_org_apache_mxnet*;
+        *NDArray*;
+        *Engine*Get*;
+        *Storage*Get*;
+        *on_enter_api*;
+        *on_exit_api*;
+        *MXAPISetLastError*;
+    local: *;
+};
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
index a1468f4496d3..171f846d20dd 100644
--- a/make/crosscompile.jetson.mk
+++ b/make/crosscompile.jetson.mk
@@ -57,10 +57,10 @@ DEBUG = 0
 USE_SIGNAL_HANDLER = 1
 
 # the additional link flags you want to add
-ADD_LDFLAGS = -L${CROSS_ROOT}/lib
+ADD_LDFLAGS = -L${CROSS_ROOT}/lib -L/usr/lib/aarch64-linux-gnu/
 
 # the additional compile flags you want to add
-ADD_CFLAGS = -I${CROSS_ROOT}/include
+ADD_CFLAGS = -I${CROSS_ROOT}/include -I/usr/include/aarch64-linux-gnu/
 
 #---------------------------------------------
 # matrix computation libraries for CPU/GPU
diff --git a/make/maven/maven_darwin_cpu.mk b/make/maven/maven_darwin_cpu.mk
new file mode 100644
index 000000000000..b8bd97fbc8ed
--- /dev/null
+++ b/make/maven/maven_darwin_cpu.mk
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making maven package
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 0
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -lz -framework CoreFoundation -framework Security  -Wl,-exported_symbols_list,$(CURDIR)/make/config/libmxnet.sym,-rpath,'$${ORIGIN}',-dead_strip
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=apple
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = NONE
+
+# whether use CuDNN R3 library
+USE_CUDNN = 0
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
+# use openmp for parallelization
+USE_OPENMP = 0
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/maven/maven_linux_cpu.mk b/make/maven/maven_linux_cpu.mk
new file mode 100644
index 000000000000..07177fbe3064
--- /dev/null
+++ b/make/maven/maven_linux_cpu.mk
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making maven package
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 0
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -lgfortran -ldl -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = NONE
+
+# whether use CuDNN R3 library
+USE_CUDNN = 0
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/maven/maven_linux_cu90.mk b/make/maven/maven_linux_cu90.mk
new file mode 100644
index 000000000000..097d252f0e4e
--- /dev/null
+++ b/make/maven/maven_linux_cu90.mk
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making maven package
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 0
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.0
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_darwin_cpu.mk b/make/pip/pip_darwin_cpu.mk
new file mode 100644
index 000000000000..bf240ad128cd
--- /dev/null
+++ b/make/pip/pip_darwin_cpu.mk
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -lz -framework CoreFoundation -framework Security  -Wl,-exported_symbols_list,$(CURDIR)/make/config/libmxnet.sym,-rpath,'$${ORIGIN}',-dead_strip
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=apple
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = NONE
+
+# whether use CuDNN R3 library
+USE_CUDNN = 0
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
+# use openmp for parallelization
+USE_OPENMP = 0
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_darwin_mkl.mk b/make/pip/pip_darwin_mkl.mk
new file mode 100644
index 000000000000..30b328c6420e
--- /dev/null
+++ b/make/pip/pip_darwin_mkl.mk
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -lz -framework CoreFoundation -framework Security  -Wl,-exported_symbols_list,$(CURDIR)/make/config/libmxnet.sym,-rpath,'$${ORIGIN}',-dead_strip
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=apple
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = NONE
+
+# whether use CuDNN R3 library
+USE_CUDNN = 0
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
+# use openmp for parallelization
+USE_OPENMP = 0
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cpu.mk b/make/pip/pip_linux_cpu.mk
new file mode 100644
index 000000000000..22bb32b4d1b8
--- /dev/null
+++ b/make/pip/pip_linux_cpu.mk
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -lgfortran -ldl -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = NONE
+
+# whether use CuDNN R3 library
+USE_CUDNN = 0
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu100.mk b/make/pip/pip_linux_cu100.mk
new file mode 100644
index 000000000000..514603b94ab6
--- /dev/null
+++ b/make/pip/pip_linux_cu100.mk
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-10.0
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu100mkl.mk b/make/pip/pip_linux_cu100mkl.mk
new file mode 100644
index 000000000000..da4127f0dd07
--- /dev/null
+++ b/make/pip/pip_linux_cu100mkl.mk
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-10.0
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu75.mk b/make/pip/pip_linux_cu75.mk
new file mode 100644
index 000000000000..02468aa9613a
--- /dev/null
+++ b/make/pip/pip_linux_cu75.mk
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-7.5
+
+# whether use CuDNN R3 library
+USE_CUDNN = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu75mkl.mk b/make/pip/pip_linux_cu75mkl.mk
new file mode 100644
index 000000000000..6895383d4fec
--- /dev/null
+++ b/make/pip/pip_linux_cu75mkl.mk
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-7.5
+
+# whether use CuDNN R3 library
+USE_CUDNN = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu80.mk b/make/pip/pip_linux_cu80.mk
new file mode 100644
index 000000000000..2a2b88de73a1
--- /dev/null
+++ b/make/pip/pip_linux_cu80.mk
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-8.0
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu80mkl.mk b/make/pip/pip_linux_cu80mkl.mk
new file mode 100644
index 000000000000..40cef4ed2110
--- /dev/null
+++ b/make/pip/pip_linux_cu80mkl.mk
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-8.0
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu90.mk b/make/pip/pip_linux_cu90.mk
new file mode 100644
index 000000000000..88e55180b738
--- /dev/null
+++ b/make/pip/pip_linux_cu90.mk
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.0
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu90mkl.mk b/make/pip/pip_linux_cu90mkl.mk
new file mode 100644
index 000000000000..9c4adf07bb6a
--- /dev/null
+++ b/make/pip/pip_linux_cu90mkl.mk
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.0
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu91.mk b/make/pip/pip_linux_cu91.mk
new file mode 100644
index 000000000000..ea62510396ef
--- /dev/null
+++ b/make/pip/pip_linux_cu91.mk
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.1
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu91mkl.mk b/make/pip/pip_linux_cu91mkl.mk
new file mode 100644
index 000000000000..232ec47979cc
--- /dev/null
+++ b/make/pip/pip_linux_cu91mkl.mk
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.1
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu92.mk b/make/pip/pip_linux_cu92.mk
new file mode 100644
index 000000000000..01d02cfefba9
--- /dev/null
+++ b/make/pip/pip_linux_cu92.mk
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.2
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 0
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_cu92mkl.mk b/make/pip/pip_linux_cu92mkl.mk
new file mode 100644
index 000000000000..1abde3d71d7f
--- /dev/null
+++ b/make/pip/pip_linux_cu92mkl.mk
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.2
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/pip/pip_linux_mkl.mk b/make/pip/pip_linux_mkl.mk
new file mode 100644
index 000000000000..96b88850c21a
--- /dev/null
+++ b/make/pip/pip_linux_mkl.mk
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -lgfortran -ldl  -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = NONE
+
+# whether use CuDNN R3 library
+USE_CUDNN = 0
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/mkldnn.mk b/mkldnn.mk
index 5af3e9b1d741..d79bbe7d2a0e 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -19,20 +19,14 @@ ifeq ($(USE_MKLDNN), 1)
 	MKLDNN_SUBMODDIR = $(ROOTDIR)/3rdparty/mkldnn
 	MKLDNN_BUILDDIR = $(MKLDNN_SUBMODDIR)/build
 	MXNET_LIBDIR = $(ROOTDIR)/lib
-	MKLDNN_LIBRARY_TYPE=STATIC
 ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
-else ifeq ($(UNAME_S), Windows)
-	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
-	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so
-	MKLDNN_LIBRARY_TYPE=SHARED
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.0.dylib
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
-	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.a
+	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so.0
 endif
 endif
 
@@ -43,7 +37,7 @@ mkldnn_build: $(MKLDNN_LIBFILE)
 $(MKLDNN_LIBFILE):
 	mkdir -p $(MKLDNNROOT)
 	cd $(MKLDNN_SUBMODDIR) && rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. && cp -a external/*/* $(MKLDNNROOT)/.
-	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF -DMKLDNN_LIBRARY_TYPE=$(MKLDNN_LIBRARY_TYPE)
+	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
 	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
 	$(MAKE) -C $(MKLDNN_BUILDDIR) install
 	mkdir -p $(MXNET_LIBDIR)
diff --git a/perl-package/AI-MXNet-Gluon-Contrib/META.yml b/perl-package/AI-MXNet-Gluon-Contrib/META.yml
index d175c2bd1413..f56b10c939b1 100644
--- a/perl-package/AI-MXNet-Gluon-Contrib/META.yml
+++ b/perl-package/AI-MXNet-Gluon-Contrib/META.yml
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 ---
 abstract: 'Perl interface to MXNet Gluon Contrib'
 author:
diff --git a/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml b/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
index d6d9652a6dd5..35c93845d367 100644
--- a/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
+++ b/perl-package/AI-MXNet-Gluon-ModelZoo/META.yml
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 ---
 abstract: 'Perl interface to MXNet Gluon ModelZoo'
 author:
diff --git a/perl-package/AI-MXNet/META.yml b/perl-package/AI-MXNet/META.yml
index e604b7cd0da8..26e37b572600 100644
--- a/perl-package/AI-MXNet/META.yml
+++ b/perl-package/AI-MXNet/META.yml
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 ---
 abstract: 'Perl interface to MXNet machine learning library'
 author:
diff --git a/perl-package/AI-MXNetCAPI/META.yml b/perl-package/AI-MXNetCAPI/META.yml
index eb5d9aae8018..d870f05fbe52 100644
--- a/perl-package/AI-MXNetCAPI/META.yml
+++ b/perl-package/AI-MXNetCAPI/META.yml
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 ---
 abstract: 'Swig interface to mxnet c api'
 author:
diff --git a/perl-package/AI-NNVMCAPI/META.yml b/perl-package/AI-NNVMCAPI/META.yml
index e462637eabc1..40c49e431e0b 100644
--- a/perl-package/AI-NNVMCAPI/META.yml
+++ b/perl-package/AI-NNVMCAPI/META.yml
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 ---
 abstract: 'Swig interface to nnvm c api'
 author:
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index e960829e691b..374a3b50bbb5 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -54,7 +54,6 @@
 from . import lr_scheduler
 # use mx.kv as short for kvstore
 from . import kvstore as kv
-from . import kvstore_server
 # Runtime compile module
 from . import rtc
 # Attribute scope to add attributes to symbolic graphs
@@ -82,3 +81,11 @@
 from . import gluon
 
 __version__ = base.__version__
+
+# Dist kvstore module which launches a separate process when role is set to "server".
+# This should be done after other modules are initialized.
+# Otherwise this may result in errors when unpickling custom LR scheduler/optimizers.
+# For example, the LRScheduler in gluoncv depends on a specific version of MXNet, and
+# checks the __version__ attr of MXNet, which is not set on kvstore server due to the
+# fact that kvstore-server module is imported before the __version__ attr is set.
+from . import kvstore_server
diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
index e1c1714445df..bfec794f6220 100644
--- a/python/mxnet/callback.py
+++ b/python/mxnet/callback.py
@@ -113,7 +113,7 @@ def _callback(param):
                 logging.info('Iter[%d] Batch[%d] Train-%s=%f',
                              param.epoch, param.nbatch, name, value)
             if auto_reset:
-                param.eval_metric.reset()
+                param.eval_metric.reset_local()
     return _callback
 
 
@@ -164,7 +164,7 @@ def __call__(self, param):
                 if param.eval_metric is not None:
                     name_value = param.eval_metric.get_name_value()
                     if self.auto_reset:
-                        param.eval_metric.reset()
+                        param.eval_metric.reset_local()
                         msg = 'Epoch[%d] Batch [%d-%d]\tSpeed: %.2f samples/sec'
                         msg += '\t%s=%f'*len(name_value)
                         logging.info(msg, param.epoch, count-self.frequent, count, speed, *sum(name_value, ()))
diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index 0d20c76240bd..51deb4fcaa6e 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -232,6 +232,17 @@ def convert_fully_connected(node, **kwargs):
 
     fcnode = []
 
+    op_name = "flatten_" + str(kwargs["idx"])
+    flatten_node = onnx.helper.make_node(
+        'Flatten',
+        inputs=[input_nodes[0]],
+        outputs=[op_name],
+        name=op_name
+    )
+
+    input_nodes[0] = op_name
+    fcnode.append(flatten_node)
+
     if no_bias:
         data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype('int64')]
         bias_name = "bias" + str(kwargs["idx"])
@@ -575,6 +586,7 @@ def convert_pooling(node, **kwargs):
     pool_type = attrs["pool_type"]
     stride = eval(attrs["stride"]) if attrs.get("stride") else None
     global_pool = get_boolean_attribute_value(attrs, "global_pool")
+    p_value = attrs.get('p_value', 'None')
 
     pooling_convention = attrs.get('pooling_convention', 'valid')
 
@@ -587,26 +599,51 @@ def convert_pooling(node, **kwargs):
 
     pad_dims = list(parse_helper(attrs, "pad", [0, 0]))
     pad_dims = pad_dims + pad_dims
-    pool_types = {"max": "MaxPool", "avg": "AveragePool"}
-    global_pool_types = {"max": "GlobalMaxPool", "avg": "GlobalAveragePool"}
+    pool_types = {"max": "MaxPool", "avg": "AveragePool", "lp": "LpPool"}
+    global_pool_types = {"max": "GlobalMaxPool", "avg": "GlobalAveragePool",
+                         "lp": "GlobalLpPool"}
+
+    if pool_type == 'lp' and p_value == 'None':
+        raise AttributeError('ONNX requires a p value for LpPool and GlobalLpPool')
 
     if global_pool:
-        node = onnx.helper.make_node(
-            global_pool_types[pool_type],
-            input_nodes,  # input
-            [name],
-            name=name
-        )
+        if pool_type == 'lp':
+            node = onnx.helper.make_node(
+                global_pool_types[pool_type],
+                input_nodes,  # input
+                [name],
+                p=int(p_value),
+                name=name
+            )
+        else:
+            node = onnx.helper.make_node(
+                global_pool_types[pool_type],
+                input_nodes,  # input
+                [name],
+                name=name
+            )
     else:
-        node = onnx.helper.make_node(
-            pool_types[pool_type],
-            input_nodes,  # input
-            [name],
-            kernel_shape=kernel,
-            pads=pad_dims,
-            strides=stride,
-            name=name
-        )
+        if pool_type == 'lp':
+            node = onnx.helper.make_node(
+                pool_types[pool_type],
+                input_nodes,  # input
+                [name],
+                p=int(p_value),
+                kernel_shape=kernel,
+                pads=pad_dims,
+                strides=stride,
+                name=name
+            )
+        else:
+            node = onnx.helper.make_node(
+                pool_types[pool_type],
+                input_nodes,  # input
+                [name],
+                kernel_shape=kernel,
+                pads=pad_dims,
+                strides=stride,
+                name=name
+            )
 
     return [node]
 
@@ -619,12 +656,19 @@ def convert_exp(node, **kwargs):
     return create_basic_op_node('Exp', node, kwargs)
 
 @mx_op.register("_copy")
-def convert_identity(node, **kwargs):
+def convert_copy(node, **kwargs):
     """Map MXNet's _copy operator attributes to onnx's Identity operator
     and return the created node.
     """
     return create_basic_op_node('Identity', node, kwargs)
 
+@mx_op.register("identity")
+def convert_identity(node, **kwargs):
+    """Map MXNet's identity operator attributes to onnx's ConstantFill operator
+    and return the created node.
+    """
+    return create_basic_op_node('ConstantFill', node, kwargs)
+
 @mx_op.register("InstanceNorm")
 def convert_instancenorm(node, **kwargs):
     """Map MXNet's InstanceNorm operator attributes to onnx's InstanceNormalization operator
@@ -715,6 +759,31 @@ def convert_softmax_output(node, **kwargs):
 
     return [softmax_node]
 
+@mx_op.register("LogisticRegressionOutput")
+def convert_logistic_regression_output(node, **kwargs):
+    """Map MXNet's SoftmaxOutput operator attributes to onnx's Softmax operator
+    and return the created node.
+    """
+    name = node["name"]
+    input1_idx = kwargs["index_lookup"][node["inputs"][0][0]]
+    input1 = kwargs["proc_nodes"][input1_idx]
+    sigmoid_node = onnx.helper.make_node(
+        "Sigmoid",
+        [input1.name],
+        [name],
+        name=name
+    )
+    return [sigmoid_node]
+
+@mx_op.register("BlockGrad")
+def convert_blockgrad(node, **kwargs):
+    """ Skip operator  """
+    return create_basic_op_node('ConstantFill', node, kwargs)
+
+@mx_op.register("MakeLoss")
+def convert_makeloss(node, **kwargs):
+    """ Skip operator  """
+    return create_basic_op_node('ConstantFill', node, kwargs)
 
 @mx_op.register("Concat")
 def convert_concat(node, **kwargs):
@@ -799,7 +868,7 @@ def convert_l2normalization(node, **kwargs):
     mode = attrs.get("mode", "instance")
 
     if mode != "channel":
-        raise AttributeError("ONNX currently supports channel mode only")
+        raise AttributeError("L2Normalization: ONNX currently supports channel mode only")
 
     l2norm_node = onnx.helper.make_node(
         "LpNormalization",
@@ -861,7 +930,7 @@ def convert_clip(node, **kwargs):
 def scalar_op_helper(node, op_name, **kwargs):
     """Helper function for scalar arithmetic operations"""
     name, input_nodes, attrs = get_inputs(node, kwargs)
-
+    from onnx import numpy_helper
     input_type = kwargs["in_type"]
     scalar_value = np.array([attrs.get("scalar", 1)],
                             dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[input_type])
@@ -873,13 +942,21 @@ def scalar_op_helper(node, op_name, **kwargs):
     for i in initializer:
         if i.name == input_nodes[0]:
             if op_name == 'Mul':
-                new_initializer = onnx.numpy_helper.to_array(i) * scalar_value[0]
+                new_initializer = numpy_helper.to_array(i) * scalar_value[0]
             elif op_name == 'Sub':
-                new_initializer = onnx.numpy_helper.to_array(i) - scalar_value[0]
+                if name.startswith("_rminusscalar"):
+                    new_initializer = scalar_value[0] - numpy_helper.to_array(i)
+                else:
+                    new_initializer = numpy_helper.to_array(i) - scalar_value[0]
             elif op_name == 'Add':
-                new_initializer = onnx.numpy_helper.to_array(i) + scalar_value[0]
+                new_initializer = numpy_helper.to_array(i) + scalar_value[0]
             elif op_name == 'Div':
-                new_initializer = onnx.numpy_helper.to_array(i) / scalar_value[0]
+                if name.startswith("_rdivscalar"):
+                    new_initializer = scalar_value[0] / numpy_helper.to_array(i)
+                else:
+                    new_initializer = numpy_helper.to_array(i) / scalar_value[0]
+            elif op_name == 'Pow':
+                new_initializer = numpy_helper.to_array(i) ** scalar_value[0]
             flag = False
             break
 
@@ -945,6 +1022,13 @@ def convert_minus_scalar(node, **kwargs):
     """
     return scalar_op_helper(node, 'Sub', **kwargs)
 
+@mx_op.register("_rminus_scalar")
+def convert_rminus_scalar(node, **kwargs):
+    """Map MXNet's _rminus_scalar operator attributes to onnx's Sub operator.
+    Creates a new node for the input scalar value, adds it to the initializer
+    and return multiple created nodes.
+    """
+    return scalar_op_helper(node, 'Sub', **kwargs)
 
 # Convert scalar value into node and pass it as input to mul_node
 @mx_op.register("_plus_scalar")
@@ -964,6 +1048,21 @@ def convert_div_scalar(node, **kwargs):
     """
     return scalar_op_helper(node, 'Div', **kwargs)
 
+@mx_op.register("_rdiv_scalar")
+def convert_rdiv_scalar(node, **kwargs):
+    """Map MXNet's _rdiv_scalar operator attributes to onnx's Div operator.
+    Creates a new node for the input scalar value, adds it to the initializer
+    and return multiple created nodes.
+    """
+    return scalar_op_helper(node, 'Div', **kwargs)
+
+@mx_op.register("_power_scalar")
+def convert_pow_scalar(node, **kwargs):
+    """Map MXNet's _pow_scalar operator attributes to onnx's Pow operator.
+    Creates a new node for the input scalar value, adds it to the initializer
+    and return multiple created nodes.
+    """
+    return scalar_op_helper(node, 'Pow', **kwargs)
 
 # Sorting and Searching
 @mx_op.register("argmax")
@@ -1291,7 +1390,7 @@ def convert_reshape(node, **kwargs):
 
     for val in output_shape_list:
         if val in not_supported_shape:
-            raise AttributeError("Shape value not supported in ONNX", val)
+            raise AttributeError("Reshape: Shape value not supported in ONNX", val)
 
     reshape_node = onnx.helper.make_node(
         "Reshape",
@@ -1417,7 +1516,7 @@ def convert_squeeze(node, **kwargs):
 
     axis = attrs.get("axis", None)
     if not axis:
-        raise AttributeError("Missing axis attribute: ONNX currently requires axis to "
+        raise AttributeError("Squeeze: Missing axis attribute: ONNX currently requires axis to "
                              "be specified for squeeze operator")
     axis = convert_string_to_list(axis)
 
@@ -1655,3 +1754,156 @@ def convert_size(node, **kwargs):
     and return the created node.
     """
     return create_basic_op_node('Size', node, kwargs)
+
+
+@mx_op.register("log_softmax")
+def convert_logsoftmax(node, **kwargs):
+    """Map MXNet's log_softmax operator attributes to onnx's LogSoftMax operator
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    # Converting to int
+    axis = int(attrs.get("axis", -1))
+    temp = attrs.get("temperature", 'None')
+    if temp != 'None':
+        raise AttributeError("LogSoftMax: ONNX supports only temperature=None")
+
+    node = onnx.helper.make_node(
+        'LogSoftmax',
+        input_nodes,
+        [name],
+        axis=axis,
+        name=name
+    )
+    return [node]
+
+@mx_op.register("norm")
+def convert_norm(node, **kwargs):
+    """Map MXNet's norm operator attributes to onnx's ReduceL1 and ReduceL2 operators
+    and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    mx_axis = attrs.get("axis", None)
+    axes = convert_string_to_list(str(mx_axis)) if mx_axis else None
+
+    keepdims = get_boolean_attribute_value(attrs, "keepdims")
+    ord = int(attrs.get("ord", 2))
+
+    onnx_op_name = "ReduceL1" if ord == 1 else "ReduceL2"
+
+    if axes:
+        reduce_node = onnx.helper.make_node(
+            onnx_op_name,
+            input_nodes,
+            [name],
+            axes=axes,
+            keepdims=keepdims,
+            name=name
+        )
+        return [reduce_node]
+    else:
+        reduce_node = onnx.helper.make_node(
+            onnx_op_name,
+            input_nodes,
+            [name],
+            keepdims=keepdims,
+            name=name
+        )
+        return [reduce_node]
+
+@mx_op.register("_sample_multinomial")
+def convert_multinomial(node, **kwargs):
+    """Map MXNet's multinomial operator attributes to onnx's
+    Multinomial operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+    dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(attrs.get("dtype", 'int32'))]
+    sample_size = convert_string_to_list(attrs.get("shape", '1'))
+    if len(sample_size) < 2:
+        sample_size = sample_size[-1]
+    else:
+        raise AttributeError("ONNX currently supports integer sample_size only")
+    node = onnx.helper.make_node(
+        "Multinomial",
+        input_nodes,
+        [name],
+        dtype=dtype,
+        sample_size=sample_size,
+        name=name,
+    )
+    return [node]
+
+
+@mx_op.register("_random_uniform")
+def convert_random_uniform(node, **kwargs):
+    """Map MXNet's random_uniform operator attributes to onnx's RandomUniform
+    operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    # Converting to float32
+    low = float(attrs.get("low", 0))
+    high = float(attrs.get("high", 1.0))
+    shape = convert_string_to_list(attrs.get('shape', '[]'))
+    dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(attrs.get('dtype', 'float32'))]
+
+    node = onnx.helper.make_node(
+        'RandomUniform',
+        input_nodes,
+        [name],
+        low=low,
+        high=high,
+        dtype=dtype,
+        shape=shape,
+        name=name
+    )
+    return [node]
+
+
+@mx_op.register("_random_normal")
+def convert_random_normal(node, **kwargs):
+    """Map MXNet's random_normal operator attributes to onnx's RandomNormal
+    operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    # Converting to float32
+    mean = float(attrs.get("loc", 0))
+    scale = float(attrs.get("scale", 1.0))
+    shape = convert_string_to_list(attrs.get('shape', '[]'))
+    dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(attrs.get('dtype', 'float32'))]
+
+    node = onnx.helper.make_node(
+        'RandomNormal',
+        input_nodes,
+        [name],
+        mean=mean,
+        scale=scale,
+        dtype=dtype,
+        shape=shape,
+        name=name
+    )
+    return [node]
+
+
+@mx_op.register("ROIPooling")
+def convert_roipooling(node, **kwargs):
+    """Map MXNet's ROIPooling operator attributes to onnx's MaxRoiPool
+    operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    pooled_shape = convert_string_to_list(attrs.get('pooled_size'))
+    scale = float(attrs.get("spatial_scale"))
+
+    node = onnx.helper.make_node(
+        'MaxRoiPool',
+        input_nodes,
+        [name],
+        pooled_shape=pooled_shape,
+        spatial_scale=scale,
+        name=name
+    )
+    return [node]
diff --git a/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py b/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
index 84db5decd503..d0d4501d89f4 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/export_onnx.py
@@ -146,7 +146,7 @@ def get_outputs(sym, params, in_shape, in_label):
             if name.endswith('_output'):
                 out_names.append(name[:-len('_output')])
             else:
-                logging.warning("output '%s' does not end with '_output'", name)
+                logging.info("output '%s' does not end with '_output'", name)
                 out_names.append(name)
 
         assert len(out_shapes) == len(out_names)
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py b/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
index 2ceabaec1dcd..cf95bfef09a3 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
@@ -18,12 +18,12 @@
 # coding: utf-8_
 # pylint: disable=invalid-name
 """Operator attributes conversion"""
-from ._op_translations import identity, random_uniform, random_normal
+from ._op_translations import identity, random_uniform, random_normal, sample_multinomial
 from ._op_translations import add, subtract, multiply, divide, absolute, negative, add_n
 from ._op_translations import tanh, arccos, arcsin, arctan, _cos, _sin, _tan
 from ._op_translations import softplus, shape, gather, lp_pooling, size
 from ._op_translations import ceil, floor, hardsigmoid, global_lppooling
-from ._op_translations import concat
+from ._op_translations import concat, hardmax
 from ._op_translations import leaky_relu, _elu, _prelu, _selu, softmax, fully_connected
 from ._op_translations import global_avgpooling, global_maxpooling, linalg_gemm
 from ._op_translations import sigmoid, pad, relu, matrix_multiplication, batch_norm
@@ -37,7 +37,7 @@
 from ._op_translations import reduce_sum_square, reduce_l1, reduce_l2, max_roi_pooling
 from ._op_translations import log_softmax, softsign, lesser, greater, equal
 from ._op_translations import logical_and, logical_or, logical_xor, logical_not
-from ._op_translations import mean, depthtospace, spacetodepth
+from ._op_translations import mean, depthtospace, spacetodepth, lpnormalization
 
 # convert_map defines maps of ONNX operator names to converter functor(callable)
 # defined in the op_translations module.
@@ -48,6 +48,7 @@
     'RandomNormal'      : random_normal,
     'RandomUniformLike' : random_uniform,
     'RandomNormalLike'  : random_normal,
+    'Multinomial'       : sample_multinomial,
     # Arithmetic Operators
     'Add'               : add,
     'Sub'               : subtract,
@@ -144,5 +145,7 @@
     'HardSigmoid'       : hardsigmoid,
     'LpPool'            : lp_pooling,
     'DepthToSpace'      : depthtospace,
-    'SpaceToDepth'      : spacetodepth
+    'SpaceToDepth'      : spacetodepth,
+    'Hardmax'           : hardmax,
+    'LpNormalization'   : lpnormalization
 }
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
index 702832529314..dc00feee815b 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
@@ -29,14 +29,39 @@ def identity(attrs, inputs, proto_obj):
 
 def random_uniform(attrs, inputs, proto_obj):
     """Draw random samples from a uniform distribtuion."""
-    new_attr = translation_utils._remove_attributes(attrs, ['seed'])
-    return 'random_uniform', new_attr, inputs
+    try:
+        from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
+    except ImportError:
+        raise ImportError("Onnx and protobuf need to be installed. "
+                          "Instructions to install - /~https://github.com/onnx/onnx")
+    new_attrs = translation_utils._remove_attributes(attrs, ['seed'])
+    new_attrs['dtype'] = TENSOR_TYPE_TO_NP_TYPE[int(new_attrs.get('dtype', 1))]
+    return 'random_uniform', new_attrs, inputs
 
 def random_normal(attrs, inputs, proto_obj):
     """Draw random samples from a Gaussian distribution."""
+    try:
+        from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
+    except ImportError:
+        raise ImportError("Onnx and protobuf need to be installed. "
+                          "Instructions to install - /~https://github.com/onnx/onnx")
     new_attr = translation_utils._remove_attributes(attrs, ['seed'])
-    new_attr = translation_utils._fix_attribute_names(new_attr, {'mean' : 'loc'})
-    return 'random_uniform', new_attr, inputs
+    new_attr = translation_utils._fix_attribute_names(new_attr, {'mean': 'loc'})
+    new_attr['dtype'] = TENSOR_TYPE_TO_NP_TYPE[int(new_attr.get('dtype', 1))]
+    return 'random_normal', new_attr, inputs
+
+def sample_multinomial(attrs, inputs, proto_obj):
+    """Draw random samples from a multinomial distribution."""
+    try:
+        from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
+    except ImportError:
+        raise ImportError("Onnx and protobuf need to be installed. "
+                          + "Instructions to install - /~https://github.com/onnx/onnx")
+    new_attrs = translation_utils._remove_attributes(attrs, ['seed'])
+    new_attrs = translation_utils._fix_attribute_names(new_attrs, {'sample_size': 'shape'})
+    new_attrs['dtype'] = TENSOR_TYPE_TO_NP_TYPE[int(attrs.get('dtype', 6))]
+    return 'sample_multinomial', new_attrs, inputs
+
 
 # Arithmetic Operations
 def add(attrs, inputs, proto_obj):
@@ -382,6 +407,7 @@ def global_lppooling(attrs, inputs, proto_obj):
                                                                 'kernel': (1, 1),
                                                                 'pool_type': 'lp',
                                                                 'p_value': p_value})
+    new_attrs = translation_utils._remove_attributes(new_attrs, ['p'])
     return 'Pooling', new_attrs, inputs
 
 def linalg_gemm(attrs, inputs, proto_obj):
@@ -671,11 +697,12 @@ def lp_pooling(attrs, inputs, proto_obj):
     new_attrs = translation_utils._fix_attribute_names(attrs,
                                                        {'kernel_shape': 'kernel',
                                                         'strides': 'stride',
-                                                        'pads': 'pad',
-                                                        'p_value': p_value
+                                                        'pads': 'pad'
                                                        })
+    new_attrs = translation_utils._remove_attributes(new_attrs, ['p'])
     new_attrs = translation_utils._add_extra_attributes(new_attrs,
-                                                        {'pooling_convention': 'valid'
+                                                        {'pooling_convention': 'valid',
+                                                         'p_value': p_value
                                                         })
     new_op = translation_utils._fix_pooling('lp', inputs, new_attrs)
     return new_op, new_attrs, inputs
@@ -714,3 +741,37 @@ def spacetodepth(attrs, inputs, proto_obj):
     new_attrs = translation_utils._fix_attribute_names(attrs, {'blocksize':'block_size'})
 
     return "space_to_depth", new_attrs, inputs
+
+def hardmax(attrs, inputs, proto_obj):
+    """Returns batched one-hot vectors."""
+    input_tensor_data = proto_obj.model_metadata.get('input_tensor_data')[0]
+    input_shape = input_tensor_data[1]
+
+    axis = int(attrs.get('axis', 1))
+    axis = axis if axis >= 0 else len(input_shape) + axis
+
+    if axis == len(input_shape) - 1:
+        amax = symbol.argmax(inputs[0], axis=-1)
+        one_hot = symbol.one_hot(amax, depth=input_shape[-1])
+        return one_hot, attrs, inputs
+
+    # since reshape doesn't take a tensor for shape,
+    # computing with np.prod. This needs to be changed to
+    # to use mx.sym.prod() when mx.sym.reshape() is fixed.
+    # (/~https://github.com/apache/incubator-mxnet/issues/10789)
+    new_shape = (int(np.prod(input_shape[:axis])),
+                 int(np.prod(input_shape[axis:])))
+    reshape_op = symbol.reshape(inputs[0], new_shape)
+    amax = symbol.argmax(reshape_op, axis=-1)
+    one_hot = symbol.one_hot(amax, depth=new_shape[-1])
+    hardmax_op = symbol.reshape(one_hot, input_shape)
+    return hardmax_op, attrs, inputs
+
+def lpnormalization(attrs, inputs, proto_obj):
+    """ONNX does not have eps attribute, so cannot map it to L2normalization in MXNet
+     without that, it works as norm operator discussion in PR:
+     /~https://github.com/onnx/onnx/pull/1330"""
+    new_attrs = translation_utils._fix_attribute_names(attrs, {'p': 'ord'})
+    axis = int(attrs.get("axis", -1))
+    new_attrs.update(axis=axis)
+    return 'norm', new_attrs, inputs
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
index f63c1e9e8e62..6fd52665ca31 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
@@ -94,6 +94,7 @@ def _fix_pooling(pool_type, inputs, new_attr):
     stride = new_attr.get('stride')
     kernel = new_attr.get('kernel')
     padding = new_attr.get('pad')
+    p_value = new_attr.get('p_value')
 
     # Adding default stride.
     if stride is None:
@@ -138,7 +139,10 @@ def _fix_pooling(pool_type, inputs, new_attr):
             new_pad_op = symbol.pad(curr_sym, mode='constant', pad_width=pad_width)
 
     # Apply pooling without pads.
-    new_pooling_op = symbol.Pooling(new_pad_op, pool_type=pool_type, stride=stride, kernel=kernel)
+    if pool_type == 'lp':
+        new_pooling_op = symbol.Pooling(new_pad_op, pool_type=pool_type, stride=stride, kernel=kernel, p_value=p_value)
+    else:
+        new_pooling_op = symbol.Pooling(new_pad_op, pool_type=pool_type, stride=stride, kernel=kernel)
     return new_pooling_op
 
 def _fix_bias(op_name, attrs, num_inputs):
diff --git a/python/mxnet/contrib/onnx/onnx2mx/import_onnx.py b/python/mxnet/contrib/onnx/onnx2mx/import_onnx.py
index 3af196f8b091..d5071af0763d 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/import_onnx.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/import_onnx.py
@@ -19,6 +19,7 @@
 # pylint: disable=invalid-name,too-many-locals,no-self-use
 """ Support import export formats."""
 from __future__ import absolute_import as _abs
+import numpy as np
 from .... import symbol
 from .... import ndarray as nd
 from ....base import string_types
@@ -87,7 +88,7 @@ def from_onnx(self, graph):
         params : dict
             A dict of name: nd.array pairs, used as pretrained weights
         """
-        #get input, output shapes
+        # get input, output shapes
         self.model_metadata = self.get_graph_metadata(graph)
         # parse network inputs, aka parameters
         for init_tensor in graph.initializer:
@@ -196,7 +197,11 @@ def _parse_array(self, tensor_proto):
         except ImportError:
             raise ImportError("Onnx and protobuf need to be installed. "
                               + "Instructions to install - /~https://github.com/onnx/onnx")
-        np_array = to_array(tensor_proto).reshape(tuple(tensor_proto.dims))
+        if len(tuple(tensor_proto.dims)) > 0:
+            np_array = to_array(tensor_proto).reshape(tuple(tensor_proto.dims))
+        else:
+            # If onnx's params are scalar values without dims mentioned.
+            np_array = np.array([to_array(tensor_proto)])
         return nd.array(np_array)
 
     def _parse_attr(self, attr_proto):
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 586e620470d3..9d762745a407 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -26,6 +26,7 @@
 import multiprocessing
 import multiprocessing.queues
 from multiprocessing.reduction import ForkingPickler
+from multiprocessing.pool import ThreadPool
 import threading
 import numpy as np
 
@@ -384,8 +385,9 @@ def _worker_initializer(dataset):
     global _worker_dataset
     _worker_dataset = dataset
 
-def _worker_fn(samples, batchify_fn):
+def _worker_fn(samples, batchify_fn, dataset=None):
     """Function for processing data in worker process."""
+    # pylint: disable=unused-argument
     # it is required that each worker process has to fork a new MXIndexedRecordIO handle
     # preserving dataset as global variable can save tons of overhead and is safe in new process
     global _worker_dataset
@@ -394,10 +396,14 @@ def _worker_fn(samples, batchify_fn):
     ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(batch)
     return buf.getvalue()
 
+def _thread_worker_fn(samples, batchify_fn, dataset):
+    """Threadpool worker function for processing data."""
+    return batchify_fn([dataset[i] for i in samples])
+
 class _MultiWorkerIter(object):
     """Internal multi-worker iterator for DataLoader."""
     def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
-                 worker_fn=_worker_fn, prefetch=0):
+                 worker_fn=_worker_fn, prefetch=0, dataset=None):
         self._worker_pool = worker_pool
         self._batchify_fn = batchify_fn
         self._batch_sampler = batch_sampler
@@ -407,6 +413,7 @@ def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
         self._iter = iter(self._batch_sampler)
         self._worker_fn = worker_fn
         self._pin_memory = pin_memory
+        self._dataset = dataset
         # pre-fetch
         for _ in range(prefetch):
             self._push_next()
@@ -419,7 +426,8 @@ def _push_next(self):
         r = next(self._iter, None)
         if r is None:
             return
-        async_ret = self._worker_pool.apply_async(self._worker_fn, (r, self._batchify_fn))
+        async_ret = self._worker_pool.apply_async(
+            self._worker_fn, (r, self._batchify_fn, self._dataset))
         self._data_buffer[self._sent_idx] = async_ret
         self._sent_idx += 1
 
@@ -432,7 +440,7 @@ def __next__(self):
         assert self._rcvd_idx < self._sent_idx, "rcvd_idx must be smaller than sent_idx"
         assert self._rcvd_idx in self._data_buffer, "fatal error with _push_next, rcvd_idx missing"
         ret = self._data_buffer.pop(self._rcvd_idx)
-        batch = pickle.loads(ret.get())
+        batch = pickle.loads(ret.get()) if self._dataset is None else ret.get()
         if self._pin_memory:
             batch = _as_in_context(batch, context.cpu_pinned())
         batch = batch[0] if len(batch) == 1 else batch
@@ -498,12 +506,18 @@ def default_batchify_fn(data):
         but will consume more shared_memory. Using smaller number may forfeit the purpose of using
         multiple worker processes, try reduce `num_workers` in this case.
         By default it defaults to `num_workers * 2`.
+    thread_pool : bool, default False
+        If ``True``, use threading pool instead of multiprocessing pool. Using threadpool
+        can avoid shared memory usage. If `DataLoader` is more IO bounded or GIL is not a killing
+        problem, threadpool version may achieve better performance than multiprocessing.
+
     """
     def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
                  last_batch=None, batch_sampler=None, batchify_fn=None,
-                 num_workers=0, pin_memory=False, prefetch=None):
+                 num_workers=0, pin_memory=False, prefetch=None, thread_pool=False):
         self._dataset = dataset
         self._pin_memory = pin_memory
+        self._thread_pool = thread_pool
 
         if batch_sampler is None:
             if batch_size is None:
@@ -529,8 +543,11 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
         self._worker_pool = None
         self._prefetch = max(0, int(prefetch) if prefetch is not None else 2 * self._num_workers)
         if self._num_workers > 0:
-            self._worker_pool = multiprocessing.Pool(
-                self._num_workers, initializer=_worker_initializer, initargs=[self._dataset])
+            if self._thread_pool:
+                self._worker_pool = ThreadPool(self._num_workers)
+            else:
+                self._worker_pool = multiprocessing.Pool(
+                    self._num_workers, initializer=_worker_initializer, initargs=[self._dataset])
         if batchify_fn is None:
             if num_workers > 0:
                 self._batchify_fn = default_mp_batchify_fn
@@ -551,14 +568,17 @@ def same_process_iter():
 
         # multi-worker
         return _MultiWorkerIter(self._worker_pool, self._batchify_fn, self._batch_sampler,
-                                pin_memory=self._pin_memory, worker_fn=_worker_fn,
-                                prefetch=self._prefetch)
+                                pin_memory=self._pin_memory,
+                                worker_fn=_thread_worker_fn if self._thread_pool else _worker_fn,
+                                prefetch=self._prefetch,
+                                dataset=self._dataset if self._thread_pool else None)
 
     def __len__(self):
         return len(self._batch_sampler)
 
     def __del__(self):
         if self._worker_pool:
-            # manually terminate due to a bug that pool is not automatically terminated on linux
+            # manually terminate due to a bug that pool is not automatically terminated
+            # https://bugs.python.org/issue34172
             assert isinstance(self._worker_pool, multiprocessing.pool.Pool)
             self._worker_pool.terminate()
diff --git a/python/mxnet/gluon/data/dataset.py b/python/mxnet/gluon/data/dataset.py
index c93a4b1cd6b9..28d19c9fe37c 100644
--- a/python/mxnet/gluon/data/dataset.py
+++ b/python/mxnet/gluon/data/dataset.py
@@ -88,11 +88,7 @@ def transform_first(self, fn, lazy=True):
         Dataset
             The transformed dataset.
         """
-        def base_fn(x, *args):
-            if args:
-                return (fn(x),) + args
-            return fn(x)
-        return self.transform(base_fn, lazy)
+        return self.transform(_TransformFirstClosure(fn), lazy)
 
 
 class SimpleDataset(Dataset):
@@ -129,6 +125,16 @@ def __getitem__(self, idx):
         return self._fn(item)
 
 
+class _TransformFirstClosure(object):
+    """Use callable object instead of nested function, it can be pickled."""
+    def __init__(self, fn):
+        self._fn = fn
+
+    def __call__(self, x, *args):
+        if args:
+            return (self._fn(x),) + args
+        return self._fn(x)
+
 class ArrayDataset(Dataset):
     """A dataset that combines multiple dataset-like objects, e.g.
     Datasets, lists, arrays, etc.
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index 3523be4d054a..175076925332 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -82,10 +82,10 @@ class Cast(HybridBlock):
 
 
     Inputs:
-        - **data**: input tensor with arbitrary shape.
+        - **data**: input tensor with arbitrary shape and dtype.
 
     Outputs:
-        - **out**: output tensor with the same shape as `data`.
+        - **out**: output tensor with the same shape as `data` and data type as dtype.
     """
     def __init__(self, dtype='float32'):
         super(Cast, self).__init__()
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index c69b980935fc..4d514c28317a 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -537,7 +537,7 @@ class LayerNorm(HybridBlock):
 
     .. math::
 
-      out = \frac{x - mean[data, axis]}{ \sqrt{Var[data, axis]} + \epsilon} * gamma + beta
+      out = \frac{x - mean[data, axis]}{ \sqrt{Var[data, axis] + \epsilon}} * gamma + beta
 
     Parameters
     ----------
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index c4d49e82c908..f6c0a31b52e2 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -28,6 +28,15 @@ class Trainer(object):
     """Applies an `Optimizer` on a set of Parameters. Trainer should
     be used together with `autograd`.
 
+    .. note::
+
+        For the following cases, updates will always happen on kvstore,
+        i.e., you cannot set update_on_kvstore=False.
+
+        - dist kvstore with sparse weights or sparse gradients
+        - dist async kvstore
+        - `optimizer.lr_scheduler` is not None
+
     Parameters
     ----------
     params : ParameterDict
@@ -115,11 +124,12 @@ def _init_optimizer(self, optimizer, optimizer_params):
                 "optimizer_params must be None if optimizer is an instance of " \
                 "Optimizer instead of str"
             self._optimizer = optimizer
+            # param_dict must not be deep copied, so that if user mutate the lr_mult
+            # or wd_mult of some parameters, it takes effect.
             self._optimizer.param_dict = param_dict
         else:
             self._optimizer = opt.create(optimizer, param_dict=param_dict,
                                          **optimizer_params)
-
         self._updaters = [opt.get_updater(self._optimizer) \
                             for _ in self._contexts]
 
@@ -158,59 +168,82 @@ def _reset_kvstore(self):
     def _init_kvstore(self):
         """Create kvstore."""
         config = self._kvstore_params
-        # if weight is sparse, the weight must be updated on KVStore.
-        # training loop contains:
-        #    - row_sparse_pull(sparse_weight)
-        #    - forward()
-        #    - backward()
-        #    - push(sparse_grad), push(dense_grad)
-        #    - pull(dense_weight)
+        # configure kvstore, update_on_kvstore and self._distributed on three cases:
         if self._contains_sparse_weight:
+            # If weight is sparse, kvstore must be present and the weight must be updated on kvstore.
+            # The training loop is the following:
+            #    - row_sparse_pull(sparse_weight)
+            #    - forward()
+            #    - backward()
+            #    - push_and_update(grad)
+            #    - pull(weight)
             kvstore, update_on_kvstore = _create_sparse_kvstore(config['kvstore'])
-            # raise Error if update_on_kvstore is set to False by the user
+            self._distributed = 'dist' in kvstore.type
+            # raise err if user provides unsupported configs
             if config['update_on_kvstore'] is False:
-                raise RuntimeError("Cannot set update_on_kvstore to False when sparse weights "
-                                   "are present.")
-        # if weight is dense and grad is sparse, the weight better not be updated on KVStore.
-        # training loop contains:
-        #    - forward()
-        #    - backward()
-        #    - push(grad)
-        #    - pull(grad)
-        #    - update(grad, weight)
+                raise ValueError("Cannot set update_on_kvstore=False when sparse weights "
+                                 "are present.")
+
         elif self._contains_sparse_grad:
+            # For single node training with dense weight and sparse grad,
+            # we prefer update_on_kvstore=False because this is usually faster.
+            # This means we push and pull sparse gradients, and we do not store weight in kvstore.
+            # The training loop is the following:
+            #    - forward()
+            #    - backward()
+            #    - push(grad)
+            #    - pull(grad)
+            #    - update(grad, weight)
+            #
+            # For multi-node training with dense weight and sparse grad,
+            # only update_on_kvstore=True is supported, due to the fact that
+            # kv.row_sparse_pull(grad) is not implemented.
+            # Therefore, we push sparse gradients and pull dense weights.
+            # The training loop contains:
+            #    - forward()
+            #    - backward()
+            #    - push_and_update(grad)
+            #    - pull(weight)
             arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params}
             kvstore, _ = _create_kvstore(config['kvstore'], len(self._contexts), arg_arrays)
-            update_on_kvstore = False
-        # normal case
+            self._distributed = 'dist' in kvstore.type if kvstore else False
+            update_on_kvstore = self._distributed
+            # raise err if user provides unsupported configs
+            if config['update_on_kvstore'] is not None:
+                if config['update_on_kvstore'] is False and self._distributed:
+                    raise ValueError("Cannot set update_on_kvstore=False on dist kvstore "
+                                     "when sparse gradients are present.")
+                update_on_kvstore = config['update_on_kvstore']
+
         else:
+            # Training with dense weight and dense gradients.
+            # The only unsupported mode is async with update_on_kvstore=False
             arg_arrays = {param.name: param.data(self._contexts[0]) for param in self._params}
             kvstore, update_on_kvstore = _create_kvstore(config['kvstore'], len(self._contexts),
                                                          arg_arrays)
-            if kvstore and 'async' in kvstore.type and config['update_on_kvstore'] is not None\
-                    and not config['update_on_kvstore']:
-                raise ValueError("Please set update_on_kvstore to true "
-                                 "when training in async mode.")
-
+            self._distributed = 'dist' in kvstore.type if kvstore else False
+            if self._distributed and 'async' in kvstore.type:
+                update_on_kvstore = True
+                # raise err if user provides unsupported configs
+                if config['update_on_kvstore'] is False:
+                    raise ValueError("Please set update_on_kvstore=True "
+                                     "when training in async mode.")
             if config['update_on_kvstore'] is not None:
                 update_on_kvstore = config['update_on_kvstore']
 
+        # set grad compression and optimizers
         if kvstore:
             if self._compression_params:
                 kvstore.set_gradient_compression(self._compression_params)
-            self._distributed = 'dist' in kvstore.type
-            if self._distributed:
-                # kv.pull(row_sparse_grad) is not supported for dist kvstore
-                # Captures condition for dist_async, dist_device_sync or based on config for
-                # update_on_kvstore
-                update_on_kvstore = self._contains_sparse_weight or self._contains_sparse_grad \
-                                    or 'device' in kvstore.type or 'async' in kvstore.type \
-                                    or config['update_on_kvstore']
             if update_on_kvstore:
                 # optimizer preferably needs to be set before init for multiprecision
                 kvstore.set_optimizer(self._optimizer)
             self._kvstore = kvstore
             self._update_on_kvstore = update_on_kvstore
+            if self._optimizer.lr_scheduler and not self._update_on_kvstore:
+                raise ValueError("update_on_kvstore=False does not support " \
+                                 "optimizer with LRScheduler. Please " \
+                                 "consider setting learning rate manually.")
         else:
             self._kvstore = None
             self._update_on_kvstore = None
@@ -255,6 +288,16 @@ def _row_sparse_pull(self, parameter, out, row_id, full_idx=False):
         else:
             self._kvstore.row_sparse_pull(idx, out=out, row_ids=row_id, priority=-idx)
 
+    def _check_and_rescale_grad(self, scale):
+        if self._update_on_kvstore and self._distributed and self._kv_initialized:
+            if self._optimizer.rescale_grad != scale:
+                raise UserWarning('Possible change in the `batch_size` from previous '
+                                  '`step` detected. Optimizer gradient normalizing '
+                                  'factor will not change w.r.t new batch_size when '
+                                  'update_on_kvstore=True and when distributed kvstore '
+                                  'is used.')
+        self._optimizer.rescale_grad = scale
+
     def step(self, batch_size, ignore_stale_grad=False):
         """Makes one step of parameter update. Should be called after
         `autograd.backward()` and outside of `record()` scope.
@@ -274,13 +317,7 @@ def step(self, batch_size, ignore_stale_grad=False):
             been updated by `backward` after last step) and skip update.
         """
         rescale_grad = self._scale / batch_size
-        if self._update_on_kvstore and self._distributed and \
-           self._optimizer.rescale_grad != rescale_grad:
-            raise UserWarning('Possible change in the `batch_size` from previous `step` detected.' \
-                            'Optimizer gradient normalizing factor will not change w.r.t new batch_size when ' \
-                            'update_on_kvstore=True and when distributed `kvstore` is used.')
-
-        self._optimizer.rescale_grad = rescale_grad
+        self._check_and_rescale_grad(rescale_grad)
 
         if not self._kv_initialized:
             self._init_kvstore()
@@ -352,7 +389,7 @@ def update(self, batch_size, ignore_stale_grad=False):
                 'is not supported. Try setting `update_on_kvstore` ' \
                 'to False when creating trainer.'
 
-        self._optimizer.rescale_grad = self._scale / batch_size
+        self._check_and_rescale_grad(self._scale / batch_size)
         self._update(ignore_stale_grad)
 
     def _update(self, ignore_stale_grad=False):
@@ -387,10 +424,16 @@ def _update(self, ignore_stale_grad=False):
     def save_states(self, fname):
         """Saves trainer states (e.g. optimizer, momentum) to a file.
 
+
         Parameters
         ----------
         fname : str
             Path to output states file.
+
+        Note
+        ----
+        `optimizer.param_dict`, which contains Parameter information (such as
+        `lr_mult` and `wd_mult`) will not be saved.
         """
         assert self._optimizer is not None
 
@@ -414,6 +457,12 @@ def load_states(self, fname):
         ----------
         fname : str
             Path to input states file.
+
+        Note
+        ----
+        `optimizer.param_dict`, which contains Parameter information (such as
+        `lr_mult` and `wd_mult`) will not be loaded from the file, but rather set
+        based on current Trainer's parameters.
         """
         if not self._kv_initialized:
             self._init_kvstore()
@@ -423,8 +472,6 @@ def load_states(self, fname):
         if self._update_on_kvstore:
             self._kvstore.load_optimizer_states(fname)
             self._optimizer = self._kvstore._updater.optimizer
-            param_dict = {i: param for i, param in enumerate(self._params)}
-            self._optimizer.param_dict = param_dict
         else:
             with open(fname, 'rb') as f:
                 states = f.read()
@@ -432,3 +479,5 @@ def load_states(self, fname):
                 updater.set_states(states)
                 updater.optimizer = self._updaters[0].optimizer
             self._optimizer = self._updaters[0].optimizer
+        param_dict = {i: param for i, param in enumerate(self._params)}
+        self._optimizer.param_dict = param_dict
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index b452aecdb04b..1dd665607597 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -46,7 +46,7 @@ def imread(filename, *args, **kwargs):
     """Read and decode an image to an NDArray.
 
     .. note:: `imread` uses OpenCV (not the CV2 Python library).
-    MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
+       MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
 
     Parameters
     ----------
@@ -87,7 +87,7 @@ def imresize(src, w, h, *args, **kwargs):
     r"""Resize image with OpenCV.
 
     .. note:: `imresize` uses OpenCV (not the CV2 Python library). MXNet must have been built
-    with USE_OPENCV=1 for `imresize` to work.
+       with USE_OPENCV=1 for `imresize` to work.
 
     Parameters
     ----------
@@ -144,7 +144,7 @@ def imdecode(buf, *args, **kwargs):
     """Decode an image to an NDArray.
 
     .. note:: `imdecode` uses OpenCV (not the CV2 Python library).
-    MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
+       MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
 
     Parameters
     ----------
@@ -345,7 +345,7 @@ def resize_short(src, size, interp=2):
     """Resizes shorter edge to size.
 
     .. note:: `resize_short` uses OpenCV (not the CV2 Python library).
-    MXNet must have been built with OpenCV for `resize_short` to work.
+       MXNet must have been built with OpenCV for `resize_short` to work.
 
     Resizes the original image by setting the shorter edge to size
     and setting the longer edge accordingly.
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 6d9972074b67..ecb8e1c3bc22 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -90,6 +90,7 @@ def __init__(self, name, output_names=None,
         self.name = str(name)
         self.output_names = output_names
         self.label_names = label_names
+        self._has_global_stats = kwargs.pop("has_global_stats", False)
         self._kwargs = kwargs
         self.reset()
 
@@ -148,6 +149,14 @@ def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.num_inst = 0
         self.sum_metric = 0.0
+        self.global_num_inst = 0
+        self.global_sum_metric = 0.0
+
+    def reset_local(self):
+        """Resets the local portion of the internal evaluation results
+        to initial state."""
+        self.num_inst = 0
+        self.sum_metric = 0.0
 
     def get(self):
         """Gets the current evaluation result.
@@ -164,6 +173,24 @@ def get(self):
         else:
             return (self.name, self.sum_metric / self.num_inst)
 
+    def get_global(self):
+        """Gets the current global evaluation result.
+
+        Returns
+        -------
+        names : list of str
+           Name of the metrics.
+        values : list of float
+           Value of the evaluations.
+        """
+        if self._has_global_stats:
+            if self.global_num_inst == 0:
+                return (self.name, float('nan'))
+            else:
+                return (self.name, self.global_sum_metric / self.global_num_inst)
+        else:
+            return self.get()
+
     def get_name_value(self):
         """Returns zipped name and value pairs.
 
@@ -179,6 +206,24 @@ def get_name_value(self):
             value = [value]
         return list(zip(name, value))
 
+    def get_global_name_value(self):
+        """Returns zipped name and value pairs for global results.
+
+        Returns
+        -------
+        list of tuples
+            A (name, value) tuple list.
+        """
+        if self._has_global_stats:
+            name, value = self.get_global()
+            if not isinstance(name, list):
+                name = [name]
+            if not isinstance(value, list):
+                value = [value]
+            return list(zip(name, value))
+        else:
+            return self.get_name_value()
+
 # pylint: disable=invalid-name
 register = registry.get_register_func(EvalMetric, 'metric')
 alias = registry.get_alias_func(EvalMetric, 'metric')
@@ -263,7 +308,8 @@ class CompositeEvalMetric(EvalMetric):
     def __init__(self, metrics=None, name='composite',
                  output_names=None, label_names=None):
         super(CompositeEvalMetric, self).__init__(
-            name, output_names=output_names, label_names=label_names)
+            name, output_names=output_names, label_names=label_names,
+            has_global_stats=True)
         if metrics is None:
             metrics = []
         self.metrics = [create(i) for i in metrics]
@@ -325,6 +371,15 @@ def reset(self):
         except AttributeError:
             pass
 
+    def reset_local(self):
+        """Resets the local portion of the internal evaluation results
+        to initial state."""
+        try:
+            for metric in self.metrics:
+                metric.reset_local()
+        except AttributeError:
+            pass
+
     def get(self):
         """Returns the current evaluation result.
 
@@ -347,6 +402,28 @@ def get(self):
             values.extend(value)
         return (names, values)
 
+    def get_global(self):
+        """Returns the current evaluation result.
+
+        Returns
+        -------
+        names : list of str
+           Name of the metrics.
+        values : list of float
+           Value of the evaluations.
+        """
+        names = []
+        values = []
+        for metric in self.metrics:
+            name, value = metric.get_global()
+            if isinstance(name, string_types):
+                name = [name]
+            if isinstance(value, numeric_types):
+                value = [value]
+            names.extend(name)
+            values.extend(value)
+        return (names, values)
+
     def get_config(self):
         config = super(CompositeEvalMetric, self).get_config()
         config.update({'metrics': [i.get_config() for i in self.metrics]})
@@ -395,7 +472,8 @@ def __init__(self, axis=1, name='accuracy',
                  output_names=None, label_names=None):
         super(Accuracy, self).__init__(
             name, axis=axis,
-            output_names=output_names, label_names=label_names)
+            output_names=output_names, label_names=label_names,
+            has_global_stats=True)
         self.axis = axis
 
     def update(self, labels, preds):
@@ -423,8 +501,11 @@ def update(self, labels, preds):
 
             check_label_shapes(label, pred_label)
 
-            self.sum_metric += (pred_label == label).sum()
+            num_correct = (pred_label == label).sum()
+            self.sum_metric += num_correct
+            self.global_sum_metric += num_correct
             self.num_inst += len(pred_label)
+            self.global_num_inst += len(pred_label)
 
 
 @register
@@ -467,7 +548,8 @@ def __init__(self, top_k=1, name='top_k_accuracy',
                  output_names=None, label_names=None):
         super(TopKAccuracy, self).__init__(
             name, top_k=top_k,
-            output_names=output_names, label_names=label_names)
+            output_names=output_names, label_names=label_names,
+            has_global_stats=True)
         self.top_k = top_k
         assert(self.top_k > 1), 'Please use Accuracy if top_k is no more than 1'
         self.name += '_%d' % self.top_k
@@ -487,7 +569,11 @@ def update(self, labels, preds):
 
         for label, pred_label in zip(labels, preds):
             assert(len(pred_label.shape) <= 2), 'Predictions should be no more than 2 dims'
-            pred_label = numpy.argsort(pred_label.asnumpy().astype('float32'), axis=1)
+            # Using argpartition here instead of argsort is safe because
+            # we do not care about the order of top k elements. It is
+            # much faster, which is important since that computation is
+            # single-threaded due to Python GIL.
+            pred_label = numpy.argpartition(pred_label.asnumpy().astype('float32'), -self.top_k)
             label = label.asnumpy().astype('int32')
             check_label_shapes(label, pred_label)
             num_samples = pred_label.shape[0]
@@ -498,8 +584,11 @@ def update(self, labels, preds):
                 num_classes = pred_label.shape[1]
                 top_k = min(num_classes, self.top_k)
                 for j in range(top_k):
-                    self.sum_metric += (pred_label[:, num_classes - 1 - j].flat == label.flat).sum()
+                    num_correct = (pred_label[:, num_classes - 1 - j].flat == label.flat).sum()
+                    self.sum_metric += num_correct
+                    self.global_sum_metric += num_correct
             self.num_inst += num_samples
+            self.global_num_inst += num_samples
 
 
 class _BinaryClassificationMetrics(object):
@@ -515,6 +604,10 @@ def __init__(self):
         self.false_negatives = 0
         self.false_positives = 0
         self.true_negatives = 0
+        self.global_true_positives = 0
+        self.global_false_negatives = 0
+        self.global_false_positives = 0
+        self.global_true_negatives = 0
 
     def update_binary_stats(self, label, pred):
         """
@@ -542,10 +635,18 @@ def update_binary_stats(self, label, pred):
         label_true = (label == 1)
         label_false = 1 - label_true
 
-        self.true_positives += (pred_true * label_true).sum()
-        self.false_positives += (pred_true * label_false).sum()
-        self.false_negatives += (pred_false * label_true).sum()
-        self.true_negatives += (pred_false * label_false).sum()
+        true_pos = (pred_true * label_true).sum()
+        false_pos = (pred_true * label_false).sum()
+        false_neg = (pred_false * label_true).sum()
+        true_neg = (pred_false * label_false).sum()
+        self.true_positives += true_pos
+        self.global_true_positives += true_pos
+        self.false_positives += false_pos
+        self.global_false_positives += false_pos
+        self.false_negatives += false_neg
+        self.global_false_negatives += false_neg
+        self.true_negatives += true_neg
+        self.global_true_negatives += true_neg
 
     @property
     def precision(self):
@@ -554,6 +655,13 @@ def precision(self):
         else:
             return 0.
 
+    @property
+    def global_precision(self):
+        if self.global_true_positives + self.global_false_positives > 0:
+            return float(self.global_true_positives) / (self.global_true_positives + self.global_false_positives)
+        else:
+            return 0.
+
     @property
     def recall(self):
         if self.true_positives + self.false_negatives > 0:
@@ -561,6 +669,13 @@ def recall(self):
         else:
             return 0.
 
+    @property
+    def global_recall(self):
+        if self.global_true_positives + self.global_false_negatives > 0:
+            return float(self.global_true_positives) / (self.global_true_positives + self.global_false_negatives)
+        else:
+            return 0.
+
     @property
     def fscore(self):
         if self.precision + self.recall > 0:
@@ -569,17 +684,33 @@ def fscore(self):
             return 0.
 
     @property
-    def matthewscc(self):
+    def global_fscore(self):
+        if self.global_precision + self.global_recall > 0:
+            return 2 * self.global_precision * self.global_recall / (self.global_precision + self.global_recall)
+        else:
+            return 0.
+
+    def matthewscc(self, use_global=False):
         """
         Calculate the Matthew's Correlation Coefficent
         """
-        if not self.total_examples:
-            return 0.
+        if use_global:
+            if not self.global_total_examples:
+                return 0.
+
+            true_pos = float(self.global_true_positives)
+            false_pos = float(self.global_false_positives)
+            false_neg = float(self.global_false_negatives)
+            true_neg = float(self.global_true_negatives)
+        else:
+            if not self.total_examples:
+                return 0.
+
+            true_pos = float(self.true_positives)
+            false_pos = float(self.false_positives)
+            false_neg = float(self.false_negatives)
+            true_neg = float(self.true_negatives)
 
-        true_pos = float(self.true_positives)
-        false_pos = float(self.false_positives)
-        false_neg = float(self.false_negatives)
-        true_neg = float(self.true_negatives)
         terms = [(true_pos + false_pos),
                  (true_pos + false_neg),
                  (true_neg + false_pos),
@@ -594,11 +725,26 @@ def total_examples(self):
         return self.false_negatives + self.false_positives + \
                self.true_negatives + self.true_positives
 
+    @property
+    def global_total_examples(self):
+        return self.global_false_negatives + self.global_false_positives + \
+               self.global_true_negatives + self.global_true_positives
+
+    def local_reset_stats(self):
+        self.false_positives = 0
+        self.false_negatives = 0
+        self.true_positives = 0
+        self.true_negatives = 0
+
     def reset_stats(self):
         self.false_positives = 0
         self.false_negatives = 0
         self.true_positives = 0
         self.true_negatives = 0
+        self.global_false_positives = 0
+        self.global_false_negatives = 0
+        self.global_true_positives = 0
+        self.global_true_negatives = 0
 
 
 @register
@@ -649,7 +795,8 @@ def __init__(self, name='f1',
         self.average = average
         self.metrics = _BinaryClassificationMetrics()
         EvalMetric.__init__(self, name=name,
-                            output_names=output_names, label_names=label_names)
+                            output_names=output_names, label_names=label_names,
+                            has_global_stats=True)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -669,18 +816,30 @@ def update(self, labels, preds):
 
         if self.average == "macro":
             self.sum_metric += self.metrics.fscore
+            self.global_sum_metric += self.metrics.global_fscore
             self.num_inst += 1
+            self.global_num_inst += 1
             self.metrics.reset_stats()
         else:
             self.sum_metric = self.metrics.fscore * self.metrics.total_examples
+            self.global_sum_metric = self.metrics.global_fscore * self.metrics.global_total_examples
             self.num_inst = self.metrics.total_examples
+            self.global_num_inst = self.metrics.global_total_examples
 
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.sum_metric = 0.
-        self.num_inst = 0.
+        self.num_inst = 0
+        self.global_num_inst = 0
+        self.global_sum_metric = 0.0
         self.metrics.reset_stats()
 
+    def reset_local(self):
+        """Resets the internal evaluation result to initial state."""
+        self.sum_metric = 0.
+        self.num_inst = 0
+        self.metrics.local_reset_stats()
+
 
 @register
 class MCC(EvalMetric):
@@ -750,7 +909,8 @@ def __init__(self, name='mcc',
         self._average = average
         self._metrics = _BinaryClassificationMetrics()
         EvalMetric.__init__(self, name=name,
-                            output_names=output_names, label_names=label_names)
+                            output_names=output_names, label_names=label_names,
+                            has_global_stats=True)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -769,19 +929,32 @@ def update(self, labels, preds):
             self._metrics.update_binary_stats(label, pred)
 
         if self._average == "macro":
-            self.sum_metric += self._metrics.matthewscc
+            self.sum_metric += self._metrics.matthewscc()
+            self.global_sum_metric += self._metrics.matthewscc(use_global=True)
             self.num_inst += 1
+            self.global_num_inst += 1
             self._metrics.reset_stats()
         else:
-            self.sum_metric = self._metrics.matthewscc * self._metrics.total_examples
+            self.sum_metric = self._metrics.matthewscc() * self._metrics.total_examples
+            self.global_sum_metric = self._metrics.matthewscc(use_global=True) * \
+                                     self._metrics.global_total_examples
             self.num_inst = self._metrics.total_examples
+            self.global_num_inst = self._metrics.global_total_examples
 
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.sum_metric = 0.
         self.num_inst = 0.
+        self.global_sum_metric = 0.
+        self.global_num_inst = 0.
         self._metrics.reset_stats()
 
+    def reset_local(self):
+        """Resets the internal evaluation result to initial state."""
+        self.sum_metric = 0.
+        self.num_inst = 0.
+        self._metrics.local_reset_stats()
+
 
 @register
 class Perplexity(EvalMetric):
@@ -841,7 +1014,8 @@ def __init__(self, ignore_label, axis=-1, name='perplexity',
                  output_names=None, label_names=None):
         super(Perplexity, self).__init__(
             name, ignore_label=ignore_label,
-            output_names=output_names, label_names=label_names)
+            output_names=output_names, label_names=label_names,
+            has_global_stats=True)
         self.ignore_label = ignore_label
         self.axis = axis
 
@@ -871,7 +1045,9 @@ def update(self, labels, preds):
             loss -= ndarray.sum(ndarray.log(ndarray.maximum(1e-10, pred))).asscalar()
             num += pred.size
         self.sum_metric += loss
+        self.global_sum_metric += loss
         self.num_inst += num
+        self.global_num_inst += num
 
     def get(self):
         """Returns the current evaluation result.
@@ -881,7 +1057,23 @@ def get(self):
         Tuple of (str, float)
             Representing name of the metric and evaluation result.
         """
-        return (self.name, math.exp(self.sum_metric/self.num_inst))
+        if self.num_inst == 0:
+            return (self.name, float('nan'))
+        else:
+            return (self.name, math.exp(self.sum_metric/self.num_inst))
+
+    def get_global(self):
+        """Returns the current global evaluation result.
+
+        Returns
+        -------
+        Tuple of (str, float)
+            Representing name of the metric and evaluation result.
+        """
+        if self.global_num_inst == 0:
+            return (self.name, float('nan'))
+        else:
+            return (self.name, math.exp(self.global_sum_metric/self.global_num_inst))
 
 ####################
 # REGRESSION METRICS
@@ -921,7 +1113,8 @@ class MAE(EvalMetric):
     def __init__(self, name='mae',
                  output_names=None, label_names=None):
         super(MAE, self).__init__(
-            name, output_names=output_names, label_names=label_names)
+            name, output_names=output_names, label_names=label_names,
+            has_global_stats=True)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -945,8 +1138,11 @@ def update(self, labels, preds):
             if len(pred.shape) == 1:
                 pred = pred.reshape(pred.shape[0], 1)
 
-            self.sum_metric += numpy.abs(label - pred).mean()
+            mae = numpy.abs(label - pred).mean()
+            self.sum_metric += mae
+            self.global_sum_metric += mae
             self.num_inst += 1 # numpy.prod(label.shape)
+            self.global_num_inst += 1 # numpy.prod(label.shape)
 
 
 @register
@@ -981,7 +1177,8 @@ class MSE(EvalMetric):
     def __init__(self, name='mse',
                  output_names=None, label_names=None):
         super(MSE, self).__init__(
-            name, output_names=output_names, label_names=label_names)
+            name, output_names=output_names, label_names=label_names,
+            has_global_stats=True)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -1005,8 +1202,11 @@ def update(self, labels, preds):
             if len(pred.shape) == 1:
                 pred = pred.reshape(pred.shape[0], 1)
 
-            self.sum_metric += ((label - pred)**2.0).mean()
+            mse = ((label - pred)**2.0).mean()
+            self.sum_metric += mse
+            self.global_sum_metric += mse
             self.num_inst += 1 # numpy.prod(label.shape)
+            self.global_num_inst += 1 # numpy.prod(label.shape)
 
 
 @register
@@ -1041,7 +1241,8 @@ class RMSE(EvalMetric):
     def __init__(self, name='rmse',
                  output_names=None, label_names=None):
         super(RMSE, self).__init__(
-            name, output_names=output_names, label_names=label_names)
+            name, output_names=output_names, label_names=label_names,
+            has_global_stats=True)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -1065,8 +1266,11 @@ def update(self, labels, preds):
             if len(pred.shape) == 1:
                 pred = pred.reshape(pred.shape[0], 1)
 
-            self.sum_metric += numpy.sqrt(((label - pred)**2.0).mean())
+            rmse = numpy.sqrt(((label - pred)**2.0).mean())
+            self.sum_metric += rmse
+            self.global_sum_metric += rmse
             self.num_inst += 1
+            self.global_num_inst += 1
 
 
 @register
@@ -1110,7 +1314,8 @@ def __init__(self, eps=1e-12, name='cross-entropy',
                  output_names=None, label_names=None):
         super(CrossEntropy, self).__init__(
             name, eps=eps,
-            output_names=output_names, label_names=label_names)
+            output_names=output_names, label_names=label_names,
+            has_global_stats=True)
         self.eps = eps
 
     def update(self, labels, preds):
@@ -1134,8 +1339,11 @@ def update(self, labels, preds):
             assert label.shape[0] == pred.shape[0]
 
             prob = pred[numpy.arange(label.shape[0]), numpy.int64(label)]
-            self.sum_metric += (-numpy.log(prob + self.eps)).sum()
+            cross_entropy = (-numpy.log(prob + self.eps)).sum()
+            self.sum_metric += cross_entropy
+            self.global_sum_metric += cross_entropy
             self.num_inst += label.shape[0]
+            self.global_num_inst += label.shape[0]
 
 @register
 @alias('nll_loss')
@@ -1178,7 +1386,8 @@ def __init__(self, eps=1e-12, name='nll-loss',
                  output_names=None, label_names=None):
         super(NegativeLogLikelihood, self).__init__(
             name, eps=eps,
-            output_names=output_names, label_names=label_names)
+            output_names=output_names, label_names=label_names,
+            has_global_stats=True)
         self.eps = eps
 
     def update(self, labels, preds):
@@ -1202,8 +1411,11 @@ def update(self, labels, preds):
             num_examples = pred.shape[0]
             assert label.shape[0] == num_examples, (label.shape[0], num_examples)
             prob = pred[numpy.arange(num_examples, dtype=numpy.int64), numpy.int64(label)]
-            self.sum_metric += (-numpy.log(prob + self.eps)).sum()
+            nll = (-numpy.log(prob + self.eps)).sum()
+            self.sum_metric += nll
+            self.global_sum_metric += nll
             self.num_inst += num_examples
+            self.global_num_inst += num_examples
 
 @register
 @alias('pearsonr')
@@ -1238,7 +1450,8 @@ class PearsonCorrelation(EvalMetric):
     def __init__(self, name='pearsonr',
                  output_names=None, label_names=None):
         super(PearsonCorrelation, self).__init__(
-            name, output_names=output_names, label_names=label_names)
+            name, output_names=output_names, label_names=label_names,
+            has_global_stats=True)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -1256,8 +1469,11 @@ def update(self, labels, preds):
             check_label_shapes(label, pred, False, True)
             label = label.asnumpy()
             pred = pred.asnumpy()
-            self.sum_metric += numpy.corrcoef(pred.ravel(), label.ravel())[0, 1]
+            pearson_corr = numpy.corrcoef(pred.ravel(), label.ravel())[0, 1]
+            self.sum_metric += pearson_corr
+            self.global_sum_metric += pearson_corr
             self.num_inst += 1
+            self.global_num_inst += 1
 
 
 @register
@@ -1278,7 +1494,8 @@ class Loss(EvalMetric):
     def __init__(self, name='loss',
                  output_names=None, label_names=None):
         super(Loss, self).__init__(
-            name, output_names=output_names, label_names=label_names)
+            name, output_names=output_names, label_names=label_names,
+            has_global_stats=True)
 
     def update(self, _, preds):
 
@@ -1286,8 +1503,11 @@ def update(self, _, preds):
             preds = [preds]
 
         for pred in preds:
-            self.sum_metric += ndarray.sum(pred).asscalar()
+            loss = ndarray.sum(pred).asscalar()
+            self.sum_metric += loss
+            self.global_sum_metric += loss
             self.num_inst += pred.size
+            self.global_num_inst += pred.size
 
 
 @register
@@ -1353,7 +1573,8 @@ def __init__(self, feval, name=None, allow_extra_outputs=False,
         super(CustomMetric, self).__init__(
             name, feval=feval,
             allow_extra_outputs=allow_extra_outputs,
-            output_names=output_names, label_names=label_names)
+            output_names=output_names, label_names=label_names,
+            has_global_stats=True)
         self._feval = feval
         self._allow_extra_outputs = allow_extra_outputs
 
@@ -1379,10 +1600,14 @@ def update(self, labels, preds):
             if isinstance(reval, tuple):
                 (sum_metric, num_inst) = reval
                 self.sum_metric += sum_metric
+                self.global_sum_metric += sum_metric
                 self.num_inst += num_inst
+                self.global_num_inst += num_inst
             else:
                 self.sum_metric += reval
+                self.global_sum_metric += reval
                 self.num_inst += 1
+                self.global_num_inst += 1
 
     def get_config(self):
         raise NotImplementedError("CustomMetric cannot be serialized")
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 2666f8bbcd4f..38fe739154d5 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -62,6 +62,11 @@ def _create_sparse_kvstore(kvstore):
     ----------
     kvstore : KVStore or str
         The kvstore.
+
+    Returns
+    -------
+    kvstore : KVStore
+    update_on_kvstore : bool. Always True.
     """
     # always update on kvstore
     update_on_kvstore = True
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index babea53d6e40..ca8463153686 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -22,7 +22,6 @@
 import time
 import logging
 import warnings
-import copy
 import numpy as np
 
 from .. import metric
@@ -508,7 +507,6 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
             validation_metric = eval_metric
         if not isinstance(eval_metric, metric.EvalMetric):
             eval_metric = metric.create(eval_metric)
-        epoch_eval_metric = copy.deepcopy(eval_metric)
 
         ################################################################################
         # training loop
@@ -516,7 +514,6 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
         for epoch in range(begin_epoch, num_epoch):
             tic = time.time()
             eval_metric.reset()
-            epoch_eval_metric.reset()
             nbatch = 0
             data_iter = iter(train_data)
             end_of_batch = False
@@ -532,12 +529,8 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
                     self.update_metric(eval_metric,
                                        [db.label for db in data_batch],
                                        pre_sliced=True)
-                    self.update_metric(epoch_eval_metric,
-                                       [db.label for db in data_batch],
-                                       pre_sliced=True)
                 else:
                     self.update_metric(eval_metric, data_batch.label)
-                    self.update_metric(epoch_eval_metric, data_batch.label)
 
                 try:
                     # pre fetch next batch
@@ -550,7 +543,7 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
                     monitor.toc_print()
 
                 if end_of_batch:
-                    eval_name_vals = epoch_eval_metric.get_name_value()
+                    eval_name_vals = eval_metric.get_global_name_value()
 
                 if batch_end_callback is not None:
                     batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index a085b6fe2ef6..d290a3f2fea2 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -43,33 +43,33 @@ class Optimizer(object):
 
     Parameters
     ----------
-    rescale_grad : float, optional
+    rescale_grad : float, optional, default 1.0
         Multiply the gradient with `rescale_grad` before updating. Often
         choose to be ``1.0/batch_size``.
 
-    param_idx2name : dict from int to string, optional
+    param_idx2name : dict from int to string, optional, default None
         A dictionary that maps int index to string name.
 
-    clip_gradient : float, optional
+    clip_gradient : float, optional, default None
         Clip the gradient by projecting onto the box ``[-clip_gradient, clip_gradient]``.
 
-    learning_rate : float, optional
+    learning_rate : float, optional, default 0.01
         The initial learning rate.
 
-    lr_scheduler : LRScheduler, optional
+    lr_scheduler : LRScheduler, optional, default None
         The learning rate scheduler.
 
-    wd : float, optional
+    wd : float, optional, default 0.0
         The weight decay (or L2 regularization) coefficient. Modifies objective
         by adding a penalty for having large weights.
 
-    sym: Symbol, optional
+    sym: Symbol, optional, default None
         The Symbol this optimizer is applying to.
 
-    begin_num_update : int, optional
+    begin_num_update : int, optional, default 0
         The initial number of updates.
 
-    multi_precision : bool, optional
+    multi_precision : bool, optional, default False
        Flag to control the internal precision of the optimizer.::
 
            False: results in using the same precision as the weights (default),
@@ -77,6 +77,10 @@ class Optimizer(object):
            in 32-bit precision even if actual weights used in the model have lower precision.
            Turning this on can improve convergence and accuracy when training with float16.
 
+    param_dict : dict of int -> gluon.Parameter, default None
+        Dictionary of parameter index to gluon.Parameter, used to lookup parameter attributes
+        such as lr_mult, wd_mult, etc. param_dict shall not be deep copied.
+
     Properties
     ----------
     learning_rate : float
@@ -1026,13 +1030,14 @@ class Adam(Optimizer):
     Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980.
 
     If the storage types of grad is ``row_sparse``, and ``lazy_update`` is True, \
-    **lazy updates** are applied by::
+    **lazy updates** at step t are applied by::
 
         for row in grad.indices:
             rescaled_grad[row] = clip(grad[row] * rescale_grad + wd * weight[row], clip_gradient)
             m[row] = beta1 * m[row] + (1 - beta1) * rescaled_grad[row]
             v[row] = beta2 * v[row] + (1 - beta2) * (rescaled_grad[row]**2)
-            w[row] = w[row] - learning_rate * m[row] / (sqrt(v[row]) + epsilon)
+            lr = learning_rate * sqrt(1 - beta1**t) / (1 - beta2**t)
+            w[row] = w[row] - lr * m[row] / (sqrt(v[row]) + epsilon)
 
     The lazy update only updates the mean and var for the weights whose row_sparse
     gradient indices appear in the current batch, rather than updating it for all indices.
@@ -1040,12 +1045,13 @@ class Adam(Optimizer):
     throughput for some applications. However, it provides slightly different semantics than
     the original update, and may lead to different empirical results.
 
-    Otherwise, **standard updates** are applied by::
+    Otherwise, **standard updates** at step t are applied by::
 
         rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
         m = beta1 * m + (1 - beta1) * rescaled_grad
         v = beta2 * v + (1 - beta2) * (rescaled_grad**2)
-        w = w - learning_rate * m / (sqrt(v) + epsilon)
+        lr = learning_rate * sqrt(1 - beta1**t) / (1 - beta2**t)
+        w = w - lr * m / (sqrt(v) + epsilon)
 
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
diff --git a/readthedocs.yml b/readthedocs.yml
index 3787245bd0e5..70e32ae4b6e0 100644
--- a/readthedocs.yml
+++ b/readthedocs.yml
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 formats:
         - none
 requirements_file: docs/requirements.txt
diff --git a/scala-package/.gitignore b/scala-package/.gitignore
index 8bc87f53e802..22b12b31d050 100644
--- a/scala-package/.gitignore
+++ b/scala-package/.gitignore
@@ -1,9 +1,10 @@
 .flattened-pom.xml
 core/src/main/scala/org/apache/mxnet/NDArrayAPIBase.scala
 core/src/main/scala/org/apache/mxnet/NDArrayBase.scala
+core/src/main/scala/org/apache/mxnet/NDArrayRandomAPIBase.scala
 core/src/main/scala/org/apache/mxnet/javaapi/NDArrayBase.scala
 core/src/main/scala/org/apache/mxnet/SymbolAPIBase.scala
 core/src/main/scala/org/apache/mxnet/SymbolBase.scala
+core/src/main/scala/org/apache/mxnet/SymbolRandomAPIBase.scala
 examples/scripts/infer/images/
 examples/scripts/infer/models/
-local-snapshot
\ No newline at end of file
diff --git a/scala-package/README.md b/scala-package/README.md
index 20fbee2469b0..3859e5f32240 100644
--- a/scala-package/README.md
+++ b/scala-package/README.md
@@ -1,193 +1,202 @@
-<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo-m/mxnet2.png width=135/> Deep Learning for Scala/Java
+MXNet Package for Scala/Java
 =====
 
-[![Build Status](http://jenkins.mxnet-ci.amazon-ml.com/job/incubator-mxnet/job/master/badge/icon)](http://jenkins.mxnet-ci.amazon-ml.com/job/incubator-mxnet/job/master/)
-[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
-
-Here you find the MXNet Scala Package!
-It brings flexible and efficient GPU/CPU computing and state-of-art deep learning to JVM.
+The MXNet Scala/Java Package brings flexible and efficient GPU/CPU computing and state-of-art deep learning to JVM.
 
 - It enables you to write seamless tensor/matrix computation with multiple GPUs
   in Scala, Java and other languages built on JVM.
 - It also enables you to construct and customize the state-of-art deep learning models in JVM languages,
   and apply them to tasks such as image classification and data science challenges.
+- The Scala/Java Inferece APIs provides an easy out of the box solution for loading pre-trained MXNet models and running inference on them.
   
-Install
-------------
- 
-Technically, all you need is the `mxnet-full_2.11-{arch}-{xpu}-{version}.jar` in your classpath.
-It will automatically extract the native library to a tempfile and load it.
-You can find the pre-built jar file in [here](https://search.maven.org/search?q=g:org.apache.mxnet)
- and also our nightly build package [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~)
+Pre-Built Maven Packages
+------------------------
+
+### Stable ###
+
+The MXNet Scala/Java packages can be easily included in your Maven managed project.
+The stable jar files for the packages are available on the [MXNet Maven Package Repository](https://search.maven.org/search?q=g:org.apache.mxnet)
+Currently we provide packages for Linux (Ubuntu 16.04) (CPU and GPU) and macOS (CPU only). Stable packages for Windows and CentOS will come soon. For now, if you have a CentOS machine, follow the ```Build From Source``` section below. 
 
-Currently we provide `linux-x86_64-gpu`, `linux-x86_64-cpu` and `osx-x86_64-cpu`. Support for Windows will come soon.
-Use the following dependency in maven, change the artifactId according to your own architecture, e.g., `mxnet-full_2.11-osx-x86_64-cpu` for OSX (and cpu-only).
+To add MXNet Scala/Java package to your project, add the dependency as shown below corresponding to your platform, under the ```dependencies``` tag in your project's ```pom.xml``` :
+
+**Linux GPU**
+
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
 
 ```HTML
 <dependency>
   <groupId>org.apache.mxnet</groupId>
-  <artifactId>mxnet-full_2.10-linux-x86_64-gpu</artifactId>
-  <version>0.1.1</version>
+  <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
+  <version>[1.3.1,)</version>
 </dependency>
 ```
 
-You can also use `mxnet-core_2.10-0.1.1.jar` and put the compiled native library somewhere in your load path.
+**Linux CPU**
+
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
+  <version>[1.3.1,)</version>
+</dependency>
+```
+
+**macOS CPU**
+
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-macOS cpu-green.svg" alt="maven badge"/></a>
 
 ```HTML
 <dependency>
   <groupId>org.apache.mxnet</groupId>
-  <artifactId>mxnet-core_2.10</artifactId>
-  <version>0.1.1</version>
+  <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
+  <version>[1.3.1,)</version>
 </dependency>
 ```
 
-If you have some native libraries conflict with the ones in the provided 'full' jar (e.g., you use openblas instead of atlas), this is a recommended way.
-Refer to the next section for how to build it from the very source.
+**Note:** ```<version>[1.3.1,)<\version>``` indicates that we will fetch packages with version 1.3.1 or higher. This will always ensure that the pom.xml is able to fetch the latest and greatest jar files from Maven.  
 
-Build
-------------
+### Nightly ###
 
-Checkout the [Installation Guide](http://mxnet.incubator.apache.org/install/index.html) contains instructions to install mxnet.
-Then you can compile the Scala Package by
+Apart from these, the nightly builds representing the bleeding edge development  on Scala/Java packages are also available on the [MXNet Maven Nexus Package Repository](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~). 
+Currently we provide nightly packages for Linux (CPU and GPU) and MacOS (CPU only). The Linux nightly jar files also work on CentOS. Nightly packages for Windows will come soon.
+
+Add the following ```repository``` to your project's ```pom.xml``` file : 
+
+````html
+<repositories>
+    <repository>
+      <id>Apache Snapshot</id>
+      <url>https://repository.apache.org/content/groups/snapshots</url>
+    </repository>
+</repositories>
+````
+
+Also, add the dependency which corresponds to your platform to the ```dependencies``` tag :
+
+**Linux GPU**
+
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~mxnet-full_2.11-linux-x86_64-gpu~~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux gpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
+  <version>[1.5.0,)</version>
+</dependency>
+```
+
+**Linux CPU**
+
+<a href="https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~mxnet-full_2.11-osx-x86_64-cpu~~~"><img src="https://img.shields.io/badge/org.apache.mxnet-linux cpu-green.svg" alt="maven badge"/></a>
+
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
+  <version>[1.5.0,)</version>
+</dependency>
+```
+
+**macOS CPU**
+
+<a href="https://mvnrepository.com/artifact/org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-macOS cpu-green.svg" alt="maven badge"/></a>
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
+  <version>[1.5.0,)</version>
+</dependency>
+```
+
+**Note:** ```<version>[1.5.0,)<\version>``` indicates that we will fetch packages with version 1.5.0 or higher. This will always ensure that the pom.xml is able to fetch the latest and greatest jar files from Maven Snapshot repository.
+
+Build From Source
+-----------------
+
+Checkout the [Installation Guide](http://mxnet.incubator.apache.org/install/index.html) contains instructions to install mxnet package and build it from source. Scala maven build assume you already have a ``lib/libmxnet.so`` file.
+If you have built MXNet from source and are looking to setup Scala from that point, you may simply run the following from the MXNet source root, Scala build will detect your platform (OSX/Linux) and libmxnet.so flavor (CPU/GPU):
 
 ```bash
-make scalapkg
+cd scala-package
+mvn install
 ```
 
-(Optional) run unit/integration tests by
+You can also run the unit tests and integration tests on the Scala Package by :
 
 ```bash
-make scalaunittest
-make scalaintegrationtest
+cd scala-package
+mvn integration-test -DskipTests=false
 ```
 
-Or run a subset of unit tests by, e.g.,
+Or run a subset of unit tests, for e.g.,
 
 ```bash
-make SCALA_TEST_ARGS=-Dsuites=org.apache.mxnet.NDArraySuite scalaunittest
+cd scala-package
+mvn -Dsuites=org.apache.mxnet.NDArraySuite integration-test
 ```
 
 If everything goes well, you will find jars for `assembly`, `core` and `example` modules.
-Also it produces the native library in `native/{your-architecture}/target`, which you can use to cooperate with the `core` module.
+Also it produces the native library in `native/target`, which you can use to cooperate with the `core` module.
+
+Deploy to repository
+--------------------
 
-Once you've downloaded and unpacked MNIST dataset to `./data/`, run the training example by
+By default, `maven deploy` will deploy artifacts to local file system, you can file then in: ``scala-package/deploy/target/repo`` folder.
+
+For nightly build in CI, a snapshot build will be uploaded to apache repository with follow command:
 
 ```bash
-java -Xmx4G -cp \
-  scala-package/assembly/{your-architecture}/target/*:scala-package/examples/target/*:scala-package/examples/target/classes/lib/* \
-  org.apache.mxnet.examples.imclassification.TrainMnist \
-  --data-dir=./data/ \
-  --num-epochs=10 \
-  --network=mlp \
-  --cpus=0,1,2,3
+cd scala-package
+mvn deploy -Pnightly
 ```
 
-If you've compiled with `USE_DIST_KVSTORE` enabled, the python tools in `mxnet/tracker` can be used to launch distributed training.
-The following command runs the above example using 2 worker nodes (and 2 server nodes) in local. Refer to [Distributed Training](http://mxnet.incubator.apache.org/how_to/multi_devices.html) for more details.
+Use following command to deploy release build (push artifacts to apache staging repository):
 
 ```bash
-tracker/dmlc_local.py -n 2 -s 2 \
-  java -Xmx4G -cp \
-  scala-package/assembly/{your-architecture}/target/*:scala-package/examples/target/*:scala-package/examples/target/classes/lib/* \
-  org.apache.mxnet.examples.imclassification.TrainMnist \
-  --data-dir=./data/ \
-  --num-epochs=10 \
-  --network=mlp \
-  --cpus=0 \
-  --kv-store=dist_sync
+cd scala-package
+mvn deploy -Pstaging
 ```
 
-Change the arguments and have fun!
+Examples & Usage
+-------
+- To set up the Scala Project using IntelliJ IDE on macOS follow the instructions [here](https://mxnet.incubator.apache.org/tutorials/scala/mxnet_scala_on_intellij.html).
+- Several examples on using the Scala APIs are provided in the [Scala Examples Folder](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/)
 
-Usage
+Scala Training APIs
 -------
-Here is a Scala example of what training a simple 3-layer multilayer perceptron on MNIST looks like. You can download the MNIST dataset using [get_mnist_data script](/~https://github.com/apache/incubator-mxnet/blob/master/scala-package/core/scripts/get_mnist_data.sh).
-
-```scala
-import org.apache.mxnet._
-import org.apache.mxnet.optimizer.SGD
-
-// model definition
-val data = Symbol.Variable("data")
-val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 128))
-val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
-val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> act1, "num_hidden" -> 64))
-val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
-val fc3 = Symbol.FullyConnected(name = "fc3")()(Map("data" -> act2, "num_hidden" -> 10))
-val mlp = Symbol.SoftmaxOutput(name = "sm")()(Map("data" -> fc3))
-
-// load MNIST dataset
-val trainDataIter = IO.MNISTIter(Map(
-  "image" -> "data/train-images-idx3-ubyte",
-  "label" -> "data/train-labels-idx1-ubyte",
-  "data_shape" -> "(1, 28, 28)",
-  "label_name" -> "sm_label",
-  "batch_size" -> "50",
-  "shuffle" -> "1",
-  "flat" -> "0",
-  "silent" -> "0",
-  "seed" -> "10"))
-
-val valDataIter = IO.MNISTIter(Map(
-  "image" -> "data/t10k-images-idx3-ubyte",
-  "label" -> "data/t10k-labels-idx1-ubyte",
-  "data_shape" -> "(1, 28, 28)",
-  "label_name" -> "sm_label",
-  "batch_size" -> "50",
-  "shuffle" -> "1",
-  "flat" -> "0", "silent" -> "0"))
-
-// setup model and fit the training data
-val model = FeedForward.newBuilder(mlp)
-      .setContext(Context.cpu())
-      .setNumEpoch(10)
-      .setOptimizer(new SGD(learningRate = 0.1f, momentum = 0.9f, wd = 0.0001f))
-      .setTrainData(trainDataIter)
-      .setEvalData(valDataIter)
-      .build()
-```
+- Module API :
+[The Module API](https://mxnet.incubator.apache.org/api/scala/module.html) provides an intermediate and high-level interface for performing computation with neural networks in MXNet. Modules provide high-level APIs for training, predicting, and evaluating.
 
-Predict using the model in the following way:
-
-```scala
-val probArrays = model.predict(valDataIter)
-// in this case, we do not have multiple outputs
-require(probArrays.length == 1)
-val prob = probArrays(0)
-
-// get real labels
-import scala.collection.mutable.ListBuffer
-valDataIter.reset()
-val labels = ListBuffer.empty[NDArray]
-while (valDataIter.hasNext) {
-  val evalData = valDataIter.next()
-  labels += evalData.label(0).copy()
-}
-val y = NDArray.concatenate(labels)
-
-// get predicted labels
-val py = NDArray.argmax_channel(prob)
-require(y.shape == py.shape)
-
-// calculate accuracy
-var numCorrect = 0
-var numInst = 0
-for ((labelElem, predElem) <- y.toArray zip py.toArray) {
-  if (labelElem == predElem) {
-    numCorrect += 1
-  }
-  numInst += 1
-}
-val acc = numCorrect.toFloat / numInst
-println(s"Final accuracy = $acc")
-```
+- KVStore API : 
+To run training over multiple GPUs and multiple hosts, one can use the [KVStore API](https://mxnet.incubator.apache.org/api/scala/kvstore.html).
+
+- IO/Data Loading : 
+MXNet Scala provides APIs for preparing data to feed as an input to models. Check out [Data Loading API](https://mxnet.incubator.apache.org/api/scala/io.html) for more info.
+ 
+Other available Scala APIs for training can be found [here](https://mxnet.incubator.apache.org/api/scala/index.html).  
+ 
+
+Scala Inference APIs
+-------
+The [Scala Inference APIs](https://mxnet.incubator.apache.org/api/scala/infer.html) provide an easy, out of the box solution to load a pre-trained MXNet model and run inference on it. The Inference APIs are present in the [Infer Package](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/infer) under the MXNet Scala Package repository, while the documentation for the Infer API is available [here](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.infer.package).  
 
-Release
+Java Inference APIs
 -------
-- Version 0.1.1, March 24, 2016.
-  - Bug fix for MAE & MSE metrics.
-- Version 0.1.0, March 22, 2016.
+The [Java Inference APIs](http://mxnet.incubator.apache.org/api/java/index.html) also provide an easy, out of the box solution to load a pre-trained MXNet model and run inference on it. The Inference APIs are present in the [Infer Package](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi) under the MXNet Scala Package repository, while the documentation for the Infer API is available [here](https://mxnet.incubator.apache.org/api/java/docs/index.html#org.apache.mxnet.infer.package).
+More APIs will be added to the Java Inference APIs soon.
+
+JVM Memory Management
+-------
+The Scala/Java APIs also provide an automated resource management system, thus making it easy to manage the native memory footprint without any degradation in performance.
+More details about JVM Memory Management are available [here](/~https://github.com/apache/incubator-mxnet/blob/master/scala-package/memory-management.md).
 
 License
 -------
 MXNet Scala Package is licensed under [Apache-2](/~https://github.com/apache/incubator-mxnet/blob/master/scala-package/LICENSE) license.
+
+MXNet uses some 3rd party softwares. Following 3rd party license files are bundled inside Scala jar file:
+* cub/LICENSE.TXT
+* mkldnn/external/mklml_mac_2019.0.1.20180928/license.txt
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
deleted file mode 100644
index 1658f36e6bbd..000000000000
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ /dev/null
@@ -1,131 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
-  <name>MXNet Scala Package - Full Linux-x86_64 CPU-only</name>
-  <packaging>jar</packaging>
-
-  <properties>
-    <MXNET_DIR>${project.parent.parent.basedir}/..</MXNET_DIR>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-      <type>so</type>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>flatten-maven-plugin</artifactId>
-        <!--<version>1.1.0-SNAPSHOT</version>-->
-        <configuration>
-          <pomElements>
-            <dependencies>remove</dependencies>
-          </pomElements>
-        </configuration>
-        <executions>
-          <!-- enable flattening -->
-          <execution>
-            <id>flatten</id>
-            <phase>process-resources</phase>
-            <goals>
-              <goal>flatten</goal>
-            </goals>
-          </execution>
-          <!-- ensure proper cleanup -->
-          <execution>
-            <id>flatten.clean</id>
-            <phase>clean</phase>
-            <goals>
-              <goal>clean</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <inherited>false</inherited>
-        <configuration>
-          <skip>false</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-install-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>binary-jar</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <appendAssemblyId>false</appendAssemblyId>
-              <descriptors>
-                <descriptor>src/main/assembly/assembly.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-          <execution>
-            <id>sources-jar</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <appendAssemblyId>true</appendAssemblyId>
-              <classifier>sources</classifier>
-              <descriptors>
-                <descriptor>${project.parent.basedir}/src/source.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-          <execution>
-            <id>javadoc-jar</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <appendAssemblyId>true</appendAssemblyId>
-              <classifier>javadoc</classifier>
-              <descriptors>
-                <descriptor>${project.parent.basedir}/src/javadoc.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/scala-package/assembly/linux-x86_64-cpu/src/main/assembly/assembly.xml b/scala-package/assembly/linux-x86_64-cpu/src/main/assembly/assembly.xml
deleted file mode 100644
index f4c2017c8241..000000000000
--- a/scala-package/assembly/linux-x86_64-cpu/src/main/assembly/assembly.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<assembly>
-  <id>full</id>
-  <formats>
-    <format>jar</format>
-  </formats>
-  <includeBaseDirectory>false</includeBaseDirectory>
-  <dependencySets>
-    <dependencySet>
-      <includes>
-        <include>*:*:jar</include>
-      </includes>
-      <outputDirectory>/</outputDirectory>
-      <useProjectArtifact>true</useProjectArtifact>
-      <unpack>true</unpack>
-      <scope>runtime</scope>
-    </dependencySet>
-    <dependencySet>
-      <outputDirectory>lib/native</outputDirectory>
-      <outputFileNameMapping>libmxnet-scala.so</outputFileNameMapping>
-      <unpack>false</unpack>
-      <useProjectArtifact>false</useProjectArtifact>
-      <useStrictFiltering>false</useStrictFiltering>
-      <includes>
-        <include>org.apache.mxnet:libmxnet-scala-linux-x86_64-cpu:so</include>
-      </includes>
-    </dependencySet>
-  </dependencySets>
-  <files>
-    <file>
-      <source>${MXNET_DIR}/lib/libmxnet.so</source>
-      <outputDirectory>lib/native</outputDirectory>
-    </file>
-  </files>
-</assembly>
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
deleted file mode 100644
index c80515e7b107..000000000000
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ /dev/null
@@ -1,131 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
-  <name>MXNet Scala Package - Full Linux-x86_64 GPU</name>
-  <packaging>jar</packaging>
-
-  <properties>
-    <MXNET_DIR>${project.parent.parent.basedir}/..</MXNET_DIR>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-      <type>so</type>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>flatten-maven-plugin</artifactId>
-        <!--<version>1.1.0-SNAPSHOT</version>-->
-        <configuration>
-          <pomElements>
-            <dependencies>remove</dependencies>
-          </pomElements>
-        </configuration>
-        <executions>
-          <!-- enable flattening -->
-          <execution>
-            <id>flatten</id>
-            <phase>process-resources</phase>
-            <goals>
-              <goal>flatten</goal>
-            </goals>
-          </execution>
-          <!-- ensure proper cleanup -->
-          <execution>
-            <id>flatten.clean</id>
-            <phase>clean</phase>
-            <goals>
-              <goal>clean</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <inherited>false</inherited>
-        <configuration>
-          <skip>false</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-install-plugin</artifactId>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>binary-jar</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <appendAssemblyId>false</appendAssemblyId>
-              <descriptors>
-                <descriptor>src/main/assembly/assembly.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-          <execution>
-            <id>sources-jar</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <appendAssemblyId>true</appendAssemblyId>
-              <classifier>sources</classifier>
-              <descriptors>
-                <descriptor>${project.parent.basedir}/src/source.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-          <execution>
-            <id>javadoc-jar</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <appendAssemblyId>true</appendAssemblyId>
-              <classifier>javadoc</classifier>
-              <descriptors>
-                <descriptor>${project.parent.basedir}/src/javadoc.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/scala-package/assembly/linux-x86_64-gpu/src/main/assembly/assembly.xml b/scala-package/assembly/linux-x86_64-gpu/src/main/assembly/assembly.xml
deleted file mode 100644
index 2aca64bdf1a9..000000000000
--- a/scala-package/assembly/linux-x86_64-gpu/src/main/assembly/assembly.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<assembly>
-  <id>full</id>
-  <formats>
-    <format>jar</format>
-  </formats>
-  <includeBaseDirectory>false</includeBaseDirectory>
-  <dependencySets>
-    <dependencySet>
-      <includes>
-        <include>*:*:jar</include>
-      </includes>
-      <outputDirectory>/</outputDirectory>
-      <useProjectArtifact>true</useProjectArtifact>
-      <unpack>true</unpack>
-      <scope>runtime</scope>
-    </dependencySet>
-    <dependencySet>
-      <outputDirectory>lib/native</outputDirectory>
-      <outputFileNameMapping>libmxnet-scala.so</outputFileNameMapping>
-      <unpack>false</unpack>
-      <useProjectArtifact>false</useProjectArtifact>
-      <useStrictFiltering>false</useStrictFiltering>
-      <includes>
-        <include>org.apache.mxnet:libmxnet-scala-linux-x86_64-gpu:so</include>
-      </includes>
-    </dependencySet>
-  </dependencySets>
-  <files>
-    <file>
-      <source>${MXNET_DIR}/lib/libmxnet.so</source>
-      <outputDirectory>lib/native</outputDirectory>
-    </file>
-  </files>
-</assembly>
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
deleted file mode 100644
index 62979a140fdc..000000000000
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ /dev/null
@@ -1,127 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
-  <name>MXNet Scala Package - Full OSX-x86_64 CPU-only</name>
-  <packaging>jar</packaging>
-
-  <properties>
-    <MXNET_DIR>${project.parent.parent.basedir}/..</MXNET_DIR>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-      <type>jnilib</type>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>flatten-maven-plugin</artifactId>
-        <!--<version>1.1.0-SNAPSHOT</version>-->
-        <configuration>
-          <pomElements>
-            <dependencies>remove</dependencies>
-          </pomElements>
-        </configuration>
-        <executions>
-          <!-- enable flattening -->
-          <execution>
-            <id>flatten</id>
-            <phase>process-resources</phase>
-            <goals>
-              <goal>flatten</goal>
-            </goals>
-          </execution>
-          <!-- ensure proper cleanup -->
-          <execution>
-            <id>flatten.clean</id>
-            <phase>clean</phase>
-            <goals>
-              <goal>clean</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <inherited>false</inherited>
-        <configuration>
-          <skip>false</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-assembly-plugin</artifactId>
-        <executions>
-          <execution>
-            <id>binary-jar</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <appendAssemblyId>false</appendAssemblyId>
-              <descriptors>
-                <descriptor>src/main/assembly/assembly.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-          <execution>
-            <id>sources-jar</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <appendAssemblyId>true</appendAssemblyId>
-              <classifier>sources</classifier>
-              <descriptors>
-                <descriptor>${project.parent.basedir}/src/source.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-          <execution>
-            <id>javadoc-jar</id>
-            <phase>package</phase>
-            <goals>
-              <goal>single</goal>
-            </goals>
-            <configuration>
-              <appendAssemblyId>true</appendAssemblyId>
-              <classifier>javadoc</classifier>
-              <descriptors>
-                <descriptor>${project.parent.basedir}/src/javadoc.xml</descriptor>
-              </descriptors>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/scala-package/assembly/osx-x86_64-cpu/src/main/assembly/assembly.xml b/scala-package/assembly/osx-x86_64-cpu/src/main/assembly/assembly.xml
deleted file mode 100644
index e9bc3728fcd0..000000000000
--- a/scala-package/assembly/osx-x86_64-cpu/src/main/assembly/assembly.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<assembly>
-  <id>full</id>
-  <formats>
-    <format>jar</format>
-  </formats>
-  <includeBaseDirectory>false</includeBaseDirectory>
-  <dependencySets>
-    <dependencySet>
-      <includes>
-        <include>*:*:jar</include>
-      </includes>
-      <outputDirectory>/</outputDirectory>
-      <useProjectArtifact>true</useProjectArtifact>
-      <unpack>true</unpack>
-      <scope>runtime</scope>
-    </dependencySet>
-    <dependencySet>
-      <outputDirectory>lib/native</outputDirectory>
-      <outputFileNameMapping>libmxnet-scala.jnilib</outputFileNameMapping>
-      <unpack>false</unpack>
-      <useProjectArtifact>false</useProjectArtifact>
-      <useStrictFiltering>false</useStrictFiltering>
-      <includes>
-        <include>org.apache.mxnet:libmxnet-scala-osx-x86_64-cpu:jnilib</include>
-      </includes>
-    </dependencySet>
-  </dependencySets>
-  <files>
-    <file>
-      <source>${MXNET_DIR}/lib/libmxnet.so</source>
-      <outputDirectory>lib/native</outputDirectory>
-    </file>
-  </files>
-</assembly>
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index c1d1a3b8e721..00aa8682f8fa 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -5,106 +5,109 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <artifactId>mxnet-parent</artifactId>
+    <version>INTERNAL</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>mxnet-full-parent_2.11</artifactId>
-  <name>MXNet Scala Package - Full Parent</name>
+  <artifactId>mxnet-full_2.11</artifactId>
+  <name>Assembly Scala Package</name>
   <packaging>pom</packaging>
 
+  <properties>
+    <MXNET_DIR>${project.parent.basedir}/..</MXNET_DIR>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.mxnet</groupId>
+      <artifactId>mxnet-core</artifactId>
+      <version>INTERNAL</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.mxnet</groupId>
+      <artifactId>libmxnet-scala</artifactId>
+      <version>INTERNAL</version>
+      <type>${libtype}</type>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.mxnet</groupId>
+      <artifactId>mxnet-infer</artifactId>
+      <version>INTERNAL</version>
+    </dependency>
+  </dependencies>
+
   <profiles>
     <profile>
-      <id>osx-x86_64-cpu</id>
-      <modules>
-        <module>osx-x86_64-cpu</module>
-      </modules>
-    </profile>
-    <profile>
-      <id>linux-x86_64-cpu</id>
-      <modules>
-        <module>linux-x86_64-cpu</module>
-      </modules>
-    </profile>
-    <profile>
-      <id>linux-x86_64-gpu</id>
-      <modules>
-        <module>linux-x86_64-gpu</module>
-      </modules>
-    </profile>
-    <profile>
-      <id>release</id>
+      <id>staging</id>
       <build>
         <plugins>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-deploy-plugin</artifactId>
-            <configuration>
-              <skip>true</skip>
-            </configuration>
-          </plugin>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-source-plugin</artifactId>
-            <executions>
-              <execution>
-                <phase>package</phase>
-                <goals>
-                  <goal>jar-no-fork</goal>
-                </goals>
-                <configuration>
-                  <includePom>true</includePom>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-javadoc-plugin</artifactId>
-            <executions>
-              <execution>
-                <phase>package</phase>
-                <goals>
-                  <goal>jar</goal>
-                </goals>
-                <configuration>
-                  <includeDependencySources>true</includeDependencySources>
-                  <dependencySourceExcludes>
-                    <dependencySourceExclude>commons-codec:*</dependencySourceExclude>
-                    <dependencySourceExclude>org.scala-lang:*</dependencySourceExclude>
-                    <dependencySourceExclude>log4j:*</dependencySourceExclude>
-                    <dependencySourceExclude>org.slf4j:*</dependencySourceExclude>
-                  </dependencySourceExcludes>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
           <plugin>
             <groupId>org.apache.maven.plugins</groupId>
             <artifactId>maven-gpg-plugin</artifactId>
+            <version>1.6</version>
             <executions>
               <execution>
                 <id>sign-artifacts</id>
-                <phase>verify</phase>
+                <phase>deploy</phase>
                 <goals>
                   <goal>sign</goal>
                 </goals>
               </execution>
             </executions>
           </plugin>
-          <plugin>
-            <groupId>org.sonatype.plugins</groupId>
-            <artifactId>nexus-staging-maven-plugin</artifactId>
-            <extensions>true</extensions>
-            <configuration>
-              <serverId>ossrh</serverId>
-              <nexusUrl>https://oss.sonatype.org/</nexusUrl>
-              <autoReleaseAfterClose>true</autoReleaseAfterClose>
-            </configuration>
-          </plugin>
         </plugins>
       </build>
     </profile>
   </profiles>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>binary-jar</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <appendAssemblyId>false</appendAssemblyId>
+              <descriptors>
+                <descriptor>src/main/assembly/assembly.xml</descriptor>
+              </descriptors>
+            </configuration>
+          </execution>
+          <execution>
+            <id>sources-jar</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <appendAssemblyId>true</appendAssemblyId>
+              <descriptors>
+                <descriptor>src/main/assembly/source.xml</descriptor>
+              </descriptors>
+            </configuration>
+          </execution>
+          <execution>
+            <id>javadoc-jar</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+            <configuration>
+              <appendAssemblyId>true</appendAssemblyId>
+              <descriptors>
+                <descriptor>src/main/assembly/javadoc.xml</descriptor>
+              </descriptors>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
 </project>
diff --git a/scala-package/assembly/src/javadoc.xml b/scala-package/assembly/src/javadoc.xml
deleted file mode 100644
index 9d0be80e7697..000000000000
--- a/scala-package/assembly/src/javadoc.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<assembly>
-    <id>bundle</id>
-    <formats>
-        <format>jar</format>
-    </formats>
-    <includeBaseDirectory>false</includeBaseDirectory>
-
-    <moduleSets>
-        <moduleSet>
-            <!-- Collect javadoc -->
-            <includes>
-                <!--Please put your module defs here-->
-                <include>org.apache.mxnet:mxnet-core_${scala.binary.version}</include>
-            </includes>
-            <useAllReactorProjects>true</useAllReactorProjects>
-            <binaries>
-                <includeDependencies>false</includeDependencies>
-                <attachmentClassifier>javadoc</attachmentClassifier>
-                <outputDirectory>/</outputDirectory>
-                <unpack>true</unpack>
-            </binaries>
-        </moduleSet>
-    </moduleSets>
-</assembly>
\ No newline at end of file
diff --git a/scala-package/assembly/src/main/assembly/assembly.xml b/scala-package/assembly/src/main/assembly/assembly.xml
new file mode 100644
index 000000000000..7525df883dab
--- /dev/null
+++ b/scala-package/assembly/src/main/assembly/assembly.xml
@@ -0,0 +1,57 @@
+<assembly>
+  <id>full</id>
+  <formats>
+    <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <dependencySets>
+    <dependencySet>
+      <includes>
+        <include>*:*:jar</include>
+      </includes>
+      <excludes>
+        <exclude>org.scala-lang:*</exclude>
+        <exclude>org.scala-lang.modules:*</exclude>
+      </excludes>
+      <outputDirectory>/</outputDirectory>
+      <useProjectArtifact>true</useProjectArtifact>
+      <unpack>true</unpack>
+      <scope>runtime</scope>
+    </dependencySet>
+    <dependencySet>
+      <outputDirectory>lib/native</outputDirectory>
+      <outputFileNameMapping>libmxnet-scala.${libtype}</outputFileNameMapping>
+      <unpack>false</unpack>
+      <useProjectArtifact>false</useProjectArtifact>
+      <useStrictFiltering>false</useStrictFiltering>
+      <includes>
+        <include>org.apache.mxnet:libmxnet-scala:${libtype}</include>
+      </includes>
+    </dependencySet>
+  </dependencySets>
+  <fileSets>
+    <fileSet>
+      <directory>${MXNET_DIR}/lib</directory>
+      <includes>
+        <include>libmxnet.so</include>
+        <include>libgfortran.so.3</include>
+        <include>libquadmath.so.0</include>
+        <include>libiomp5.so</include>
+        <include>libiomp5.dylib</include>
+        <include>libmklml_intel.so</include>
+        <include>libmklml.dylib</include>
+        <include>libmkldnn.so.0</include>
+        <include>libmkldnn.0.dylib</include>
+      </includes>
+      <outputDirectory>lib/native</outputDirectory>
+    </fileSet>
+    <fileSet>
+      <directory>${MXNET_DIR}/3rdparty</directory>
+      <includes>
+        <include>cub/LICENSE.TXT</include>
+        <include>mkldnn/external/mklml_mac_2019.0.1.20180928/license.txt</include>
+      </includes>
+      <outputDirectory>/</outputDirectory>
+    </fileSet>
+  </fileSets>
+</assembly>
diff --git a/scala-package/assembly/src/main/assembly/javadoc.xml b/scala-package/assembly/src/main/assembly/javadoc.xml
new file mode 100644
index 000000000000..8f30a261811c
--- /dev/null
+++ b/scala-package/assembly/src/main/assembly/javadoc.xml
@@ -0,0 +1,15 @@
+<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
+  <id>bundle</id>
+  <formats>
+    <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <fileSets>
+    <fileSet>
+      <directory>${rootdir}/core/target/site/scaladocs</directory>
+      <outputDirectory>/</outputDirectory>
+    </fileSet>
+  </fileSets>
+</assembly>
diff --git a/scala-package/assembly/src/main/assembly/source.xml b/scala-package/assembly/src/main/assembly/source.xml
new file mode 100644
index 000000000000..87fcde360c48
--- /dev/null
+++ b/scala-package/assembly/src/main/assembly/source.xml
@@ -0,0 +1,19 @@
+<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
+  <id>src</id>
+  <!--This is the source jar generation config-->
+  <formats>
+    <format>jar</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <fileSets>
+    <fileSet>
+      <directory>${rootdir}/core/src/main/scala</directory>
+      <includes>
+        <include>**\/*.scala</include>
+      </includes>
+      <outputDirectory>/</outputDirectory>
+    </fileSet>
+  </fileSets>
+</assembly>
diff --git a/scala-package/assembly/src/source.xml b/scala-package/assembly/src/source.xml
deleted file mode 100644
index 0cbdcadf8d83..000000000000
--- a/scala-package/assembly/src/source.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-<assembly xmlns="http://maven.apache.org/ASSEMBLY/2.0.0"
-          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-          xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.0.0 http://maven.apache.org/xsd/assembly-2.0.0.xsd">
-  <id>src</id>
-  <!--This is the source jar generation config-->
-  <formats>
-    <format>jar</format>
-  </formats>
-  <includeBaseDirectory>false</includeBaseDirectory>
-  <moduleSets>
-    <moduleSet>
-      <useAllReactorProjects>true</useAllReactorProjects>
-      <includes>
-        <!--Please put your module defs here-->
-        <include>org.apache.mxnet:mxnet-core_${scala.binary.version}</include>
-      </includes>
-      <sources>
-        <includeModuleDirectory>false</includeModuleDirectory>
-        <fileSets>
-          <fileSet>
-            <!--This helps clean the directory from src/main/scala/org/ -> org/-->
-            <directory>src/main/scala</directory>
-            <includes>
-              <include>**\/*.scala</include>
-            </includes>
-            <outputDirectory>/</outputDirectory>
-          </fileSet>
-        </fileSets>
-      </sources>
-    </moduleSet>
-  </moduleSets>
-</assembly>
\ No newline at end of file
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 976383f2e7d5..7264c39e84a0 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -5,55 +5,74 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <artifactId>mxnet-parent</artifactId>
+    <version>INTERNAL</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <properties>
-    <skipTests>true</skipTests>
-    <MXNET_DIR>${project.parent.basedir}/..</MXNET_DIR>
-  </properties>
-
-  <artifactId>mxnet-core_2.11</artifactId>
+  <artifactId>mxnet-core</artifactId>
   <name>MXNet Scala Package - Core</name>
 
-  <profiles>
-    <profile>
-      <id>unittest</id>
-      <properties>
-        <skipTests>false</skipTests>
-      </properties>
-    </profile>
-    <profile>
-      <id>osx-x86_64-cpu</id>
-      <properties>
-        <platform>osx-x86_64-cpu</platform>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-cpu</id>
-      <properties>
-        <platform>linux-x86_64-cpu</platform>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-gpu</id>
-      <properties>
-        <platform>linux-x86_64-gpu</platform>
-      </properties>
-    </profile>
-  </profiles>
+  <properties>
+    <skipJavaTests>false</skipJavaTests>
+  </properties>
 
   <build>
     <plugins>
       <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>native-maven-plugin</artifactId>
+        <extensions>true</extensions>
+        <executions>
+          <execution>
+            <id>javah</id>
+            <phase>verify</phase>
+            <configuration>
+              <javahProvider>default</javahProvider>
+              <javahOutputDirectory>${project.build.directory}/custom-javah</javahOutputDirectory>
+              <workingDirectory>${basedir}</workingDirectory>
+              <javahOutputFileName>org_apache_mxnet_native_c_api.h</javahOutputFileName>
+              <javahClassNames>
+                <javahClassName>org.apache.mxnet.LibInfo</javahClassName>
+              </javahClassNames>
+            </configuration>
+            <goals>
+              <goal>javah</goal>
+            </goals>
+          </execution>
+        </executions>
       </plugin>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>exec-maven-plugin</artifactId>
+        <version>1.6.0</version>
+        <executions>
+          <execution>
+            <id>verify-javah</id>
+            <phase>verify</phase>
+            <goals>
+              <goal>exec</goal>
+            </goals>
+            <configuration>
+              <executable>diff</executable>
+              <commandlineArgs>${project.build.directory}/custom-javah/org_apache_mxnet_native_c_api.h ${project.parent.basedir}/native/src/main/native/org_apache_mxnet_native_c_api.h</commandlineArgs>
+            </configuration>
+          </execution>
+          <execution>
+            <id>apidoc-generation</id>
+            <phase>generate-sources</phase>
+            <goals>
+              <goal>exec</goal>
+            </goals>
+            <configuration>
+              <executable>java</executable>
+              <workingDirectory>${project.parent.basedir}</workingDirectory>
+              <commandlineArgs>-classpath %classpath:${rootdir}/init/target/classes:${rootdir}/macros/target/classes -Djava.library.path=${rootdir}/native/target org.apache.mxnet.APIDocGenerator ${rootdir}/core/src/main/scala/org/apache/mxnet/</commandlineArgs>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
@@ -70,32 +89,34 @@
         <artifactId>maven-compiler-plugin</artifactId>
       </plugin>
       <plugin>
-        <groupId>org.scalatest</groupId>
-        <artifactId>scalatest-maven-plugin</artifactId>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-clean-plugin</artifactId>
+        <version>3.1.0</version>
         <configuration>
-          <skipTests>${skipTests}</skipTests>
-          <argLine>
-            -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
-            -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
-          </argLine>
-          <environmentVariables>
-            <LD_LIBRARY_PATH>${MXNET_DIR}/lib</LD_LIBRARY_PATH>
-          </environmentVariables>
+          <filesets>
+            <fileset>
+              <directory>src/main/scala/org/apache/mxnet</directory>
+              <includes>
+                <include>NDArrayAPIBase.scala</include>
+                <include>NDArrayBase.scala</include>
+                <include>NDArrayRandomAPIBase.scala</include>
+                <include>javaapi/NDArrayBase.scala</include>
+                <include>SymbolAPIBase.scala</include>
+                <include>SymbolRandomAPIBase.scala</include>
+              </includes>
+              <followSymlinks>false</followSymlinks>
+            </fileset>
+          </filesets>
         </configuration>
       </plugin>
       <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-surefire-plugin</artifactId>
-        <version>2.22.0</version>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
           <argLine>
-            -Djava.library.path=${project.parent.basedir}/native/${platform}/target
+            -Djava.library.path=${project.parent.basedir}/native/target \
+            -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
           </argLine>
-          <skipTests>${skipTests}</skipTests>
-          <forkMode>always</forkMode>
-          <environmentVariables>
-            <LD_LIBRARY_PATH>${MXNET_DIR}/lib</LD_LIBRARY_PATH>
-          </environmentVariables>
         </configuration>
       </plugin>
       <plugin>
@@ -107,14 +128,14 @@
   <dependencies>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <artifactId>mxnet-macros</artifactId>
+      <version>INTERNAL</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-macros_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <artifactId>mxnet-scala-init</artifactId>
+      <version>INTERNAL</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala
index b2a53fd9f2dd..001bd04d2c95 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Base.scala
@@ -18,7 +18,9 @@
 package org.apache.mxnet
 
 import org.apache.mxnet.util.NativeLibraryLoader
-import org.slf4j.{LoggerFactory, Logger}
+import org.slf4j.{Logger, LoggerFactory}
+
+import scala.Specializable.Group
 
 private[mxnet] object Base {
   private val logger: Logger = LoggerFactory.getLogger("MXNetJVM")
@@ -57,6 +59,9 @@ private[mxnet] object Base {
 
   val MX_REAL_TYPE = DType.Float32
 
+  // The primitives currently supported for NDArray operations
+  val MX_PRIMITIVES = new Group ((Double, Float))
+
   try {
     try {
       tryLoadLibraryOS("mxnet-scala")
@@ -67,6 +72,8 @@ private[mxnet] object Base {
           "Consider installing the library somewhere in the path " +
           "(for Windows: PATH, for Linux: LD_LIBRARY_PATH), " +
           "or specifying by Java cmd option -Djava.library.path=[lib path].")
+        logger.warn("LD_LIBRARY_PATH=" + System.getenv("LD_LIBRARY_PATH"))
+        logger.warn("java.library.path=" + System.getProperty("java.library.path"))
         NativeLibraryLoader.loadLibrary("mxnet-scala")
     }
   } catch {
@@ -87,35 +94,8 @@ private[mxnet] object Base {
 
   @throws(classOf[UnsatisfiedLinkError])
   private def tryLoadLibraryOS(libname: String): Unit = {
-    try {
-      logger.info(s"Try loading $libname from native path.")
-      System.loadLibrary(libname)
-    } catch {
-      case e: UnsatisfiedLinkError =>
-        val os = System.getProperty("os.name")
-        // ref: http://lopica.sourceforge.net/os.html
-        if (os.startsWith("Linux")) {
-          tryLoadLibraryXPU(libname, "linux-x86_64")
-        } else if (os.startsWith("Mac")) {
-          tryLoadLibraryXPU(libname, "osx-x86_64")
-        } else {
-          // TODO(yizhi) support windows later
-          throw new UnsatisfiedLinkError()
-        }
-    }
-  }
-
-  @throws(classOf[UnsatisfiedLinkError])
-  private def tryLoadLibraryXPU(libname: String, arch: String): Unit = {
-    try {
-      // try gpu first
-      logger.info(s"Try loading $libname-$arch-gpu from native path.")
-      System.loadLibrary(s"$libname-$arch-gpu")
-    } catch {
-      case e: UnsatisfiedLinkError =>
-        logger.info(s"Try loading $libname-$arch-cpu from native path.")
-        System.loadLibrary(s"$libname-$arch-cpu")
-    }
+    logger.info(s"Try loading $libname from native path.")
+    System.loadLibrary(libname)
   }
 
   // helper function definitions
@@ -153,3 +133,21 @@ private[mxnet] object Base {
 }
 
 class MXNetError(val err: String) extends Exception(err)
+
+// Some type-classes to ease the work in Symbol.random and NDArray.random modules
+
+class SymbolOrScalar[T](val isScalar: Boolean)
+object SymbolOrScalar {
+  def apply[T](implicit ev: SymbolOrScalar[T]): SymbolOrScalar[T] = ev
+  implicit object FloatWitness extends SymbolOrScalar[Float](true)
+  implicit object IntWitness extends SymbolOrScalar[Int](true)
+  implicit object SymbolWitness extends SymbolOrScalar[Symbol](false)
+}
+
+class NDArrayOrScalar[T](val isScalar: Boolean)
+object NDArrayOrScalar {
+  def apply[T](implicit ev: NDArrayOrScalar[T]): NDArrayOrScalar[T] = ev
+  implicit object FloatWitness extends NDArrayOrScalar[Float](true)
+  implicit object IntWitness extends NDArrayOrScalar[Int](true)
+  implicit object NDArrayWitness extends NDArrayOrScalar[NDArray](false)
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
index 0a5683aa7ab3..20b6ed9fc806 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
@@ -93,6 +93,9 @@ private[mxnet] class LibInfo {
   @native def mxNDArraySyncCopyFromCPU(handle: NDArrayHandle,
                                        source: Array[MXFloat],
                                        size: Int): Int
+  @native def mxFloat64NDArraySyncCopyFromCPU(handle: NDArrayHandle,
+                                       source: Array[Double],
+                                       size: Int): Int
   @native def mxNDArrayLoad(fname: String,
                             outSize: MXUintRef,
                             handles: ArrayBuffer[NDArrayHandle],
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala b/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala
new file mode 100644
index 000000000000..cb978856963c
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet
+
+object MX_PRIMITIVES {
+
+  /**
+    * This defines the basic primitives we can use in Scala for mathematical
+    * computations in NDArrays.This gives us a flexibility to expand to
+    * more supported primitives in the future. Currently Float and Double
+    * are supported. The functions which accept MX_PRIMITIVE_TYPE as input can also accept
+    * plain old Float and Double data as inputs because of the underlying
+    * implicit conversion between primitives to MX_PRIMITIVE_TYPE.
+    */
+  trait MX_PRIMITIVE_TYPE extends Ordered[MX_PRIMITIVE_TYPE]{
+
+    def toString: String
+
+    def unary_- : MX_PRIMITIVE_TYPE
+  }
+
+  trait MXPrimitiveOrdering extends Ordering[MX_PRIMITIVE_TYPE] {
+
+    def compare(x: MX_PRIMITIVE_TYPE, y: MX_PRIMITIVE_TYPE): Int = x.compare(y)
+
+  }
+
+  implicit object MX_PRIMITIVE_TYPE extends MXPrimitiveOrdering
+
+  /**
+    * Wrapper over Float in Scala.
+    * @param data
+    */
+  class MX_FLOAT(val data: Float) extends MX_PRIMITIVE_TYPE {
+
+    override def toString: String = data.toString
+
+    override def unary_- : MX_PRIMITIVE_TYPE = new MX_FLOAT(data.unary_-)
+
+    override def compare(that: MX_PRIMITIVE_TYPE): Int = {
+      this.data.compareTo(that.asInstanceOf[MX_FLOAT].data)
+    }
+  }
+
+  implicit def FloatToMX_Float(d : Float): MX_FLOAT = new MX_FLOAT(d)
+
+  implicit def MX_FloatToFloat(d: MX_FLOAT) : Float = d.data
+
+  implicit def IntToMX_Float(d: Int): MX_FLOAT = new MX_FLOAT(d.toFloat)
+
+  /**
+    * Wrapper over Double in Scala.
+    * @param data
+    */
+  class MX_Double(val data: Double) extends MX_PRIMITIVE_TYPE {
+
+    override def toString: String = data.toString
+
+    override def unary_- : MX_PRIMITIVE_TYPE = new MX_Double(data.unary_-)
+
+    override def compare(that: MX_PRIMITIVE_TYPE): Int = {
+      this.data.compareTo(that.asInstanceOf[MX_Double].data)
+    }
+  }
+
+  implicit def DoubleToMX_Double(d : Double): MX_Double = new MX_Double(d)
+
+  implicit def MX_DoubleToDouble(d: MX_Double) : Double = d.data
+
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index 3a0c3c11f16a..163ed2682532 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -21,6 +21,7 @@ import java.nio.{ByteBuffer, ByteOrder}
 
 import org.apache.mxnet.Base._
 import org.apache.mxnet.DType.DType
+import org.apache.mxnet.MX_PRIMITIVES.{MX_PRIMITIVE_TYPE}
 import org.slf4j.LoggerFactory
 
 import scala.collection.mutable
@@ -40,6 +41,7 @@ object NDArray extends NDArrayBase {
   private val functions: Map[String, NDArrayFunction] = initNDArrayModule()
 
   val api = NDArrayAPI
+  val random = NDArrayRandomAPI
 
   private def addDependency(froms: Array[NDArray], tos: Array[NDArray]): Unit = {
     froms.foreach { from =>
@@ -261,16 +263,46 @@ object NDArray extends NDArrayBase {
     arr
   }
 
-  // Perform power operator
+  def full(shape: Shape, value: Double, ctx: Context): NDArray = {
+    val arr = empty(shape, ctx, DType.Float64)
+    arr.set(value)
+    arr
+  }
+
+  /**
+    * Create a new NDArray filled with given value, with specified shape.
+    * @param shape shape of the NDArray.
+    * @param value value to be filled with
+    */
+  def full(shape: Shape, value: Double): NDArray = {
+    full(shape, value, null)
+  }
+
+
+  /**
+    * Perform power operation on NDArray. Returns result as NDArray
+    * @param lhs
+    * @param rhs
+    */
   def power(lhs: NDArray, rhs: NDArray): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_power", Seq(lhs, rhs))
   }
 
-  def power(lhs: NDArray, rhs: Float): NDArray = {
+  /**
+    * Perform scalar power operation on NDArray. Returns result as NDArray
+    * @param lhs NDArray on which to perform the operation on.
+    * @param rhs The scalar input. Can be of type Float/Double
+    */
+  def power(lhs: NDArray, rhs: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_power_scalar", Seq(lhs, rhs))
   }
 
-  def power(lhs: Float, rhs: NDArray): NDArray = {
+  /**
+    * Perform scalar power operation on NDArray. Returns result as NDArray
+    * @param lhs The scalar input. Can be of type Float/Double
+    * @param rhs NDArray on which to perform the operation on.
+    */
+  def power(lhs: MX_PRIMITIVE_TYPE, rhs: NDArray): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_rpower_scalar", Seq(lhs, rhs))
   }
 
@@ -279,11 +311,21 @@ object NDArray extends NDArrayBase {
     NDArray.genericNDArrayFunctionInvoke("_maximum", Seq(lhs, rhs))
   }
 
-  def maximum(lhs: NDArray, rhs: Float): NDArray = {
+  /**
+    * Perform the max operation on NDArray. Returns the result as NDArray.
+    * @param lhs NDArray on which to perform the operation on.
+    * @param rhs The scalar input. Can be of type Float/Double
+    */
+  def maximum(lhs: NDArray, rhs: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_maximum_scalar", Seq(lhs, rhs))
   }
 
-  def maximum(lhs: Float, rhs: NDArray): NDArray = {
+  /**
+    * Perform the max operation on NDArray. Returns the result as NDArray.
+    * @param lhs The scalar input. Can be of type Float/Double
+    * @param rhs NDArray on which to perform the operation on.
+    */
+  def maximum(lhs: MX_PRIMITIVE_TYPE, rhs: NDArray): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_maximum_scalar", Seq(lhs, rhs))
   }
 
@@ -292,11 +334,21 @@ object NDArray extends NDArrayBase {
     NDArray.genericNDArrayFunctionInvoke("_minimum", Seq(lhs, rhs))
   }
 
-  def minimum(lhs: NDArray, rhs: Float): NDArray = {
+  /**
+    * Perform the min operation on NDArray. Returns the result as NDArray.
+    * @param lhs NDArray on which to perform the operation on.
+    * @param rhs The scalar input. Can be of type Float/Double
+    */
+  def minimum(lhs: NDArray, rhs: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_minimum_scalar", Seq(lhs, rhs))
   }
 
-  def minimum(lhs: Float, rhs: NDArray): NDArray = {
+  /**
+    * Perform the min operation on NDArray. Returns the result as NDArray.
+    * @param lhs The scalar input. Can be of type Float/Double
+    * @param rhs NDArray on which to perform the operation on.
+    */
+  def minimum(lhs: MX_PRIMITIVE_TYPE, rhs: NDArray): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_minimum_scalar", Seq(lhs, rhs))
   }
 
@@ -309,7 +361,15 @@ object NDArray extends NDArrayBase {
     NDArray.genericNDArrayFunctionInvoke("broadcast_equal", Seq(lhs, rhs))
   }
 
-  def equal(lhs: NDArray, rhs: Float): NDArray = {
+  /**
+    * Returns the result of element-wise **equal to** (==) comparison operation with broadcasting.
+    * For each element in input arrays, return 1(true) if corresponding elements are same,
+    * otherwise return 0(false).
+    *
+    * @param lhs NDArray
+    * @param rhs The scalar input. Can be of type Float/Double
+    */
+  def equal(lhs: NDArray, rhs: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_equal_scalar", Seq(lhs, rhs))
   }
 
@@ -323,7 +383,15 @@ object NDArray extends NDArrayBase {
     NDArray.genericNDArrayFunctionInvoke("broadcast_not_equal", Seq(lhs, rhs))
   }
 
-  def notEqual(lhs: NDArray, rhs: Float): NDArray = {
+  /**
+    * Returns the result of element-wise **not equal to** (!=) comparison operation
+    * with broadcasting.
+    * For each element in input arrays, return 1(true) if corresponding elements are different,
+    * otherwise return 0(false).
+    * @param lhs NDArray
+    * @param rhs The scalar input. Can be of type Float/Double
+    */
+  def notEqual(lhs: NDArray, rhs: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_not_equal_scalar", Seq(lhs, rhs))
   }
 
@@ -337,7 +405,16 @@ object NDArray extends NDArrayBase {
     NDArray.genericNDArrayFunctionInvoke("broadcast_greater", Seq(lhs, rhs))
   }
 
-  def greater(lhs: NDArray, rhs: Float): NDArray = {
+  /**
+    * Returns the result of element-wise **greater than** (>) comparison operation
+    * with broadcasting.
+    * For each element in input arrays, return 1(true) if lhs elements are greater than rhs,
+    * otherwise return 0(false).
+    *
+    * @param lhs NDArray
+    * @param rhs The scalar input. Can be of type Float/Double
+    */
+  def greater(lhs: NDArray, rhs: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_greater_scalar", Seq(lhs, rhs))
   }
 
@@ -351,7 +428,16 @@ object NDArray extends NDArrayBase {
     NDArray.genericNDArrayFunctionInvoke("broadcast_greater_equal", Seq(lhs, rhs))
   }
 
-  def greaterEqual(lhs: NDArray, rhs: Float): NDArray = {
+  /**
+    * Returns the result of element-wise **greater than or equal to** (>=) comparison
+    * operation with broadcasting.
+    * For each element in input arrays, return 1(true) if lhs elements are greater than equal to
+    * rhs, otherwise return 0(false).
+    *
+    * @param lhs NDArray
+    * @param rhs The scalar input. Can be of type Float/Double
+    */
+  def greaterEqual(lhs: NDArray, rhs: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_greater_equal_scalar", Seq(lhs, rhs))
   }
 
@@ -365,7 +451,15 @@ object NDArray extends NDArrayBase {
     NDArray.genericNDArrayFunctionInvoke("broadcast_lesser", Seq(lhs, rhs))
   }
 
-  def lesser(lhs: NDArray, rhs: Float): NDArray = {
+  /**
+    * Returns the result of element-wise **lesser than** (<) comparison operation
+    * with broadcasting.
+    * For each element in input arrays, return 1(true) if lhs elements are less than rhs,
+    * otherwise return 0(false).
+    * @param lhs NDArray
+    * @param rhs The scalar input. Can be of type Float/Double
+    */
+  def lesser(lhs: NDArray, rhs: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_lesser_scalar", Seq(lhs, rhs))
   }
 
@@ -379,7 +473,16 @@ object NDArray extends NDArrayBase {
     NDArray.genericNDArrayFunctionInvoke("broadcast_lesser_equal", Seq(lhs, rhs))
   }
 
-  def lesserEqual(lhs: NDArray, rhs: Float): NDArray = {
+  /**
+    * Returns the result of element-wise **lesser than or equal to** (<=) comparison
+    * operation with broadcasting.
+    * For each element in input arrays, return 1(true) if lhs elements are
+    * lesser than equal to rhs, otherwise return 0(false).
+    *
+    * @param lhs NDArray
+    * @param rhs The scalar input. Can be of type Float/Double
+    */
+  def lesserEqual(lhs: NDArray, rhs: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_lesser_equal_scalar", Seq(lhs, rhs))
   }
 
@@ -396,6 +499,16 @@ object NDArray extends NDArrayBase {
     arr
   }
 
+  def array(sourceArr: Array[Double], shape: Shape, ctx: Context): NDArray = {
+    val arr = empty(shape, ctx, dtype = DType.Float64)
+    arr.set(sourceArr)
+    arr
+  }
+
+  def array(sourceArr: Array[Double], shape: Shape): NDArray = {
+    array(sourceArr, shape, null)
+  }
+
   /**
    * Returns evenly spaced values within a given interval.
    * Values are generated within the half-open interval [`start`, `stop`). In other
@@ -644,6 +757,12 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     checkCall(_LIB.mxNDArraySyncCopyFromCPU(handle, source, source.length))
   }
 
+  private def syncCopyfrom(source: Array[Double]): Unit = {
+    require(source.length == size,
+      s"array size (${source.length}) do not match the size of NDArray ($size)")
+    checkCall(_LIB.mxFloat64NDArraySyncCopyFromCPU(handle, source, source.length))
+  }
+
   /**
    * Return a sliced NDArray that shares memory with current one.
    * NDArray only support continuous slicing on axis 0
@@ -758,7 +877,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
    * @param value Value to set
    * @return Current NDArray
    */
-  def set(value: Float): NDArray = {
+  def set(value: MX_PRIMITIVE_TYPE): NDArray = {
     require(writable, "trying to assign to a readonly NDArray")
     NDArray.genericNDArrayFunctionInvoke("_set_value", Seq(value), Map("out" -> this))
     this
@@ -775,11 +894,17 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     this
   }
 
+  def set(other: Array[Double]): NDArray = {
+    require(writable, "trying to assign to a readonly NDArray")
+    syncCopyfrom(other)
+    this
+  }
+
   def +(other: NDArray): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_plus", Seq(this, other))
   }
 
-  def +(other: Float): NDArray = {
+  def +(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_plus_scalar", Seq(this, other))
   }
 
@@ -791,7 +916,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     this
   }
 
-  def +=(other: Float): NDArray = {
+  def +=(other: MX_PRIMITIVE_TYPE): NDArray = {
     if (!writable) {
       throw new IllegalArgumentException("trying to add to a readonly NDArray")
     }
@@ -803,7 +928,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     NDArray.genericNDArrayFunctionInvoke("_minus", Seq(this, other))
   }
 
-  def -(other: Float): NDArray = {
+  def -(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_minus_scalar", Seq(this, other))
   }
 
@@ -815,7 +940,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     this
   }
 
-  def -=(other: Float): NDArray = {
+  def -=(other: MX_PRIMITIVE_TYPE): NDArray = {
     if (!writable) {
       throw new IllegalArgumentException("trying to subtract from a readonly NDArray")
     }
@@ -827,7 +952,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     NDArray.genericNDArrayFunctionInvoke("_mul", Seq(this, other))
   }
 
-  def *(other: Float): NDArray = {
+  def *(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_mul_scalar", Seq(this, other))
   }
 
@@ -843,7 +968,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     this
   }
 
-  def *=(other: Float): NDArray = {
+  def *=(other: MX_PRIMITIVE_TYPE): NDArray = {
     if (!writable) {
       throw new IllegalArgumentException("trying to multiply to a readonly NDArray")
     }
@@ -855,7 +980,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     NDArray.genericNDArrayFunctionInvoke("_div", Seq(this, other))
   }
 
-  def /(other: Float): NDArray = {
+  def /(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_div_scalar", Seq(this, other))
   }
 
@@ -867,7 +992,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     this
   }
 
-  def /=(other: Float): NDArray = {
+  def /=(other: MX_PRIMITIVE_TYPE): NDArray = {
     if (!writable) {
       throw new IllegalArgumentException("trying to divide from a readonly NDArray")
     }
@@ -879,7 +1004,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     NDArray.power(this, other)
   }
 
-  def **(other: Float): NDArray = {
+  def **(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.power(this, other)
   }
 
@@ -887,7 +1012,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     NDArray.genericNDArrayFunctionInvoke("_power", Seq(this, other), Map("out" -> this))
   }
 
-  def **=(other: Float): NDArray = {
+  def **=(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_power_scalar", Seq(this, other), Map("out" -> this))
   }
 
@@ -895,7 +1020,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     NDArray.greater(this, other)
   }
 
-  def >(other: Float): NDArray = {
+  def >(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.greater(this, other)
   }
 
@@ -903,7 +1028,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     NDArray.greaterEqual(this, other)
   }
 
-  def >=(other: Float): NDArray = {
+  def >=(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.greaterEqual(this, other)
   }
 
@@ -911,7 +1036,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     NDArray.lesser(this, other)
   }
 
-  def <(other: Float): NDArray = {
+  def <(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.lesser(this, other)
   }
 
@@ -919,7 +1044,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     NDArray.lesserEqual(this, other)
   }
 
-  def <=(other: Float): NDArray = {
+  def <=(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.lesserEqual(this, other)
   }
 
@@ -927,7 +1052,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     NDArray.genericNDArrayFunctionInvoke("_mod", Seq(this, other))
   }
 
-  def %(other: Float): NDArray = {
+  def %(other: MX_PRIMITIVE_TYPE): NDArray = {
     NDArray.genericNDArrayFunctionInvoke("_mod_scalar", Seq(this, other))
   }
 
@@ -939,7 +1064,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     this
   }
 
-  def %=(other: Float): NDArray = {
+  def %=(other: MX_PRIMITIVE_TYPE): NDArray = {
     if (!writable) {
       throw new IllegalArgumentException("trying to take modulo from a readonly NDArray")
     }
@@ -955,6 +1080,14 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     internal.toFloatArray
   }
 
+  /**
+    * Return a copied flat java array of current array (row-major) with datatype as Float64/Double.
+    * @return  A copy of array content.
+    */
+  def toFloat64Array: Array[Double] = {
+    internal.toDoubleArray
+  }
+
   def internal: NDArrayInternal = {
     val myType = dtype
     val arrLength = DType.numOfBytes(myType) * size
@@ -974,6 +1107,11 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     this.toArray(0)
   }
 
+  def toFloat64Scalar: Double = {
+    require(shape == Shape(1), "The current array is not a scalar")
+    this.toFloat64Array(0)
+  }
+
   /**
    * Copy the content of current array to other.
    *
@@ -996,7 +1134,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
    * @return The copy target NDArray
    */
   def copyTo(ctx: Context): NDArray = {
-    val ret = new NDArray(NDArray.newAllocHandle(shape, ctx, delayAlloc = true))
+    val ret = new NDArray(NDArray.newAllocHandle(shape, ctx, delayAlloc = true, dtype = dtype))
     copyTo(ret)
   }
 
@@ -1046,11 +1184,11 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
 
 private[mxnet] object NDArrayConversions {
   implicit def int2Scalar(x: Int): NDArrayConversions = new NDArrayConversions(x.toFloat)
-  implicit def double2Scalar(x: Double): NDArrayConversions = new NDArrayConversions(x.toFloat)
+  implicit def double2Scalar(x: Double): NDArrayConversions = new NDArrayConversions(x)
   implicit def float2Scalar(x: Float): NDArrayConversions = new NDArrayConversions(x)
 }
 
-private[mxnet] class NDArrayConversions(val value: Float) {
+private[mxnet] class NDArrayConversions(val value: MX_PRIMITIVE_TYPE) {
   def +(other: NDArray): NDArray = {
     other + value
   }
@@ -1144,34 +1282,39 @@ private[mxnet] class NDArrayFuncReturn(private[mxnet] val arr: Array[NDArray]) {
   def waitToRead(): Unit = head.waitToRead()
   def context: Context = head.context
   def set(value: Float): NDArray = head.set(value)
+  def set(value: Double): NDArray = head.set(value)
   def set(other: NDArray): NDArray = head.set(other)
   def set(other: Array[Float]): NDArray = head.set(other)
+  def set(other: Array[Double]): NDArray = head.set(other)
   def +(other: NDArray): NDArray = head + other
-  def +(other: Float): NDArray = head + other
+  def +(other: MX_PRIMITIVE_TYPE): NDArray = head + other
   def +=(other: NDArray): NDArray = head += other
-  def +=(other: Float): NDArray = head += other
+  def +=(other: MX_PRIMITIVE_TYPE): NDArray = head += other
   def -(other: NDArray): NDArray = head - other
-  def -(other: Float): NDArray = head - other
+  def -(other: MX_PRIMITIVE_TYPE): NDArray = head - other
   def -=(other: NDArray): NDArray = head -= other
-  def -=(other: Float): NDArray = head -= other
+  def -=(other: MX_PRIMITIVE_TYPE): NDArray = head -= other
   def *(other: NDArray): NDArray = head * other
-  def *(other: Float): NDArray = head * other
+  def *(other: MX_PRIMITIVE_TYPE): NDArray = head * other
   def unary_-(): NDArray = -head
   def *=(other: NDArray): NDArray = head *= other
-  def *=(other: Float): NDArray = head *= other
+  def *=(other: MX_PRIMITIVE_TYPE): NDArray = head *= other
   def /(other: NDArray): NDArray = head / other
+  def /(other: MX_PRIMITIVE_TYPE): NDArray = head / other
   def **(other: NDArray): NDArray = head ** other
-  def **(other: Float): NDArray = head ** other
+  def **(other: MX_PRIMITIVE_TYPE): NDArray = head ** other
   def >(other: NDArray): NDArray = head > other
-  def >(other: Float): NDArray = head > other
+  def >(other: MX_PRIMITIVE_TYPE): NDArray = head > other
   def >=(other: NDArray): NDArray = head >= other
-  def >=(other: Float): NDArray = head >= other
+  def >=(other: MX_PRIMITIVE_TYPE): NDArray = head >= other
   def <(other: NDArray): NDArray = head < other
-  def <(other: Float): NDArray = head < other
+  def <(other: MX_PRIMITIVE_TYPE): NDArray = head < other
   def <=(other: NDArray): NDArray = head <= other
-  def <=(other: Float): NDArray = head <= other
+  def <=(other: MX_PRIMITIVE_TYPE): NDArray = head <= other
   def toArray: Array[Float] = head.toArray
+  def toFloat64Array: Array[Double] = head.toFloat64Array
   def toScalar: Float = head.toScalar
+  def toFloat64Scalar: Double = head.toFloat64Scalar
   def copyTo(other: NDArray): NDArray = head.copyTo(other)
   def copyTo(ctx: Context): NDArray = head.copyTo(ctx)
   def copy(): NDArray = head.copy()
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
index 1d8551c1b1e5..024fed1c4ba6 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArrayAPI.scala
@@ -15,11 +15,22 @@
  * limitations under the License.
  */
 package org.apache.mxnet
-@AddNDArrayAPIs(false)
+
 /**
   * typesafe NDArray API: NDArray.api._
   * Main code will be generated during compile time through Macros
   */
+@AddNDArrayAPIs(false)
 object NDArrayAPI extends NDArrayAPIBase {
   // TODO: Implement CustomOp for NDArray
 }
+
+/**
+  * typesafe NDArray random module: NDArray.random._
+  * Main code will be generated during compile time through Macros
+  */
+@AddNDArrayRandomAPIs(false)
+object NDArrayRandomAPI extends NDArrayRandomAPIBase {
+
+}
+
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
index 01349a689b6c..29885fc723cd 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
@@ -842,6 +842,7 @@ object Symbol extends SymbolBase {
   private val bindReqMap = Map("null" -> 0, "write" -> 1, "add" -> 3)
 
   val api = SymbolAPI
+  val random = SymbolRandomAPI
 
   def pow(sym1: Symbol, sym2: Symbol): Symbol = {
     Symbol.createFromListedSymbols("_Power")(Array(sym1, sym2))
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala b/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
index 1bfb0559cf96..f166de11ea52 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/SymbolAPI.scala
@@ -19,11 +19,11 @@ package org.apache.mxnet
 import scala.collection.mutable
 
 
-@AddSymbolAPIs(false)
 /**
   * typesafe Symbol API: Symbol.api._
   * Main code will be generated during compile time through Macros
   */
+@AddSymbolAPIs(false)
 object SymbolAPI extends SymbolAPIBase {
   def Custom (op_type : String, kwargs : mutable.Map[String, Any],
              name : String = null, attr : Map[String, String] = null) : Symbol = {
@@ -32,3 +32,13 @@ object SymbolAPI extends SymbolAPIBase {
     Symbol.createSymbolGeneral("Custom", name, attr, Seq(), map.toMap)
   }
 }
+
+/**
+  * typesafe Symbol random module: Symbol.random._
+  * Main code will be generated during compile time through Macros
+  */
+@AddSymbolRandomAPIs(false)
+object SymbolRandomAPI extends SymbolRandomAPIBase {
+
+}
+
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/io/MXDataIter.scala b/scala-package/core/src/main/scala/org/apache/mxnet/io/MXDataIter.scala
index a84bd106b763..e30098c3088b 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/io/MXDataIter.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/io/MXDataIter.scala
@@ -53,9 +53,9 @@ private[mxnet] class MXDataIter(private[mxnet] val handle: DataIterHandle,
       val label = currentBatch.label(0)
       // properties
       val res = (
-        // TODO: need to allow user to specify DType and Layout
-        IndexedSeq(new DataDesc(dataName, data.shape, DType.Float32, Layout.UNDEFINED)),
-        IndexedSeq(new DataDesc(labelName, label.shape, DType.Float32, Layout.UNDEFINED)),
+        // TODO: need to allow user to specify Layout
+        IndexedSeq(new DataDesc(dataName, data.shape, data.dtype, Layout.UNDEFINED)),
+        IndexedSeq(new DataDesc(labelName, label.shape, label.dtype, Layout.UNDEFINED)),
         ListMap(dataName -> data.shape),
         ListMap(labelName -> label.shape),
         data.shape(0))
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala b/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
index 0032a54dd802..e690abba0d13 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
@@ -61,7 +61,8 @@ class NDArrayIter(data: IndexedSeq[(DataDesc, NDArray)],
            dataBatchSize: Int = 1, shuffle: Boolean = false,
            lastBatchHandle: String = "pad",
            dataName: String = "data", labelName: String = "label") {
-    this(IO.initDataDesc(data, allowEmpty = false, dataName, MX_REAL_TYPE, Layout.UNDEFINED),
+    this(IO.initDataDesc(data, allowEmpty = false, dataName,
+      if (data == null || data.isEmpty)  MX_REAL_TYPE else data(0).dtype, Layout.UNDEFINED),
       IO.initDataDesc(label, allowEmpty = true, labelName, MX_REAL_TYPE, Layout.UNDEFINED),
       dataBatchSize, shuffle, lastBatchHandle)
   }
@@ -272,7 +273,7 @@ object NDArrayIter {
      */
     def addData(name: String, data: NDArray): Builder = {
       this.data = this.data ++ IndexedSeq((new DataDesc(name,
-        data.shape, DType.Float32, Layout.UNDEFINED), data))
+        data.shape, data.dtype, Layout.UNDEFINED), data))
       this
     }
 
@@ -284,7 +285,7 @@ object NDArrayIter {
      */
     def addLabel(name: String, label: NDArray): Builder = {
       this.label = this.label ++ IndexedSeq((new DataDesc(name,
-        label.shape, DType.Float32, Layout.UNDEFINED), label))
+        label.shape, label.dtype, Layout.UNDEFINED), label))
       this
     }
 
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/NDArray.scala
index 198102d2377f..67809c158aff 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/NDArray.scala
@@ -91,17 +91,26 @@ object NDArray extends NDArrayBase {
   def full(shape: Shape, value: Float, ctx: Context): NDArray
   = org.apache.mxnet.NDArray.full(shape, value, ctx)
 
+  def full(shape: Shape, value: Double, ctx: Context): NDArray
+  = org.apache.mxnet.NDArray.full(shape, value, ctx)
+
   def power(lhs: NDArray, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.power(lhs, rhs)
   def power(lhs: NDArray, rhs: Float): NDArray = org.apache.mxnet.NDArray.power(lhs, rhs)
   def power(lhs: Float, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.power(lhs, rhs)
+  def power(lhs: NDArray, rhs: Double): NDArray = org.apache.mxnet.NDArray.power(lhs, rhs)
+  def power(lhs: Double, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.power(lhs, rhs)
 
   def maximum(lhs: NDArray, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.maximum(lhs, rhs)
   def maximum(lhs: NDArray, rhs: Float): NDArray = org.apache.mxnet.NDArray.maximum(lhs, rhs)
   def maximum(lhs: Float, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.maximum(lhs, rhs)
+  def maximum(lhs: NDArray, rhs: Double): NDArray = org.apache.mxnet.NDArray.maximum(lhs, rhs)
+  def maximum(lhs: Double, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.maximum(lhs, rhs)
 
   def minimum(lhs: NDArray, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.minimum(lhs, rhs)
   def minimum(lhs: NDArray, rhs: Float): NDArray = org.apache.mxnet.NDArray.minimum(lhs, rhs)
   def minimum(lhs: Float, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.minimum(lhs, rhs)
+  def minimum(lhs: NDArray, rhs: Double): NDArray = org.apache.mxnet.NDArray.minimum(lhs, rhs)
+  def minimum(lhs: Double, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.minimum(lhs, rhs)
 
 
   /**
@@ -111,6 +120,7 @@ object NDArray extends NDArrayBase {
     */
   def equal(lhs: NDArray, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.equal(lhs, rhs)
   def equal(lhs: NDArray, rhs: Float): NDArray = org.apache.mxnet.NDArray.equal(lhs, rhs)
+  def equal(lhs: NDArray, rhs: Double): NDArray = org.apache.mxnet.NDArray.equal(lhs, rhs)
 
   /**
     * Returns the result of element-wise **not equal to** (!=) comparison operation
@@ -120,6 +130,7 @@ object NDArray extends NDArrayBase {
     */
   def notEqual(lhs: NDArray, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.notEqual(lhs, rhs)
   def notEqual(lhs: NDArray, rhs: Float): NDArray = org.apache.mxnet.NDArray.notEqual(lhs, rhs)
+  def notEqual(lhs: NDArray, rhs: Double): NDArray = org.apache.mxnet.NDArray.notEqual(lhs, rhs)
 
   /**
     * Returns the result of element-wise **greater than** (>) comparison operation
@@ -129,6 +140,7 @@ object NDArray extends NDArrayBase {
     */
   def greater(lhs: NDArray, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.greater(lhs, rhs)
   def greater(lhs: NDArray, rhs: Float): NDArray = org.apache.mxnet.NDArray.greater(lhs, rhs)
+  def greater(lhs: NDArray, rhs: Double): NDArray = org.apache.mxnet.NDArray.greater(lhs, rhs)
 
   /**
     * Returns the result of element-wise **greater than or equal to** (>=) comparison
@@ -140,6 +152,8 @@ object NDArray extends NDArrayBase {
   = org.apache.mxnet.NDArray.greaterEqual(lhs, rhs)
   def greaterEqual(lhs: NDArray, rhs: Float): NDArray
   = org.apache.mxnet.NDArray.greaterEqual(lhs, rhs)
+  def greaterEqual(lhs: NDArray, rhs: Double): NDArray
+  = org.apache.mxnet.NDArray.greaterEqual(lhs, rhs)
 
   /**
     * Returns the result of element-wise **lesser than** (<) comparison operation
@@ -149,6 +163,7 @@ object NDArray extends NDArrayBase {
     */
   def lesser(lhs: NDArray, rhs: NDArray): NDArray = org.apache.mxnet.NDArray.lesser(lhs, rhs)
   def lesser(lhs: NDArray, rhs: Float): NDArray = org.apache.mxnet.NDArray.lesser(lhs, rhs)
+  def lesser(lhs: NDArray, rhs: Double): NDArray = org.apache.mxnet.NDArray.lesser(lhs, rhs)
 
   /**
     * Returns the result of element-wise **lesser than or equal to** (<=) comparison
@@ -160,6 +175,8 @@ object NDArray extends NDArrayBase {
   = org.apache.mxnet.NDArray.lesserEqual(lhs, rhs)
   def lesserEqual(lhs: NDArray, rhs: Float): NDArray
   = org.apache.mxnet.NDArray.lesserEqual(lhs, rhs)
+  def lesserEqual(lhs: NDArray, rhs: Double): NDArray
+  = org.apache.mxnet.NDArray.lesserEqual(lhs, rhs)
 
   /**
     * Create a new NDArray that copies content from source_array.
@@ -172,6 +189,18 @@ object NDArray extends NDArrayBase {
   = org.apache.mxnet.NDArray.array(
     sourceArr.asScala.map(ele => Float.unbox(ele)).toArray, shape, ctx)
 
+  /**
+    * Create a new NDArray that copies content from source_array.
+    * @param sourceArr Source data (list of Doubles) to create NDArray from.
+    * @param shape shape of the NDArray
+    * @param ctx The context of the NDArray, default to current default context.
+    * @return The created NDArray.
+    */
+  def arrayWithDouble(sourceArr: java.util.List[java.lang.Double], shape: Shape,
+                      ctx: Context = null): NDArray
+  = org.apache.mxnet.NDArray.array(
+    sourceArr.asScala.map(ele => Double.unbox(ele)).toArray, shape)
+
   /**
     * Returns evenly spaced values within a given interval.
     * Values are generated within the half-open interval [`start`, `stop`). In other
@@ -205,6 +234,10 @@ class NDArray private[mxnet] (val nd: org.apache.mxnet.NDArray ) {
     this(org.apache.mxnet.NDArray.array(arr, shape, ctx))
   }
 
+  def this(arr: Array[Double], shape: Shape, ctx: Context) = {
+    this(org.apache.mxnet.NDArray.array(arr, shape, ctx))
+  }
+
   def this(arr: java.util.List[java.lang.Float], shape: Shape, ctx: Context) = {
     this(NDArray.array(arr, shape, ctx))
   }
@@ -304,41 +337,59 @@ class NDArray private[mxnet] (val nd: org.apache.mxnet.NDArray ) {
     * @return Current NDArray
     */
   def set(value: Float): NDArray = nd.set(value)
+  def set(value: Double): NDArray = nd.set(value)
   def set(other: NDArray): NDArray = nd.set(other)
   def set(other: Array[Float]): NDArray = nd.set(other)
+  def set(other: Array[Double]): NDArray = nd.set(other)
 
   def add(other: NDArray): NDArray = this.nd + other.nd
   def add(other: Float): NDArray = this.nd + other
+  def add(other: Double): NDArray = this.nd + other
   def addInplace(other: NDArray): NDArray = this.nd += other
   def addInplace(other: Float): NDArray = this.nd += other
+  def addInplace(other: Double): NDArray = this.nd += other
   def subtract(other: NDArray): NDArray = this.nd - other
   def subtract(other: Float): NDArray = this.nd - other
+  def subtract(other: Double): NDArray = this.nd - other
   def subtractInplace(other: NDArray): NDArray = this.nd -= other
   def subtractInplace(other: Float): NDArray = this.nd -= other
+  def subtractInplace(other: Double): NDArray = this.nd -= other
   def multiply(other: NDArray): NDArray = this.nd * other
   def multiply(other: Float): NDArray = this.nd * other
+  def multiply(other: Double): NDArray = this.nd * other
   def multiplyInplace(other: NDArray): NDArray = this.nd *= other
   def multiplyInplace(other: Float): NDArray = this.nd *= other
+  def multiplyInplace(other: Double): NDArray = this.nd *= other
   def div(other: NDArray): NDArray = this.nd / other
   def div(other: Float): NDArray = this.nd / other
+  def div(other: Double): NDArray = this.nd / other
   def divInplace(other: NDArray): NDArray = this.nd /= other
   def divInplace(other: Float): NDArray = this.nd /= other
+  def divInplace(other: Double): NDArray = this.nd /= other
   def pow(other: NDArray): NDArray = this.nd ** other
   def pow(other: Float): NDArray = this.nd ** other
+  def pow(other: Double): NDArray = this.nd ** other
   def powInplace(other: NDArray): NDArray = this.nd **= other
   def powInplace(other: Float): NDArray = this.nd **= other
+  def powInplace(other: Double): NDArray = this.nd **= other
   def mod(other: NDArray): NDArray = this.nd % other
   def mod(other: Float): NDArray = this.nd % other
+  def mod(other: Double): NDArray = this.nd % other
   def modInplace(other: NDArray): NDArray = this.nd %= other
   def modInplace(other: Float): NDArray = this.nd %= other
+  def modInplace(other: Double): NDArray = this.nd %= other
   def greater(other: NDArray): NDArray = this.nd > other
   def greater(other: Float): NDArray = this.nd > other
+  def greater(other: Double): NDArray = this.nd > other
   def greaterEqual(other: NDArray): NDArray = this.nd >= other
   def greaterEqual(other: Float): NDArray = this.nd >= other
+  def greaterEqual(other: Double): NDArray = this.nd >= other
   def lesser(other: NDArray): NDArray = this.nd < other
   def lesser(other: Float): NDArray = this.nd < other
+  def lesser(other: Double): NDArray = this.nd < other
   def lesserEqual(other: NDArray): NDArray = this.nd <= other
   def lesserEqual(other: Float): NDArray = this.nd <= other
+  def lesserEqual(other: Double): NDArray = this.nd <= other
 
   /**
     * Return a copied flat java array of current array (row-major).
@@ -346,6 +397,12 @@ class NDArray private[mxnet] (val nd: org.apache.mxnet.NDArray ) {
     */
   def toArray: Array[Float] = nd.toArray
 
+  /**
+    * Return a copied flat java array of current array (row-major).
+    * @return  A copy of array content.
+    */
+  def toFloat64Array: Array[Double] = nd.toFloat64Array
+
   /**
     * Return a CPU scalar(float) of current ndarray.
     * This ndarray must have shape (1,)
@@ -354,6 +411,14 @@ class NDArray private[mxnet] (val nd: org.apache.mxnet.NDArray ) {
     */
   def toScalar: Float = nd.toScalar
 
+  /**
+    * Return a CPU scalar(float) of current ndarray.
+    * This ndarray must have shape (1,)
+    *
+    * @return The scalar representation of the ndarray.
+    */
+  def toFloat64Scalar: Double = nd.toFloat64Scalar
+
   /**
     * Copy the content of current array to other.
     *
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/util/NativeLibraryLoader.scala b/scala-package/core/src/main/scala/org/apache/mxnet/util/NativeLibraryLoader.scala
index 2ce893b478ed..1e6d9c4f9fce 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/util/NativeLibraryLoader.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/util/NativeLibraryLoader.scala
@@ -85,8 +85,16 @@ private[mxnet] object NativeLibraryLoader {
       }
     logger.debug(s"Attempting to load $loadLibname")
     val libFileInJar = libPathInJar + loadLibname
-    saveLibraryToTemp("libmxnet.so", "/lib/native/libmxnet.so")
-    val tempfile: File = saveLibraryToTemp(libname, libFileInJar)
+    saveLibraryToTemp("libmxnet.so", "/lib/native/libmxnet.so", true)
+    saveLibraryToTemp("libgfortran.so.3", "/lib/native/libgfortran.so.3", false)
+    saveLibraryToTemp("libquadmath.so.0", "/lib/native/libquadmath.so.0", false)
+    saveLibraryToTemp("libiomp5.so", "/lib/native/libiomp5.so", false)
+    saveLibraryToTemp("libiomp5.dylib", "/lib/native/libiomp5.dylib", false)
+    saveLibraryToTemp("libmklml_intel.so", "/lib/native/libmklml_intel.so", false)
+    saveLibraryToTemp("libmklml.dylib", "/lib/native/libmklml.dylib", false)
+    saveLibraryToTemp("libmkldnn.so.0", "/lib/native/libmkldnn.so.0", false)
+    saveLibraryToTemp("libmkldnn.0.dylib", "/lib/native/libmkldnn.0.dylib", false)
+    val tempfile: File = saveLibraryToTemp(libname, libFileInJar, true)
 
     loadLibraryFromFile(libname, tempfile)
   }
@@ -134,29 +142,34 @@ private[mxnet] object NativeLibraryLoader {
     *
     * @param libname name of the library (just used in constructing the library name)
     * @param resource String resource path in the jar file
+    * @param required true if library is required
     */
-  private def saveLibraryToTemp(libname: String, resource: String): File = {
+  private def saveLibraryToTemp(libname: String, resource: String, required: Boolean): File = {
     try {
       val is: InputStream = getClass.getResourceAsStream(resource)
       if (is == null) {
-        throw new UnsatisfiedLinkError(s"Couldn't find the resource $resource")
-      }
-
-      val tempfile: File = new File(_tempDir, libname)
-      val os: OutputStream = new FileOutputStream(tempfile)
-      logger.debug("tempfile.getPath() = {}", tempfile.getPath)
-      val savedTime: Long = System.currentTimeMillis
-      val buf: Array[Byte] = new Array[Byte](8192)
-      var len: Int = is.read(buf)
-      while (len > 0) {
-        os.write(buf, 0, len)
-        len = is.read(buf)
+        if (required) {
+          throw new UnsatisfiedLinkError(s"Couldn't find the resource $resource")
+        } else {
+          null
+        }
+      } else {
+        val tempfile: File = new File(_tempDir, libname)
+        val os: OutputStream = new FileOutputStream(tempfile)
+        logger.debug("tempfile.getPath() = {}", tempfile.getPath)
+        val savedTime: Long = System.currentTimeMillis
+        val buf: Array[Byte] = new Array[Byte](8192)
+        var len: Int = is.read(buf)
+        while (len > 0) {
+          os.write(buf, 0, len)
+          len = is.read(buf)
+        }
+        os.close()
+        is.close()
+        val seconds: Double = (System.currentTimeMillis - savedTime).toDouble / 1e3
+        logger.debug(s"Copying $libname took $seconds seconds.")
+        tempfile
       }
-      os.close()
-      is.close()
-      val seconds: Double = (System.currentTimeMillis - savedTime).toDouble / 1e3
-      logger.debug(s"Copying $libname took $seconds seconds.")
-      tempfile
     } catch {
       case io: IOException =>
         throw new UnsatisfiedLinkError(s"Could not create temp file for $libname")
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/util/OptionConversion.scala b/scala-package/core/src/main/scala/org/apache/mxnet/util/OptionConversion.scala
new file mode 100644
index 000000000000..2cf453ac3d18
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/util/OptionConversion.scala
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.util
+
+object OptionConversion {
+  implicit def someWrapper[A](noSome : A) : Option[A] = Option(noSome)
+}
diff --git a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java
index 2659b7848bc6..86c7eb29d2ef 100644
--- a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java
+++ b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java
@@ -40,6 +40,15 @@ public void testCreateNDArray() {
                 new Shape(new int[]{1, 3}),
                 new Context("cpu", 0));
         assertTrue(Arrays.equals(nd.shape().toArray(), arr));
+
+        List<Double> list2 = Arrays.asList(1d, 1d, 1d);
+        nd = NDArray.arrayWithDouble(list2,
+                new Shape(new int[]{1, 3}),
+                new Context("cpu", 0));
+
+        // Float64 assertion
+        assertTrue(nd.dtype() == DType.Float64());
+
     }
 
     @Test
@@ -64,6 +73,12 @@ public void testComparison(){
         nd = nd.subtract(nd2);
         float[] lesser = new float[]{0, 0, 0};
         assertTrue(Arrays.equals(nd.greater(nd2).toArray(), lesser));
+
+        NDArray nd3 = new NDArray(new double[]{1.0, 2.0, 3.0}, new Shape(new int[]{3}), new Context("cpu", 0));
+        nd3 = nd3.add(1.0);
+        double[] smaller = new double[] {2, 3, 4};
+        assertTrue(Arrays.equals(smaller, nd3.toFloat64Array()));
+
     }
 
     @Test
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
index 2ec6f668dbcc..d3969b0ce77d 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
@@ -303,5 +303,32 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     assert(dataDesc(0).layout == Layout.NTC)
     assert(labelDesc(0).dtype == DType.Int32)
     assert(labelDesc(0).layout == Layout.NT)
+
+
+    // Test with passing Float64 hardcoded as Dtype of data
+    val dataIter4 = new NDArrayIter(
+      IO.initDataDesc(data, false, "data", DType.Float64, Layout.NTC),
+      IO.initDataDesc(label, false, "label", DType.Int32, Layout.NT),
+      128, false, "pad")
+    val dataDesc4 = dataIter4.provideDataDesc
+    val labelDesc4 = dataIter4.provideLabelDesc
+    assert(dataDesc4(0).dtype == DType.Float64)
+    assert(dataDesc4(0).layout == Layout.NTC)
+    assert(labelDesc4(0).dtype == DType.Int32)
+    assert(labelDesc4(0).layout == Layout.NT)
+
+    // Test with Float64 coming from the data itself
+    val dataF64 = IndexedSeq(NDArray.ones(shape0, dtype = DType.Float64),
+      NDArray.zeros(shape0, dtype = DType.Float64))
+
+    val dataIter5 = new NDArrayIter(
+      IO.initDataDesc(dataF64, false, "data", DType.Float64, Layout.NTC),
+      IO.initDataDesc(label, false, "label", DType.Int32, Layout.NT),
+      128, false, "pad")
+    val dataDesc5 = dataIter5.provideDataDesc
+    assert(dataDesc5(0).dtype == DType.Float64)
+    assert(dataDesc5(0).dtype != DType.Float32)
+    assert(dataDesc5(0).layout == Layout.NTC)
+
   }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
index 5d88bb39e502..bc7a0a026bc3 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
@@ -21,7 +21,7 @@ import java.io.File
 import java.util.concurrent.atomic.AtomicInteger
 
 import org.apache.mxnet.NDArrayConversions._
-import org.scalatest.{Matchers, BeforeAndAfterAll, FunSuite}
+import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
 
 class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
   private val sequence: AtomicInteger = new AtomicInteger(0)
@@ -29,6 +29,9 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
   test("to java array") {
     val ndarray = NDArray.zeros(2, 2)
     assert(ndarray.toArray === Array(0f, 0f, 0f, 0f))
+
+    val float64Array = NDArray.zeros(Shape(2, 2), dtype = DType.Float64)
+    assert(float64Array.toFloat64Array === Array(0d, 0d, 0d, 0d))
   }
 
   test("to scalar") {
@@ -38,8 +41,17 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     assert(ndones.toScalar === 1f)
   }
 
+  test("to float 64 scalar") {
+    val ndzeros = NDArray.zeros(Shape(1), dtype = DType.Float64)
+    assert(ndzeros.toFloat64Scalar === 0d)
+    val ndones = NDArray.ones(Shape(1), dtype = DType.Float64)
+    assert(ndones.toFloat64Scalar === 1d)
+  }
+
   test ("call toScalar on an ndarray which is not a scalar") {
     intercept[Exception] { NDArray.zeros(1, 1).toScalar }
+    intercept[Exception] { NDArray.zeros(shape = Shape (1, 1),
+      dtype = DType.Float64).toFloat64Scalar }
   }
 
   test("size and shape") {
@@ -51,12 +63,20 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
   test("dtype") {
     val arr = NDArray.zeros(3, 2)
     assert(arr.dtype === DType.Float32)
+
+    val float64Array = NDArray.zeros(shape = Shape(3, 2), dtype = DType.Float64)
+    assert(float64Array.dtype === DType.Float64)
   }
 
   test("set scalar value") {
     val ndarray = NDArray.empty(2, 1)
     ndarray.set(10f)
     assert(ndarray.toArray === Array(10f, 10f))
+
+    val float64array = NDArray.empty(shape = Shape(2, 1), dtype = DType.Float64)
+    float64array.set(10d)
+    assert(float64array.toFloat64Array === Array(10d, 10d))
+
   }
 
   test("copy from java array") {
@@ -66,19 +86,29 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
   }
 
   test("plus") {
-    val ndzeros = NDArray.zeros(2, 1)
-    val ndones = ndzeros + 1f
+    var ndzeros = NDArray.zeros(2, 1)
+    var ndones = ndzeros + 1f
     assert(ndones.toArray === Array(1f, 1f))
     assert((ndones + ndzeros).toArray === Array(1f, 1f))
     assert((1 + ndones).toArray === Array(2f, 2f))
     // in-place
     ndones += ndones
     assert(ndones.toArray === Array(2f, 2f))
+
+    // Float64 method test
+    ndzeros = NDArray.zeros(shape = Shape(2, 1), dtype = DType.Float64)
+    ndones = ndzeros + 1d
+    assert(ndones.toFloat64Array === Array(1d, 1d))
+    assert((ndones + ndzeros).toFloat64Array === Array(1d, 1d))
+    assert((1d + ndones).toArray === Array(2d, 2d))
+    // in-place
+    ndones += ndones
+    assert(ndones.toFloat64Array === Array(2d, 2d))
   }
 
   test("minus") {
-    val ndones = NDArray.ones(2, 1)
-    val ndzeros = ndones - 1f
+    var ndones = NDArray.ones(2, 1)
+    var ndzeros = ndones - 1f
     assert(ndzeros.toArray === Array(0f, 0f))
     assert((ndones - ndzeros).toArray === Array(1f, 1f))
     assert((ndzeros - ndones).toArray === Array(-1f, -1f))
@@ -86,23 +116,46 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     // in-place
     ndones -= ndones
     assert(ndones.toArray === Array(0f, 0f))
+
+    // Float64 methods test
+    ndones = NDArray.ones(shape = Shape(2, 1))
+    ndzeros = ndones - 1d
+    assert(ndzeros.toFloat64Array === Array(0d, 0d))
+    assert((ndones - ndzeros).toFloat64Array === Array(1d , 1d))
+    assert((ndzeros - ndones).toFloat64Array === Array(-1d , -1d))
+    assert((ndones - 1).toFloat64Array === Array(0d, 0d))
+    // in-place
+    ndones -= ndones
+    assert(ndones.toArray === Array(0d, 0d))
+
   }
 
   test("multiplication") {
-    val ndones = NDArray.ones(2, 1)
-    val ndtwos = ndones * 2
+    var ndones = NDArray.ones(2, 1)
+    var ndtwos = ndones * 2
     assert(ndtwos.toArray === Array(2f, 2f))
     assert((ndones * ndones).toArray === Array(1f, 1f))
     assert((ndtwos * ndtwos).toArray === Array(4f, 4f))
     ndtwos *= ndtwos
     // in-place
     assert(ndtwos.toArray === Array(4f, 4f))
+
+    // Float64 methods test
+    ndones = NDArray.ones(shape = Shape(2, 1), dtype = DType.Float64)
+    ndtwos = ndones * 2d
+    assert(ndtwos.toFloat64Array === Array(2d, 2d))
+    assert((ndones * ndones).toFloat64Array === Array(1d, 1d))
+    assert((ndtwos * ndtwos).toFloat64Array === Array(4d, 4d))
+    ndtwos *= ndtwos
+    // in-place
+    assert(ndtwos.toFloat64Array === Array(4d, 4d))
+
   }
 
   test("division") {
-    val ndones = NDArray.ones(2, 1)
-    val ndzeros = ndones - 1f
-    val ndhalves = ndones / 2
+    var ndones = NDArray.ones(2, 1)
+    var ndzeros = ndones - 1f
+    var ndhalves = ndones / 2
     assert(ndhalves.toArray === Array(0.5f, 0.5f))
     assert((ndhalves / ndhalves).toArray === Array(1f, 1f))
     assert((ndones / ndones).toArray === Array(1f, 1f))
@@ -110,37 +163,75 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     ndhalves /= ndhalves
     // in-place
     assert(ndhalves.toArray === Array(1f, 1f))
+
+    // Float64 methods test
+    ndones = NDArray.ones(shape = Shape (2, 1), dtype = DType.Float64)
+    ndzeros = ndones - 1d
+    ndhalves = ndones / 2d
+    assert(ndhalves.toFloat64Array === Array(0.5d, 0.5d))
+    assert((ndhalves / ndhalves).toFloat64Array === Array(1d, 1d))
+    assert((ndones / ndones).toFloat64Array === Array(1d, 1d))
+    assert((ndzeros / ndones).toFloat64Array === Array(0d, 0d))
+    ndhalves /= ndhalves
+    // in-place
+    assert(ndhalves.toFloat64Array === Array(1d, 1d))
   }
 
   test("full") {
-    val arr = NDArray.full(Shape(1, 2), 3f)
+    var arr = NDArray.full(Shape(1, 2), 3f)
     assert(arr.shape === Shape(1, 2))
     assert(arr.toArray === Array(3f, 3f))
+
+    // Float64 methods test
+    arr = NDArray.full(Shape(1, 2), value = 5d, Context.cpu())
+    assert(arr.toFloat64Array === Array (5d, 5d))
   }
 
   test("clip") {
-    val ndarray = NDArray.empty(3, 2)
+    var ndarray = NDArray.empty(3, 2)
     ndarray.set(Array(1f, 2f, 3f, 4f, 5f, 6f))
     assert(NDArray.clip(ndarray, 2f, 5f).toArray === Array(2f, 2f, 3f, 4f, 5f, 5f))
+
+    // Float64 methods test
+    ndarray = NDArray.empty(shape = Shape(3, 2), dtype = DType.Float64)
+    ndarray.set(Array(1d, 2d, 3d, 4d, 5d, 6d))
+    assert(NDArray.clip(ndarray, 2d, 5d).toFloat64Array === Array(2d, 2d, 3d, 4d, 5d, 5d))
   }
 
   test("sqrt") {
-    val ndarray = NDArray.empty(4, 1)
+    var ndarray = NDArray.empty(4, 1)
     ndarray.set(Array(0f, 1f, 4f, 9f))
     assert(NDArray.sqrt(ndarray).toArray === Array(0f, 1f, 2f, 3f))
+
+    // Float64 methods test
+    ndarray = NDArray.empty(shape = Shape(4, 1), dtype = DType.Float64)
+    ndarray.set(Array(0d, 1d, 4d, 9d))
+    assert(NDArray.sqrt(ndarray).toFloat64Array === Array(0d, 1d, 2d, 3d))
   }
 
   test("rsqrt") {
-    val ndarray = NDArray.array(Array(1f, 4f), shape = Shape(2, 1))
+    var ndarray = NDArray.array(Array(1f, 4f), shape = Shape(2, 1))
     assert(NDArray.rsqrt(ndarray).toArray === Array(1f, 0.5f))
+
+    // Float64 methods test
+    ndarray = NDArray.array(Array(1d, 4d, 25d), shape = Shape(3, 1), Context.cpu())
+    assert(NDArray.rsqrt(ndarray).toFloat64Array === Array(1d, 0.5d, 0.2d))
   }
 
   test("norm") {
-    val ndarray = NDArray.empty(3, 1)
+    var ndarray = NDArray.empty(3, 1)
     ndarray.set(Array(1f, 2f, 3f))
-    val normed = NDArray.norm(ndarray)
+    var normed = NDArray.norm(ndarray)
     assert(normed.shape === Shape(1))
     assert(normed.toScalar === math.sqrt(14.0).toFloat +- 1e-3f)
+
+    // Float64 methods test
+    ndarray = NDArray.empty(shape = Shape(3, 1), dtype = DType.Float64)
+    ndarray.set(Array(1d, 2d, 3d))
+    normed = NDArray.norm(ndarray)
+    assert(normed.get.dtype === DType.Float64)
+    assert(normed.shape === Shape(1))
+    assert(normed.toFloat64Scalar === math.sqrt(14.0) +- 1e-3d)
   }
 
   test("one hot encode") {
@@ -176,25 +267,26 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
   }
 
   test("power") {
-    val arr = NDArray.array(Array(3f, 5f), shape = Shape(2, 1))
+    var arr = NDArray.array(Array(3f, 5f), shape = Shape(2, 1))
 
-    val arrPower1 = NDArray.power(2f, arr)
+    var arrPower1 = NDArray.power(2f, arr)
     assert(arrPower1.shape === Shape(2, 1))
     assert(arrPower1.toArray === Array(8f, 32f))
 
-    val arrPower2 = NDArray.power(arr, 2f)
+    var arrPower2 = NDArray.power(arr, 2f)
     assert(arrPower2.shape === Shape(2, 1))
     assert(arrPower2.toArray === Array(9f, 25f))
 
-    val arrPower3 = NDArray.power(arr, arr)
+    var arrPower3 = NDArray.power(arr, arr)
     assert(arrPower3.shape === Shape(2, 1))
     assert(arrPower3.toArray === Array(27f, 3125f))
 
-   val arrPower4 = arr ** 2f
+    var arrPower4 = arr ** 2f
+
     assert(arrPower4.shape === Shape(2, 1))
     assert(arrPower4.toArray === Array(9f, 25f))
 
-    val arrPower5 = arr ** arr
+    var arrPower5 = arr ** arr
     assert(arrPower5.shape === Shape(2, 1))
     assert(arrPower5.toArray === Array(27f, 3125f))
 
@@ -206,84 +298,211 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     arr **= arr
     assert(arr.shape === Shape(2, 1))
     assert(arr.toArray === Array(27f, 3125f))
+
+    // Float64 tests
+    arr = NDArray.array(Array(3d, 5d), shape = Shape(2, 1))
+
+    arrPower1 = NDArray.power(2d, arr)
+    assert(arrPower1.shape === Shape(2, 1))
+    assert(arrPower1.dtype === DType.Float64)
+    assert(arrPower1.toFloat64Array === Array(8d, 32d))
+
+    arrPower2 = NDArray.power(arr, 2d)
+    assert(arrPower2.shape === Shape(2, 1))
+    assert(arrPower2.dtype === DType.Float64)
+    assert(arrPower2.toFloat64Array === Array(9d, 25d))
+
+    arrPower3 = NDArray.power(arr, arr)
+    assert(arrPower3.shape === Shape(2, 1))
+    assert(arrPower3.dtype === DType.Float64)
+    assert(arrPower3.toFloat64Array === Array(27d, 3125d))
+
+    arrPower4 = arr ** 2f
+    assert(arrPower4.shape === Shape(2, 1))
+    assert(arrPower4.dtype === DType.Float64)
+    assert(arrPower4.toFloat64Array === Array(9d, 25d))
+
+    arrPower5 = arr ** arr
+    assert(arrPower5.shape === Shape(2, 1))
+    assert(arrPower5.dtype === DType.Float64)
+    assert(arrPower5.toFloat64Array === Array(27d, 3125d))
+
+    arr **= 2d
+    assert(arr.shape === Shape(2, 1))
+    assert(arr.dtype === DType.Float64)
+    assert(arr.toFloat64Array === Array(9d, 25d))
+
+    arr.set(Array(3d, 5d))
+    arr **= arr
+    assert(arr.shape === Shape(2, 1))
+    assert(arr.dtype === DType.Float64)
+    assert(arr.toFloat64Array === Array(27d, 3125d))
   }
 
   test("equal") {
-    val arr1 = NDArray.array(Array(1f, 2f, 3f, 5f), shape = Shape(2, 2))
-    val arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
+    var arr1 = NDArray.array(Array(1f, 2f, 3f, 5f), shape = Shape(2, 2))
+    var arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
 
-    val arrEqual1 = NDArray.equal(arr1, arr2)
+    var arrEqual1 = NDArray.equal(arr1, arr2)
     assert(arrEqual1.shape === Shape(2, 2))
     assert(arrEqual1.toArray === Array(1f, 0f, 1f, 0f))
 
-    val arrEqual2 = NDArray.equal(arr1, 3f)
+    var arrEqual2 = NDArray.equal(arr1, 3f)
     assert(arrEqual2.shape === Shape(2, 2))
     assert(arrEqual2.toArray === Array(0f, 0f, 1f, 0f))
+
+
+    // Float64 methods test
+    arr1 = NDArray.array(Array(1d, 2d, 3d, 5d), shape = Shape(2, 2))
+    arr2 = NDArray.array(Array(1d, 4d, 3d, 6d), shape = Shape(2, 2))
+
+    arrEqual1 = NDArray.equal(arr1, arr2)
+    assert(arrEqual1.shape === Shape(2, 2))
+    assert(arrEqual1.dtype === DType.Float64)
+    assert(arrEqual1.toFloat64Array === Array(1d, 0d, 1d, 0d))
+
+    arrEqual2 = NDArray.equal(arr1, 3d)
+    assert(arrEqual2.shape === Shape(2, 2))
+    assert(arrEqual2.dtype === DType.Float64)
+    assert(arrEqual2.toFloat64Array === Array(0d, 0d, 1d, 0d))
   }
 
   test("not_equal") {
-    val arr1 = NDArray.array(Array(1f, 2f, 3f, 5f), shape = Shape(2, 2))
-    val arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
+    var arr1 = NDArray.array(Array(1f, 2f, 3f, 5f), shape = Shape(2, 2))
+    var arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
 
-    val arrEqual1 = NDArray.notEqual(arr1, arr2)
+    var arrEqual1 = NDArray.notEqual(arr1, arr2)
     assert(arrEqual1.shape === Shape(2, 2))
     assert(arrEqual1.toArray === Array(0f, 1f, 0f, 1f))
 
-    val arrEqual2 = NDArray.notEqual(arr1, 3f)
+    var arrEqual2 = NDArray.notEqual(arr1, 3f)
     assert(arrEqual2.shape === Shape(2, 2))
     assert(arrEqual2.toArray === Array(1f, 1f, 0f, 1f))
+
+    // Float64 methods test
+
+    arr1 = NDArray.array(Array(1d, 2d, 3d, 5d), shape = Shape(2, 2))
+    arr2 = NDArray.array(Array(1d, 4d, 3d, 6d), shape = Shape(2, 2))
+
+    arrEqual1 = NDArray.notEqual(arr1, arr2)
+    assert(arrEqual1.shape === Shape(2, 2))
+    assert(arrEqual1.dtype === DType.Float64)
+    assert(arrEqual1.toFloat64Array === Array(0d, 1d, 0d, 1d))
+
+    arrEqual2 = NDArray.notEqual(arr1, 3d)
+    assert(arrEqual2.shape === Shape(2, 2))
+    assert(arrEqual2.dtype === DType.Float64)
+    assert(arrEqual2.toFloat64Array === Array(1d, 1d, 0d, 1d))
+
   }
 
   test("greater") {
-    val arr1 = NDArray.array(Array(1f, 2f, 4f, 5f), shape = Shape(2, 2))
-    val arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
+    var arr1 = NDArray.array(Array(1f, 2f, 4f, 5f), shape = Shape(2, 2))
+    var arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
 
-    val arrEqual1 = arr1 > arr2
+    var arrEqual1 = arr1 > arr2
     assert(arrEqual1.shape === Shape(2, 2))
     assert(arrEqual1.toArray === Array(0f, 0f, 1f, 0f))
 
-    val arrEqual2 = arr1 > 2f
+    var arrEqual2 = arr1 > 2f
     assert(arrEqual2.shape === Shape(2, 2))
     assert(arrEqual2.toArray === Array(0f, 0f, 1f, 1f))
+
+    // Float64 methods test
+    arr1 = NDArray.array(Array(1d, 2d, 4d, 5d), shape = Shape(2, 2))
+    arr2 = NDArray.array(Array(1d, 4d, 3d, 6d), shape = Shape(2, 2))
+
+    arrEqual1 = arr1 > arr2
+    assert(arrEqual1.shape === Shape(2, 2))
+    assert(arrEqual1.dtype === DType.Float64)
+    assert(arrEqual1.toFloat64Array === Array(0d, 0d, 1d, 0d))
+
+    arrEqual2 = arr1 > 2d
+    assert(arrEqual2.shape === Shape(2, 2))
+    assert(arrEqual2.dtype === DType.Float64)
+    assert(arrEqual2.toFloat64Array === Array(0d, 0d, 1d, 1d))
   }
 
   test("greater_equal") {
-    val arr1 = NDArray.array(Array(1f, 2f, 4f, 5f), shape = Shape(2, 2))
-    val arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
+    var arr1 = NDArray.array(Array(1f, 2f, 4f, 5f), shape = Shape(2, 2))
+    var arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
 
-    val arrEqual1 = arr1 >= arr2
+    var arrEqual1 = arr1 >= arr2
     assert(arrEqual1.shape === Shape(2, 2))
     assert(arrEqual1.toArray === Array(1f, 0f, 1f, 0f))
 
-    val arrEqual2 = arr1 >= 2f
+    var arrEqual2 = arr1 >= 2f
     assert(arrEqual2.shape === Shape(2, 2))
     assert(arrEqual2.toArray === Array(0f, 1f, 1f, 1f))
+
+    // Float64 methods test
+    arr1 = NDArray.array(Array(1d, 2d, 4d, 5d), shape = Shape(2, 2))
+    arr2 = NDArray.array(Array(1d, 4d, 3d, 6d), shape = Shape(2, 2))
+
+    arrEqual1 = arr1 >= arr2
+    assert(arrEqual1.shape === Shape(2, 2))
+    assert(arrEqual1.dtype === DType.Float64)
+    assert(arrEqual1.toFloat64Array === Array(1d, 0d, 1d, 0d))
+
+    arrEqual2 = arr1 >= 2d
+    assert(arrEqual2.shape === Shape(2, 2))
+    assert(arrEqual2.dtype === DType.Float64)
+    assert(arrEqual2.toFloat64Array === Array(0d, 1d, 1d, 1d))
   }
 
   test("lesser") {
-    val arr1 = NDArray.array(Array(1f, 2f, 4f, 5f), shape = Shape(2, 2))
-    val arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
+    var arr1 = NDArray.array(Array(1f, 2f, 4f, 5f), shape = Shape(2, 2))
+    var arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
 
-    val arrEqual1 = arr1 < arr2
+    var arrEqual1 = arr1 < arr2
     assert(arrEqual1.shape === Shape(2, 2))
     assert(arrEqual1.toArray === Array(0f, 1f, 0f, 1f))
 
-    val arrEqual2 = arr1 < 2f
+    var arrEqual2 = arr1 < 2f
     assert(arrEqual2.shape === Shape(2, 2))
     assert(arrEqual2.toArray === Array(1f, 0f, 0f, 0f))
+
+    // Float64 methods test
+    arr1 = NDArray.array(Array(1d, 2d, 4d, 5d), shape = Shape(2, 2))
+    arr2 = NDArray.array(Array(1d, 4d, 3d, 6d), shape = Shape(2, 2))
+
+    arrEqual1 = arr1 < arr2
+    assert(arrEqual1.shape === Shape(2, 2))
+    assert(arrEqual1.dtype === DType.Float64)
+    assert(arrEqual1.toFloat64Array === Array(0d, 1d, 0d, 1d))
+
+    arrEqual2 = arr1 < 2d
+    assert(arrEqual2.shape === Shape(2, 2))
+    assert(arrEqual2.dtype === DType.Float64)
+    assert(arrEqual2.toFloat64Array === Array(1d, 0d, 0d, 0d))
+
   }
 
   test("lesser_equal") {
-    val arr1 = NDArray.array(Array(1f, 2f, 4f, 5f), shape = Shape(2, 2))
-    val arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
+    var arr1 = NDArray.array(Array(1f, 2f, 4f, 5f), shape = Shape(2, 2))
+    var arr2 = NDArray.array(Array(1f, 4f, 3f, 6f), shape = Shape(2, 2))
 
-    val arrEqual1 = arr1 <= arr2
+    var arrEqual1 = arr1 <= arr2
     assert(arrEqual1.shape === Shape(2, 2))
     assert(arrEqual1.toArray === Array(1f, 1f, 0f, 1f))
 
-    val arrEqual2 = arr1 <= 2f
+    var arrEqual2 = arr1 <= 2f
     assert(arrEqual2.shape === Shape(2, 2))
     assert(arrEqual2.toArray === Array(1f, 1f, 0f, 0f))
+
+    // Float64 methods test
+    arr1 = NDArray.array(Array(1d, 2d, 4d, 5d), shape = Shape(2, 2))
+    arr2 = NDArray.array(Array(1d, 4d, 3d, 6d), shape = Shape(2, 2))
+
+    arrEqual1 = arr1 <= arr2
+    assert(arrEqual1.shape === Shape(2, 2))
+    assert(arrEqual1.dtype === DType.Float64)
+    assert(arrEqual1.toFloat64Array === Array(1d, 1d, 0d, 1d))
+
+    arrEqual2 = arr1 <= 2d
+    assert(arrEqual2.shape === Shape(2, 2))
+    assert(arrEqual2.dtype === DType.Float64)
+    assert(arrEqual2.toFloat64Array === Array(1d, 1d, 0d, 0d))
   }
 
   test("choose_element_0index") {
@@ -294,11 +513,18 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
   }
 
   test("copy to") {
-    val source = NDArray.array(Array(1f, 2f, 3f), shape = Shape(1, 3))
-    val dest = NDArray.empty(1, 3)
+    var source = NDArray.array(Array(1f, 2f, 3f), shape = Shape(1, 3))
+    var dest = NDArray.empty(1, 3)
     source.copyTo(dest)
     assert(dest.shape === Shape(1, 3))
     assert(dest.toArray === Array(1f, 2f, 3f))
+
+    // Float64 methods test
+    source = NDArray.array(Array(1d, 2d, 3d), shape = Shape(1, 3))
+    dest = NDArray.empty(shape = Shape(1, 3), dtype = DType.Float64)
+    source.copyTo(dest)
+    assert(dest.dtype === DType.Float64)
+    assert(dest.toFloat64Array === Array(1d, 2d, 3d))
   }
 
   test("abs") {
@@ -365,6 +591,12 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     val arr = NDArray.maximum(arr1, arr2)
     assert(arr.shape === Shape(3, 1))
     assert(arr.toArray === Array(4f, 2.1f, 3.7f))
+
+    // Float64 methods test
+    val arr3 = NDArray.array(Array(1d, 2d, 3d), shape = Shape(3, 1))
+    val maxArr = NDArray.maximum(arr3, 10d)
+    assert(maxArr.shape === Shape(3, 1))
+    assert(maxArr.toArray === Array(10d, 10d, 10d))
   }
 
   test("min") {
@@ -378,11 +610,18 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     val arr = NDArray.minimum(arr1, arr2)
     assert(arr.shape === Shape(3, 1))
     assert(arr.toArray === Array(1.5f, 1f, 3.5f))
+
+    // Float64 methods test
+    val arr3 = NDArray.array(Array(4d, 5d, 6d), shape = Shape(3, 1))
+    val minArr = NDArray.minimum(arr3, 5d)
+    assert(minArr.shape === Shape(3, 1))
+    assert(minArr.toFloat64Array === Array(4d, 5d, 5d))
   }
 
   test("sum") {
-    val arr = NDArray.array(Array(1f, 2f, 3f, 4f), shape = Shape(2, 2))
+    var arr = NDArray.array(Array(1f, 2f, 3f, 4f), shape = Shape(2, 2))
     assert(NDArray.sum(arr).toScalar === 10f +- 1e-3f)
+
   }
 
   test("argmaxChannel") {
@@ -398,6 +637,12 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     val arr = NDArray.concatenate(arr1, arr2)
     assert(arr.shape === Shape(3, 3))
     assert(arr.toArray === Array(1f, 2f, 4f, 3f, 3f, 3f, 8f, 7f, 6f))
+
+    // Try concatenating float32 arr with float64 arr. Should get exception
+    intercept[Exception] {
+      val arr3 = NDArray.array(Array (5d, 6d, 7d), shape = Shape(1, 3))
+      NDArray.concatenate(Array(arr1, arr3))
+    }
   }
 
   test("concatenate axis-1") {
@@ -406,6 +651,12 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     val arr = NDArray.concatenate(Array(arr1, arr2), axis = 1)
     assert(arr.shape === Shape(2, 3))
     assert(arr.toArray === Array(1f, 2f, 5f, 3f, 4f, 6f))
+
+    // Try concatenating float32 arr with float64 arr. Should get exception
+    intercept[Exception] {
+      val arr3 = NDArray.array(Array (5d, 6d), shape = Shape(2, 1))
+      NDArray.concatenate(Array(arr1, arr3), axis = 1)
+    }
   }
 
   test("transpose") {
@@ -428,6 +679,24 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
       val loadedArray = arrays(0)
       assert(loadedArray.shape === Shape(3, 1))
       assert(loadedArray.toArray === Array(1f, 2f, 3f))
+      assert(loadedArray.dtype === DType.Float32)
+    } finally {
+      val file = new File(filename)
+      file.delete()
+    }
+
+    // Try the same for Float64 array
+    try {
+      val ndarray = NDArray.array(Array(1d, 2d, 3d), shape = Shape(3, 1), ctx = Context.cpu())
+      NDArray.save(filename, Map("local" -> ndarray))
+      val (keys, arrays) = NDArray.load(filename)
+      assert(keys.length === 1)
+      assert(keys(0) === "local")
+      assert(arrays.length === 1)
+      val loadedArray = arrays(0)
+      assert(loadedArray.shape === Shape(3, 1))
+      assert(loadedArray.toArray === Array(1d, 2d, 3d))
+      assert(loadedArray.dtype === DType.Float64)
     } finally {
       val file = new File(filename)
       file.delete()
@@ -446,6 +715,24 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
       val loadedArray = arrays(0)
       assert(loadedArray.shape === Shape(3, 1))
       assert(loadedArray.toArray === Array(1f, 2f, 3f))
+      assert(loadedArray.dtype === DType.Float32)
+    } finally {
+      val file = new File(filename)
+      file.delete()
+    }
+
+    // Try the same thing for Float64 array :
+
+    try {
+      val ndarray = NDArray.array(Array(1d, 2d, 3d), shape = Shape(3, 1), ctx = Context.cpu())
+      NDArray.save(filename, Array(ndarray))
+      val (keys, arrays) = NDArray.load(filename)
+      assert(keys.length === 0)
+      assert(arrays.length === 1)
+      val loadedArray = arrays(0)
+      assert(loadedArray.shape === Shape(3, 1))
+      assert(loadedArray.toArray === Array(1d, 2d, 3d))
+      assert(loadedArray.dtype === DType.Float64)
     } finally {
       val file = new File(filename)
       file.delete()
@@ -464,9 +751,11 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     val ndarray2 = NDArray.array(Array(1f, 2f, 3f), shape = Shape(3, 1))
     val ndarray3 = NDArray.array(Array(1f, 2f, 3f), shape = Shape(1, 3))
     val ndarray4 = NDArray.array(Array(3f, 2f, 3f), shape = Shape(3, 1))
+    val ndarray5 = NDArray.array(Array(3d, 2d, 3d), shape = Shape(3, 1), ctx = Context.cpu())
     ndarray1 shouldEqual ndarray2
     ndarray1 shouldNot equal(ndarray3)
     ndarray1 shouldNot equal(ndarray4)
+    ndarray5 shouldNot equal(ndarray3)
   }
 
   test("slice") {
@@ -545,6 +834,7 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     val bytes = arr.serialize()
     val arrCopy = NDArray.deserialize(bytes)
     assert(arr === arrCopy)
+    assert(arrCopy.dtype === DType.Float32)
   }
 
   test("dtype int32") {
@@ -576,4 +866,38 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     assert(arr.internal.toDoubleArray === Array(2d, 2d))
     assert(arr.internal.toByteArray === Array(2.toByte, 2.toByte))
   }
+
+  test("NDArray random module is generated properly") {
+    val lam = NDArray.ones(1, 2)
+    val rnd = NDArray.random.poisson(lam = Some(lam), shape = Some(Shape(3, 4)))
+    val rnd2 = NDArray.random.poisson(lam = Some(1f), shape = Some(Shape(3, 4)),
+      dtype = Some("float64"))
+    assert(rnd.shape === Shape(1, 2, 3, 4))
+    assert(rnd2.shape === Shape(3, 4))
+    assert(rnd2.head.dtype === DType.Float64)
+  }
+
+  test("NDArray random module is generated properly - special case of 'normal'") {
+    val mu = NDArray.ones(1, 2)
+    val sigma = NDArray.ones(1, 2) * 2
+    val rnd = NDArray.random.normal(mu = Some(mu), sigma = Some(sigma), shape = Some(Shape(3, 4)))
+    val rnd2 = NDArray.random.normal(mu = Some(1f), sigma = Some(2f), shape = Some(Shape(3, 4)),
+      dtype = Some("float64"))
+    assert(rnd.shape === Shape(1, 2, 3, 4))
+    assert(rnd2.shape === Shape(3, 4))
+    assert(rnd2.head.dtype === DType.Float64)
+  }
+
+  test("Generated api") {
+    // Without SomeConversion
+    val arr3 = NDArray.ones(Shape(1, 2), dtype = DType.Float64)
+    val arr4 = NDArray.ones(Shape(1), dtype = DType.Float64)
+    val arr5 = NDArray.api.norm(arr3, ord = Some(1), out = Some(arr4))
+    // With SomeConversion
+    import org.apache.mxnet.util.OptionConversion._
+    val arr = NDArray.ones(Shape(1, 2), dtype = DType.Float64)
+    val arr2 = NDArray.ones(Shape(1), dtype = DType.Float64)
+    NDArray.api.norm(arr, ord = 1, out = arr2)
+    val result = NDArray.api.dot(arr2, arr2)
+  }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
index ebb61d7d4bfb..d134c83ff7e7 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
@@ -20,6 +20,7 @@ package org.apache.mxnet
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 
 class SymbolSuite extends FunSuite with BeforeAndAfterAll {
+
   test("symbol compose") {
     val data = Symbol.Variable("data")
 
@@ -71,4 +72,25 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
     val data2 = data.clone()
     assert(data.toJson === data2.toJson)
   }
+
+  test("Symbol random module is generated properly") {
+    val lam = Symbol.Variable("lam")
+    val rnd = Symbol.random.poisson(lam = Some(lam), shape = Some(Shape(2, 2)))
+    val rnd2 = Symbol.random.poisson(lam = Some(1f), shape = Some(Shape(2, 2)))
+    // scalastyle:off println
+    println(s"Symbol.random.poisson debug info: ${rnd.debugStr}")
+    println(s"Symbol.random.poisson debug info: ${rnd2.debugStr}")
+    // scalastyle:on println
+  }
+
+  test("Symbol random module is generated properly - special case of 'normal'") {
+    val loc = Symbol.Variable("loc")
+    val scale = Symbol.Variable("scale")
+    val rnd = Symbol.random.normal(mu = Some(loc), sigma = Some(scale), shape = Some(Shape(2, 2)))
+    val rnd2 = Symbol.random.normal(mu = Some(1f), sigma = Some(2f), shape = Some(Shape(2, 2)))
+    // scalastyle:off println
+    println(s"Symbol.random.sample_normal debug info: ${rnd.debugStr}")
+    println(s"Symbol.random.random_normal debug info: ${rnd2.debugStr}")
+    // scalastyle:on println
+  }
 }
diff --git a/scala-package/deploy/pom.xml b/scala-package/deploy/pom.xml
new file mode 100644
index 000000000000..c51aa9a92090
--- /dev/null
+++ b/scala-package/deploy/pom.xml
@@ -0,0 +1,140 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.mxnet</groupId>
+    <artifactId>mxnet-parent</artifactId>
+    <version>INTERNAL</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>mxnet-deployment</artifactId>
+  <version>${revision}</version>
+  <name>MXNet Scala Package - Full ${platform}-only</name>
+  <packaging>pom</packaging>
+  <description>
+    Scala Package for Apache MXNet (Incubating) - flexible and efficient library for deep learning.
+  </description>
+
+  <properties>
+    <ARTIFACT_ID>mxnet-full_2.11-${platform}-${flavor}</ARTIFACT_ID>
+    <revision>1.5.0-SNAPSHOT</revision>
+    <repositoryId>apache.snapshots.https</repositoryId>
+    <repo_url>file://${project.build.directory}/repo</repo_url>
+    <deploy_asc_types/>
+    <deploy_asc_classifers/>
+    <deploy_asc_files/>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.mxnet</groupId>
+      <artifactId>mxnet-full_2.11</artifactId>
+      <version>INTERNAL</version>
+    </dependency>
+  </dependencies>
+
+  <profiles>
+    <profile>
+      <id>staging</id>
+      <properties>
+        <revision>1.5.0</revision>
+        <repositoryId>apache.releases.https</repositoryId>
+        <repo_url>https://repository.apache.org/content/repositories/staging</repo_url>
+        <deploy_asc_types>jar.asc,asc,asc</deploy_asc_types>
+        <deploy_asc_classifers>,sources.jar,javadoc.jar</deploy_asc_classifers>
+        <deploy_asc_files>
+          ../assembly/target/mxnet-full_2.11-INTERNAL.jar.asc,../assembly/target/mxnet-full_2.11-INTERNAL-src.jar.asc,../assembly/target/mxnet-full_2.11-INTERNAL-bundle.jar.asc
+        </deploy_asc_files>
+      </properties>
+    </profile>
+    <profile>
+      <id>nightly</id>
+      <properties>
+        <repositoryId>apache.snapshots.https</repositoryId>
+        <repo_url>https://repository.apache.org/content/repositories/snapshots</repo_url>
+      </properties>
+    </profile>
+  </profiles>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <configuration>
+          <attach>false</attach>
+        </configuration>
+      </plugin>
+
+      <plugin>
+        <groupId>com.google.code.maven-replacer-plugin</groupId>
+        <artifactId>replacer</artifactId>
+        <version>1.5.3</version>
+        <executions>
+          <execution>
+            <phase>deploy</phase>
+            <goals>
+              <goal>replace</goal>
+            </goals>
+          </execution>
+        </executions>
+        <configuration>
+          <file>${basedir}/src/main/deploy/deploy.xml</file>
+          <outputFile>${project.build.directory}/deploy.xml</outputFile>
+          <replacements>
+            <replacement>
+              <token>DESCRIPTION</token>
+              <value>${project.description}</value>
+            </replacement>
+            <replacement>
+              <token>ARTIFACT_ID</token>
+              <value>${ARTIFACT_ID}</value>
+            </replacement>
+            <replacement>
+              <token>PROJECT_VERSION</token>
+              <value>${project.version}</value>
+            </replacement>
+            <replacement>
+              <token>SCALA_VERSION</token>
+              <value>${scala.version}</value>
+            </replacement>
+          </replacements>
+        </configuration>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>deploy-file</id>
+            <phase>deploy</phase>
+            <goals>
+              <goal>deploy-file</goal>
+            </goals>
+            <configuration>
+              <description>${project.description}</description>
+              <repositoryId>${repositoryId}</repositoryId>
+              <url>${repo_url}</url>
+              <groupId>${project.groupId}</groupId>
+              <artifactId>${ARTIFACT_ID}</artifactId>
+              <version>${project.version}</version>
+              <packaging>jar</packaging>
+              <pomFile>${project.build.directory}/deploy.xml</pomFile>
+              <file>${rootdir}/assembly/target/mxnet-full_2.11-INTERNAL.jar</file>
+              <sources>${rootdir}/assembly/target/mxnet-full_2.11-INTERNAL-src.jar</sources>
+              <javadoc>${rootdir}/assembly/target/mxnet-full_2.11-INTERNAL-bundle.jar</javadoc>
+              <types>${deploy_asc_types}</types>
+              <classifiers>${deploy_asc_classifers}</classifiers>
+              <files>${deploy_asc_files}</files>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
diff --git a/scala-package/deploy/src/main/deploy/deploy.xml b/scala-package/deploy/src/main/deploy/deploy.xml
new file mode 100644
index 000000000000..7bfd20737537
--- /dev/null
+++ b/scala-package/deploy/src/main/deploy/deploy.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project
+    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
+    xmlns="http://maven.apache.org/POM/4.0.0"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>ARTIFACT_ID</artifactId>
+  <version>PROJECT_VERSION</version>
+  <description>DESCRIPTION</description>
+  <dependencies>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>SCALA_VERSION</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-reflect</artifactId>
+      <version>SCALA_VERSION</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang.modules</groupId>
+      <artifactId>scala-parser-combinators_2.11</artifactId>
+      <version>1.0.4</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-compiler</artifactId>
+      <version>SCALA_VERSION</version>
+    </dependency>
+  </dependencies>
+</project>
diff --git a/scala-package/dev/compile-mxnet-backend.sh b/scala-package/dev/compile-mxnet-backend.sh
index b065e01afc8e..114bf0766444 100755
--- a/scala-package/dev/compile-mxnet-backend.sh
+++ b/scala-package/dev/compile-mxnet-backend.sh
@@ -33,7 +33,7 @@ MXNETDIR=$2
 
 # below routine shamelessly copied from
 # /~https://github.com/apache/incubator-mxnet/blob/master/setup-utils/install-mxnet-osx-python.sh
-# This routine executes a command, 
+# This routine executes a command,
 # prints error message on the console on non-zero exit codes and
 # returns the exit code to the caller.
 chkret() {
@@ -51,7 +51,10 @@ chkret() {
 
 UNAME=`uname -s`
 chkret pushd $MXNETDIR
-chkret git submodule update --init --recursive
+
+set +e
+git submodule update --init --recursive
+set -e
 
 # don't want to overwrite an existing config file
 cp make/config.mk ./config.mk
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 3ebb39b9a67e..564102a9f696 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -5,88 +5,18 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <artifactId>mxnet-parent</artifactId>
+    <version>INTERNAL</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>mxnet-examples_2.11</artifactId>
+  <artifactId>mxnet-examples</artifactId>
   <name>MXNet Scala Package - Examples</name>
 
   <properties>
     <skipTests>true</skipTests>
-    <MXNET_DIR>${project.parent.basedir}/..</MXNET_DIR>
   </properties>
 
-  <profiles>
-    <profile>
-      <id>integrationtest</id>
-      <properties>
-        <skipTests>false</skipTests>
-      </properties>
-    </profile>
-    <profile>
-      <id>osx-x86_64-cpu</id>
-      <properties>
-        <platform>osx-x86_64-cpu</platform>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-cpu</id>
-      <properties>
-        <platform>linux-x86_64-cpu</platform>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-gpu</id>
-      <properties>
-        <platform>linux-x86_64-gpu</platform>
-      </properties>
-    </profile>
-    <profile>
-      <id>deployLocal</id>
-      <build>
-        <plugins>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-deploy-plugin</artifactId>
-            <configuration>
-              <skip>false</skip>
-            </configuration>
-          </plugin>
-        </plugins>
-      </build>
-    </profile>
-    <profile>
-      <id>release</id>
-      <build>
-        <plugins>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-source-plugin</artifactId>
-            <configuration>
-              <skipSource>true</skipSource>
-            </configuration>
-          </plugin>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-gpg-plugin</artifactId>
-            <configuration>
-              <skip>true</skip>
-            </configuration>
-          </plugin>
-          <plugin>
-            <groupId>org.sonatype.plugins</groupId>
-            <artifactId>nexus-staging-maven-plugin</artifactId>
-            <configuration>
-              <skipNexusStagingDeployMojo>true</skipNexusStagingDeployMojo>
-            </configuration>
-          </plugin>
-        </plugins>
-      </build>
-    </profile>
-  </profiles>
-
   <build>
     <plugins>
       <plugin>
@@ -149,12 +79,9 @@
         <configuration>
           <skipTests>${skipTests}</skipTests>
           <argLine>
-            -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
+            -Djava.library.path=${project.parent.basedir}/native/target \
             -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
           </argLine>
-          <environmentVariables>
-            <LD_LIBRARY_PATH>${MXNET_DIR}/lib</LD_LIBRARY_PATH>
-          </environmentVariables>
         </configuration>
       </plugin>
       <plugin>
@@ -166,14 +93,14 @@
   <dependencies>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <artifactId>mxnet-core</artifactId>
+      <version>INTERNAL</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-infer_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <artifactId>mxnet-infer</artifactId>
+      <version>INTERNAL</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/examples/scripts/infer/imageclassifier/get_resnet_18_data.sh b/scala-package/examples/scripts/infer/imageclassifier/get_resnet_18_data.sh
index 1ce996e5c851..b12993210ed2 100755
--- a/scala-package/examples/scripts/infer/imageclassifier/get_resnet_18_data.sh
+++ b/scala-package/examples/scripts/infer/imageclassifier/get_resnet_18_data.sh
@@ -34,8 +34,8 @@ if [ ! -d "$image_path" ]; then
 fi
 
 if [ ! -f "$data_path" ]; then
-  wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-symbol.json -P $data_path
-  wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-0000.params -P $data_path
-  wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/synset.txt -P $data_path
-  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/kitten.jpg -P $image_path
+  curl https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-symbol.json -o $data_path/resnet-18-symbol.json
+  curl https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-0000.params -o $data_path/resnet-18-0000.params
+  curl https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/synset.txt -o $data_path/synset.txt
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/kitten.jpg -o $image_path/kitten.jpg
 fi
diff --git a/scala-package/examples/scripts/infer/imageclassifier/get_resnet_data.sh b/scala-package/examples/scripts/infer/imageclassifier/get_resnet_data.sh
index 6fd85e4f4400..dc37bdbd73ef 100755
--- a/scala-package/examples/scripts/infer/imageclassifier/get_resnet_data.sh
+++ b/scala-package/examples/scripts/infer/imageclassifier/get_resnet_data.sh
@@ -34,8 +34,8 @@ if [ ! -d "$image_path" ]; then
 fi
 
 if [ ! -f "$data_path" ]; then
-  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/resnet-152-0000.params -P $data_path
-  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/resnet-152-symbol.json -P $data_path
-  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/synset.txt -P $data_path
-  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/kitten.jpg -P $image_path
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/resnet-152-0000.params -o $data_path/resnet-152-0000.params
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/resnet-152-symbol.json -o $data_path/resnet-152-symbol.json
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/synset.txt -o $data_path/synset.txt
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/kitten.jpg -o $image_path/kitten.jpg
 fi
diff --git a/scala-package/examples/scripts/infer/objectdetector/get_ssd_data.sh b/scala-package/examples/scripts/infer/objectdetector/get_ssd_data.sh
index 8787d6382204..e901fc80792f 100755
--- a/scala-package/examples/scripts/infer/objectdetector/get_ssd_data.sh
+++ b/scala-package/examples/scripts/infer/objectdetector/get_ssd_data.sh
@@ -35,12 +35,11 @@ if [ ! -d "$image_path" ]; then
 fi
 
 if [ ! -f "$data_path" ]; then
-    wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_model-symbol.json -P $data_path
-    wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_model-0000.params -P $data_path
-    wget https://s3.amazonaws.com/model-server/models/resnet50_ssd/synset.txt -P $data_path
-    cd $image_path
-    wget https://cloud.githubusercontent.com/assets/3307514/20012566/cbb53c76-a27d-11e6-9aaa-91939c9a1cd5.jpg -O 000001.jpg
-    wget https://cloud.githubusercontent.com/assets/3307514/20012567/cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg -O dog.jpg
-    wget https://cloud.githubusercontent.com/assets/3307514/20012563/cbb41382-a27d-11e6-92a9-18dab4fd1ad3.jpg -O person.jpg
+    curl https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_model-symbol.json -o $data_path/resnet50_ssd_model-symbol.json
+    curl https://s3.amazonaws.com/model-server/models/resnet50_ssd/resnet50_ssd_model-0000.params -o $data_path/resnet50_ssd_model-0000.params
+    curl https://s3.amazonaws.com/model-server/models/resnet50_ssd/synset.txt -o $data_path/synset.txt
+    curl https://cloud.githubusercontent.com/assets/3307514/20012566/cbb53c76-a27d-11e6-9aaa-91939c9a1cd5.jpg -o $image_path/000001.jpg
+    curl https://cloud.githubusercontent.com/assets/3307514/20012567/cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg -o $image_path/dog.jpg
+    curl https://cloud.githubusercontent.com/assets/3307514/20012563/cbb41382-a27d-11e6-92a9-18dab4fd1ad3.jpg -o $image_path/person.jpg
 fi
 
diff --git a/scala-package/examples/scripts/module/mnist_mlp.sh b/scala-package/examples/scripts/module/mnist_mlp.sh
index 29306706dbbe..e16e16e246b2 100755
--- a/scala-package/examples/scripts/module/mnist_mlp.sh
+++ b/scala-package/examples/scripts/module/mnist_mlp.sh
@@ -18,7 +18,7 @@
 # under the License.
 
 ROOT_DIR=$(cd `dirname $0`/../../..; pwd)
-CLASSPATH=$ROOT_DIR/assembly/osx-x86_64-cpu/target/*:$ROOT_DIR/examples/target/*:$ROOT_DIR/examples/target/classes/lib/*
+CLASSPATH=$ROOT_DIR/assembly/osx-x86_64/target/*:$ROOT_DIR/examples/target/*:$ROOT_DIR/examples/target/classes/lib/*
 
 mkdir -p model
 java -Xmx4G -cp $CLASSPATH \
diff --git a/scala-package/examples/scripts/module/run_sequential_module.sh b/scala-package/examples/scripts/module/run_sequential_module.sh
index 9e1f30e3687b..1e40d2dd83f7 100644
--- a/scala-package/examples/scripts/module/run_sequential_module.sh
+++ b/scala-package/examples/scripts/module/run_sequential_module.sh
@@ -18,7 +18,7 @@
 # under the License.
 
 ROOT_DIR=$(cd `dirname $0`/../../..; pwd)
-CLASSPATH=$ROOT_DIR/assembly/linux-x86_64-cpu/target/*:$ROOT_DIR/examples/target/*:$ROOT_DIR/examples/target/classes/lib/*
+CLASSPATH=$ROOT_DIR/assembly/linux-x86_64/target/*:$ROOT_DIR/examples/target/*:$ROOT_DIR/examples/target/classes/lib/*
 
 DATA_DIR=$ROOT_DIR/core/data
 
diff --git a/scala-package/examples/scripts/rnn/run_test_charrnn.sh b/scala-package/examples/scripts/rnn/run_test_charrnn.sh
index bf02a1d604ca..a854764fa416 100644
--- a/scala-package/examples/scripts/rnn/run_test_charrnn.sh
+++ b/scala-package/examples/scripts/rnn/run_test_charrnn.sh
@@ -27,7 +27,7 @@ else
 fi
 
 # you can get the training data file using the following command
-# wget http://data.mxnet.io/data/char_lstm.zip
+# curl -O http://data.mxnet.io/data/char_lstm.zip
 # unzip -o char_lstm.zip
 # for example ./datas/obama.txt
 DATA_PATH=$1
diff --git a/scala-package/examples/scripts/rnn/run_train_charrnn.sh b/scala-package/examples/scripts/rnn/run_train_charrnn.sh
index 4877f57855a0..f49eed64b109 100755
--- a/scala-package/examples/scripts/rnn/run_train_charrnn.sh
+++ b/scala-package/examples/scripts/rnn/run_train_charrnn.sh
@@ -28,7 +28,7 @@ fi
 # which gpu card to use, -1 means cpu
 GPU=$1
 # you can get the training data file using the following command
-# wget http://data.mxnet.io/data/char_lstm.zip
+# curl -O http://data.mxnet.io/data/char_lstm.zip
 # unzip -o char_lstm.zip
 # for example ./datas/obama.txt
 DATA_PATH=$2
diff --git a/scala-package/examples/scripts/run_visualization.sh b/scala-package/examples/scripts/run_visualization.sh
index 3e8f5cd784f9..7f5b94fbe2b5 100644
--- a/scala-package/examples/scripts/run_visualization.sh
+++ b/scala-package/examples/scripts/run_visualization.sh
@@ -19,7 +19,7 @@
 
 
 MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
-CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-cpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/assembly/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
 
 # please install the Graphviz library
 # if you are using ubuntu, use the following command:
diff --git a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/README.md b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/README.md
index 681253f39a88..55741024d08b 100644
--- a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/README.md
+++ b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/objectdetector/README.md
@@ -17,8 +17,8 @@ The model is trained on the [Pascal VOC 2012 dataset](http://host.robots.ox.ac.u
 
 ## Prerequisites
 
-1. MXNet
-2. MXNet Scala Package
+1. [Build MXNet](http://mxnet.incubator.apache.org/install/scala_setup.html)
+2. [Build MXNet Scala/Java Package](http://mxnet.incubator.apache.org/install/scala_setup.html)
 3. [IntelliJ IDE (or alternative IDE) project setup](http://mxnet.incubator.apache.org/tutorials/java/mxnet_java_on_intellij.html) with the MXNet Scala/Java Package
 4. wget
 
@@ -64,10 +64,10 @@ The followings is the parameters defined for this example, you can find more inf
 ## How to Run Inference
 After the previous steps, you should be able to run the code using the following script that will pass all of the required parameters to the Infer API.
 
-From the `scala-package/examples/scripts/inferexample/objectdetector/` folder run:
+From the `scala-package/examples/scripts/infer/objectdetector/` folder run:
 
 ```bash
-./run_ssd_example.sh ../models/resnet50_ssd/resnet50_ssd/resnet50_ssd_model ../images/dog.jpg ../images
+./run_ssd_example.sh ../models/resnet50_ssd/resnet50_ssd_model ../images/dog.jpg ../images
 ```
 
 **Notes**:
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainModel.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainModel.scala
index f6c283c3dfb2..9f0430eaada6 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainModel.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainModel.scala
@@ -19,6 +19,7 @@ package org.apache.mxnetexamples.imclassification
 
 import java.util.concurrent._
 
+import org.apache.mxnet.DType.DType
 import org.apache.mxnetexamples.imclassification.models._
 import org.apache.mxnetexamples.imclassification.util.Trainer
 import org.apache.mxnet._
@@ -42,12 +43,13 @@ object TrainModel {
     * @return The final validation accuracy
     */
   def test(model: String, dataPath: String, numExamples: Int = 60000,
-           numEpochs: Int = 10, benchmark: Boolean = false): Float = {
+           numEpochs: Int = 10, benchmark: Boolean = false,
+           dtype: DType = DType.Float32): Float = {
     ResourceScope.using() {
       val devs = Array(Context.cpu(0))
       val envs: mutable.Map[String, String] = mutable.HashMap.empty[String, String]
       val (dataLoader, net) = dataLoaderAndModel("mnist", model, dataPath,
-        numExamples = numExamples, benchmark = benchmark)
+        numExamples = numExamples, benchmark = benchmark, dtype = dtype)
       val Acc = Trainer.fit(batchSize = 128, numExamples, devs = devs,
         network = net, dataLoader = dataLoader,
         kvStore = "local", numEpochs = numEpochs)
@@ -69,7 +71,7 @@ object TrainModel {
     */
   def dataLoaderAndModel(dataset: String, model: String, dataDir: String = "",
                          numLayers: Int = 50, numExamples: Int = 60000,
-                         benchmark: Boolean = false
+                         benchmark: Boolean = false, dtype: DType = DType.Float32
                         ): ((Int, KVStore) => (DataIter, DataIter), Symbol) = {
     val (imageShape, numClasses) = dataset match {
       case "mnist" => (List(1, 28, 28), 10)
@@ -80,16 +82,17 @@ object TrainModel {
     val List(channels, height, width) = imageShape
     val dataSize: Int = channels * height * width
     val (datumShape, net) = model match {
-      case "mlp" => (List(dataSize), MultiLayerPerceptron.getSymbol(numClasses))
-      case "lenet" => (List(channels, height, width), Lenet.getSymbol(numClasses))
+      case "mlp" => (List(dataSize), MultiLayerPerceptron.getSymbol(numClasses, dtype = dtype))
+      case "lenet" => (List(channels, height, width), Lenet.getSymbol(numClasses, dtype = dtype))
       case "resnet" => (List(channels, height, width), Resnet.getSymbol(numClasses,
-        numLayers, imageShape))
+        numLayers, imageShape, dtype = dtype))
       case _ => throw new Exception("Invalid model name")
     }
 
     val dataLoader: (Int, KVStore) => (DataIter, DataIter) = if (benchmark) {
       (batchSize: Int, kv: KVStore) => {
-        val iter = new SyntheticDataIter(numClasses, batchSize, datumShape, List(), numExamples)
+        val iter = new SyntheticDataIter(numClasses, batchSize, datumShape, List(), numExamples,
+          dtype)
         (iter, iter)
       }
     } else {
@@ -116,8 +119,10 @@ object TrainModel {
         val dataPath = if (inst.dataDir == null) System.getenv("MXNET_HOME")
         else inst.dataDir
 
+        val dtype = DType.withName(inst.dType)
+
         val (dataLoader, net) = dataLoaderAndModel(inst.dataset, inst.network, dataPath,
-          inst.numLayers, inst.numExamples, inst.benchmark)
+          inst.numLayers, inst.numExamples, inst.benchmark, dtype)
 
         val devs =
           if (inst.gpus != null) inst.gpus.split(',').map(id => Context.gpu(id.trim.toInt))
@@ -210,5 +215,8 @@ class TrainModel {
   private val numWorker: Int = 1
   @Option(name = "--num-server", usage = "# of servers")
   private val numServer: Int = 1
+  @Option(name = "--dtype", usage = "data type of the model to train. " +
+    "Can be float32/float64. Works only with synthetic data currently")
+  private val dType: String = "float32"
 }
 
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala
index 9421f1021619..e4d3b2ae7c3e 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala
@@ -24,7 +24,7 @@ import scala.collection.immutable.ListMap
 import scala.util.Random
 
 class SyntheticDataIter(numClasses: Int, val batchSize: Int, datumShape: List[Int],
-                        labelShape: List[Int], maxIter: Int, dtype: DType = DType.Float32
+                        labelShape: List[Int], maxIter: Int, dType: DType = DType.Float32
                        ) extends DataIter {
   var curIter = 0
   val random = new Random()
@@ -35,12 +35,12 @@ class SyntheticDataIter(numClasses: Int, val batchSize: Int, datumShape: List[In
   var label: IndexedSeq[NDArray] = IndexedSeq(
     NDArray.api.random_uniform(Some(0f), Some(maxLabel), shape = Some(batchLabelShape)))
   var data: IndexedSeq[NDArray] = IndexedSeq(
-    NDArray.api.random_uniform(shape = Some(shape)))
+    NDArray.api.random_uniform(shape = Some(shape), dtype = Some(dType.toString)))
 
   val provideDataDesc: IndexedSeq[DataDesc] = IndexedSeq(
-    new DataDesc("data", shape, dtype, Layout.UNDEFINED))
+    new DataDesc("data", shape, data(0).dtype, Layout.UNDEFINED))
   val provideLabelDesc: IndexedSeq[DataDesc] = IndexedSeq(
-    new DataDesc("softmax_label", batchLabelShape, dtype, Layout.UNDEFINED))
+    new DataDesc("softmax_label", batchLabelShape, label(0).dtype, Layout.UNDEFINED))
   val getPad: Int = 0
 
   override def getData(): IndexedSeq[NDArray] = data
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Lenet.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Lenet.scala
index 76fb7bb66022..6f8b138d5ccb 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Lenet.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Lenet.scala
@@ -17,6 +17,7 @@
 
 package org.apache.mxnetexamples.imclassification.models
 
+import org.apache.mxnet.DType.DType
 import org.apache.mxnet._
 
 object Lenet {
@@ -26,8 +27,8 @@ object Lenet {
     * @param numClasses Number of classes to classify into
     * @return model symbol
     */
-  def getSymbol(numClasses: Int): Symbol = {
-    val data = Symbol.Variable("data")
+  def getSymbol(numClasses: Int, dtype: DType = DType.Float32): Symbol = {
+    val data = Symbol.Variable("data", dType = dtype)
     // first conv
     val conv1 = Symbol.api.Convolution(data = Some(data), kernel = Shape(5, 5), num_filter = 20)
     val tanh1 = Symbol.api.tanh(data = Some(conv1))
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/MultiLayerPerceptron.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/MultiLayerPerceptron.scala
index 5d880bbe0619..089b65f24a65 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/MultiLayerPerceptron.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/MultiLayerPerceptron.scala
@@ -17,6 +17,7 @@
 
 package org.apache.mxnetexamples.imclassification.models
 
+import org.apache.mxnet.DType.DType
 import org.apache.mxnet._
 
 object MultiLayerPerceptron {
@@ -26,8 +27,8 @@ object MultiLayerPerceptron {
     * @param numClasses Number of classes to classify into
     * @return model symbol
     */
-  def getSymbol(numClasses: Int): Symbol = {
-    val data = Symbol.Variable("data")
+  def getSymbol(numClasses: Int, dtype: DType = DType.Float32): Symbol = {
+    val data = Symbol.Variable("data", dType = dtype)
 
     val fc1 = Symbol.api.FullyConnected(data = Some(data), num_hidden = 128, name = "fc1")
     val act1 = Symbol.api.Activation(data = Some(fc1), "relu", name = "relu")
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Resnet.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Resnet.scala
index c3f43d97e898..e5f597680f99 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Resnet.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Resnet.scala
@@ -17,6 +17,7 @@
 
 package org.apache.mxnetexamples.imclassification.models
 
+import org.apache.mxnet.DType.DType
 import org.apache.mxnet._
 
 object Resnet {
@@ -77,13 +78,14 @@ object Resnet {
     */
   def resnet(units: List[Int], numStages: Int, filterList: List[Int], numClasses: Int,
              imageShape: List[Int], bottleNeck: Boolean = true, bnMom: Float = 0.9f,
-             workspace: Int = 256, dtype: String = "float32", memonger: Boolean = false): Symbol = {
+             workspace: Int = 256, dtype: DType = DType.Float32,
+             memonger: Boolean = false): Symbol = {
     assert(units.size == numStages)
     var data = Symbol.Variable("data", shape = Shape(List(4) ::: imageShape), dType = DType.Float32)
-    if (dtype == "float32") {
+    if (dtype == DType.Float32) {
       data = Symbol.api.identity(Some(data), "id")
-    } else if (dtype == "float16") {
-      data = Symbol.api.cast(Some(data), "float16")
+    } else if (dtype == DType.Float16) {
+      data = Symbol.api.cast(Some(data), DType.Float16.toString)
     }
     data = Symbol.api.BatchNorm(Some(data), fix_gamma = Some(true), eps = Some(2e-5),
       momentum = Some(bnMom), name = "bn_data")
@@ -118,8 +120,8 @@ object Resnet {
       kernel = Some(Shape(7, 7)), pool_type = Some("avg"), name = "pool1")
     val flat = Symbol.api.Flatten(Some(pool1))
     var fc1 = Symbol.api.FullyConnected(Some(flat), num_hidden = numClasses, name = "fc1")
-    if (dtype == "float16") {
-      fc1 = Symbol.api.cast(Some(fc1), "float32")
+    if (dtype == DType.Float16) {
+      fc1 = Symbol.api.cast(Some(fc1), DType.Float32.toString)
     }
     Symbol.api.SoftmaxOutput(Some(fc1), name = "softmax")
   }
@@ -134,7 +136,7 @@ object Resnet {
     * @return Model symbol
     */
   def getSymbol(numClasses: Int, numLayers: Int, imageShape: List[Int], convWorkspace: Int = 256,
-                dtype: String = "float32"): Symbol = {
+                dtype: DType = DType.Float32): Symbol = {
     val List(channels, height, width) = imageShape
     val (numStages, units, filterList, bottleNeck): (Int, List[Int], List[Int], Boolean) =
       if (height <= 28) {
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
index 44025c0459ad..a7e36dfc3a11 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
@@ -29,8 +29,7 @@ import org.slf4j.LoggerFactory
 import scala.sys.process.Process
 
 /**
-  * Integration test for imageClassifier example.
-  * This will run as a part of "make scalatest"
+  * Integration test for CNN example.
   */
 class CNNClassifierExampleSuite extends FunSuite with BeforeAndAfterAll {
   private val logger = LoggerFactory.getLogger(classOf[CNNClassifierExampleSuite])
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
index 6e9667abe9c0..0daba5a97d77 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
@@ -19,7 +19,7 @@ package org.apache.mxnetexamples.imclassification
 
 import java.io.File
 
-import org.apache.mxnet.Context
+import org.apache.mxnet.{Context, DType}
 import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
@@ -55,9 +55,15 @@ class IMClassificationExampleSuite extends FunSuite with BeforeAndAfterAll {
 
   for(model <- List("mlp", "lenet", "resnet")) {
     test(s"Example CI: Test Image Classification Model ${model}") {
-      var context = Context.cpu()
       val valAccuracy = TrainModel.test(model, "", 10, 1, benchmark = true)
     }
   }
 
+  for(model <- List("mlp", "lenet", "resnet")) {
+    test(s"Example CI: Test Image Classification Model ${model} with Float64 input") {
+      val valAccuracy = TrainModel.test(model, "", 10, 1, benchmark = true,
+        dtype = DType.Float64)
+    }
+  }
+
 }
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
index 27d9bb4c8fe9..c5308ac37512 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
@@ -27,7 +27,6 @@ import sys.process.Process
 
 /**
   * Integration test for imageClassifier example.
-  * This will run as a part of "make scalatest"
   */
 class ImageClassifierExampleSuite extends FunSuite with BeforeAndAfterAll {
   private val logger = LoggerFactory.getLogger(classOf[ImageClassifierExampleSuite])
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
index 983978dbaec4..65902c7ad391 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
@@ -25,8 +25,7 @@ import org.scalatest.FunSuite
 
 
 /**
-  * Integration test for imageClassifier example.
-  * This will run as a part of "make scalatest"
+  * Integration test for Multi-task example.
   */
 class MultiTaskSuite extends FunSuite {
   test("Multitask Test") {
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/profiler/ProfilerSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/profiler/ProfilerSuite.scala
index 859a8c06493f..67638b8fac0f 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/profiler/ProfilerSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/profiler/ProfilerSuite.scala
@@ -25,8 +25,7 @@ import org.apache.mxnet.Profiler
 import org.apache.mxnet.Context
 
 /**
-  * Integration test for imageClassifier example.
-  * This will run as a part of "make scalatest"
+  * Integration test for profiler example.
   */
 class ProfilerSuite extends FunSuite with BeforeAndAfterAll {
   private val logger = LoggerFactory.getLogger(classOf[ProfilerSuite])
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index fb5cf370a009..13ceebb83cd9 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -2,106 +2,75 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-    <modelVersion>4.0.0</modelVersion>
-    <parent>
-        <artifactId>mxnet-parent_2.11</artifactId>
-        <groupId>org.apache.mxnet</groupId>
-        <version>1.5.0-SNAPSHOT</version>
-        <relativePath>../pom.xml</relativePath>
-    </parent>
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <artifactId>mxnet-parent</artifactId>
+    <groupId>org.apache.mxnet</groupId>
+    <version>INTERNAL</version>
+    <relativePath>../pom.xml</relativePath>
+  </parent>
 
-    <artifactId>mxnet-infer_2.11</artifactId>
-    <name>MXNet Scala Package - Inference</name>
+  <properties>
+    <skipJavaTests>false</skipJavaTests>
+  </properties>
 
-    <properties>
-        <skipTests>true</skipTests>
-        <MXNET_DIR>${project.parent.basedir}/..</MXNET_DIR>
-    </properties>
+  <artifactId>mxnet-infer</artifactId>
+  <name>MXNet Scala Package - Inference</name>
 
-    <profiles>
-        <profile>
-            <id>unittest</id>
-            <properties>
-                <skipTests>false</skipTests>
-            </properties>
-        </profile>
-        <profile>
-            <id>osx-x86_64-cpu</id>
-            <properties>
-                <platform>osx-x86_64-cpu</platform>
-            </properties>
-        </profile>
-        <profile>
-            <id>linux-x86_64-cpu</id>
-            <properties>
-                <platform>linux-x86_64-cpu</platform>
-            </properties>
-        </profile>
-        <profile>
-            <id>linux-x86_64-gpu</id>
-            <properties>
-                <platform>linux-x86_64-gpu</platform>
-            </properties>
-        </profile>
-    </profiles>
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <excludes>
+            <exclude>META-INF/*.SF</exclude>
+            <exclude>META-INF/*.DSA</exclude>
+            <exclude>META-INF/*.RSA</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+        <configuration>
+          <argLine>
+            -Djava.library.path=${project.parent.basedir}/native/target \
+            -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
+          </argLine>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.scalastyle</groupId>
+        <artifactId>scalastyle-maven-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.mxnet</groupId>
+      <artifactId>mxnet-core</artifactId>
+      <version>INTERNAL</version>
+      <scope>provided</scope>
+    </dependency>
+    <!-- https://mvnrepository.com/artifact/org.mockito/mockito-all -->
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-all</artifactId>
+      <version>1.10.19</version>
+      <scope>test</scope>
+    </dependency>
 
-    <build>
-        <plugins>
-            <plugin>
-                <groupId>org.apache.maven.plugins</groupId>
-                <artifactId>maven-deploy-plugin</artifactId>
-                <configuration>
-                    <skip>true</skip>
-                </configuration>
-            </plugin>
-            <plugin>
-                <groupId>org.apache.maven.plugins</groupId>
-                <artifactId>maven-jar-plugin</artifactId>
-                <configuration>
-                    <excludes>
-                        <exclude>META-INF/*.SF</exclude>
-                        <exclude>META-INF/*.DSA</exclude>
-                        <exclude>META-INF/*.RSA</exclude>
-                    </excludes>
-                </configuration>
-            </plugin>
-            <plugin>
-                <groupId>org.apache.maven.plugins</groupId>
-                <artifactId>maven-compiler-plugin</artifactId>
-            </plugin>
-            <plugin>
-                <groupId>org.scalatest</groupId>
-                <artifactId>scalatest-maven-plugin</artifactId>
-                <configuration>
-                    <skipTests>${skipTests}</skipTests>
-                    <argLine>
-                        -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
-                        -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
-                    </argLine>
-                    <environmentVariables>
-                        <LD_LIBRARY_PATH>${MXNET_DIR}/lib</LD_LIBRARY_PATH>
-                    </environmentVariables>
-                </configuration>
-            </plugin>
-            <plugin>
-                <groupId>org.scalastyle</groupId>
-                <artifactId>scalastyle-maven-plugin</artifactId>
-            </plugin>
-        </plugins>
-    </build>
-    <dependencies>
-        <dependency>
-            <groupId>org.apache.mxnet</groupId>
-            <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-            <version>1.5.0-SNAPSHOT</version>
-            <scope>provided</scope>
-        </dependency>
-        <!-- https://mvnrepository.com/artifact/org.mockito/mockito-all -->
-        <dependency>
-            <groupId>org.mockito</groupId>
-            <artifactId>mockito-all</artifactId>
-            <version>1.10.19</version>
-            <scope>test</scope>
-        </dependency>
-    </dependencies>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.11</version>
+      <scope>test</scope>
+    </dependency>
+
+  </dependencies>
 </project>
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
index cf55bc10d97e..bf6581588114 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Classifier.scala
@@ -17,9 +17,10 @@
 
 package org.apache.mxnet.infer
 
-import org.apache.mxnet.{Context, DataDesc, NDArray}
+import org.apache.mxnet._
 import java.io.File
 
+import org.apache.mxnet.MX_PRIMITIVES.MX_PRIMITIVE_TYPE
 import org.slf4j.LoggerFactory
 
 import scala.io
@@ -30,13 +31,13 @@ trait ClassifierBase {
 
   /**
     * Takes an array of floats and returns corresponding (Label, Score) tuples
-    * @param input            Indexed sequence one-dimensional array of floats
+    * @param input            Indexed sequence one-dimensional array of floats/doubles
     * @param topK             (Optional) How many result (sorting based on the last axis)
     *                         elements to return. Default returns unsorted output.
     * @return                 Indexed sequence of (Label, Score) tuples
     */
-  def classify(input: IndexedSeq[Array[Float]],
-               topK: Option[Int] = None): IndexedSeq[(String, Float)]
+  def classify[@specialized (Base.MX_PRIMITIVES) T](input: IndexedSeq[Array[T]],
+               topK: Option[Int] = None): IndexedSeq[(String, T)]
 
   /**
     * Takes a sequence of NDArrays and returns (Label, Score) tuples
@@ -78,17 +79,35 @@ class Classifier(modelPathPrefix: String,
 
   /**
     * Takes flat arrays as input and returns (Label, Score) tuples.
-    * @param input            Indexed sequence one-dimensional array of floats
+    * @param input            Indexed sequence one-dimensional array of floats/doubles
     * @param topK             (Optional) How many result (sorting based on the last axis)
     *                         elements to return. Default returns unsorted output.
     * @return                 Indexed sequence of (Label, Score) tuples
     */
-  override def classify(input: IndexedSeq[Array[Float]],
-                        topK: Option[Int] = None): IndexedSeq[(String, Float)] = {
+  override def classify[@specialized (Base.MX_PRIMITIVES) T](input: IndexedSeq[Array[T]],
+                        topK: Option[Int] = None): IndexedSeq[(String, T)] = {
+
+    // considering only the first output
+    val result = input(0)(0) match {
+      case d: Double => {
+        classifyImpl(input.asInstanceOf[IndexedSeq[Array[Double]]], topK)
+      }
+      case _ => {
+        classifyImpl(input.asInstanceOf[IndexedSeq[Array[Float]]], topK)
+      }
+    }
+
+    result.asInstanceOf[IndexedSeq[(String, T)]]
+  }
+
+  private def classifyImpl[B, A <: MX_PRIMITIVE_TYPE]
+  (input: IndexedSeq[Array[B]], topK: Option[Int] = None)(implicit ev: B => A)
+  : IndexedSeq[(String, B)] = {
 
     // considering only the first output
     val predictResult = predictor.predict(input)(0)
-    var result: IndexedSeq[(String, Float)] = IndexedSeq.empty
+
+    var result: IndexedSeq[(String, B)] = IndexedSeq.empty
 
     if (topK.isDefined) {
       val sortedIndex = predictResult.zipWithIndex.sortBy(-_._1).map(_._2).take(topK.get)
@@ -105,7 +124,7 @@ class Classifier(modelPathPrefix: String,
     * @param input            Indexed sequence of NDArrays
     * @param topK             (Optional) How many result (sorting based on the last axis)
     *                         elements to return. Default returns unsorted output.
-    * @return                 Traversable sequence of (Label, Score) tuples
+    * @return                 Traversable sequence of (Label, Score) tuples.
     */
   override def classifyWithNDArray(input: IndexedSeq[NDArray], topK: Option[Int] = None)
   : IndexedSeq[IndexedSeq[(String, Float)]] = {
@@ -113,7 +132,7 @@ class Classifier(modelPathPrefix: String,
     // considering only the first output
     // Copy NDArray to CPU to avoid frequent GPU to CPU copying
     val predictResultND: NDArray =
-      predictor.predictWithNDArray(input)(0).asInContext(Context.cpu())
+    predictor.predictWithNDArray(input)(0).asInContext(Context.cpu())
     // Parallel Execution with ParArray for better performance
     val predictResultPar: ParArray[Array[Float]] =
       new ParArray[Array[Float]](predictResultND.shape(0))
@@ -126,7 +145,6 @@ class Classifier(modelPathPrefix: String,
     })
 
     val predictResult = predictResultPar.toArray
-
     var result: ListBuffer[IndexedSeq[(String, Float)]] =
       ListBuffer.empty[IndexedSeq[(String, Float)]]
 
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala
index 96be12179d42..3c80f9226399 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ImageClassifier.scala
@@ -17,7 +17,8 @@
 
 package org.apache.mxnet.infer
 
-import org.apache.mxnet.{Context, DataDesc, NDArray, Shape}
+import org.apache.mxnet.DType.DType
+import org.apache.mxnet._
 
 import scala.collection.mutable.ListBuffer
 
@@ -70,14 +71,18 @@ class ImageClassifier(modelPathPrefix: String,
     *
     * @param inputImage       Path prefix of the input image
     * @param topK             Number of result elements to return, sorted by probability
+    * @param dType            The precision at which to run the inference.
+    *                         specify the DType as DType.Float64 for Double precision.
+    *                         Defaults to DType.Float32
     * @return                 List of list of tuples of (Label, Probability)
     */
-  def classifyImage(inputImage: BufferedImage,
-                    topK: Option[Int] = None): IndexedSeq[IndexedSeq[(String, Float)]] = {
+  def classifyImage
+  (inputImage: BufferedImage, topK: Option[Int] = None, dType: DType = DType.Float32):
+  IndexedSeq[IndexedSeq[(String, Float)]] = {
 
     val scaledImage = ImageClassifier.reshapeImage(inputImage, width, height)
     val imageShape = inputShape.drop(1)
-    val pixelsNDArray = ImageClassifier.bufferedImageToPixels(scaledImage, imageShape)
+    val pixelsNDArray = ImageClassifier.bufferedImageToPixels(scaledImage, imageShape, dType)
     val imgWithBatchNum = NDArray.api.expand_dims(pixelsNDArray, 0)
     inputImage.flush()
     scaledImage.flush()
@@ -95,16 +100,19 @@ class ImageClassifier(modelPathPrefix: String,
     *
     * @param inputBatch       Input array of buffered images
     * @param topK             Number of result elements to return, sorted by probability
+    * @param dType            The precision at which to run the inference.
+    *                         specify the DType as DType.Float64 for Double precision.
+    *                         Defaults to DType.Float32
     * @return                 List of list of tuples of (Label, Probability)
     */
-  def classifyImageBatch(inputBatch: Traversable[BufferedImage], topK: Option[Int] = None):
-  IndexedSeq[IndexedSeq[(String, Float)]] = {
+  def classifyImageBatch(inputBatch: Traversable[BufferedImage], topK: Option[Int] = None,
+   dType: DType = DType.Float32): IndexedSeq[IndexedSeq[(String, Float)]] = {
 
     val inputBatchSeq = inputBatch.toIndexedSeq
     val imageBatch = inputBatchSeq.indices.par.map(idx => {
       val scaledImage = ImageClassifier.reshapeImage(inputBatchSeq(idx), width, height)
       val imageShape = inputShape.drop(1)
-      val imgND = ImageClassifier.bufferedImageToPixels(scaledImage, imageShape)
+      val imgND = ImageClassifier.bufferedImageToPixels(scaledImage, imageShape, dType)
       val imgWithBatch = NDArray.api.expand_dims(imgND, 0).get
       handler.execute(imgND.dispose())
       imgWithBatch
@@ -152,11 +160,29 @@ object ImageClassifier {
     * returned by this method after the use.
     * </p>
     * @param resizedImage     BufferedImage to get pixels from
+    *
     * @param inputImageShape  Input shape; for example for resnet it is (3,224,224).
                               Should be same as inputDescriptor shape.
+    * @param dType            The DataType of the NDArray created from the image
+    *                         that should be returned.
+    *                         Currently it defaults to Dtype.Float32
     * @return                 NDArray pixels array with shape (3, 224, 224) in CHW format
     */
-  def bufferedImageToPixels(resizedImage: BufferedImage, inputImageShape: Shape): NDArray = {
+  def bufferedImageToPixels(resizedImage: BufferedImage, inputImageShape: Shape,
+                            dType : DType = DType.Float32): NDArray = {
+
+      if (dType == DType.Float64) {
+        val result = getFloatPixelsArray(resizedImage)
+        NDArray.array(result.map(_.toDouble), shape = inputImageShape)
+      }
+      else {
+        val result = getFloatPixelsArray(resizedImage)
+        NDArray.array(result, shape = inputImageShape)
+      }
+  }
+
+  private def getFloatPixelsArray(resizedImage: BufferedImage): Array[Float] = {
+
     // Get height and width of the image
     val w = resizedImage.getWidth()
     val h = resizedImage.getHeight()
@@ -166,7 +192,6 @@ object ImageClassifier {
 
     // 3 times height and width for R,G,B channels
     val result = new Array[Float](3 * h * w)
-
     var row = 0
     // copy pixels to array vertically
     while (row < h) {
@@ -184,11 +209,10 @@ object ImageClassifier {
       }
       row += 1
     }
+
     resizedImage.flush()
 
-    // creating NDArray according to the input shape
-    val pixelsArray = NDArray.array(result, shape = inputImageShape)
-    pixelsArray
+    result
   }
 
   /**
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala
index d4bce9f0d71e..67692a316cc4 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/Predictor.scala
@@ -17,8 +17,9 @@
 
 package org.apache.mxnet.infer
 
+import org.apache.mxnet.MX_PRIMITIVES.MX_PRIMITIVE_TYPE
 import org.apache.mxnet.io.NDArrayIter
-import org.apache.mxnet.{Context, DataDesc, NDArray, Shape}
+import org.apache.mxnet._
 import org.apache.mxnet.module.Module
 
 import scala.collection.mutable.ListBuffer
@@ -36,11 +37,13 @@ private[infer] trait PredictBase {
    * <p>
    * This method will take input as IndexedSeq one dimensional arrays and creates the
    * NDArray needed for inference. The array will be reshaped based on the input descriptors.
-   * @param input:            An IndexedSequence of a one-dimensional array.
+   * @param input:            An Indexed Sequence of a one-dimensional array of datatype
+    *                         Float or Double
                               An IndexedSequence is needed when the model has more than one input.
    * @return                  Indexed sequence array of outputs
    */
-  def predict(input: IndexedSeq[Array[Float]]): IndexedSeq[Array[Float]]
+  def predict[@specialized (Base.MX_PRIMITIVES) T](input: IndexedSeq[Array[T]])
+  : IndexedSeq[Array[T]]
 
   /**
    * Predict using NDArray as input.
@@ -123,13 +126,13 @@ class Predictor(modelPathPrefix: String,
    * Takes input as IndexedSeq one dimensional arrays and creates the NDArray needed for inference
    * The array will be reshaped based on the input descriptors.
    *
-   * @param input:            An IndexedSequence of a one-dimensional array.
+   * @param input:            An IndexedSequence of a one-dimensional array
+    *                         of data type Float or Double.
                               An IndexedSequence is needed when the model has more than one input.
    * @return                  Indexed sequence array of outputs
    */
-  override def predict(input: IndexedSeq[Array[Float]])
-  : IndexedSeq[Array[Float]] = {
-
+  override def predict[@specialized (Base.MX_PRIMITIVES) T](input: IndexedSeq[Array[T]])
+  : IndexedSeq[Array[T]] = {
     require(input.length == inputDescriptors.length,
       s"number of inputs provided: ${input.length} does not match number of inputs " +
         s"in inputDescriptors: ${inputDescriptors.length}")
@@ -139,12 +142,30 @@ class Predictor(modelPathPrefix: String,
         s"number of elements:${i.length} in the input does not match the shape:" +
           s"${d.shape.toString()}")
     }
+
+    // Infer the dtype of input and call relevant method
+    val result = input(0)(0) match {
+      case d: Double => predictImpl(input.asInstanceOf[IndexedSeq[Array[Double]]])
+      case _ => predictImpl(input.asInstanceOf[IndexedSeq[Array[Float]]])
+    }
+
+    result.asInstanceOf[IndexedSeq[Array[T]]]
+  }
+
+  private def predictImpl[B, A <: MX_PRIMITIVE_TYPE]
+  (input: IndexedSeq[Array[B]])(implicit ev: B => A)
+  : IndexedSeq[Array[B]] = {
+
     var inputND: ListBuffer[NDArray] = ListBuffer.empty[NDArray]
 
     for((i, d) <- input.zip(inputDescriptors)) {
       val shape = d.shape.toVector.patch(from = batchIndex, patch = Vector(1), replaced = 1)
-
-      inputND += mxNetHandler.execute(NDArray.array(i, Shape(shape)))
+      if (d.dtype == DType.Float64) {
+        inputND += mxNetHandler.execute(NDArray.array(i.asInstanceOf[Array[Double]], Shape(shape)))
+      }
+      else {
+        inputND += mxNetHandler.execute(NDArray.array(i.asInstanceOf[Array[Float]], Shape(shape)))
+      }
     }
 
     // rebind with batchsize 1
@@ -158,7 +179,8 @@ class Predictor(modelPathPrefix: String,
     val resultND = mxNetHandler.execute(mod.predict(new NDArrayIter(
       inputND.toIndexedSeq, dataBatchSize = 1)))
 
-    val result = resultND.map((f : NDArray) => f.toArray)
+    val result =
+      resultND.map((f : NDArray) => if (f.dtype == DType.Float64) f.toFloat64Array else f.toArray)
 
     mxNetHandler.execute(inputND.foreach(_.dispose))
     mxNetHandler.execute(resultND.foreach(_.dispose))
@@ -168,9 +190,11 @@ class Predictor(modelPathPrefix: String,
       mxNetHandler.execute(mod.bind(inputDescriptors, forTraining = false, forceRebind = true))
     }
 
-    result
+    result.asInstanceOf[IndexedSeq[Array[B]]]
   }
 
+
+
   /**
    * Predict using NDArray as input
    * This method is useful when the input is a batch of data
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetectorOutput.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetectorOutput.scala
index 13369c8fcef5..5a6ac7599fa9 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetectorOutput.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetectorOutput.scala
@@ -17,18 +17,55 @@
 
 package org.apache.mxnet.infer.javaapi
 
+/**
+  * The ObjectDetectorOutput class is a simple POJO helper class that is used to simplify
+  * the interactions with ObjectDetector predict results. The class stores the bounding box
+  * coordinates, name of preicted class, and the probability.
+  */
+
+
 class ObjectDetectorOutput (className: String, args: Array[Float]){
 
+  /**
+    * Gets the predicted class's name.
+    *
+    * @return       String representing the name of the predicted class
+    */
   def getClassName: String = className
 
+  /**
+    * Gets the probability of the predicted class.
+    *
+    * @return       Float representing the probability of predicted class
+    */
   def getProbability: Float = args(0)
 
+  /**
+    * Gets the minimum X coordinate for the bounding box containing the predicted object.
+    *
+    * @return       Float of the min X coordinate for the object bounding box
+    */
   def getXMin: Float = args(1)
 
+  /**
+    * Gets the maximum X coordinate for the bounding box containing the predicted object.
+    *
+    * @return       Float of the max X coordinate for the object bounding box
+    */
   def getXMax: Float = args(2)
 
+  /**
+    * Gets the minimum Y coordinate for the bounding box containing the predicted object.
+    *
+    * @return       Float of the min Y coordinate for the object bounding box
+    */
   def getYMin: Float = args(3)
 
+  /**
+    * Gets the maximum Y coordinate for the bounding box containing the predicted object.
+    *
+    * @return       Float of the max Y coordinate for the object bounding box
+    */
   def getYMax: Float = args(4)
 
 }
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
index 0466693be9bc..146fe93105e4 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
@@ -72,6 +72,30 @@ class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor)
     predictor.predict(input).toArray
   }
 
+  /**
+    * Takes input as Array of one dimensional arrays and creates the NDArray needed for inference
+    * The array will be reshaped based on the input descriptors. Example of calling in Java:
+    *
+    * <pre>
+    * {@code
+    * double tmp[][] = new double[1][224];
+    * for (int x = 0; x < 1; x++)
+    *   for (int y = 0; y < 224; y++)
+    *     tmp[x][y] = (int)(Math.random()*10);
+    * predictor.predict(tmp);
+    * }
+    * </pre>
+    *
+    * @param input:            An Array of a one-dimensional array.
+                              An extra Array is needed for when the model has more than one input.
+    * @return                  Indexed sequence array of outputs
+    */
+
+  def predict(input: Array[Array[Double]]):
+  Array[Array[Double]] = {
+    predictor.predict(input).toArray
+  }
+
   /**
     * Takes input as List of one dimensional arrays and creates the NDArray needed for inference
     * The array will be reshaped based on the input descriptors.
diff --git a/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorOutputTest.java b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorOutputTest.java
new file mode 100644
index 000000000000..04041fcda9bf
--- /dev/null
+++ b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorOutputTest.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.infer.javaapi;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class ObjectDetectorOutputTest {
+
+    private String predictedClassName = "lion";
+
+    private float delta = 0.00001f;
+
+    @Test
+    public void testConstructor() {
+
+        float[] arr = new float[]{0f, 1f, 2f, 3f, 4f};
+
+        ObjectDetectorOutput odOutput = new ObjectDetectorOutput(predictedClassName, arr);
+
+        Assert.assertEquals(odOutput.getClassName(), predictedClassName);
+        Assert.assertEquals("Threshold not matching", odOutput.getProbability(), 0f, delta);
+        Assert.assertEquals("Threshold not matching", odOutput.getXMin(), 1f, delta);
+        Assert.assertEquals("Threshold not matching", odOutput.getXMax(), 2f, delta);
+        Assert.assertEquals("Threshold not matching", odOutput.getYMin(), 3f, delta);
+        Assert.assertEquals("Threshold not matching", odOutput.getYMax(), 4f, delta);
+
+    }
+
+    @Test (expected = ArrayIndexOutOfBoundsException.class)
+    public void testIncompleteArgsConstructor() {
+
+        float[] arr = new float[]{0f, 1f};
+
+        ObjectDetectorOutput odOutput = new ObjectDetectorOutput(predictedClassName, arr);
+
+        Assert.assertEquals(odOutput.getClassName(), predictedClassName);
+        Assert.assertEquals("Threshold not matching", odOutput.getProbability(), 0f, delta);
+        Assert.assertEquals("Threshold not matching", odOutput.getXMin(), 1f, delta);
+
+        // This is where exception will be thrown
+        odOutput.getXMax();
+    }
+}
diff --git a/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorTest.java b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorTest.java
new file mode 100644
index 000000000000..a5e64911d141
--- /dev/null
+++ b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.infer.javaapi;
+
+import org.apache.mxnet.Layout;
+import org.apache.mxnet.javaapi.DType;
+import org.apache.mxnet.javaapi.DataDesc;
+import org.apache.mxnet.javaapi.NDArray;
+import org.apache.mxnet.javaapi.Shape;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import java.awt.image.BufferedImage;
+import java.util.ArrayList;
+import java.util.List;
+
+public class ObjectDetectorTest {
+
+    private List<DataDesc> inputDesc;
+    private BufferedImage inputImage;
+
+    private List<List<ObjectDetectorOutput>> expectedResult;
+
+    private ObjectDetector objectDetector;
+
+    private int batchSize = 1;
+
+    private int channels = 3;
+
+    private int imageHeight = 512;
+
+    private int imageWidth = 512;
+
+    private String dataName = "data";
+
+    private int topK = 5;
+
+    private String predictedClassName = "lion"; // Random string
+
+    private Shape getTestShape() {
+
+        return new Shape(new int[] {batchSize, channels, imageHeight, imageWidth});
+    }
+
+    @Before
+    public void setUp() {
+
+        inputDesc = new ArrayList<>();
+        inputDesc.add(new DataDesc(dataName, getTestShape(), DType.Float32(), Layout.NCHW()));
+        inputImage = new BufferedImage(imageWidth, imageHeight, BufferedImage.TYPE_INT_RGB);
+        objectDetector = Mockito.mock(ObjectDetector.class);
+        expectedResult = new ArrayList<>();
+        expectedResult.add(new ArrayList<ObjectDetectorOutput>());
+        expectedResult.get(0).add(new ObjectDetectorOutput(predictedClassName, new float[]{}));
+    }
+
+    @Test
+    public void testObjectDetectorWithInputImage() {
+
+        Mockito.when(objectDetector.imageObjectDetect(inputImage, topK)).thenReturn(expectedResult);
+        List<List<ObjectDetectorOutput>> actualResult = objectDetector.imageObjectDetect(inputImage, topK);
+        Mockito.verify(objectDetector, Mockito.times(1)).imageObjectDetect(inputImage, topK);
+        Assert.assertEquals(expectedResult, actualResult);
+    }
+
+
+    @Test
+    public void testObjectDetectorWithBatchImage() {
+
+        List<BufferedImage> batchImage = new ArrayList<>();
+        batchImage.add(inputImage);
+        Mockito.when(objectDetector.imageBatchObjectDetect(batchImage, topK)).thenReturn(expectedResult);
+        List<List<ObjectDetectorOutput>> actualResult = objectDetector.imageBatchObjectDetect(batchImage, topK);
+        Mockito.verify(objectDetector, Mockito.times(1)).imageBatchObjectDetect(batchImage, topK);
+        Assert.assertEquals(expectedResult, actualResult);
+    }
+
+    @Test
+    public void testObjectDetectorWithNDArrayInput() {
+
+        NDArray inputArr = ObjectDetector.bufferedImageToPixels(inputImage, getTestShape());
+        List<NDArray> inputL = new ArrayList<>();
+        inputL.add(inputArr);
+        Mockito.when(objectDetector.objectDetectWithNDArray(inputL, 5)).thenReturn(expectedResult);
+        List<List<ObjectDetectorOutput>> actualResult = objectDetector.objectDetectWithNDArray(inputL, topK);
+        Mockito.verify(objectDetector, Mockito.times(1)).objectDetectWithNDArray(inputL, topK);
+        Assert.assertEquals(expectedResult, actualResult);
+    }
+}
diff --git a/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/PredictorTest.java b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/PredictorTest.java
new file mode 100644
index 000000000000..e7a6c9652346
--- /dev/null
+++ b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/PredictorTest.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.infer.javaapi;
+
+import org.apache.mxnet.javaapi.Context;
+import org.apache.mxnet.javaapi.NDArray;
+import org.apache.mxnet.javaapi.Shape;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class PredictorTest {
+
+    Predictor mockPredictor;
+
+    @Before
+    public void setUp() {
+        mockPredictor = Mockito.mock(Predictor.class);
+    }
+
+    @Test
+    public void testPredictWithFloatArray() {
+
+        float tmp[][] = new float[1][224];
+        for (int x = 0; x < 1; x++) {
+            for (int y = 0; y < 224; y++)
+                tmp[x][y] = (int) (Math.random() * 10);
+        }
+
+        float [][] expectedResult = new float[][] {{1f, 2f}};
+        Mockito.when(mockPredictor.predict(tmp)).thenReturn(expectedResult);
+        float[][] actualResult = mockPredictor.predict(tmp);
+
+        Mockito.verify(mockPredictor, Mockito.times(1)).predict(tmp);
+        Assert.assertArrayEquals(expectedResult, actualResult);
+    }
+
+    @Test
+    public void testPredictWithNDArray() {
+
+        float[] tmpArr = new float[224];
+            for (int y = 0; y < 224; y++)
+                tmpArr[y] = (int) (Math.random() * 10);
+
+        NDArray arr = new org.apache.mxnet.javaapi.NDArray(tmpArr, new Shape(new int[] {1, 1, 1, 224}), new Context("cpu", 0));
+
+        List<NDArray> inputList = new ArrayList<>();
+        inputList.add(arr);
+
+        NDArray expected = new NDArray(tmpArr, new Shape(new int[] {1, 1, 1, 224}), new Context("cpu", 0));
+        List<NDArray> expectedResult = new ArrayList<>();
+        expectedResult.add(expected);
+
+        Mockito.when(mockPredictor.predictWithNDArray(inputList)).thenReturn(expectedResult);
+
+        List<NDArray> actualOutput = mockPredictor.predictWithNDArray(inputList);
+
+        Mockito.verify(mockPredictor, Mockito.times(1)).predictWithNDArray(inputList);
+
+        Assert.assertEquals(expectedResult, actualOutput);
+    }
+
+    @Test
+    public void testPredictWithListOfFloatsAsInput() {
+        List<List<Float>> input = new ArrayList<>();
+
+        input.add(Arrays.asList(new Float[] {1f, 2f}));
+
+        List<List<Float>> expectedOutput = new ArrayList<>(input);
+
+        Mockito.when(mockPredictor.predict(input)).thenReturn(expectedOutput);
+
+        List<List<Float>> actualOutput = mockPredictor.predict(input);
+
+        Mockito.verify(mockPredictor, Mockito.times(1)).predict(input);
+
+        Assert.assertEquals(expectedOutput, actualOutput);
+
+    }
+}
\ No newline at end of file
diff --git a/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ClassifierSuite.scala b/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ClassifierSuite.scala
index b28aeba1deed..d9ccec468791 100644
--- a/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ClassifierSuite.scala
+++ b/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ClassifierSuite.scala
@@ -22,7 +22,7 @@ import java.nio.file.{Files, Paths}
 import java.util
 
 import org.apache.mxnet.module.Module
-import org.apache.mxnet.{Context, DataDesc, NDArray, Shape}
+import org.apache.mxnet.{Context, DType, DataDesc, NDArray, Shape}
 import org.mockito.Matchers._
 import org.mockito.Mockito
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
@@ -127,6 +127,29 @@ class ClassifierSuite extends FunSuite with BeforeAndAfterAll {
 
   }
 
+  test("ClassifierSuite-flatFloat64Array-topK") {
+    val inputDescriptor = IndexedSeq[DataDesc](new DataDesc("data", Shape(2, 3, 2, 2)))
+    val inputData = Array.fill[Double](12)(1d)
+
+    val predictResult : IndexedSeq[Array[Double]] =
+      IndexedSeq[Array[Double]](Array(.98d, 0.97d, 0.96d, 0.99d))
+
+    val testClassifier = new MyClassifier(modelPath, inputDescriptor)
+
+    Mockito.doReturn(predictResult).when(testClassifier.predictor)
+      .predict(any(classOf[IndexedSeq[Array[Double]]]))
+
+    val result: IndexedSeq[(String, Double)] = testClassifier.
+      classify(IndexedSeq(inputData), topK = Some(10))
+
+    assert((result(0)_2).getClass == 1d.getClass)
+
+    assertResult(predictResult(0).sortBy(-_)) {
+      result.map(_._2).toArray
+    }
+
+  }
+
   test("ClassifierSuite-flatArrayInput") {
     val inputDescriptor = IndexedSeq[DataDesc](new DataDesc("data", Shape(2, 3, 2, 2)))
     val inputData = Array.fill[Float](12)(1)
@@ -147,6 +170,28 @@ class ClassifierSuite extends FunSuite with BeforeAndAfterAll {
     }
   }
 
+  test("ClassifierSuite-flatArrayFloat64Input") {
+    val inputDescriptor = IndexedSeq[DataDesc](new DataDesc("data", Shape(2, 3, 2, 2)))
+    val inputData = Array.fill[Double](12)(1d)
+
+    val predictResult : IndexedSeq[Array[Double]] =
+      IndexedSeq[Array[Double]](Array(.98d, 0.97d, 0.96d, 0.99d))
+
+    val testClassifier = new MyClassifier(modelPath, inputDescriptor)
+
+    Mockito.doReturn(predictResult).when(testClassifier.predictor)
+      .predict(any(classOf[IndexedSeq[Array[Double]]]))
+
+    val result: IndexedSeq[(String, Double)] = testClassifier.
+      classify(IndexedSeq(inputData))
+
+    assert((result(0)_2).getClass == 1d.getClass)
+
+    assertResult(predictResult(0)) {
+      result.map(_._2).toArray
+    }
+  }
+
   test("ClassifierSuite-NDArray1InputWithoutTopK") {
     val inputDescriptor = IndexedSeq[DataDesc](new DataDesc("data", Shape(2, 3, 2, 2)))
     val inputDataShape = Shape(1, 3, 2, 2)
diff --git a/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ImageClassifierSuite.scala b/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ImageClassifierSuite.scala
index 1c291e1e7b3c..5198c4a1f309 100644
--- a/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ImageClassifierSuite.scala
+++ b/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ImageClassifierSuite.scala
@@ -68,6 +68,10 @@ class ImageClassifierSuite extends ClassifierSuite with BeforeAndAfterAll {
     val result = ImageClassifier.bufferedImageToPixels(image2, Shape(3, 2, 2))
 
     assert(result.shape == inputDescriptor(0).shape.drop(1))
+    assert(result.dtype == DType.Float32)
+
+    val resultFloat64 = ImageClassifier.bufferedImageToPixels(image2, Shape(3, 2, 2), DType.Float64)
+    assert(resultFloat64.dtype == DType.Float64)
   }
 
   test("ImageClassifierSuite-testWithInputImage") {
@@ -106,8 +110,10 @@ class ImageClassifierSuite extends ClassifierSuite with BeforeAndAfterAll {
         predictResult(i).map(_._2).toArray
       }
     }
+
   }
 
+
   test("ImageClassifierSuite-testWithInputBatchImage") {
     val dType = DType.Float32
     val inputDescriptor = IndexedSeq[DataDesc](new DataDesc(modelPath, Shape(1, 3, 512, 512),
@@ -152,4 +158,5 @@ class ImageClassifierSuite extends ClassifierSuite with BeforeAndAfterAll {
       }
     }
   }
+
 }
diff --git a/scala-package/infer/src/test/scala/org/apache/mxnet/infer/PredictorSuite.scala b/scala-package/infer/src/test/scala/org/apache/mxnet/infer/PredictorSuite.scala
index 509ffb35db8d..9afbc9b3d4a8 100644
--- a/scala-package/infer/src/test/scala/org/apache/mxnet/infer/PredictorSuite.scala
+++ b/scala-package/infer/src/test/scala/org/apache/mxnet/infer/PredictorSuite.scala
@@ -19,7 +19,7 @@ package org.apache.mxnet.infer
 
 import org.apache.mxnet.io.NDArrayIter
 import org.apache.mxnet.module.{BaseModule, Module}
-import org.apache.mxnet.{DataDesc, Layout, NDArray, Shape}
+import org.apache.mxnet._
 import org.mockito.Matchers._
 import org.mockito.Mockito
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
@@ -91,6 +91,36 @@ class PredictorSuite extends FunSuite with BeforeAndAfterAll {
       , any[Option[BaseModule]], any[String])
   }
 
+  test("PredictorSuite-testWithFlatFloat64Arrays") {
+
+    val inputDescriptor = IndexedSeq[DataDesc](new DataDesc("data", Shape(2, 3, 2, 2),
+      layout = Layout.NCHW, dtype = DType.Float64))
+    val inputData = Array.fill[Double](12)(1d)
+
+    // this will disposed at the end of the predict call on Predictor.
+    val predictResult = IndexedSeq(NDArray.ones(Shape(1, 3, 2, 2), dtype = DType.Float64))
+
+    val testPredictor = new MyPredictor("xyz", inputDescriptor)
+
+    Mockito.doReturn(predictResult).when(testPredictor.mockModule)
+      .predict(any(classOf[NDArrayIter]), any[Int], any[Boolean])
+
+    val testFun = testPredictor.predict(IndexedSeq(inputData))
+
+    assert(testFun.size == 1, "output size should be 1 ")
+
+    assert(testFun(0)(0).getClass == 1d.getClass)
+
+    assert(Array.fill[Double](12)(1d).mkString == testFun(0).mkString)
+
+    // Verify that the module was bound with batch size 1 and rebound back to the original
+    // input descriptor. the number of times is twice here because loadModule overrides the
+    // initial bind.
+    Mockito.verify(testPredictor.mockModule, Mockito.times(2)).bind(any[IndexedSeq[DataDesc]],
+      any[Option[IndexedSeq[DataDesc]]], any[Boolean], any[Boolean], any[Boolean]
+      , any[Option[BaseModule]], any[String])
+  }
+
   test("PredictorSuite-testWithNDArray") {
     val inputDescriptor = IndexedSeq[DataDesc](new DataDesc("data", Shape(2, 3, 2, 2),
       layout = Layout.NCHW))
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
deleted file mode 100644
index 242f2f3d5626..000000000000
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ /dev/null
@@ -1,130 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>libmxnet-init-scala-linux-x86_64</artifactId>
-  <name>MXNet Scala Package - Initializer Native Linux-x86_64</name>
-  <url>http://maven.apache.org</url>
-
-  <packaging>so</packaging>
-
-  <properties>
-    <MXNET_DIR>${project.parent.parent.basedir}/..</MXNET_DIR>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-      <type>jar</type>
-      <scope>compile</scope>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
-
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>native-maven-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <!--  trigger javah -->
-          <javahOS>linux</javahOS>
-          <compilerProvider>generic-classic</compilerProvider>
-          <compilerExecutable>${cxx}</compilerExecutable>
-          <linkerExecutable>${cxx}</linkerExecutable>
-          <sources>
-            <source>
-              <directory>../src/main/native</directory>
-              <fileNames>
-                <fileName>org_apache_mxnet_init_native_c_api.cc</fileName>
-              </fileNames>
-            </source>
-          </sources>
-          <compilerStartOptions>
-            <compilerStartOption>-std=c++0x</compilerStartOption>
-          </compilerStartOptions>
-          <compilerEndOptions>
-            <compilerEndOption>-I${MXNET_DIR}/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/dmlc-core/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/mshadow</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/dlpack/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/tvm/nnvm/include</compilerEndOption>
-            <compilerEndOption>-DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CUDA=0</compilerEndOption>
-            <compilerEndOption>-O3 -DNDEBUG=1 -fPIC -msse3 -mf16c</compilerEndOption>
-            <compilerEndOption>-Wall -Wsign-compare -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs</compilerEndOption>
-          </compilerEndOptions>
-          <linkerStartOptions>
-            <linkerStartOption>-shared</linkerStartOption>
-          </linkerStartOptions>
-          <linkerMiddleOptions>
-            <linkerMiddleOption>-Wl,--whole-archive</linkerMiddleOption>
-            <linkerMiddleOption>-Wl,--no-whole-archive -pthread -lm -fopenmp -lrt</linkerMiddleOption>
-          </linkerMiddleOptions>
-          <linkerEndOptions>
-            <linkerEndOption>-Wl,-rpath=${dollar}ORIGIN -lmxnet -L${MXNET_DIR}/lib</linkerEndOption>
-          </linkerEndOptions>
-        </configuration>
-
-        <executions>
-          <execution>
-            <id>javah</id>
-            <phase>generate-sources</phase>
-            <configuration>
-              <javahProvider>default</javahProvider>
-              <javahOutputDirectory>${project.build.directory}/custom-javah</javahOutputDirectory>
-              <workingDirectory>${basedir}</workingDirectory>
-              <javahOutputFileName>org_apache_mxnet_init_native_c_api.h</javahOutputFileName>
-              <javahClassNames>
-                <javahClassName>org.apache.mxnet.init.LibInfo</javahClassName>
-              </javahClassNames>
-            </configuration>
-            <goals>
-              <goal>javah</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>exec-maven-plugin</artifactId>
-        <version>1.6.0</version>
-        <executions>
-          <execution>
-            <id>link-native-lib</id>
-            <phase>generate-resources</phase>
-            <goals>
-              <goal>exec</goal>
-            </goals>
-            <configuration>
-              <executable>ln</executable>
-              <commandlineArgs>-sf ${MXNET_DIR}/lib/libmxnet.so ${project.build.directory}/libmxnet.so</commandlineArgs>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
deleted file mode 100644
index 12f4d800eba4..000000000000
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ /dev/null
@@ -1,142 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>libmxnet-init-scala-osx-x86_64</artifactId>
-  <name>MXNet Scala Package - Initializer Native OSX-x86_64</name>
-  <url>http://maven.apache.org</url>
-
-  <packaging>jnilib</packaging>
-
-  <properties>
-    <MXNET_DIR>${project.parent.parent.basedir}/..</MXNET_DIR>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-      <type>jar</type>
-      <scope>compile</scope>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
-
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>native-maven-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <!--  trigger javah -->
-          <javahOS>darwin</javahOS>
-          <compilerProvider>generic-classic</compilerProvider>
-          <compilerExecutable>${cxx}</compilerExecutable>
-          <linkerExecutable>${cxx}</linkerExecutable>
-          <sources>
-            <source>
-              <directory>../src/main/native</directory>
-              <fileNames>
-                <fileName>org_apache_mxnet_init_native_c_api.cc</fileName>
-              </fileNames>
-            </source>
-          </sources>
-          <compilerStartOptions>
-            <compilerStartOption>-std=c++0x</compilerStartOption>
-          </compilerStartOptions>
-          <compilerEndOptions>
-            <compilerEndOption>-I${MXNET_DIR}/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/dmlc-core/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/mshadow</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/dlpack/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/tvm/nnvm/include</compilerEndOption>
-            <compilerEndOption>-DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CUDA=0</compilerEndOption>
-            <compilerEndOption>-g -O0 -fPIC -msse3 -mf16c</compilerEndOption>
-            <compilerEndOption>-Wall -Wsign-compare -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs</compilerEndOption>
-          </compilerEndOptions>
-          <linkerStartOptions>
-            <linkerStartOption>-shared</linkerStartOption>
-          </linkerStartOptions>
-          <linkerMiddleOptions>
-            <linkerMiddleOption>-framework JavaVM</linkerMiddleOption>
-            <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
-            <linkerMiddleOption>-Wl,-x</linkerMiddleOption>
-          </linkerMiddleOptions>
-          <linkerEndOptions>
-            <linkerEndOption>-lmxnet -L${MXNET_DIR}/lib</linkerEndOption>
-          </linkerEndOptions>
-        </configuration>
-
-        <executions>
-          <execution>
-            <id>javah</id>
-            <phase>generate-sources</phase>
-            <configuration>
-              <javahProvider>default</javahProvider>
-              <javahOutputDirectory>${project.build.directory}/custom-javah</javahOutputDirectory>
-              <workingDirectory>${basedir}</workingDirectory>
-              <javahOutputFileName>org_apache_mxnet_init_native_c_api.h</javahOutputFileName>
-              <javahClassNames>
-                <javahClassName>org.apache.mxnet.init.LibInfo</javahClassName>
-              </javahClassNames>
-            </configuration>
-            <goals>
-              <goal>javah</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>exec-maven-plugin</artifactId>
-        <version>1.6.0</version>
-        <executions>
-          <execution>
-            <id>post-native-build</id>
-            <phase>package</phase>
-            <goals>
-              <goal>exec</goal>
-            </goals>
-            <configuration>
-              <executable>install_name_tool</executable>
-              <commandlineArgs>-change lib/libmxnet.so @loader_path/libmxnet.so ${project.build.directory}/${artifactId}.jnilib</commandlineArgs>
-            </configuration>
-          </execution>
-          <execution>
-            <id>link-native-lib</id>
-            <phase>generate-resources</phase>
-            <goals>
-              <goal>exec</goal>
-            </goals>
-            <configuration>
-              <executable>ln</executable>
-              <commandlineArgs>-sf ${MXNET_DIR}/lib/libmxnet.so ${project.build.directory}/libmxnet.so</commandlineArgs>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index bed216e45035..1721f8cbd403 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -5,46 +5,170 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <artifactId>mxnet-parent</artifactId>
+    <version>INTERNAL</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>mxnet-scala-init-native-parent</artifactId>
-  <name>MXNet Scala Package - Initializer Native Parent</name>
-  <packaging>pom</packaging>
+  <artifactId>libmxnet-init-scala</artifactId>
+  <name>MXNet Scala Package - Initializer Native</name>
+
+  <properties>
+    <MXNET_DIR>${project.parent.basedir}/..</MXNET_DIR>
+  </properties>
+
+  <packaging>${libtype}</packaging>
 
   <profiles>
     <profile>
-      <id>osx-x86_64-cpu</id>
-      <modules>
-        <module>osx-x86_64</module>
-      </modules>
-    </profile>
-    <profile>
-      <id>linux-x86_64-cpu</id>
-      <modules>
-        <module>linux-x86_64</module>
-      </modules>
+      <id>osx-x86_64</id>
+      <activation>
+        <os><family>mac</family></os>
+      </activation>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>native-maven-plugin</artifactId>
+            <extensions>true</extensions>
+            <configuration>
+              <javahOS>darwin</javahOS>
+              <compilerProvider>generic-classic</compilerProvider>
+              <compilerExecutable>${cxx}</compilerExecutable>
+              <linkerExecutable>${cxx}</linkerExecutable>
+              <sources>
+                <source>
+                  <directory>src/main/native</directory>
+                  <fileNames>
+                    <fileName>org_apache_mxnet_init_native_c_api.cc</fileName>
+                  </fileNames>
+                </source>
+              </sources>
+              <compilerStartOptions>
+                <compilerStartOption>-std=c++0x</compilerStartOption>
+              </compilerStartOptions>
+              <compilerEndOptions>
+                <compilerEndOption>-I${MXNET_DIR}/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/dmlc-core/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/mshadow</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/dlpack/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/tvm/nnvm/include</compilerEndOption>
+                <compilerEndOption>-DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CUDA=0</compilerEndOption>
+                <compilerEndOption>-g -O0 -fPIC -msse3 -mf16c</compilerEndOption>
+                <compilerEndOption>-Wall -Wsign-compare -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs</compilerEndOption>
+              </compilerEndOptions>
+              <linkerStartOptions>
+                <linkerStartOption>-shared</linkerStartOption>
+              </linkerStartOptions>
+              <linkerMiddleOptions>
+                <linkerMiddleOption>-framework JavaVM</linkerMiddleOption>
+                <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
+                <linkerMiddleOption>-Wl,-x</linkerMiddleOption>
+              </linkerMiddleOptions>
+              <linkerEndOptions>
+                <linkerEndOption>-lmxnet -L${MXNET_DIR}/lib</linkerEndOption>
+              </linkerEndOptions>
+            </configuration>
+          </plugin>
+
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>exec-maven-plugin</artifactId>
+            <version>1.6.0</version>
+            <executions>
+              <execution>
+                <id>post-native-build</id>
+                <phase>package</phase>
+                <goals>
+                  <goal>exec</goal>
+                </goals>
+                <configuration>
+                  <executable>install_name_tool</executable>
+                  <commandlineArgs>-add_rpath @loader_path ${project.build.directory}/${project.artifactId}.jnilib</commandlineArgs>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
     </profile>
     <profile>
-      <id>linux-x86_64-gpu</id>
-      <modules>
-        <module>linux-x86_64</module>
-      </modules>
+      <id>linux-x86_64</id>
+      <activation>
+        <os>
+          <family>unix</family>
+          <name>Linux</name>
+        </os>
+      </activation>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>native-maven-plugin</artifactId>
+            <extensions>true</extensions>
+            <configuration>
+              <javahOS>linux</javahOS>
+              <compilerProvider>generic-classic</compilerProvider>
+              <compilerExecutable>${cxx}</compilerExecutable>
+              <linkerExecutable>${cxx}</linkerExecutable>
+              <sources>
+                <source>
+                  <directory>src/main/native</directory>
+                  <fileNames>
+                    <fileName>org_apache_mxnet_init_native_c_api.cc</fileName>
+                  </fileNames>
+                </source>
+              </sources>
+              <compilerStartOptions>
+                <compilerStartOption>-std=c++0x</compilerStartOption>
+              </compilerStartOptions>
+              <compilerEndOptions>
+                <compilerEndOption>-I${MXNET_DIR}/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/dmlc-core/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/mshadow</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/dlpack/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/tvm/nnvm/include</compilerEndOption>
+                <compilerEndOption>-DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CUDA=0</compilerEndOption>
+                <compilerEndOption>-O3 -DNDEBUG=1 -fPIC -msse3 -mf16c</compilerEndOption>
+                <compilerEndOption>-Wall -Wsign-compare -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs</compilerEndOption>
+              </compilerEndOptions>
+              <linkerStartOptions>
+                <linkerStartOption>-shared</linkerStartOption>
+              </linkerStartOptions>
+              <linkerMiddleOptions>
+                <linkerMiddleOption>-Wl,--whole-archive</linkerMiddleOption>
+                <linkerMiddleOption>-Wl,--no-whole-archive -pthread -lm -fopenmp -lrt</linkerMiddleOption>
+              </linkerMiddleOptions>
+              <linkerEndOptions>
+                <linkerEndOption>-Wl,-rpath=${dollar}ORIGIN -lmxnet -L${MXNET_DIR}/lib</linkerEndOption>
+              </linkerEndOptions>
+            </configuration>
+          </plugin>
+        </plugins>
+      </build>
     </profile>
   </profiles>
 
   <build>
     <plugins>
       <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>exec-maven-plugin</artifactId>
+        <version>1.6.0</version>
+        <executions>
+          <execution>
+            <id>link-native-lib</id>
+            <phase>generate-resources</phase>
+            <goals>
+              <goal>exec</goal>
+            </goals>
+            <configuration>
+              <executable>bash</executable>
+              <commandlineArgs>-c 'ln -sf ${MXNET_DIR}/lib/* ${project.build.directory}/'</commandlineArgs>
+            </configuration>
+          </execution>
+        </executions>
       </plugin>
     </plugins>
   </build>
-
 </project>
diff --git a/scala-package/init-native/src/main/native/org_apache_mxnet_init_native_c_api.h b/scala-package/init-native/src/main/native/org_apache_mxnet_init_native_c_api.h
new file mode 100644
index 000000000000..6ff6ae6a107c
--- /dev/null
+++ b/scala-package/init-native/src/main/native/org_apache_mxnet_init_native_c_api.h
@@ -0,0 +1,45 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_apache_mxnet_init_LibInfo */
+
+#ifndef _Included_org_apache_mxnet_init_LibInfo
+#define _Included_org_apache_mxnet_init_LibInfo
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     org_apache_mxnet_init_LibInfo
+ * Method:    mxSymbolListAtomicSymbolCreators
+ * Signature: (Lscala/collection/mutable/ListBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_init_LibInfo_mxSymbolListAtomicSymbolCreators
+  (JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_init_LibInfo
+ * Method:    mxSymbolGetAtomicSymbolInfo
+ * Signature: (JLorg/apache/mxnet/init/Base/RefString;Lorg/apache/mxnet/init/Base/RefString;Lorg/apache/mxnet/init/Base/RefInt;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lorg/apache/mxnet/init/Base/RefString;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_init_LibInfo_mxSymbolGetAtomicSymbolInfo
+  (JNIEnv *, jobject, jlong, jobject, jobject, jobject, jobject, jobject, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_init_LibInfo
+ * Method:    mxListAllOpNames
+ * Signature: (Lscala/collection/mutable/ListBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_init_LibInfo_mxListAllOpNames
+  (JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_init_LibInfo
+ * Method:    nnGetOpHandle
+ * Signature: (Ljava/lang/String;Lorg/apache/mxnet/init/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_init_LibInfo_nnGetOpHandle
+  (JNIEnv *, jobject, jstring, jobject);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index 4278df6f2e73..a0bb6be384b5 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -5,65 +5,62 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
-<!--  <relativePath>../pom.xml</relativePath>-->
+    <artifactId>mxnet-parent</artifactId>
+    <version>INTERNAL</version>
+    <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>mxnet-init_2.11</artifactId>
+  <artifactId>mxnet-scala-init</artifactId>
   <name>MXNet Scala Package - Initializer</name>
 
-  <profiles>
-    <profile>
-      <id>osx-x86_64-cpu</id>
-      <properties>
-        <platform>osx-x86_64-cpu</platform>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-cpu</id>
-      <properties>
-        <platform>linux-x86_64-cpu</platform>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-gpu</id>
-      <properties>
-        <platform>linux-x86_64-gpu</platform>
-      </properties>
-    </profile>
-    <profile>
-      <id>apache-release</id>
-   <!--Running the compile-backend inside a different profile did not work when used with apache-release profile for release-perform-->
-      <build>
-        <plugins>
-          <plugin>
-            <groupId>org.codehaus.mojo</groupId>
-            <artifactId>exec-maven-plugin</artifactId>
-            <version>1.6.0</version>
-            <executions>
-              <execution>
-                <id>compile-mxnet-backend</id>
-                <phase>compile</phase>
-                <goals>
-                  <goal>exec</goal>
-                </goals>
-                <configuration>
-                  <executable>bash</executable>
-                  <commandlineArgs>${project.parent.basedir}/dev/compile-mxnet-backend.sh ${build.platform} ${project.parent.basedir}/../</commandlineArgs>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-deploy-plugin</artifactId>
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>native-maven-plugin</artifactId>
+        <extensions>true</extensions>
+        <executions>
+          <execution>
+            <id>javah</id>
+            <phase>verify</phase>
             <configuration>
-              <skip>true</skip>
+              <javahProvider>default</javahProvider>
+              <javahOutputDirectory>${project.build.directory}/custom-javah</javahOutputDirectory>
+              <workingDirectory>${basedir}</workingDirectory>
+              <javahOutputFileName>org_apache_mxnet_init_native_c_api.h</javahOutputFileName>
+              <javahClassNames>
+                <javahClassName>org.apache.mxnet.init.LibInfo</javahClassName>
+              </javahClassNames>
             </configuration>
-          </plugin>
-        </plugins>
-      </build>
-      </profile>
-  </profiles>
+            <goals>
+              <goal>javah</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>exec-maven-plugin</artifactId>
+        <version>1.6.0</version>
+        <executions>
+          <execution>
+            <id>verify-javah</id>
+            <phase>verify</phase>
+            <goals>
+              <goal>exec</goal>
+            </goals>
+            <configuration>
+              <executable>diff</executable>
+              <commandlineArgs>
+                ${project.build.directory}/custom-javah/org_apache_mxnet_init_native_c_api.h
+                ${project.parent.basedir}/init-native/src/main/native/org_apache_mxnet_init_native_c_api.h
+              </commandlineArgs>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
 </project>
diff --git a/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala b/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala
index 7402dbd3bc1d..b5a6286af1b6 100644
--- a/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala
+++ b/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala
@@ -17,6 +17,8 @@
 
 package org.apache.mxnet.init
 
+import java.io.File
+
 object Base {
   tryLoadInitLibrary()
   val _LIB = new LibInfo
@@ -37,18 +39,22 @@ object Base {
 
   @throws(classOf[UnsatisfiedLinkError])
   private def tryLoadInitLibrary(): Unit = {
-    var baseDir = System.getProperty("user.dir") + "/init-native"
-    // TODO(lanKing520) Update this to use relative path to the MXNet director.
-    // TODO(lanking520) baseDir = sys.env("MXNET_BASEDIR") + "/scala-package/init-native"
-    if (System.getenv().containsKey("MXNET_BASEDIR")) {
-      baseDir = sys.env("MXNET_BASEDIR")
+    var userDir : File = new File(System.getProperty("user.dir"))
+    var nativeDir : File = new File(userDir, "init-native")
+    if (!nativeDir.exists()) {
+      nativeDir = new File(userDir.getParent, "init-native")
+      if (!nativeDir.exists()) {
+        throw new IllegalStateException("scala-init should be executed inside scala-package folder")
+      }
     }
+    val baseDir = nativeDir.getAbsolutePath
+
     val os = System.getProperty("os.name")
     // ref: http://lopica.sourceforge.net/os.html
     if (os.startsWith("Linux")) {
-      System.load(s"$baseDir/linux-x86_64/target/libmxnet-init-scala-linux-x86_64.so")
+      System.load(s"$baseDir/target/libmxnet-init-scala.so")
     } else if (os.startsWith("Mac")) {
-      System.load(s"$baseDir/osx-x86_64/target/libmxnet-init-scala-osx-x86_64.jnilib")
+      System.load(s"$baseDir/target/libmxnet-init-scala.jnilib")
     } else {
       // TODO(yizhi) support windows later
       throw new UnsatisfiedLinkError()
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index cd56060b4b36..52dfde181d72 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -5,63 +5,20 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <artifactId>mxnet-parent</artifactId>
+    <version>INTERNAL</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>mxnet-macros_2.11</artifactId>
+  <artifactId>mxnet-macros</artifactId>
   <name>MXNet Scala Package - Macros</name>
 
-  <profiles>
-    <profile>
-      <id>unittest</id>
-      <properties>
-        <skiptest>false</skiptest>
-      </properties>
-    </profile>
-    <profile>
-      <id>integrationtest</id>
-      <properties>
-        <skiptest>true</skiptest>
-      </properties>
-    </profile>
-    <profile>
-      <id>osx-x86_64-cpu</id>
-      <properties>
-        <platform>osx-x86_64</platform>
-        <libtype>jnilib</libtype>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-cpu</id>
-      <properties>
-        <platform>linux-x86_64</platform>
-        <libtype>so</libtype>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-gpu</id>
-      <properties>
-        <platform>linux-x86_64</platform>
-        <libtype>so</libtype>
-      </properties>
-    </profile>
-  </profiles>
-
   <dependencies>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-      <scope>provided</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>libmxnet-init-scala-${platform}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <artifactId>mxnet-scala-init</artifactId>
+      <version>INTERNAL</version>
       <scope>provided</scope>
-      <type>${libtype}</type>
     </dependency>
     <dependency>
       <groupId>commons-io</groupId>
@@ -70,16 +27,8 @@
     </dependency>
   </dependencies>
 
-
   <build>
     <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
@@ -95,39 +44,15 @@
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
       </plugin>
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>exec-maven-plugin</artifactId>
-        <version>1.6.0</version>
-        <executions>
-          <execution>
-            <id>apidoc-generation</id>
-            <phase>package</phase>
-            <goals>
-              <goal>java</goal>
-            </goals>
-          </execution>
-        </executions>
-        <configuration>
-          <additionalClasspathElements>
-            <additionalClasspathElement>${project.parent.basedir}/init/target/classes</additionalClasspathElement>
-          </additionalClasspathElements>
-          <arguments>
-            <argument>${project.parent.basedir}/core/src/main/scala/org/apache/mxnet/</argument>
-          </arguments>
-          <mainClass>org.apache.mxnet.APIDocGenerator</mainClass>
-        </configuration>
-      </plugin>
       <plugin>
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <skipTests>${skiptest}</skipTests>
           <environmentVariables>
             <MXNET_BASEDIR>${project.parent.basedir}/init-native</MXNET_BASEDIR>
           </environmentVariables>
           <argLine>
-            -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
+            -Djava.library.path=${project.parent.basedir}/native/target \
             -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
           </argLine>
         </configuration>
@@ -138,5 +63,4 @@
       </plugin>
     </plugins>
   </build>
-
 </project>
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
index ce12dc7cd5a0..ede16f73d2a1 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
@@ -27,13 +27,15 @@ import scala.collection.mutable.ListBuffer
   * Two file namely: SymbolAPIBase.scala and NDArrayAPIBase.scala
   * The code will be executed during Macros stage and file live in Core stage
   */
-private[mxnet] object APIDocGenerator extends GeneratorBase {
+private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
 
   def main(args: Array[String]): Unit = {
     val FILE_PATH = args(0)
     val hashCollector = ListBuffer[String]()
     hashCollector += typeSafeClassGen(FILE_PATH, true)
     hashCollector += typeSafeClassGen(FILE_PATH, false)
+    hashCollector += typeSafeRandomClassGen(FILE_PATH, true)
+    hashCollector += typeSafeRandomClassGen(FILE_PATH, false)
     hashCollector += nonTypeSafeClassGen(FILE_PATH, true)
     hashCollector += nonTypeSafeClassGen(FILE_PATH, false)
     hashCollector += javaClassGen(FILE_PATH)
@@ -57,8 +59,27 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
 
     writeFile(
       FILE_PATH,
+      "package org.apache.mxnet",
       if (isSymbol) "SymbolAPIBase" else "NDArrayAPIBase",
+      "import org.apache.mxnet.annotation.Experimental",
+      generated)
+  }
+
+  def typeSafeRandomClassGen(FILE_PATH: String, isSymbol: Boolean): String = {
+    val generated = typeSafeRandomFunctionsToGenerate(isSymbol)
+      .map { func =>
+        val scalaDoc = generateAPIDocFromBackend(func)
+        val typeParameter = randomGenericTypeSpec(isSymbol, false)
+        val decl = generateAPISignature(func, isSymbol, typeParameter)
+        s"$scalaDoc\n$decl"
+      }
+
+    writeFile(
+      FILE_PATH,
       "package org.apache.mxnet",
+      if (isSymbol) "SymbolRandomAPIBase" else "NDArrayRandomAPIBase",
+      """import org.apache.mxnet.annotation.Experimental
+        |import scala.reflect.ClassTag""".stripMargin,
       generated)
   }
 
@@ -85,8 +106,9 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
 
     writeFile(
       FILE_PATH,
-      if (isSymbol) "SymbolBase" else "NDArrayBase",
       "package org.apache.mxnet",
+      if (isSymbol) "SymbolBase" else "NDArrayBase",
+      "import org.apache.mxnet.annotation.Experimental",
       absFuncs)
   }
 
@@ -110,7 +132,12 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
       }).toSeq
     val packageName = "NDArrayBase"
     val packageDef = "package org.apache.mxnet.javaapi"
-    writeFile(filePath + "javaapi/", packageName, packageDef, absFuncs)
+    writeFile(
+      filePath + "javaapi/",
+      packageDef,
+      packageName,
+      "import org.apache.mxnet.annotation.Experimental",
+      absFuncs)
   }
 
   def generateAPIDocFromBackend(func: Func, withParam: Boolean = true): String = {
@@ -146,7 +173,7 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
     }
   }
 
-  def generateAPISignature(func: Func, isSymbol: Boolean): String = {
+  def generateAPISignature(func: Func, isSymbol: Boolean, typeParameter: String = ""): String = {
     val argDef = ListBuffer[String]()
 
     argDef ++= typedFunctionCommonArgDef(func)
@@ -162,7 +189,7 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
     val returnType = func.returnType
 
     s"""@Experimental
-       |def ${func.name} (${argDef.mkString(", ")}): $returnType""".stripMargin
+       |def ${func.name}$typeParameter (${argDef.mkString(", ")}): $returnType""".stripMargin
   }
 
   def generateJavaAPISignature(func : Func) : String = {
@@ -223,30 +250,30 @@ private[mxnet] object APIDocGenerator extends GeneratorBase {
     }
   }
 
-  def writeFile(FILE_PATH: String, className: String, packageDef: String,
-                absFuncs: Seq[String]): String = {
+  def writeFile(FILE_PATH: String, packageDef: String, className: String,
+                imports: String, absFuncs: Seq[String]): String = {
 
     val finalStr =
       s"""/*
-         |* Licensed to the Apache Software Foundation (ASF) under one or more
-         |* contributor license agreements.  See the NOTICE file distributed with
-         |* this work for additional information regarding copyright ownership.
-         |* The ASF licenses this file to You under the Apache License, Version 2.0
-         |* (the "License"); you may not use this file except in compliance with
-         |* the License.  You may obtain a copy of the License at
-         |*
-         |*    http://www.apache.org/licenses/LICENSE-2.0
-         |*
-         |* Unless required by applicable law or agreed to in writing, software
-         |* distributed under the License is distributed on an "AS IS" BASIS,
-         |* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-         |* See the License for the specific language governing permissions and
-         |* limitations under the License.
-         |*/
+         | * Licensed to the Apache Software Foundation (ASF) under one or more
+         | * contributor license agreements.  See the NOTICE file distributed with
+         | * this work for additional information regarding copyright ownership.
+         | * The ASF licenses this file to You under the Apache License, Version 2.0
+         | * (the "License"); you may not use this file except in compliance with
+         | * the License.  You may obtain a copy of the License at
+         | *
+         | *    http://www.apache.org/licenses/LICENSE-2.0
+         | *
+         | * Unless required by applicable law or agreed to in writing, software
+         | * distributed under the License is distributed on an "AS IS" BASIS,
+         | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+         | * See the License for the specific language governing permissions and
+         | * limitations under the License.
+         | */
          |
          |$packageDef
          |
-         |import org.apache.mxnet.annotation.Experimental
+         |$imports
          |
          |// scalastyle:off
          |abstract class $className {
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
index 9245ef1b437f..498c4e943669 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
@@ -23,7 +23,7 @@ import org.apache.mxnet.utils.{CToScalaUtils, OperatorBuildUtils}
 import scala.collection.mutable.ListBuffer
 import scala.reflect.macros.blackbox
 
-abstract class GeneratorBase {
+private[mxnet] abstract class GeneratorBase {
   type Handle = Long
 
   case class Arg(argName: String, argType: String, argDesc: String, isOptional: Boolean) {
@@ -46,7 +46,8 @@ abstract class GeneratorBase {
     }
   }
 
-  def typeSafeFunctionsToGenerate(isSymbol: Boolean, isContrib: Boolean): List[Func] = {
+  // filter the operators to generate in the type-safe Symbol.api and NDArray.api
+  protected def typeSafeFunctionsToGenerate(isSymbol: Boolean, isContrib: Boolean): List[Func] = {
     // Operators that should not be generated
     val notGenerated = Set("Custom")
 
@@ -95,7 +96,7 @@ abstract class GeneratorBase {
       else if (isSymbol) "org.apache.mxnet.Symbol"
       else "org.apache.mxnet.NDArray"
       val typeAndOption =
-        CToScalaUtils.argumentCleaner(argName, argType, family)
+        CToScalaUtils.argumentCleaner(argName, argType, family, isJava)
       Arg(argName, typeAndOption._1, argDesc, typeAndOption._2)
     }
     val returnType =
@@ -144,8 +145,8 @@ abstract class GeneratorBase {
     result
   }
 
+  // build function argument definition, with optionality, and safe names
   protected def typedFunctionCommonArgDef(func: Func): List[String] = {
-    // build function argument definition, with optionality, and safe names
     func.listOfArgs.map(arg =>
       if (arg.isOptional) {
         // let's avoid a stupid Option[Array[...]]
@@ -161,3 +162,71 @@ abstract class GeneratorBase {
     )
   }
 }
+
+// a mixin to ease generating the Random module
+private[mxnet] trait RandomHelpers {
+  self: GeneratorBase =>
+
+  // a generic type spec used in Symbol.random and NDArray.random modules
+  protected def randomGenericTypeSpec(isSymbol: Boolean, fullPackageSpec: Boolean): String = {
+    val classTag = if (fullPackageSpec) "scala.reflect.ClassTag" else "ClassTag"
+    if (isSymbol) s"[T: SymbolOrScalar : $classTag]"
+    else s"[T: NDArrayOrScalar : $classTag]"
+  }
+
+  // filter the operators to generate in the type-safe Symbol.random and NDArray.random
+  protected def typeSafeRandomFunctionsToGenerate(isSymbol: Boolean): List[Func] = {
+    getBackEndFunctions(isSymbol)
+      .filter(f => f.name.startsWith("_sample_") || f.name.startsWith("_random_"))
+      .map(f => f.copy(name = f.name.stripPrefix("_")))
+      // unify _random and _sample
+      .map(f => unifyRandom(f, isSymbol))
+      // deduplicate
+      .groupBy(_.name)
+      .mapValues(_.head)
+      .values
+      .toList
+  }
+
+  // unify call targets (random_xyz and sample_xyz) and unify their argument types
+  private def unifyRandom(func: Func, isSymbol: Boolean): Func = {
+    var typeConv = Set("org.apache.mxnet.NDArray", "org.apache.mxnet.Symbol",
+      "Float", "Int")
+
+    func.copy(
+      name = func.name.replaceAll("(random|sample)_", ""),
+      listOfArgs = func.listOfArgs
+        .map(hackNormalFunc)
+        .map(arg =>
+          if (typeConv(arg.argType)) arg.copy(argType = "T")
+          else arg
+        )
+      // TODO: some functions are non consistent in random_ vs sample_ regarding optionality
+      // we may try to unify that as well here.
+    )
+  }
+
+  // hacks to manage the fact that random_normal and sample_normal have
+  // non-consistent parameter naming in the back-end
+  // this first one, merge loc/scale and mu/sigma
+  protected def hackNormalFunc(arg: Arg): Arg = {
+    if (arg.argName == "loc") arg.copy(argName = "mu")
+    else if (arg.argName == "scale") arg.copy(argName = "sigma")
+    else arg
+  }
+
+  // this second one reverts this merge prior to back-end call
+  protected def unhackNormalFunc(func: Func): String = {
+    if (func.name.equals("normal")) {
+      s"""if(target.equals("random_normal")) {
+         |  if(map.contains("mu")) { map("loc") = map("mu"); map.remove("mu")  }
+         |  if(map.contains("sigma")) { map("scale") = map("sigma"); map.remove("sigma") }
+         |}
+       """.stripMargin
+    } else {
+      ""
+    }
+
+  }
+
+}
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
index d85abe1ecc4f..c18694b59bf6 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
@@ -18,7 +18,6 @@
 package org.apache.mxnet
 
 import scala.annotation.StaticAnnotation
-import scala.collection.mutable.ListBuffer
 import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
@@ -30,6 +29,14 @@ private[mxnet] class AddNDArrayAPIs(isContrib: Boolean) extends StaticAnnotation
   private[mxnet] def macroTransform(annottees: Any*) = macro TypedNDArrayAPIMacro.typeSafeAPIDefs
 }
 
+private[mxnet] class AddNDArrayRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
+  private[mxnet] def macroTransform(annottees: Any*) =
+  macro TypedNDArrayRandomAPIMacro.typeSafeAPIDefs
+}
+
+/**
+  * For non-typed NDArray API
+  */
 private[mxnet] object NDArrayMacro extends GeneratorBase {
 
   def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -70,6 +77,9 @@ private[mxnet] object NDArrayMacro extends GeneratorBase {
   }
 }
 
+/**
+  * NDArray.api code generation
+  */
 private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
 
   def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -78,9 +88,9 @@ private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
       case q"new AddNDArrayAPIs($b)" => c.eval[Boolean](c.Expr(b))
     }
 
-    val functions = typeSafeFunctionsToGenerate(isSymbol = false, isContrib)
+    val functionDefs = typeSafeFunctionsToGenerate(isSymbol = false, isContrib)
+      .map(f => buildTypedFunction(c)(f))
 
-    val functionDefs = functions.map(f => buildTypedFunction(c)(f))
     structGeneration(c)(functionDefs, annottees: _*)
   }
 
@@ -89,49 +99,136 @@ private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
     import c.universe._
 
     val returnType = "org.apache.mxnet.NDArrayFuncReturn"
-    val ndarrayType = "org.apache.mxnet.NDArray"
-
-    // Construct argument field
-    val argDef = ListBuffer[String]()
-    argDef ++= typedFunctionCommonArgDef(function)
-    argDef += "out : Option[NDArray] = None"
-
-    // Construct Implementation field
-    var impl = ListBuffer[String]()
-    impl += "val map = scala.collection.mutable.Map[String, Any]()"
-    impl += s"val args = scala.collection.mutable.ArrayBuffer.empty[$ndarrayType]"
-
-    // NDArray arg implementation
-    impl ++= function.listOfArgs.map { arg =>
-      if (arg.argType.equals(s"Array[$ndarrayType]")) {
-        s"args ++= ${arg.safeArgName}"
-      } else {
-        val base =
-          if (arg.argType.equals(ndarrayType)) {
-            // ndarrays go to args
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+ "out : Option[NDArray] = None"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        // ndarrays go to args, other types go to kwargs
+        if (arg.argType.equals(s"Array[org.apache.mxnet.NDArray]")) {
+          s"args ++= ${arg.safeArgName}.toSeq"
+        } else {
+          val base = if (arg.argType.equals("org.apache.mxnet.NDArray")) {
             s"args += ${arg.safeArgName}"
           } else {
-            // other types go to kwargs
             s"""map("${arg.argName}") = ${arg.safeArgName}"""
           }
-        if (arg.isOptional) s"if (!${arg.safeArgName}.isEmpty) $base.get"
-        else base
+          if (arg.isOptional) s"if (!${arg.safeArgName}.isEmpty) $base.get"
+          else base
+        }
       }
-    }
 
-    impl +=
-      s"""if (!out.isEmpty) map("out") = out.get
-         |org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(
-         |  "${function.name}", args.toSeq, map.toMap)
+    val impl =
+      s"""
+         |def ${function.name}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  val args = scala.collection.mutable.ArrayBuffer.empty[org.apache.mxnet.NDArray]
+         |
+         |  if (!out.isEmpty) map("out") = out.get
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(
+         |    "${function.name}", args.toSeq, map.toMap)
+         |}
        """.stripMargin
 
-    // Combine and build the function string
-    val finalStr =
-      s"""def ${function.name}
-         |   (${argDef.mkString(",")}) : $returnType
-         | = {${impl.mkString("\n")}}
+    c.parse(impl).asInstanceOf[DefDef]
+  }
+}
+
+
+/**
+  * NDArray.random code generation
+  */
+private[mxnet] object TypedNDArrayRandomAPIMacro extends GeneratorBase
+  with RandomHelpers {
+
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+    // Note: no contrib managed in this module
+
+    val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = false)
+      .map(f => buildTypedFunction(c)(f))
+
+    structGeneration(c)(functionDefs, annottees: _*)
+  }
+
+  protected def buildTypedFunction(c: blackbox.Context)
+                                  (function: Func): c.universe.DefDef = {
+    import c.universe._
+
+    val returnType = "org.apache.mxnet.NDArrayFuncReturn"
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+ "out : Option[NDArray] = None"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        // ndarrays go to args, other types go to kwargs
+        if (arg.argType.equals("Array[org.apache.mxnet.NDArray]")) {
+          s"args ++= ${arg.safeArgName}.toSeq"
+        } else {
+          if (arg.argType.equals("T")) {
+            if (arg.isOptional) {
+              s"""if(${arg.safeArgName}.isDefined) {
+                 |  if(isScalar) {
+                 |    map("${arg.argName}") = ${arg.safeArgName}.get
+                 |  } else {
+                 |    args += ${arg.safeArgName}.get.asInstanceOf[org.apache.mxnet.NDArray]
+                 |  }
+                 |}
+             """.stripMargin
+            } else {
+              s"""if(isScalar) {
+                 |  map("${arg.argName}") = ${arg.safeArgName}
+                 |} else {
+                 |  args += ${arg.safeArgName}.asInstanceOf[org.apache.mxnet.NDArray]
+                 |}
+             """.stripMargin
+            }
+          } else {
+            if (arg.isOptional) {
+              s"""if (${arg.safeArgName}.isDefined) map("${arg.argName}")=${arg.safeArgName}.get"""
+            } else {
+              s"""map("${arg.argName}") = ${arg.safeArgName}"""
+            }
+          }
+        }
+      }
+
+    val impl =
+      s"""
+         |def ${function.name}${randomGenericTypeSpec(false, true)}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  val args = scala.collection.mutable.ArrayBuffer.empty[org.apache.mxnet.NDArray]
+         |  val isScalar = NDArrayOrScalar[T].isScalar
+         |
+         |  if(out.isDefined) map("out") = out.get
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  val target = if(isScalar) {
+         |    "random_${function.name}"
+         |  } else {
+         |    "sample_${function.name}"
+         |  }
+         |
+         |  ${unhackNormalFunc(function)}
+         |
+         |  org.apache.mxnet.NDArray.genericNDArrayFunctionInvoke(
+         |    target, args.toSeq, map.toMap)
+         |}
        """.stripMargin
 
-    c.parse(finalStr).asInstanceOf[DefDef]
+    c.parse(impl).asInstanceOf[DefDef]
   }
+
+
 }
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
index ab864e1ef195..7ec80b9c066c 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
@@ -17,8 +17,8 @@
 
 package org.apache.mxnet
 
+
 import scala.annotation.StaticAnnotation
-import scala.collection.mutable.ListBuffer
 import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
@@ -30,6 +30,14 @@ private[mxnet] class AddSymbolAPIs(isContrib: Boolean) extends StaticAnnotation
   private[mxnet] def macroTransform(annottees: Any*) = macro TypedSymbolAPIMacro.typeSafeAPIDefs
 }
 
+private[mxnet] class AddSymbolRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
+  private[mxnet] def macroTransform(annottees: Any*) =
+  macro TypedSymbolRandomAPIMacro.typeSafeAPIDefs
+}
+
+/**
+  * For non-typed Symbol API
+  */
 private[mxnet] object SymbolMacro extends GeneratorBase {
 
   def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -63,6 +71,9 @@ private[mxnet] object SymbolMacro extends GeneratorBase {
   }
 }
 
+/**
+  * Symbol.api code generation
+  */
 private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
 
   def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
@@ -71,9 +82,9 @@ private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
       case q"new AddSymbolAPIs($b)" => c.eval[Boolean](c.Expr(b))
     }
 
-    val functions = typeSafeFunctionsToGenerate(isSymbol = true, isContrib)
+    val functionDefs = typeSafeFunctionsToGenerate(isSymbol = true, isContrib)
+      .map(f => buildTypedFunction(c)(f))
 
-    val functionDefs = functions.map(f => buildTypedFunction(c)(f))
     structGeneration(c)(functionDefs, annottees: _*)
   }
 
@@ -82,45 +93,111 @@ private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
     import c.universe._
 
     val returnType = "org.apache.mxnet.Symbol"
-    val symbolType = "org.apache.mxnet.Symbol"
-
-    // Construct argument field
-    val argDef = ListBuffer[String]()
-    argDef ++= typedFunctionCommonArgDef(function)
-    argDef += "name : String = null"
-    argDef += "attr : Map[String, String] = null"
-
-    // Construct Implementation field
-    val impl = ListBuffer[String]()
-    impl += "val map = scala.collection.mutable.Map[String, Any]()"
-    impl += s"var args = scala.collection.Seq[$symbolType]()"
-
-    // Symbol arg implementation
-    impl ++= function.listOfArgs.map { arg =>
-      if (arg.argType.equals(s"Array[$symbolType]")) {
-        s"if (!${arg.safeArgName}.isEmpty) args = ${arg.safeArgName}.toSeq"
-      } else {
-        // all go in kwargs
-        if (arg.isOptional) {
-          s"""if (!${arg.safeArgName}.isEmpty) map("${arg.argName}") = ${arg.safeArgName}.get"""
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+
+      "name : String = null" :+
+      "attr : Map[String, String] = null"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        if (arg.argType.equals(s"Array[org.apache.mxnet.Symbol]")) {
+          s"args = ${arg.safeArgName}.toSeq"
         } else {
-          s"""map("${arg.argName}") = ${arg.safeArgName}"""
+          // all go in kwargs
+          if (arg.isOptional) {
+            s"""if (!${arg.safeArgName}.isEmpty) map("${arg.argName}") = ${arg.safeArgName}.get"""
+          } else {
+            s"""map("${arg.argName}") = ${arg.safeArgName}"""
+          }
         }
       }
-    }
 
-    impl +=
-      s"""org.apache.mxnet.Symbol.createSymbolGeneral(
-         |  "${function.name}", name, attr, args, map.toMap)
+    val impl =
+      s"""
+         |def ${function.name}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  var args = scala.collection.Seq[org.apache.mxnet.Symbol]()
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  org.apache.mxnet.Symbol.createSymbolGeneral(
+         |    "${function.name}", name, attr, args, map.toMap)
+         |}
        """.stripMargin
 
-    // Combine and build the function string
-    val finalStr =
-      s"""def ${function.name}
-         |   (${argDef.mkString(",")}) : $returnType
-         | = {${impl.mkString("\n")}}
+    c.parse(impl).asInstanceOf[DefDef]
+  }
+}
+
+
+/**
+  * Symbol.random code generation
+  */
+private[mxnet] object TypedSymbolRandomAPIMacro extends GeneratorBase
+  with RandomHelpers {
+
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
+    val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = true)
+      .map(f => buildTypedFunction(c)(f))
+
+    structGeneration(c)(functionDefs, annottees: _*)
+  }
+
+  protected def buildTypedFunction(c: blackbox.Context)
+                                  (function: Func): c.universe.DefDef = {
+    import c.universe._
+
+    val returnType = "org.apache.mxnet.Symbol"
+
+    // Construct API arguments declaration
+    val argDecl = super.typedFunctionCommonArgDef(function) :+
+      "name : String = null" :+
+      "attr : Map[String, String] = null"
+
+    // Map API input args to backend args
+    val backendArgsMapping =
+      function.listOfArgs.map { arg =>
+        if (arg.argType.equals(s"Array[org.apache.mxnet.Symbol]")) {
+          s"args = ${arg.safeArgName}.toSeq"
+        } else {
+          // all go in kwargs
+          if (arg.isOptional) {
+            s"""if (${arg.safeArgName}.isDefined) map("${arg.argName}") = ${arg.safeArgName}.get"""
+          } else {
+            s"""map("${arg.argName}") = ${arg.safeArgName}"""
+          }
+        }
+      }
+
+    val impl =
+      s"""
+         |def ${function.name}${randomGenericTypeSpec(true, true)}
+         |  (${argDecl.mkString(",")}): $returnType = {
+         |
+         |  val map = scala.collection.mutable.Map[String, Any]()
+         |  var args = scala.collection.Seq[org.apache.mxnet.Symbol]()
+         |  val isScalar = SymbolOrScalar[T].isScalar
+         |
+         |  ${backendArgsMapping.mkString("\n")}
+         |
+         |  val target = if(isScalar) {
+         |    "random_${function.name}"
+         |  } else {
+         |    "sample_${function.name}"
+         |  }
+         |
+         |  ${unhackNormalFunc(function)}
+         |
+         |  org.apache.mxnet.Symbol.createSymbolGeneral(
+         |    target, name, attr, args, map.toMap)
+         |}
        """.stripMargin
 
-    c.parse(finalStr).asInstanceOf[DefDef]
+    c.parse(impl).asInstanceOf[DefDef]
   }
 }
+
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
index 2fd8b2e73c7a..57c4cfba10b7 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
@@ -18,23 +18,35 @@ package org.apache.mxnet.utils
 
 private[mxnet] object CToScalaUtils {
 
-
+  private val javaType = Map(
+    "float" -> "java.lang.Float",
+    "int" -> "java.lang.Integer",
+    "long" -> "java.lang.Long",
+    "double" -> "java.lang.Double",
+    "bool" -> "java.lang.Boolean")
+  private val scalaType = Map(
+    "float" -> "Float",
+    "int" -> "Int",
+    "long" -> "Long",
+    "double" -> "Double",
+    "bool" -> "Boolean")
 
   // Convert C++ Types to Scala Types
   def typeConversion(in : String, argType : String = "", argName : String,
-                     returnType : String) : String = {
+                     returnType : String, isJava : Boolean) : String = {
     val header = returnType.split("\\.").dropRight(1)
+    val types = if (isJava) javaType else scalaType
     in match {
       case "Shape(tuple)" | "ShapeorNone" => s"${header.mkString(".")}.Shape"
       case "Symbol" | "NDArray" | "NDArray-or-Symbol" => returnType
       case "Symbol[]" | "NDArray[]" | "NDArray-or-Symbol[]" | "SymbolorSymbol[]"
       => s"Array[$returnType]"
-      case "float" | "real_t" | "floatorNone" => "java.lang.Float"
-      case "int" | "intorNone" | "int(non-negative)" => "java.lang.Integer"
-      case "long" | "long(non-negative)" => "java.lang.Long"
-      case "double" | "doubleorNone" => "java.lang.Double"
+      case "float" | "real_t" | "floatorNone" => types("float")
+      case "int" | "intorNone" | "int(non-negative)" => types("int")
+      case "long" | "long(non-negative)" => types("long")
+      case "double" | "doubleorNone" => types("double")
       case "string" => "String"
-      case "boolean" | "booleanorNone" => "java.lang.Boolean"
+      case "boolean" | "booleanorNone" => types("bool")
       case "tupleof<float>" | "tupleof<double>" | "tupleof<>" | "ptr" | "" => "Any"
       case default => throw new IllegalArgumentException(
         s"Invalid type for args: $default\nString argType: $argType\nargName: $argName")
@@ -54,7 +66,7 @@ private[mxnet] object CToScalaUtils {
     * @return (Scala_Type, isOptional)
     */
   def argumentCleaner(argName: String, argType : String,
-                      returnType : String) : (String, Boolean) = {
+                      returnType : String, isJava : Boolean) : (String, Boolean) = {
     val spaceRemoved = argType.replaceAll("\\s+", "")
     var commaRemoved : Array[String] = new Array[String](0)
     // Deal with the case e.g: stype : {'csr', 'default', 'row_sparse'}
@@ -72,9 +84,9 @@ private[mxnet] object CToScalaUtils {
         s"""expected "optional" got ${commaRemoved(1)}""")
       require(commaRemoved(2).startsWith("default="),
         s"""expected "default=..." got ${commaRemoved(2)}""")
-      (typeConversion(commaRemoved(0), argType, argName, returnType), true)
+      (typeConversion(commaRemoved(0), argType, argName, returnType, isJava), true)
     } else if (commaRemoved.length == 2 || commaRemoved.length == 1) {
-      val tempType = typeConversion(commaRemoved(0), argType, argName, returnType)
+      val tempType = typeConversion(commaRemoved(0), argType, argName, returnType, isJava)
       val tempOptional = tempType.equals("org.apache.mxnet.Symbol")
       (tempType, tempOptional)
     } else {
diff --git a/scala-package/macros/src/test/scala/org/apache/mxnet/MacrosSuite.scala b/scala-package/macros/src/test/scala/org/apache/mxnet/MacrosSuite.scala
index 4404b0885d57..4069bba25220 100644
--- a/scala-package/macros/src/test/scala/org/apache/mxnet/MacrosSuite.scala
+++ b/scala-package/macros/src/test/scala/org/apache/mxnet/MacrosSuite.scala
@@ -36,14 +36,15 @@ class MacrosSuite extends FunSuite with BeforeAndAfterAll {
     )
     val output = List(
       ("org.apache.mxnet.Symbol", true),
-      ("java.lang.Integer", false),
+      ("Int", false),
       ("org.apache.mxnet.Shape", true),
       ("String", true),
       ("Any", false)
     )
 
     for (idx <- input.indices) {
-      val result = CToScalaUtils.argumentCleaner("Sample", input(idx), "org.apache.mxnet.Symbol")
+      val result = CToScalaUtils.argumentCleaner("Sample", input(idx),
+        "org.apache.mxnet.Symbol", false)
       assert(result._1 === output(idx)._1 && result._2 === output(idx)._2)
     }
   }
diff --git a/scala-package/mxnet-demo/java-demo/Makefile b/scala-package/mxnet-demo/java-demo/Makefile
index bb47db1c6d27..4f2b5e938970 100644
--- a/scala-package/mxnet-demo/java-demo/Makefile
+++ b/scala-package/mxnet-demo/java-demo/Makefile
@@ -16,7 +16,7 @@
 # under the License.
 
 SCALA_VERSION_PROFILE := 2.11
-MXNET_VERSION := 1.4.0-SNAPSHOT
+MXNET_VERSION := [1.5.0-SNAPSHOT,\)
 
 ifeq ($(OS),Windows_NT)
 	UNAME_S := Windows
diff --git a/scala-package/mxnet-demo/java-demo/README.md b/scala-package/mxnet-demo/java-demo/README.md
index dbe18052a899..ca2828ae405d 100644
--- a/scala-package/mxnet-demo/java-demo/README.md
+++ b/scala-package/mxnet-demo/java-demo/README.md
@@ -12,7 +12,7 @@ You can use the following instruction as an alternative to achieve the same resu
 User are required to use `mvn package` to build the package,
  which are shown below:
 ```Bash
-export SCALA_VERSION_PROFILE=2.11 MXNET_VERSION=1.4.0-SNAPSHOT
+export SCALA_VERSION_PROFILE=2.11 MXNET_VERSION=1.5.0-SNAPSHOT
 export SCALA_PKG_PROFILE=
 mvn package -Dmxnet.profile=$SCALA_PKG_PROFILE \
 		-Dmxnet.scalaprofile=$SCALA_VERSION_PROFILE \
@@ -37,6 +37,13 @@ However, you have to define the Classpath before you run the demo code. More inf
 The `CLASSPATH` should point to the jar file you have downloaded.
 
 It will load the library automatically and run the example
+
+In order to use the `Param Object`. We requires user to place this line in the front:
+```
+static NDArray$ NDArray = NDArray$.MODULE$;
+```
+It would help to have the NDArray companion object static and accessable from the outside.
+
 ### Object Detection using Inference API
 We also provide an example to do object detection, which downloads a ImageNet trained resnet50 model and runs inference on an image to return the classification result as
 ```Bash
@@ -80,5 +87,5 @@ sudo apt install libopencv-imgcodecs3.4
 
 Is there any other version available?
 
-You can find nightly release version from [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.4.0-SNAPSHOT~~).
+You can find nightly release version from [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0-SNAPSHOT~~).
 Please keep the same version in the Makefile or [above version](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~) to run this demo.
diff --git a/scala-package/mxnet-demo/java-demo/bin/java_sample.sh b/scala-package/mxnet-demo/java-demo/bin/java_sample.sh
old mode 100644
new mode 100755
index 2ec9a78c3233..4fb724aca8db
--- a/scala-package/mxnet-demo/java-demo/bin/java_sample.sh
+++ b/scala-package/mxnet-demo/java-demo/bin/java_sample.sh
@@ -16,5 +16,5 @@
 # under the License.
 #!/bin/bash
 CURR_DIR=$(cd $(dirname $0)/../; pwd)
-CLASSPATH=$CLASSPATH:$CURR_DIR/target/*:$CLASSPATH:$CURR_DIR/target/classes/lib/*
+CLASSPATH=$CLASSPATH:$CURR_DIR/target/*:$CLASSPATH:$CURR_DIR/target/dependency/*
 java -Xmx8G  -cp $CLASSPATH mxnet.HelloWorld
\ No newline at end of file
diff --git a/scala-package/mxnet-demo/java-demo/bin/run_od.sh b/scala-package/mxnet-demo/java-demo/bin/run_od.sh
old mode 100644
new mode 100755
index e3c8fd545048..abd0bf5b1b93
--- a/scala-package/mxnet-demo/java-demo/bin/run_od.sh
+++ b/scala-package/mxnet-demo/java-demo/bin/run_od.sh
@@ -16,5 +16,5 @@
 # under the License.
 #!/bin/bash
 CURR_DIR=$(cd $(dirname $0)/../; pwd)
-CLASSPATH=$CLASSPATH:$CURR_DIR/target/*:$CLASSPATH:$CURR_DIR/target/classes/lib/*
+CLASSPATH=$CLASSPATH:$CURR_DIR/target/*:$CLASSPATH:$CURR_DIR/target/dependency/*
 java -Xmx8G  -cp $CLASSPATH mxnet.ObjectDetection
\ No newline at end of file
diff --git a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/HelloWorld.java b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/HelloWorld.java
index 3f209a6c6c84..71981e2691c5 100644
--- a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/HelloWorld.java
+++ b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/HelloWorld.java
@@ -20,9 +20,13 @@
 import java.util.Arrays;
 
 public class HelloWorld {
+    static NDArray$ NDArray = NDArray$.MODULE$;
+
     public static void main(String[] args) {
     	System.out.println("Hello World!");
         NDArray nd = new NDArray(new float[]{2.0f, 3.0f}, new Shape(new int[]{1, 2}), Context.cpu());
         System.out.println(nd.shape());
+        NDArray nd2 = NDArray.dot(NDArray.new dotParam(nd, nd.T()))[0];
+        System.out.println(Arrays.toString(nd2.toArray()));
     }
 }
diff --git a/scala-package/mxnet-demo/scala-demo/pom.xml b/scala-package/mxnet-demo/scala-demo/pom.xml
index 8fc30e78cac8..a908487cd21d 100644
--- a/scala-package/mxnet-demo/scala-demo/pom.xml
+++ b/scala-package/mxnet-demo/scala-demo/pom.xml
@@ -4,7 +4,7 @@
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
     <groupId>Demo</groupId>
-    <artifactId>mxnet-scala-demo_2.11</artifactId>
+    <artifactId>mxnet-scala-demo</artifactId>
     <version>1.0-SNAPSHOT</version>
     <name>MXNet Scala Demo</name>
     <packaging>pom</packaging>
diff --git a/scala-package/native/README.md b/scala-package/native/README.md
index cb6dd3890dd2..c87b064fff02 100644
--- a/scala-package/native/README.md
+++ b/scala-package/native/README.md
@@ -6,7 +6,11 @@ MXNet Scala JNI is a thin wrapper layer of underlying libmxnet.so.
 JNI native code requires a header file that matches the java/scala interface,
 this file is usually generated with javah.
 
-In our case, jni_helper_func.h is generated and will be used to compile native code.
+In our case, org_apache_mxnet_native_c.h is generated and will be used to compile native code.
+
+To improve build performance, we check in generated org_apache_mxnet_native_c.h file.
+And we added a check to detect mismatch with Scala code and generated header. The checker will
+make sure we won't forget to update org_apache_mxnet_native_c.h file.
 
 
 ## Linker options
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
deleted file mode 100644
index 7cfd01a4ef79..000000000000
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ /dev/null
@@ -1,107 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-  <name>MXNet Scala Package - Native Linux-x86_64 CPU-only</name>
-  <url>http://maven.apache.org</url>
-
-  <packaging>so</packaging>
-
-  <properties>
-    <MXNET_DIR>${project.parent.parent.basedir}/..</MXNET_DIR>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-      <type>jar</type>
-      <scope>compile</scope>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
-
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>native-maven-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <!--  trigger javah -->
-          <javahOS>linux</javahOS>
-          <compilerProvider>generic-classic</compilerProvider>
-          <compilerExecutable>${cxx}</compilerExecutable>
-          <linkerExecutable>${cxx}</linkerExecutable>
-          <sources>
-            <source>
-              <directory>../src/main/native</directory>
-              <fileNames>
-                <fileName>org_apache_mxnet_native_c_api.cc</fileName>
-              </fileNames>
-            </source>
-          </sources>
-          <compilerStartOptions>
-            <compilerStartOption>-std=c++0x</compilerStartOption>
-          </compilerStartOptions>
-          <compilerEndOptions>
-            <compilerEndOption>-I${MXNET_DIR}/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/dmlc-core/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/mshadow</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/dlpack/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/tvm/nnvm/include</compilerEndOption>
-            <compilerEndOption>-DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CUDA=0</compilerEndOption>
-            <compilerEndOption>-O3 -DNDEBUG=1 -fPIC -msse3 -mf16c</compilerEndOption>
-            <compilerEndOption>-Wall -Wsign-compare -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs</compilerEndOption>
-          </compilerEndOptions>
-          <linkerStartOptions>
-            <linkerStartOption>-shared</linkerStartOption>
-          </linkerStartOptions>
-          <linkerEndOptions>
-            <linkerEndOption>-Wl,-rpath=${dollar}ORIGIN -lmxnet -L${MXNET_DIR}/lib</linkerEndOption>
-          </linkerEndOptions>
-        </configuration>
-
-        <executions>
-          <execution>
-            <id>javah</id>
-            <phase>generate-sources</phase>
-            <configuration>
-              <javahProvider>default</javahProvider>
-              <javahOutputDirectory>${project.build.directory}/custom-javah</javahOutputDirectory>
-              <workingDirectory>${basedir}</workingDirectory>
-              <javahOutputFileName>org_apache_mxnet_native_c_api.h</javahOutputFileName>
-              <javahClassNames>
-                <javahClassName>org.apache.mxnet.LibInfo</javahClassName>
-              </javahClassNames>
-            </configuration>
-            <goals>
-              <goal>javah</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
deleted file mode 100644
index 668f330b5ff9..000000000000
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ /dev/null
@@ -1,107 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-  <name>MXNet Scala Package - Native Linux-x86_64 GPU</name>
-  <url>http://maven.apache.org</url>
-
-  <packaging>so</packaging>
-
-  <properties>
-    <MXNET_DIR>${project.parent.parent.basedir}/..</MXNET_DIR>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-      <type>jar</type>
-      <scope>compile</scope>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
-
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>native-maven-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <!--  trigger javah -->
-          <javahOS>linux</javahOS>
-          <compilerProvider>generic-classic</compilerProvider>
-          <compilerExecutable>${cxx}</compilerExecutable>
-          <linkerExecutable>${cxx}</linkerExecutable>
-          <sources>
-            <source>
-              <directory>../src/main/native</directory>
-              <fileNames>
-                <fileName>org_apache_mxnet_native_c_api.cc</fileName>
-              </fileNames>
-            </source>
-          </sources>
-          <compilerStartOptions>
-            <compilerStartOption>-std=c++0x</compilerStartOption>
-          </compilerStartOptions>
-          <compilerEndOptions>
-            <compilerEndOption>-I${MXNET_DIR}/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/dmlc-core/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/mshadow</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/dlpack/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/tvm/nnvm/include</compilerEndOption>
-            <compilerEndOption>-DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CUDA=0</compilerEndOption>
-            <compilerEndOption>-O3 -DNDEBUG=1 -fPIC -msse3 -mf16c</compilerEndOption>
-            <compilerEndOption>-Wall -Wsign-compare -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs</compilerEndOption>
-          </compilerEndOptions>
-          <linkerStartOptions>
-            <linkerStartOption>-shared</linkerStartOption>
-          </linkerStartOptions>
-          <linkerEndOptions>
-            <linkerEndOption>-Wl,-rpath=${dollar}ORIGIN -lmxnet -L${MXNET_DIR}/lib</linkerEndOption>
-          </linkerEndOptions>
-        </configuration>
-
-        <executions>
-          <execution>
-            <id>javah</id>
-            <phase>generate-sources</phase>
-            <configuration>
-              <javahProvider>default</javahProvider>
-              <javahOutputDirectory>${project.build.directory}/custom-javah</javahOutputDirectory>
-              <workingDirectory>${basedir}</workingDirectory>
-              <javahOutputFileName>org_apache_mxnet_native_c_api.h</javahOutputFileName>
-              <javahClassNames>
-                <javahClassName>org.apache.mxnet.LibInfo</javahClassName>
-              </javahClassNames>
-            </configuration>
-            <goals>
-              <goal>javah</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
deleted file mode 100644
index 425ca96815de..000000000000
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ /dev/null
@@ -1,142 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project xmlns="http://maven.apache.org/POM/4.0.0"
-         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-  <name>MXNet Scala Package - Native OSX-x86_64 CPU-only</name>
-  <url>http://maven.apache.org</url>
-
-  <packaging>jnilib</packaging>
-
-  <properties>
-    <MXNET_DIR>${project.parent.parent.basedir}/..</MXNET_DIR>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
-      <type>jar</type>
-      <scope>compile</scope>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-compiler-plugin</artifactId>
-      </plugin>
-
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>native-maven-plugin</artifactId>
-        <extensions>true</extensions>
-        <configuration>
-          <!--  trigger javah -->
-          <javahOS>darwin</javahOS>
-          <compilerProvider>generic-classic</compilerProvider>
-          <compilerExecutable>${cxx}</compilerExecutable>
-          <linkerExecutable>${cxx}</linkerExecutable>
-          <sources>
-            <source>
-              <directory>../src/main/native</directory>
-              <fileNames>
-                <fileName>org_apache_mxnet_native_c_api.cc</fileName>
-              </fileNames>
-            </source>
-          </sources>
-          <compilerStartOptions>
-            <compilerStartOption>-std=c++0x</compilerStartOption>
-          </compilerStartOptions>
-          <compilerEndOptions>
-            <compilerEndOption>-I${MXNET_DIR}/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/dmlc-core/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/mshadow</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/dlpack/include</compilerEndOption>
-            <compilerEndOption>-I${MXNET_DIR}/3rdparty/tvm/nnvm/include</compilerEndOption>
-            <compilerEndOption>-DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CUDA=0</compilerEndOption>
-            <compilerEndOption>-g -O0 -fPIC -msse3 -mf16c</compilerEndOption>
-            <compilerEndOption>-Wall -Wsign-compare -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs</compilerEndOption>
-          </compilerEndOptions>
-          <linkerStartOptions>
-            <linkerStartOption>-shared</linkerStartOption>
-          </linkerStartOptions>
-          <linkerMiddleOptions>
-            <linkerMiddleOption>-framework JavaVM</linkerMiddleOption>
-            <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
-            <linkerMiddleOption>-Wl,-x</linkerMiddleOption>
-          </linkerMiddleOptions>
-          <linkerEndOptions>
-            <linkerEndOption>-Wl,-install_name,libmxnet-scala.jnilib -lmxnet -L${MXNET_DIR}/lib</linkerEndOption>
-          </linkerEndOptions>
-        </configuration>
-
-        <executions>
-          <execution>
-            <id>javah</id>
-            <phase>generate-sources</phase>
-            <configuration>
-              <javahProvider>default</javahProvider>
-              <javahOutputDirectory>${project.build.directory}/custom-javah</javahOutputDirectory>
-              <workingDirectory>${basedir}</workingDirectory>
-              <javahOutputFileName>org_apache_mxnet_native_c_api.h</javahOutputFileName>
-              <javahClassNames>
-                <javahClassName>org.apache.mxnet.LibInfo</javahClassName>
-              </javahClassNames>
-            </configuration>
-            <goals>
-              <goal>javah</goal>
-            </goals>
-          </execution>
-        </executions>
-      </plugin>
-
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>exec-maven-plugin</artifactId>
-        <version>1.6.0</version>
-        <executions>
-          <execution>
-            <id>post-native-build</id>
-            <phase>package</phase>
-            <goals>
-              <goal>exec</goal>
-            </goals>
-            <configuration>
-              <executable>install_name_tool</executable>
-              <commandlineArgs>-change lib/libmxnet.so @loader_path/libmxnet.so ${project.build.directory}/${artifactId}.jnilib</commandlineArgs>
-            </configuration>
-          </execution>
-          <execution>
-            <id>link-native-lib</id>
-            <phase>generate-resources</phase>
-            <goals>
-                <goal>exec</goal>
-            </goals>
-            <configuration>
-                <executable>ln</executable>
-                <commandlineArgs>-sf ${MXNET_DIR}/lib/libmxnet.so ${project.build.directory}/libmxnet.so</commandlineArgs>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-</project>
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index 2f6425d21104..7b776d5b5171 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -5,46 +5,165 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <artifactId>mxnet-parent</artifactId>
+    <version>INTERNAL</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>mxnet-scala-native-parent</artifactId>
-  <name>MXNet Scala Package - Native Parent</name>
-  <packaging>pom</packaging>
+  <artifactId>libmxnet-scala</artifactId>
+  <name>MXNet Scala Package - Native</name>
+  <packaging>${libtype}</packaging>
+
+  <properties>
+    <MXNET_DIR>${project.parent.basedir}/..</MXNET_DIR>
+  </properties>
 
   <profiles>
     <profile>
-      <id>osx-x86_64-cpu</id>
-      <modules>
-        <module>osx-x86_64-cpu</module>
-      </modules>
-    </profile>
-    <profile>
-      <id>linux-x86_64-cpu</id>
-      <modules>
-        <module>linux-x86_64-cpu</module>
-      </modules>
+      <id>osx-x86_64</id>
+      <activation>
+        <os><family>mac</family></os>
+      </activation>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>native-maven-plugin</artifactId>
+            <extensions>true</extensions>
+            <configuration>
+              <javahOS>darwin</javahOS>
+              <compilerProvider>generic-classic</compilerProvider>
+              <compilerExecutable>${cxx}</compilerExecutable>
+              <linkerExecutable>${cxx}</linkerExecutable>
+              <sources>
+                <source>
+                  <directory>src/main/native</directory>
+                  <fileNames>
+                    <fileName>org_apache_mxnet_native_c_api.cc</fileName>
+                  </fileNames>
+                </source>
+              </sources>
+              <compilerStartOptions>
+                <compilerStartOption>-std=c++0x</compilerStartOption>
+              </compilerStartOptions>
+              <compilerEndOptions>
+                <compilerEndOption>-I${MXNET_DIR}/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/dmlc-core/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/mshadow</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/dlpack/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/tvm/nnvm/include</compilerEndOption>
+                <compilerEndOption>-DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CUDA=0</compilerEndOption>
+                <compilerEndOption>-g -O0 -fPIC -msse3 -mf16c</compilerEndOption>
+                <compilerEndOption>-Wall -Wsign-compare -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs</compilerEndOption>
+              </compilerEndOptions>
+              <linkerStartOptions>
+                <linkerStartOption>-shared</linkerStartOption>
+              </linkerStartOptions>
+              <linkerMiddleOptions>
+                <linkerMiddleOption>-framework JavaVM</linkerMiddleOption>
+                <linkerMiddleOption>-Wl,-exported_symbol,_Java_*</linkerMiddleOption>
+                <linkerMiddleOption>-Wl,-x</linkerMiddleOption>
+              </linkerMiddleOptions>
+              <linkerEndOptions>
+                <linkerEndOption>-Wl,-install_name,libmxnet-scala.jnilib -lmxnet -L${MXNET_DIR}/lib</linkerEndOption>
+              </linkerEndOptions>
+            </configuration>
+          </plugin>
+
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>exec-maven-plugin</artifactId>
+            <version>1.6.0</version>
+            <executions>
+              <execution>
+                <id>post-native-build</id>
+                <phase>package</phase>
+                <goals>
+                  <goal>exec</goal>
+                </goals>
+                <configuration>
+                  <executable>install_name_tool</executable>
+                  <commandlineArgs>-add_rpath @loader_path ${project.build.directory}/${project.artifactId}.jnilib</commandlineArgs>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
     </profile>
     <profile>
-      <id>linux-x86_64-gpu</id>
-      <modules>
-        <module>linux-x86_64-gpu</module>
-      </modules>
+      <id>linux-x86_64</id>
+      <activation>
+        <os>
+          <family>unix</family>
+          <name>Linux</name>
+        </os>
+      </activation>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>native-maven-plugin</artifactId>
+            <extensions>true</extensions>
+            <configuration>
+              <javahOS>linux</javahOS>
+              <compilerProvider>generic-classic</compilerProvider>
+              <compilerExecutable>${cxx}</compilerExecutable>
+              <linkerExecutable>${cxx}</linkerExecutable>
+              <sources>
+                <source>
+                  <directory>src/main/native</directory>
+                  <fileNames>
+                    <fileName>org_apache_mxnet_native_c_api.cc</fileName>
+                  </fileNames>
+                </source>
+              </sources>
+              <compilerStartOptions>
+                <compilerStartOption>-std=c++0x</compilerStartOption>
+              </compilerStartOptions>
+              <compilerEndOptions>
+                <compilerEndOption>-I${MXNET_DIR}/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/dmlc-core/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/mshadow</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/dlpack/include</compilerEndOption>
+                <compilerEndOption>-I${MXNET_DIR}/3rdparty/tvm/nnvm/include</compilerEndOption>
+                <compilerEndOption>-DMSHADOW_USE_MKL=0 -DMSHADOW_USE_CUDA=0</compilerEndOption>
+                <compilerEndOption>-O3 -DNDEBUG=1 -fPIC -msse3 -mf16c</compilerEndOption>
+                <compilerEndOption>-Wall -Wsign-compare -Wno-unused-parameter -Wno-unknown-pragmas -Wno-unused-local-typedefs</compilerEndOption>
+              </compilerEndOptions>
+              <linkerStartOptions>
+                <linkerStartOption>-shared</linkerStartOption>
+              </linkerStartOptions>
+              <linkerEndOptions>
+                <linkerEndOption>-Wl,-rpath=${dollar}ORIGIN -lmxnet -L${MXNET_DIR}/lib</linkerEndOption>
+              </linkerEndOptions>
+            </configuration>
+          </plugin>
+        </plugins>
+      </build>
     </profile>
   </profiles>
 
   <build>
     <plugins>
       <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>exec-maven-plugin</artifactId>
+        <version>1.6.0</version>
+        <executions>
+          <execution>
+            <id>link-native-lib</id>
+            <phase>generate-resources</phase>
+            <goals>
+              <goal>exec</goal>
+            </goals>
+            <configuration>
+              <executable>bash</executable>
+              <commandlineArgs>-c 'ln -sf ${MXNET_DIR}/lib/* ${project.build.directory}/'</commandlineArgs>
+            </configuration>
+          </execution>
+        </executions>
       </plugin>
     </plugins>
   </build>
-
 </project>
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 17d166eac345..ea6e9c8f5ba4 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -33,6 +33,7 @@
 #include <functional>
 #include <string>
 #include <unordered_map>
+#include <vector>
 #include "jni_helper_func.h"
 
 JavaVM *_jvm;
@@ -423,6 +424,15 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySyncCopyFromCPU
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxFloat64NDArraySyncCopyFromCPU
+  (JNIEnv *env, jobject obj, jlong arrayPtr, jdoubleArray sourceArr, jint arrSize) {
+  jdouble *sourcePtr = env->GetDoubleArrayElements(sourceArr, NULL);
+  int ret = MXNDArraySyncCopyFromCPU(reinterpret_cast<NDArrayHandle>(arrayPtr),
+                                     static_cast<const double *>(sourcePtr), arrSize);
+  env->ReleaseDoubleArrayElements(sourceArr, sourcePtr, 0);
+  return ret;
+}
+
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetContext
   (JNIEnv *env, jobject obj, jlong arrayPtr, jobject devTypeId, jobject devId) {
   int outDevType;
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
new file mode 100644
index 000000000000..7e8e03de9124
--- /dev/null
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
@@ -0,0 +1,861 @@
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class org_apache_mxnet_LibInfo */
+
+#ifndef _Included_org_apache_mxnet_LibInfo
+#define _Included_org_apache_mxnet_LibInfo
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    nativeLibInit
+ * Signature: ()I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_nativeLibInit
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxGetLastError
+ * Signature: ()Ljava/lang/String;
+ */
+JNIEXPORT jstring JNICALL Java_org_apache_mxnet_LibInfo_mxGetLastError
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxListAllOpNames
+ * Signature: (Lscala/collection/mutable/ListBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxListAllOpNames
+  (JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    nnGetOpHandle
+ * Signature: (Ljava/lang/String;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_nnGetOpHandle
+  (JNIEnv *, jobject, jstring, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxImperativeInvoke
+ * Signature: (J[J[JLscala/collection/mutable/ArrayBuffer;I[Ljava/lang/String;[Ljava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxImperativeInvoke
+  (JNIEnv *, jobject, jlong, jlongArray, jlongArray, jobject, jint, jobjectArray, jobjectArray);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayFree
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayFree
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayCreateNone
+ * Signature: (Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateNone
+  (JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayCreateEx
+ * Signature: ([IIIIIILorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayCreateEx
+  (JNIEnv *, jobject, jintArray, jint, jint, jint, jint, jint, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayWaitAll
+ * Signature: ()I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayWaitAll
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayWaitToRead
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayWaitToRead
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxListFunctions
+ * Signature: (Lscala/collection/mutable/ListBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxListFunctions
+  (JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxFuncDescribe
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;Lorg/apache/mxnet/Base/RefInt;Lorg/apache/mxnet/Base/RefInt;Lorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxFuncDescribe
+  (JNIEnv *, jobject, jlong, jobject, jobject, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxFuncGetInfo
+ * Signature: (JLorg/apache/mxnet/Base/RefString;Lorg/apache/mxnet/Base/RefString;Lorg/apache/mxnet/Base/RefInt;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxFuncGetInfo
+  (JNIEnv *, jobject, jlong, jobject, jobject, jobject, jobject, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxFuncInvoke
+ * Signature: (J[J[F[J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxFuncInvoke
+  (JNIEnv *, jobject, jlong, jlongArray, jfloatArray, jlongArray);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxFuncInvokeEx
+ * Signature: (J[J[F[JI[[B[[B)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxFuncInvokeEx
+  (JNIEnv *, jobject, jlong, jlongArray, jfloatArray, jlongArray, jint, jobjectArray, jobjectArray);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayGetShape
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;Lscala/collection/mutable/ArrayBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetShape
+  (JNIEnv *, jobject, jlong, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArraySyncCopyToCPU
+ * Signature: (J[BI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySyncCopyToCPU
+  (JNIEnv *, jobject, jlong, jbyteArray, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArraySlice
+ * Signature: (JIILorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySlice
+  (JNIEnv *, jobject, jlong, jint, jint, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayAt
+ * Signature: (JILorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayAt
+  (JNIEnv *, jobject, jlong, jint, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayReshape
+ * Signature: (JI[ILorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayReshape
+  (JNIEnv *, jobject, jlong, jint, jintArray, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArraySyncCopyFromCPU
+ * Signature: (J[FI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySyncCopyFromCPU
+  (JNIEnv *, jobject, jlong, jfloatArray, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxFloat64NDArraySyncCopyFromCPU
+ * Signature: (J[DI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxFloat64NDArraySyncCopyFromCPU
+  (JNIEnv *, jobject, jlong, jdoubleArray, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayLoad
+ * Signature: (Ljava/lang/String;Lorg/apache/mxnet/Base/RefInt;Lscala/collection/mutable/ArrayBuffer;Lorg/apache/mxnet/Base/RefInt;Lscala/collection/mutable/ArrayBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayLoad
+  (JNIEnv *, jobject, jstring, jobject, jobject, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArraySave
+ * Signature: (Ljava/lang/String;[J[Ljava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySave
+  (JNIEnv *, jobject, jstring, jlongArray, jobjectArray);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayGetContext
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;Lorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetContext
+  (JNIEnv *, jobject, jlong, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArraySaveRawBytes
+ * Signature: (JLscala/collection/mutable/ArrayBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArraySaveRawBytes
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayLoadFromRawBytes
+ * Signature: ([BLorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayLoadFromRawBytes
+  (JNIEnv *, jobject, jbyteArray, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNDArrayGetDType
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetDType
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxInitPSEnv
+ * Signature: ([Ljava/lang/String;[Ljava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxInitPSEnv
+  (JNIEnv *, jobject, jobjectArray, jobjectArray);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreRunServer
+ * Signature: (JLorg/apache/mxnet/KVServerControllerCallback;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreRunServer
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreGetNumDeadNode
+ * Signature: (JILorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreGetNumDeadNode
+  (JNIEnv *, jobject, jlong, jint, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreCreate
+ * Signature: (Ljava/lang/String;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreCreate
+  (JNIEnv *, jobject, jstring, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreInit
+ * Signature: (JI[I[J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreInit
+  (JNIEnv *, jobject, jlong, jint, jintArray, jlongArray);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreInitEx
+ * Signature: (JI[Ljava/lang/String;[J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreInitEx
+  (JNIEnv *, jobject, jlong, jint, jobjectArray, jlongArray);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStorePush
+ * Signature: (JI[I[JI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStorePush
+  (JNIEnv *, jobject, jlong, jint, jintArray, jlongArray, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStorePushEx
+ * Signature: (JI[Ljava/lang/String;[JI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStorePushEx
+  (JNIEnv *, jobject, jlong, jint, jobjectArray, jlongArray, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStorePull
+ * Signature: (JI[I[JI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStorePull
+  (JNIEnv *, jobject, jlong, jint, jintArray, jlongArray, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStorePullEx
+ * Signature: (JI[Ljava/lang/String;[JI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStorePullEx
+  (JNIEnv *, jobject, jlong, jint, jobjectArray, jlongArray, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreSetUpdater
+ * Signature: (JLorg/apache/mxnet/MXKVStoreUpdater;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreSetUpdater
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreIsWorkerNode
+ * Signature: (Lorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreIsWorkerNode
+  (JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreGetType
+ * Signature: (JLorg/apache/mxnet/Base/RefString;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreGetType
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreSendCommmandToServers
+ * Signature: (JILjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreSendCommmandToServers
+  (JNIEnv *, jobject, jlong, jint, jstring);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreBarrier
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreBarrier
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreGetGroupSize
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreGetGroupSize
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreGetRank
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreGetRank
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreSetBarrierBeforeExit
+ * Signature: (JI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreSetBarrierBeforeExit
+  (JNIEnv *, jobject, jlong, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxKVStoreFree
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxKVStoreFree
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxListDataIters
+ * Signature: (Lscala/collection/mutable/ListBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxListDataIters
+  (JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxDataIterCreateIter
+ * Signature: (J[Ljava/lang/String;[Ljava/lang/String;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDataIterCreateIter
+  (JNIEnv *, jobject, jlong, jobjectArray, jobjectArray, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxDataIterGetIterInfo
+ * Signature: (JLorg/apache/mxnet/Base/RefString;Lorg/apache/mxnet/Base/RefString;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDataIterGetIterInfo
+  (JNIEnv *, jobject, jlong, jobject, jobject, jobject, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxDataIterFree
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDataIterFree
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxDataIterBeforeFirst
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDataIterBeforeFirst
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxDataIterNext
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDataIterNext
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxDataIterGetLabel
+ * Signature: (JLorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDataIterGetLabel
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxDataIterGetData
+ * Signature: (JLorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDataIterGetData
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxDataIterGetIndex
+ * Signature: (JLscala/collection/mutable/ListBuffer;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDataIterGetIndex
+  (JNIEnv *, jobject, jlong, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxDataIterGetPadNum
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDataIterGetPadNum
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxExecutorOutputs
+ * Signature: (JLscala/collection/mutable/ArrayBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorOutputs
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxExecutorFree
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorFree
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxExecutorForward
+ * Signature: (JI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorForward
+  (JNIEnv *, jobject, jlong, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxExecutorBackward
+ * Signature: (J[J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorBackward
+  (JNIEnv *, jobject, jlong, jlongArray);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxExecutorPrint
+ * Signature: (JLorg/apache/mxnet/Base/RefString;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorPrint
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxExecutorSetMonitorCallback
+ * Signature: (JLorg/apache/mxnet/MXMonitorCallback;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorSetMonitorCallback
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolListAtomicSymbolCreators
+ * Signature: (Lscala/collection/mutable/ListBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolListAtomicSymbolCreators
+  (JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolGetAtomicSymbolInfo
+ * Signature: (JLorg/apache/mxnet/Base/RefString;Lorg/apache/mxnet/Base/RefString;Lorg/apache/mxnet/Base/RefInt;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lorg/apache/mxnet/Base/RefString;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolGetAtomicSymbolInfo
+  (JNIEnv *, jobject, jlong, jobject, jobject, jobject, jobject, jobject, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolCreateAtomicSymbol
+ * Signature: (J[Ljava/lang/String;[Ljava/lang/String;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolCreateAtomicSymbol
+  (JNIEnv *, jobject, jlong, jobjectArray, jobjectArray, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolSetAttr
+ * Signature: (JLjava/lang/String;Ljava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolSetAttr
+  (JNIEnv *, jobject, jlong, jstring, jstring);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolListAttrShallow
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;Lscala/collection/mutable/ArrayBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolListAttrShallow
+  (JNIEnv *, jobject, jlong, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolListAttr
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;Lscala/collection/mutable/ArrayBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolListAttr
+  (JNIEnv *, jobject, jlong, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolCompose
+ * Signature: (JLjava/lang/String;[Ljava/lang/String;[J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolCompose
+  (JNIEnv *, jobject, jlong, jstring, jobjectArray, jlongArray);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolCreateVariable
+ * Signature: (Ljava/lang/String;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolCreateVariable
+  (JNIEnv *, jobject, jstring, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolGetAttr
+ * Signature: (JLjava/lang/String;Lorg/apache/mxnet/Base/RefString;Lorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolGetAttr
+  (JNIEnv *, jobject, jlong, jstring, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolListArguments
+ * Signature: (JLscala/collection/mutable/ArrayBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolListArguments
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolCopy
+ * Signature: (JLorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolCopy
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolListAuxiliaryStates
+ * Signature: (JLscala/collection/mutable/ArrayBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolListAuxiliaryStates
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolListOutputs
+ * Signature: (JLscala/collection/mutable/ArrayBuffer;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolListOutputs
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolCreateGroup
+ * Signature: ([JLorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolCreateGroup
+  (JNIEnv *, jobject, jlongArray, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolPrint
+ * Signature: (JLorg/apache/mxnet/Base/RefString;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolPrint
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolGetInternals
+ * Signature: (JLorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolGetInternals
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolInferType
+ * Signature: (J[Ljava/lang/String;[ILscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferType
+  (JNIEnv *, jobject, jlong, jobjectArray, jintArray, jobject, jobject, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolInferShape
+ * Signature: (JI[Ljava/lang/String;[I[ILscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
+  (JNIEnv *, jobject, jlong, jint, jobjectArray, jintArray, jintArray, jobject, jobject, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolGetOutput
+ * Signature: (JILorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolGetOutput
+  (JNIEnv *, jobject, jlong, jint, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolSaveToJSON
+ * Signature: (JLorg/apache/mxnet/Base/RefString;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolSaveToJSON
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolCreateFromJSON
+ * Signature: (Ljava/lang/String;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolCreateFromJSON
+  (JNIEnv *, jobject, jstring, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxExecutorBindX
+ * Signature: (JIII[Ljava/lang/String;[I[II[J[J[I[JLorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorBindX
+  (JNIEnv *, jobject, jlong, jint, jint, jint, jobjectArray, jintArray, jintArray, jint, jlongArray, jlongArray, jintArray, jlongArray, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxExecutorBindEX
+ * Signature: (JIII[Ljava/lang/String;[I[II[J[J[I[JJLorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorBindEX
+  (JNIEnv *, jobject, jlong, jint, jint, jint, jobjectArray, jintArray, jintArray, jint, jlongArray, jlongArray, jintArray, jlongArray, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolSaveToFile
+ * Signature: (JLjava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolSaveToFile
+  (JNIEnv *, jobject, jlong, jstring);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolCreateFromFile
+ * Signature: (Ljava/lang/String;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolCreateFromFile
+  (JNIEnv *, jobject, jstring, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolFree
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolFree
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRandomSeed
+ * Signature: (I)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRandomSeed
+  (JNIEnv *, jobject, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxNotifyShutdown
+ * Signature: ()I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNotifyShutdown
+  (JNIEnv *, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRecordIOWriterCreate
+ * Signature: (Ljava/lang/String;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRecordIOWriterCreate
+  (JNIEnv *, jobject, jstring, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRecordIOReaderCreate
+ * Signature: (Ljava/lang/String;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRecordIOReaderCreate
+  (JNIEnv *, jobject, jstring, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRecordIOWriterFree
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRecordIOWriterFree
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRecordIOReaderFree
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRecordIOReaderFree
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRecordIOWriterWriteRecord
+ * Signature: (JLjava/lang/String;I)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRecordIOWriterWriteRecord
+  (JNIEnv *, jobject, jlong, jstring, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRecordIOReaderReadRecord
+ * Signature: (JLorg/apache/mxnet/Base/RefString;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRecordIOReaderReadRecord
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRecordIOWriterTell
+ * Signature: (JLorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRecordIOWriterTell
+  (JNIEnv *, jobject, jlong, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRecordIOReaderSeek
+ * Signature: (JI)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRecordIOReaderSeek
+  (JNIEnv *, jobject, jlong, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRtcCreate
+ * Signature: (Ljava/lang/String;[Ljava/lang/String;[Ljava/lang/String;[J[JLjava/lang/String;Lorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRtcCreate
+  (JNIEnv *, jobject, jstring, jobjectArray, jobjectArray, jlongArray, jlongArray, jstring, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRtcPush
+ * Signature: (J[J[JIIIIII)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRtcPush
+  (JNIEnv *, jobject, jlong, jlongArray, jlongArray, jint, jint, jint, jint, jint, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxRtcFree
+ * Signature: (J)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxRtcFree
+  (JNIEnv *, jobject, jlong);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxCustomOpRegister
+ * Signature: (Ljava/lang/String;Lorg/apache/mxnet/CustomOpProp;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxCustomOpRegister
+  (JNIEnv *, jobject, jstring, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSetProfilerConfig
+ * Signature: ([Ljava/lang/String;[Ljava/lang/String;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetProfilerConfig
+  (JNIEnv *, jobject, jobjectArray, jobjectArray);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSetProfilerState
+ * Signature: (I)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetProfilerState
+  (JNIEnv *, jobject, jint);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxDumpProfile
+ * Signature: (I)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDumpProfile
+  (JNIEnv *, jobject, jint);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/scala-package/packageTest/Makefile b/scala-package/packageTest/Makefile
index 6073ff8a722f..8c12c1d04189 100644
--- a/scala-package/packageTest/Makefile
+++ b/scala-package/packageTest/Makefile
@@ -43,6 +43,10 @@ else
 	endif
 endif
 
+ifeq ($(CI), 1)
+	MAVEN_ARGS := -B
+endif
+
 PROFILES := -Ptest
 ifeq ($(UNIT), 1)
   PROFILES := "$(PROFILES),unittest"
@@ -59,27 +63,27 @@ endif
 
 
 clean:
-	(mvn clean -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
+	(mvn $(MAVEN_ARGS) clean -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
 		-Dmxnet.scalaprofile=$(SCALA_VERSION_PROFILE) \
 		-Dmxnet.version=$(MXNET_VERSION) \
 		-Dscala.version=$(SCALA_VERSION))
 
 testinstall:
-	(mvn integration-test -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
+	(mvn $(MAVEN_ARGS) integration-test -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
     $(PROFILES) \
 		-Dmxnet.scalaprofile=$(SCALA_VERSION_PROFILE) \
 		-Dmxnet.version=$(MXNET_VERSION) \
 		-Dscala.version=$(SCALA_VERSION))
 
 testlocal:
-	(mvn integration-test -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
+	(mvn $(MAVEN_ARGS) integration-test -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
     $(PROFILES),fromLocal \
 		-Dmxnet.scalaprofile=$(SCALA_VERSION_PROFILE) \
 		-Dmxnet.version=$(MXNET_VERSION) \
 		-Dscala.version=$(SCALA_VERSION))
 
 testsnapshot:
-	(mvn integration-test -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
+	(mvn $(MAVEN_ARGS) integration-test -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
     $(PROFILES),fromSnapshots \
 		-Dmxnet.scalaprofile=$(SCALA_VERSION_PROFILE) \
 		-Dmxnet.repo=$(MXNET_REPO) \
diff --git a/scala-package/packageTest/pom.xml b/scala-package/packageTest/pom.xml
index 9c5c11cf2779..f7d9e3b180bc 100644
--- a/scala-package/packageTest/pom.xml
+++ b/scala-package/packageTest/pom.xml
@@ -42,7 +42,7 @@
         <repositories>
           <repository>
             <id>local-snapshot</id>
-            <url>file://${basedir}/../local-snapshot</url>
+            <url>file://${basedir}/../deploy/target/repo</url>
             <snapshots>
               <enabled>true</enabled>
             </snapshots>
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 6eb573bf3e23..6665e953dcd1 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -6,11 +6,12 @@
   <parent>
     <groupId>org.apache</groupId>
     <artifactId>apache</artifactId>
-     <version>19</version>
+    <version>19</version>
   </parent>
+
   <groupId>org.apache.mxnet</groupId>
-  <artifactId>mxnet-parent_2.11</artifactId>
-  <version>1.5.0-SNAPSHOT</version>
+  <artifactId>mxnet-parent</artifactId>
+  <version>INTERNAL</version>
   <name>MXNet Scala Package - Parent</name>
   <url>/~https://github.com/apache/incubator-mxnet/tree/master/scala-package</url>
   <description>
@@ -37,10 +38,11 @@
 
   <properties>
     <scala.version>2.11.8</scala.version>
-    <scala.binary.version>2.11</scala.binary.version>
-    <build.platform />
+    <build.platform/>
     <cxx>g++</cxx>
     <dollar>$</dollar>
+    <MXNET_DIR>${project.basedir}/..</MXNET_DIR>
+    <skipJavaTests>true</skipJavaTests>
   </properties>
 
   <packaging>pom</packaging>
@@ -48,46 +50,18 @@
     <module>init</module>
     <module>init-native</module>
     <module>macros</module>
-    <module>core</module>
     <module>native</module>
+    <module>core</module>
     <module>infer</module>
     <module>examples</module>
     <module>spark</module>
     <module>assembly</module>
+    <module>deploy</module>
   </modules>
 
   <profiles>
-    <profile>
-      <id>release</id>
-      <build>
-        <plugins>
-          <plugin>
-            <groupId>org.codehaus.mojo</groupId>
-            <artifactId>build-helper-maven-plugin</artifactId>
-            <executions>
-              <execution>
-                <phase>generate-sources</phase>
-                <goals>
-                  <goal>add-source</goal>
-                </goals>
-                <configuration>
-                  <sources>
-                    <source>${project.build.directory}/genjavadoc</source>
-                  </sources>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-        </plugins>
-      </build>
-    </profile>
-
     <profile>
       <id>scala-2.11</id>
-      <properties>
-        <scala.version>2.11.8</scala.version>
-        <scala.binary.version>2.11</scala.binary.version>
-      </properties>
       <build>
         <plugins>
           <plugin>
@@ -117,31 +91,66 @@
     </profile>
 
     <profile>
-      <id>scala-2.12</id>
+      <id>osx-x86_64</id>
+      <activation>
+        <os>
+          <family>mac</family>
+        </os>
+      </activation>
+      <properties>
+        <platform>osx-x86_64</platform>
+        <libtype>jnilib</libtype>
+        <flavor>cpu</flavor>
+      </properties>
+    </profile>
+    <profile>
+      <id>linux-x86_64</id>
+      <activation>
+        <os>
+          <family>unix</family>
+          <name>Linux</name>
+        </os>
+      </activation>
       <properties>
-        <scala.version>2.12.4</scala.version>
-        <scala.binary.version>2.12</scala.binary.version>
+        <platform>linux-x86_64</platform>
+        <libtype>so</libtype>
       </properties>
+
       <build>
         <plugins>
           <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-enforcer-plugin</artifactId>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>exec-maven-plugin</artifactId>
+            <version>1.6.0</version>
             <executions>
               <execution>
-                <id>enforce-versions</id>
+                <id>init-build-flavor</id>
+                <phase>initialize</phase>
                 <goals>
-                  <goal>enforce</goal>
+                  <goal>exec</goal>
                 </goals>
                 <configuration>
-                  <rules>
-                    <bannedDependencies>
-                      <excludes combine.children="append">
-                        <exclude>*:*_2.11</exclude>
-                        <exclude>*:*_2.10</exclude>
-                      </excludes>
-                    </bannedDependencies>
-                  </rules>
+                  <executable>bash</executable>
+                  <commandlineArgs>-c 'mkdir -p ${project.build.directory}; if [[ $(ldd ${MXNET_DIR}/lib/libmxnet.so | grep libcuda.so | wc -l) == "0" ]]; then echo flavor=cpu &gt; ${project.build.directory}/flavor.properties; else echo flavor=gpu &gt; ${project.build.directory}/flavor.properties; fi'</commandlineArgs>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>properties-maven-plugin</artifactId>
+            <version>1.0.0</version>
+            <executions>
+              <execution>
+                <id>read-properties</id>
+                <phase>initialize</phase>
+                <goals>
+                  <goal>read-project-properties</goal>
+                </goals>
+                <configuration>
+                  <files>
+                    <file>${project.build.directory}/flavor.properties</file>
+                  </files>
                 </configuration>
               </execution>
             </executions>
@@ -154,19 +163,25 @@
   <build>
     <plugins>
       <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-release-plugin</artifactId>
-        <configuration>
-          <localCheckout>true</localCheckout>
-          <pushChanges>false</pushChanges>
-        </configuration>
-      </plugin>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-deploy-plugin</artifactId>
-        <configuration>
-          <skip>true</skip>
-        </configuration>
+        <groupId>org.commonjava.maven.plugins</groupId>
+        <artifactId>directory-maven-plugin</artifactId>
+        <version>0.1</version>
+        <executions>
+          <execution>
+            <id>directories</id>
+            <goals>
+              <goal>directory-of</goal>
+            </goals>
+            <phase>initialize</phase>
+            <configuration>
+              <property>rootdir</property>
+              <project>
+                <groupId>org.apache.mxnet</groupId>
+                <artifactId>mxnet-parent</artifactId>
+              </project>
+            </configuration>
+          </execution>
+        </executions>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
@@ -209,14 +224,17 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-assembly-plugin</artifactId>
-        <version>2.5.5</version>
+        <version>3.1.0</version>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
-        <version>2.19</version>
+        <version>2.22.0</version>
         <configuration>
-          <skipTests>true</skipTests>
+          <skipTests>${skipJavaTests}</skipTests>
+            <argLine>
+              -Djava.library.path=${project.parent.basedir}/native/target
+            </argLine>
           <useSystemClassLoader>false</useSystemClassLoader>
         </configuration>
       </plugin>
@@ -231,7 +249,6 @@
         <artifactId>scalatest-maven-plugin</artifactId>
         <version>1.0</version>
         <configuration>
-          <skipTests>${skipTests}</skipTests>
           <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
           <junitxml>.</junitxml>
           <stdout>F</stdout>
@@ -256,7 +273,7 @@
       <plugin>
         <groupId>org.scalastyle</groupId>
         <artifactId>scalastyle-maven-plugin</artifactId>
-        <version>0.8.0</version>
+        <version>1.0.0</version>
         <configuration>
           <verbose>false</verbose>
           <failOnViolation>true</failOnViolation>
@@ -264,7 +281,7 @@
           <failOnWarning>false</failOnWarning>
           <sourceDirectory>${basedir}/src/main/scala</sourceDirectory>
           <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory>
-          <configLocation>scalastyle-config.xml</configLocation>
+          <configLocation>${rootdir}/scalastyle-config.xml</configLocation>
           <outputFile>${basedir}/target/scalastyle-output.xml</outputFile>
           <outputEncoding>UTF-8</outputEncoding>
         </configuration>
@@ -315,19 +332,17 @@
           </execution>
         </executions>
       </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
     </plugins>
   </build>
   <dependencies>
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-library</artifactId>
-      <version>${scala.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.scala-lang</groupId>
-      <artifactId>scala-reflect</artifactId>
-      <version>${scala.version}</version>
-    </dependency>
     <dependency>
       <groupId>commons-codec</groupId>
       <artifactId>commons-codec</artifactId>
@@ -352,7 +367,7 @@
     </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
-      <artifactId>scalatest_${scala.binary.version}</artifactId>
+      <artifactId>scalatest_2.11</artifactId>
       <version>3.0.4</version>
       <scope>test</scope>
     </dependency>
@@ -363,13 +378,25 @@
     </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
-      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+      <artifactId>scalacheck_2.11</artifactId>
       <version>1.13.5</version>
       <scope>test</scope>
     </dependency>
+
+    <!-- Following libraries are required by running javah, they should be excluded from .jar -->
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>${scala.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-reflect</artifactId>
+      <version>${scala.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
-      <artifactId>scala-parser-combinators_${scala.binary.version}</artifactId>
+      <artifactId>scala-parser-combinators_2.11</artifactId>
       <version>1.0.4</version>
     </dependency>
     <dependency>
diff --git a/scala-package/spark/README.md b/scala-package/spark/README.md
index 06106648c059..503c279038a5 100644
--- a/scala-package/spark/README.md
+++ b/scala-package/spark/README.md
@@ -16,7 +16,8 @@ Checkout the [Installation Guide](http://mxnet.io/get_started/setup.html) contai
 Compile the Scala Package by
 
 ```bash
-make scalapkg
+cd scala-package
+mvn package
 ```
 
 This will automatically build the `spark` submodule. Now you can submit Spark job with these built jars.
diff --git a/scala-package/spark/bin/run-mnist-example.sh b/scala-package/spark/bin/run-mnist-example.sh
index 4ebd6c61d56b..4f747f2c91a1 100755
--- a/scala-package/spark/bin/run-mnist-example.sh
+++ b/scala-package/spark/bin/run-mnist-example.sh
@@ -27,9 +27,9 @@ OS=""
 
 if [ "$(uname)" == "Darwin" ]; then
 	# Do something under Mac OS X platform
-  OS='osx-x86_64-cpu'
+  OS='osx-x86_64'
 elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
-  OS='linux-x86_64-cpu'
+  OS='linux-x86_64'
 fi
 
 LIB_DIR=${SPARK_MODULE_DIR}/target/classes/lib
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index 2db3bee8c78d..f2737e9334f4 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -5,47 +5,28 @@
   <modelVersion>4.0.0</modelVersion>
   <parent>
     <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-parent_2.11</artifactId>
-    <version>1.5.0-SNAPSHOT</version>
+    <artifactId>mxnet-parent</artifactId>
+    <version>INTERNAL</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
-  <artifactId>mxnet-spark_2.11</artifactId>
+  <artifactId>mxnet-spark</artifactId>
   <name>MXNet Scala Package - Spark ML</name>
 
   <properties>
     <spark.version>1.6.3</spark.version>
   </properties>
-  <profiles>
-    <profile>
-      <id>osx-x86_64-cpu</id>
-      <properties>
-        <platform>osx-x86_64-cpu</platform>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-cpu</id>
-      <properties>
-        <platform>linux-x86_64-cpu</platform>
-      </properties>
-    </profile>
-    <profile>
-      <id>linux-x86_64-gpu</id>
-      <properties>
-        <platform>linux-x86_64-gpu</platform>
-      </properties>
-    </profile>
-  </profiles>
+
   <dependencies>
     <dependency>
       <groupId>org.apache.mxnet</groupId>
-      <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>1.5.0-SNAPSHOT</version>
+      <artifactId>mxnet-core</artifactId>
+      <version>INTERNAL</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
-      <artifactId>spark-mllib_${scala.binary.version}</artifactId>
+      <artifactId>spark-mllib_2.11</artifactId>
       <version>${spark.version}</version>
     </dependency>
     <dependency>
diff --git a/snapcraft.yaml b/snapcraft.yaml
index d8d0e301e6b1..9791cd86fc0e 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 name: mxnet
 version: '1.5.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index 079b587e9965..ecb05bc78ca4 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -29,37 +29,29 @@
 #include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <mxnet/c_api.h>
+#include <mxnet/c_api_error.h>
 #include <mxnet/base.h>
 #include <nnvm/graph.h>
 #include <vector>
 #include <string>
 
-/*! \brief  macro to guard beginning and end section of all functions */
-#define API_BEGIN() try { on_enter_api(__FUNCTION__);
-/*! \brief every function starts with API_BEGIN();
-     and finishes with API_END() or API_END_HANDLE_ERROR */
-#define API_END() } catch(dmlc::Error &_except_) { on_exit_api(); return MXAPIHandleException(_except_); } on_exit_api(); return 0;  // NOLINT(*)
 /*!
- * \brief every function starts with API_BEGIN();
- *   and finishes with API_END() or API_END_HANDLE_ERROR
- *   The finally clause contains procedure to cleanup states when an error happens.
+ * \brief Macros to guard beginning and end section of all functions
+ * every function starts with API_BEGIN()
+ * and finishes with API_END() or API_END_HANDLE_ERROR()
+ * The finally clause contains procedure to cleanup states when an error happens.
  */
-#define API_END_HANDLE_ERROR(Finalize) } catch(dmlc::Error &_except_) { Finalize; on_exit_api(); return MXAPIHandleException(_except_); } on_exit_api(); return 0; // NOLINT(*)
+#ifndef API_BEGIN
+#define API_BEGIN MX_API_BEGIN
+#endif
 
-/*!
- * \brief Set the last error message needed by C API
- * \param msg The error message to set.
- */
-void MXAPISetLastError(const char* msg);
-/*!
- * \brief handle exception throwed out
- * \param e the exception
- * \return the return value of API after exception is handled
- */
-inline int MXAPIHandleException(const dmlc::Error &e) {
-  MXAPISetLastError(e.what());
-  return -1;
-}
+#ifndef API_END
+#define API_END MX_API_END
+#endif
+
+#ifndef API_END_HANDLE_ERROR
+#define API_END_HANDLE_ERROR MX_API_END_HANDLE_ERROR
+#endif
 
 using namespace mxnet;
 
@@ -137,10 +129,6 @@ inline void CopyAttr(const nnvm::IndexedGraph& idx,
 
 // stores keys that will be converted to __key__
 extern const std::vector<std::string> kHiddenKeys;
-
-extern void on_enter_api(const char *function);
-extern void on_exit_api();
-
 }  // namespace mxnet
 
 #endif  // MXNET_C_API_C_API_COMMON_H_
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index 1f936b164326..e2e53c7261fa 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -148,8 +148,6 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
                      NDArrayHandle *aux_states,
                      ExecutorHandle shared_exec,
                      ExecutorHandle *out) {
-  Executor* exec = nullptr;
-
   API_BEGIN();
   nnvm::Symbol *symb = static_cast<nnvm::Symbol*>(symbol_handle);
   Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
@@ -181,7 +179,7 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
   *out = Executor::Bind(*symb, ctx, ctx_map, in_args_vec,
                         arg_grad_vec, grad_req_vec, aux_states_vec,
                         reinterpret_cast<Executor*>(shared_exec));
-  API_END_HANDLE_ERROR(delete exec);
+  API_END();
 }
 
 /*!
@@ -558,8 +556,11 @@ int MXExecutorReshape(int partial_shaping,
                       NDArrayHandle** aux_states,
                       ExecutorHandle shared_exec,
                       ExecutorHandle *out) {
+  Executor* new_exec = nullptr;
+
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
   API_BEGIN();
+  *out = nullptr;  // ensure we can know whether to free executor on early abort
   // create shape map for in_args and aux_states
   std::unordered_map<std::string, TShape> kwargs(num_provided_arg_shapes);
   for (mx_uint i = 0; i < num_provided_arg_shapes; ++i) {
@@ -581,8 +582,9 @@ int MXExecutorReshape(int partial_shaping,
   std::vector<NDArray> aux_state_vec;
 
   Executor* exec = static_cast<Executor*>(shared_exec);
-  *out = exec->Reshape(partial_shaping, allow_up_sizing, ctx, ctx_map, kwargs,
+  new_exec = exec->Reshape(partial_shaping, allow_up_sizing, ctx, ctx_map, kwargs,
                        &in_arg_vec, &arg_grad_vec, &aux_state_vec);
+  *out = new_exec;
 
   ret->ret_handles.clear();
   ret->ret_handles.reserve(in_arg_vec.size()+arg_grad_vec.size()+aux_state_vec.size());
@@ -623,7 +625,7 @@ int MXExecutorReshape(int partial_shaping,
     *aux_states = &(ret->ret_handles[nd_idx]);
     nd_idx = ret->ret_handles.size();
   }
-  API_END_HANDLE_ERROR(delete out);
+  API_END_HANDLE_ERROR(delete new_exec);
 }
 
 int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index 047edde88a53..0dd9d2db3722 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -286,22 +286,35 @@ inline DType __device__ CudaMin(DType a, DType b) {
 class DeviceStore {
  public:
   /*! \brief default constructor- only optionally restores previous device */
-  explicit DeviceStore(bool restore = true) : restore_(restore) {
+  explicit DeviceStore(int requested_device = -1, bool restore = true) :
+    restore_device_(-1),
+    current_device_(requested_device),
+    restore_(restore) {
     if (restore_)
       CUDA_CALL(cudaGetDevice(&restore_device_));
+    if (requested_device != restore_device_) {
+      SetDevice(requested_device);
+    }
   }
 
   ~DeviceStore() {
-    if (restore_)
+    if (restore_ &&
+        current_device_ != restore_device_ &&
+        current_device_ != -1 &&
+        restore_device_ != -1)
       CUDA_CALL(cudaSetDevice(restore_device_));
   }
 
   void SetDevice(int device) {
-    CUDA_CALL(cudaSetDevice(device));
+    if (device != -1) {
+      CUDA_CALL(cudaSetDevice(device));
+      current_device_ = device;
+    }
   }
 
  private:
   int restore_device_;
+  int current_device_;
   bool restore_;
 };
 
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index d4ac042ff401..516e04bf5e82 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -65,9 +65,6 @@ template <std::size_t kNumGpus, std::size_t kStreams>
 RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
     Context const& ctx) {
   RunContext ret;
-#if MXNET_USE_CUDA
-  mxnet::common::cuda::DeviceStore device_store;
-#endif
   switch (ctx.dev_mask()) {
     case cpu::kDevMask:
       ret = RunContext{ctx, nullptr};
@@ -75,11 +72,11 @@ RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
       std::size_t use_counter;
-      device_store.SetDevice(ctx.dev_id);
       {
         std::lock_guard<std::mutex> lock{mutex_};
         auto&& counter = gpu_cnt_.at(ctx.dev_id);
         if (counter == -1) {
+          mxnet::common::cuda::DeviceStore device_store(ctx.dev_id);
           for (auto&& i : gpu_streams_.at(ctx.dev_id)) {
             i = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0, ctx.dev_id);
           }
@@ -104,19 +101,16 @@ template <std::size_t kNumGpus, std::size_t kStreams>
 RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
     Context const& ctx) {
   RunContext ret;
-#if MXNET_USE_CUDA
-  mxnet::common::cuda::DeviceStore device_store;
-#endif
   switch (ctx.dev_mask()) {
     case cpu::kDevMask:
       ret = RunContext{ctx, nullptr};
       break;
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
-      device_store.SetDevice(ctx.dev_id);
       {
         std::lock_guard<std::mutex> lock{mutex_};
         if (gpu_io_streams_.at(ctx.dev_id) == nullptr) {
+          mxnet::common::cuda::DeviceStore device_store(ctx.dev_id);
           gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream<gpu>(false, false, ctx.dev_id);
         }
       }
diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc
index 1abb82fd6a67..c6eb99508e09 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -31,6 +31,9 @@
 #include "./threaded_engine.h"
 #include "./thread_pool.h"
 #include "./stream_manager.h"
+#if MXNET_USE_CUDA
+#include "../common/cuda_utils.h"
+#endif
 
 namespace mxnet {
 namespace engine {
@@ -130,10 +133,13 @@ class ThreadedEnginePooled : public ThreadedEngine {
    * \param opr_block The operator block.
    */
   void DoExecute(OprBlock* opr_block) {
+#if MXNET_USE_CUDA
+    mxnet::common::cuda::DeviceStore device_store(-1, false);
+#endif
     assert(opr_block->wait.load() == 0);
     if (opr_block->ctx.dev_mask() == gpu::kDevMask) {
       #if MXNET_USE_CUDA
-      CUDA_CALL(cudaSetDevice(opr_block->ctx.dev_id));
+      device_store.SetDevice(opr_block->ctx.dev_id);
       #else   // MXNET_USE_CUDA
       LOG(FATAL) << "Please compile with CUDA enabled";
       #endif  // MXNET_USE_CUDA
diff --git a/src/executor/tensorrt_pass.cc b/src/executor/tensorrt_pass.cc
index b5fc8d15f7ac..d26704c35cf5 100644
--- a/src/executor/tensorrt_pass.cc
+++ b/src/executor/tensorrt_pass.cc
@@ -324,10 +324,10 @@ nnvm::NodePtr ConvertNnvmGraphToOnnx(const nnvm::Graph &g,
                                      std::unordered_map<std::string, NDArray>* const params_map) {
   auto p = nnvm::Node::Create();
   p->attrs.op = nnvm::Op::Get("_trt_op");
-  op::TRTParam trt_param = op::nnvm_to_onnx::ConvertNnvmGraphToOnnx(g, params_map);
-  p->attrs.dict["serialized_output_map"] = trt_param.serialized_output_map;
-  p->attrs.dict["serialized_input_map"]  = trt_param.serialized_input_map;
-  p->attrs.dict["serialized_onnx_graph"] = trt_param.serialized_onnx_graph;
+  op::ONNXParam onnx_param = op::nnvm_to_onnx::ConvertNnvmGraphToOnnx(g, params_map);
+  p->attrs.dict["serialized_output_map"] = onnx_param.serialized_output_map;
+  p->attrs.dict["serialized_input_map"]  = onnx_param.serialized_input_map;
+  p->attrs.dict["serialized_onnx_graph"] = onnx_param.serialized_onnx_graph;
   if (p->op()->attr_parser != nullptr) {
     p->op()->attr_parser(&(p->attrs));
   }
diff --git a/src/executor/trt_graph_executor.cc b/src/executor/trt_graph_executor.cc
index 65dbb29792e0..ec35fee98a96 100644
--- a/src/executor/trt_graph_executor.cc
+++ b/src/executor/trt_graph_executor.cc
@@ -407,14 +407,7 @@ nnvm::Symbol TrtGraphExecutor::GetOptimizedSymbol() {
   Symbol ret;
   ret.outputs = std::vector<nnvm::NodeEntry>(graph_.outputs.begin(),
                                              graph_.outputs.begin() + num_forward_outputs_);
-  ret = ret.Copy();
-  static const Op* trt_op = Op::Get("_trt_op");
-  DFSVisit(ret.outputs, [](const nnvm::NodePtr n) {
-    if (n->op() == trt_op) {
-      n->attrs.dict.clear();
-    }
-  });
-  return ret;
+  return ret.Copy();
 }
 
 Executor *TrtGraphExecutor::TensorRTBind(nnvm::Symbol symbol,
diff --git a/src/initialize.cc b/src/initialize.cc
index ddda3f18a3ae..de7edd1b1455 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -57,11 +57,13 @@ class LibraryInitializer {
         Engine::Get()->Start();
       },
       []() {
-        // Make children single threaded since they are typically workers
-        dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
+        // Conservative thread management for multiprocess workers
+        const size_t mp_worker_threads = dmlc::GetEnv("MXNET_MP_WORKER_NTHREADS", 1);
+        dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", mp_worker_threads);
         dmlc::SetEnv("OMP_NUM_THREADS", 1);
 #if MXNET_USE_OPENCV && !__APPLE__
-        cv::setNumThreads(0);  // disable opencv threading
+        const size_t mp_cv_num_threads = dmlc::GetEnv("MXNET_MP_OPENCV_NUM_THREADS", 0);
+        cv::setNumThreads(mp_cv_num_threads);  // disable opencv threading
 #endif  // MXNET_USE_OPENCV
         engine::OpenMP::Get()->set_enabled(false);
         Engine::Get()->Start();
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index 7090aaf46d8f..08f6155cb5b4 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -724,10 +724,9 @@ class CommDevice : public Comm {
     int enabled = 0;
     std::vector<int> p2p(n*n);
 
-    // Restores active device to what it was before EnableP2P
-    mxnet::common::cuda::DeviceStore device_store;
     for (int i = 0; i < n; ++i) {
-     device_store.SetDevice(gpus[i]);
+      // Restores active device to what it was before EnableP2P
+      mxnet::common::cuda::DeviceStore device_store(gpus[i]);
       for (int j = 0; j < n; j++) {
         int access;
         cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index e3b2ad7f57d3..b62228cd2885 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -339,9 +339,8 @@ class CommDeviceTree : public CommDevice {
     int n = static_cast<int>(gpus.size());
     int enabled = 0;
     std::vector<int> p2p(n*n);
-    mxnet::common::cuda::DeviceStore device_store;
     for (int i = 0; i < n; ++i) {
-      device_store.SetDevice(gpus[i]);
+      mxnet::common::cuda::DeviceStore device_store(gpus[i]);
       for (int j = 0; j < n; j++) {
         int access;
         cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 081d4e759323..251bfb3f0e1f 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -330,11 +330,10 @@ struct NDArrayDLManager {
 };
 
 DLManagedTensor* NDArray::ToDLPack() const {
+  CHECK(!is_none()) << "NDArray is not initialized";
   NDArrayDLManager* dlmanager(new NDArrayDLManager);
   dlmanager->handle = *this;
-  if (!is_none()) {
-    dlmanager->tensor.dl_tensor = data().dltensor();
-  }
+  dlmanager->tensor.dl_tensor = dlmanager->handle.data().dltensor();
   dlmanager->tensor.manager_ctx = dlmanager;
   dlmanager->tensor.deleter = [](DLManagedTensor* dlmanager){
     delete static_cast<NDArrayDLManager*>(dlmanager->manager_ctx);
@@ -454,17 +453,10 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) {
 
   mkldnn::memory::dims dims;
   // These are shapes supprted by MKLDNN.
-  if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4
-      || shape.ndim() == 5) {
+  if (shape.ndim() >= 1 && shape.ndim() <= 5) {
     dims.resize(shape.ndim());
     for (size_t i = 0; i < dims.size(); i++)
       dims[i] = shape[i];
-  } else if (shape.ndim() == 3) {
-    // If there are 3 dimensions, we'll force it to 4 dimensions.
-    dims.resize(shape.ndim() + 1);
-    dims[0] = 1;
-    for (size_t i = 0; i < shape.ndim(); i++)
-      dims[i + 1] = shape[i];
   } else {
     LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions";
   }
@@ -472,6 +464,7 @@ void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) {
   switch (dims.size()) {
     case 1: layout = mkldnn::memory::format::x; break;
     case 2: layout = mkldnn::memory::format::nc; break;
+    case 3: layout = mkldnn::memory::format::ncw; break;
     case 4: layout = mkldnn::memory::format::nchw; break;
     // This isn't the right layout when the data has 5 dimensions in MXNet.
     // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have
diff --git a/src/operator/c_lapack_api.cc b/src/operator/c_lapack_api.cc
new file mode 100644
index 000000000000..c6293bf8f684
--- /dev/null
+++ b/src/operator/c_lapack_api.cc
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "c_lapack_api.h"
+
+#if (MSHADOW_USE_MKL && MXNET_USE_LAPACK)
+#elif MXNET_USE_LAPACK
+#else
+  // use pragma message instead of warning
+  #pragma message("Warning: lapack usage not enabled, linalg-operators will not be available." \
+     " Ensure that lapack library is installed and build with USE_LAPACK=1 to get lapack" \
+     " functionalities.")
+
+  // Define compilable stubs.
+  #define MXNET_LAPACK_CWRAPPER1(func, dtype) \
+  int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype* a, int lda) { \
+    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
+    return 1; \
+  }
+
+  #define MXNET_LAPACK_CWRAPPER2(func, dtype) \
+  int MXNET_LAPACK_##func(int matrix_layout, int m, int n, dtype* a, \
+                                 int lda, dtype* tau, dtype* work, int lwork) { \
+    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
+    return 1; \
+  }
+
+  #define MXNET_LAPACK_CWRAPPER3(func, dtype) \
+  int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype *a, \
+                                 int lda, dtype *w, dtype *work, int lwork, \
+                                 int *iwork, int liwork) { \
+    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
+    return 1; \
+  }
+
+  #define MXNET_LAPACK_UNAVAILABLE(func) \
+  int mxnet_lapack_##func(...) { \
+    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
+    return 1; \
+  }
+  MXNET_LAPACK_CWRAPPER1(spotrf, float)
+  MXNET_LAPACK_CWRAPPER1(dpotrf, double)
+  MXNET_LAPACK_CWRAPPER1(spotri, float)
+  MXNET_LAPACK_CWRAPPER1(dpotri, double)
+
+  MXNET_LAPACK_UNAVAILABLE(sposv)
+  MXNET_LAPACK_UNAVAILABLE(dposv)
+
+  MXNET_LAPACK_CWRAPPER2(sgelqf, float)
+  MXNET_LAPACK_CWRAPPER2(dgelqf, double)
+  MXNET_LAPACK_CWRAPPER2(sorglq, float)
+  MXNET_LAPACK_CWRAPPER2(dorglq, double)
+
+  MXNET_LAPACK_CWRAPPER3(ssyevd, float)
+  MXNET_LAPACK_CWRAPPER3(dsyevd, double)
+#endif  // MSHADOW_USE_MKL == 0
diff --git a/src/operator/c_lapack_api.h b/src/operator/c_lapack_api.h
index 46c8b963f429..cd69775547b4 100644
--- a/src/operator/c_lapack_api.h
+++ b/src/operator/c_lapack_api.h
@@ -324,42 +324,26 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
 
 #else
 
-  // use pragma message instead of warning
-  #pragma message("Warning: lapack usage not enabled, linalg-operators will not be available." \
-     " Ensure that lapack library is installed and build with USE_LAPACK=1 to get lapack" \
-     " functionalities.")
+
 
   #define MXNET_LAPACK_ROW_MAJOR 101
   #define MXNET_LAPACK_COL_MAJOR 102
 
   // Define compilable stubs.
   #define MXNET_LAPACK_CWRAPPER1(func, dtype) \
-  inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype* a, int lda) { \
-    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
-    return 1; \
-  }
+  int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype* a, int lda);
 
   #define MXNET_LAPACK_CWRAPPER2(func, dtype) \
-  inline int MXNET_LAPACK_##func(int matrix_layout, int m, int n, dtype* a, \
-                                 int lda, dtype* tau, dtype* work, int lwork) { \
-    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
-    return 1; \
-  }
+  int MXNET_LAPACK_##func(int matrix_layout, int m, int n, dtype* a, \
+                                 int lda, dtype* tau, dtype* work, int lwork);
 
   #define MXNET_LAPACK_CWRAPPER3(func, dtype) \
-  inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype *a, \
+  int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype *a, \
                                  int lda, dtype *w, dtype *work, int lwork, \
-                                 int *iwork, int liwork) { \
-    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
-    return 1; \
-  }
+                                 int *iwork, int liwork);
 
   #define MXNET_LAPACK_UNAVAILABLE(func) \
-  inline int mxnet_lapack_##func(...) { \
-    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
-    return 1; \
-  }
-
+  int mxnet_lapack_##func(...);
   MXNET_LAPACK_CWRAPPER1(spotrf, float)
   MXNET_LAPACK_CWRAPPER1(dpotrf, double)
   MXNET_LAPACK_CWRAPPER1(spotri, float)
@@ -375,7 +359,10 @@ inline void flip(int m, int n, DType *b, int ldb, DType *a, int lda) {
 
   MXNET_LAPACK_CWRAPPER3(ssyevd, float)
   MXNET_LAPACK_CWRAPPER3(dsyevd, double)
-
+  #undef MXNET_LAPACK_CWRAPPER1
+  #undef MXNET_LAPACK_CWRAPPER2
+  #undef MXNET_LAPACK_CWRAPPER3
+  #undef MXNET_LAPACK_UNAVAILABLE
 #endif
 
 template <typename DType>
diff --git a/src/operator/contrib/adamw-inl.h b/src/operator/contrib/adamw-inl.h
new file mode 100644
index 000000000000..3d76b33ae765
--- /dev/null
+++ b/src/operator/contrib/adamw-inl.h
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file optimizer_op-inl.h
+ * \brief Optimizer operators
+ * \author Haibin Lin
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_ADAMW_INL_H_
+#define MXNET_OPERATOR_CONTRIB_ADAMW_INL_H_
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <mxnet/op_attr_types.h>
+#include <mshadow/base.h>
+#include <nnvm/op.h>
+#include <nnvm/op_attr_types.h>
+#include <vector>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+#include "../elemwise_op_common.h"
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct AdamWParam : public dmlc::Parameter<AdamWParam> {
+  float lr;
+  float beta1;
+  float beta2;
+  float epsilon;
+  float wd;
+  float eta;
+  float rescale_grad;
+  float clip_gradient;
+  DMLC_DECLARE_PARAMETER(AdamWParam) {
+    DMLC_DECLARE_FIELD(lr)
+    .describe("Learning rate");
+    DMLC_DECLARE_FIELD(beta1)
+    .set_default(0.9f)
+    .describe("The decay rate for the 1st moment estimates.");
+    DMLC_DECLARE_FIELD(beta2)
+    .set_default(0.999f)
+    .describe("The decay rate for the 2nd moment estimates.");
+    DMLC_DECLARE_FIELD(epsilon)
+    .set_default(1e-8f)
+    .describe("A small constant for numerical stability.");
+    DMLC_DECLARE_FIELD(wd)
+    .set_default(0.0f)
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
+    DMLC_DECLARE_FIELD(eta)
+    .describe("Learning rate schedule multiplier");
+    DMLC_DECLARE_FIELD(rescale_grad)
+    .set_default(1.0f)
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+    .set_default(-1.0f)
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
+  }
+};
+
+/*
+ * \brief adam_w update.
+ */
+template<typename xpu>
+inline void AdamWUpdate(const nnvm::NodeAttrs& attrs,
+                        const OpContext &ctx,
+                        const std::vector<TBlob> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mshadow_op;
+  const AdamWParam& param = nnvm::get<AdamWParam>(attrs.parsed);
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    Tensor<xpu, 2, DType> weight = inputs[0].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grad = inputs[1].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> mean = inputs[2].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> var = inputs[3].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+
+    grad = scalar<DType>(param.rescale_grad) * grad;
+    if (param.clip_gradient >= 0.0f) {
+      mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) *
+          F<clip>(grad, DType(param.clip_gradient));
+      var = scalar<DType>(param.beta2)*var + scalar<DType>(1.f-param.beta2)*F<square>(
+          F<clip>(grad, DType(param.clip_gradient)));
+    } else {
+      mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) * grad;
+      var = scalar<DType>(param.beta2)*var + scalar<DType>(1.f-param.beta2) * F<square>(grad);
+    }
+    Assign(out, req[0],
+           weight -
+           scalar<DType>(param.eta) * (scalar<DType>(param.lr) *
+           mean / (F<square_root>(var) + scalar<DType>(param.epsilon)) +
+           (scalar<DType>(param.wd) * weight)));
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_ADAMW_INL_H_
diff --git a/src/operator/contrib/adamw.cc b/src/operator/contrib/adamw.cc
new file mode 100644
index 000000000000..94623fe08a9e
--- /dev/null
+++ b/src/operator/contrib/adamw.cc
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file optimizer_op.cc
+ * \brief Optimizer operators
+ * \author Haibin Lin
+ */
+#include "./adamw-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(AdamWParam);
+
+NNVM_REGISTER_OP(_contrib_adamw_update)
+.describe(R"code(Update function for AdamW optimizer. AdamW is seen as a modification of
+Adam by decoupling the weight decay from the optimization steps taken w.r.t. the loss function.
+
+Adam update consists of the following steps, where g represents gradient and m, v
+are 1st and 2nd order moment estimates (mean and variance).
+
+.. math::
+
+ g_t = \nabla J(W_{t-1})\\
+ m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t\\
+ v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\\
+ W_t = W_{t-1} - \eta_t (\alpha \frac{ m_t }{ \sqrt{ v_t } + \epsilon } + wd W_{t-1})
+
+It updates the weights using::
+
+ m = beta1*m + (1-beta1)*grad
+ v = beta2*v + (1-beta2)*(grad**2)
+ w -= eta * (learning_rate * m / (sqrt(v) + epsilon) + w * wd)
+
+)code" ADD_FILELINE)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<AdamWParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<4, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<4, 1>)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    return std::vector<uint32_t>{2, 3};
+  })
+.set_attr<FCompute>("FCompute<cpu>", AdamWUpdate<cpu>)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("mean", "NDArray-or-Symbol", "Moving mean")
+.add_argument("var", "NDArray-or-Symbol", "Moving variance")
+.add_arguments(AdamWParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu
new file mode 100644
index 000000000000..b7452f861e2d
--- /dev/null
+++ b/src/operator/contrib/adamw.cu
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file adamw.cu
+ * \brief Optimizer operators
+ * \author Haibin Lin
+ */
+#include "./adamw-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_contrib_adamw_update)
+.set_attr<FCompute>("FCompute<gpu>", AdamWUpdate<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/bounding_box-inl.h b/src/operator/contrib/bounding_box-inl.h
index 8e963461ec06..031dd952d386 100644
--- a/src/operator/contrib/bounding_box-inl.h
+++ b/src/operator/contrib/bounding_box-inl.h
@@ -785,7 +785,7 @@ void BipartiteMatchingForward(const nnvm::NodeAttrs& attrs,
      .get_with_shape<xpu, 2, DType>(Shape2(batch_size, col), s);
     Shape<1> sort_index_shape = Shape1(dshape.Size());
     index_t workspace_size = sort_index_shape.Size();
-    workspace_size += ((sort_index_shape.Size() * sizeof(int32_t) - 1) / sizeof(DType)) * 2;
+    workspace_size += (sort_index_shape.Size() * 2 * sizeof(int32_t) - 1) / sizeof(DType) + 1;
     Tensor<xpu, 1, DType> workspace = ctx.requested[0]
       .get_space_typed<xpu, 1, DType>(Shape1(workspace_size), s);
     Tensor<xpu, 1, DType> scores_copy(workspace.dptr_,
diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index 6d586755c957..a03cbef0b5ca 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -413,21 +413,6 @@ static bool CSRNeighborNonUniformSampleType(const nnvm::NodeAttrs& attrs,
   return success;
 }
 
-/*
- * Get src vertex and edge id for a destination vertex
- */
-static void GetSrcList(const dgl_id_t* val_list,
-                       const dgl_id_t* col_list,
-                       const dgl_id_t* indptr,
-                       const dgl_id_t dst_id,
-                       std::vector<dgl_id_t>* src_list,
-                       std::vector<dgl_id_t>* edge_list) {
-  for (dgl_id_t i = *(indptr+dst_id); i < *(indptr+dst_id+1); ++i) {
-    src_list->push_back(col_list[i]);
-    edge_list->push_back(val_list[i]);
-  }
-}
-
 static void RandomSample(size_t set_size,
                          size_t num,
                          std::vector<size_t>* out,
@@ -464,34 +449,34 @@ static void NegateSet(const std::vector<size_t> &idxs,
 /*
  * Uniform sample
  */
-static void GetUniformSample(const std::vector<dgl_id_t>& ver_list,
-                             const std::vector<dgl_id_t>& edge_list,
+static void GetUniformSample(const dgl_id_t* val_list,
+                             const dgl_id_t* col_list,
+                             const size_t ver_len,
                              const size_t max_num_neighbor,
                              std::vector<dgl_id_t>* out_ver,
                              std::vector<dgl_id_t>* out_edge,
                              unsigned int* seed) {
-  CHECK_EQ(ver_list.size(), edge_list.size());
   // Copy ver_list to output
-  if (ver_list.size() <= max_num_neighbor) {
-    for (size_t i = 0; i < ver_list.size(); ++i) {
-      out_ver->push_back(ver_list[i]);
-      out_edge->push_back(edge_list[i]);
+  if (ver_len <= max_num_neighbor) {
+    for (size_t i = 0; i < ver_len; ++i) {
+      out_ver->push_back(col_list[i]);
+      out_edge->push_back(val_list[i]);
     }
     return;
   }
   // If we just sample a small number of elements from a large neighbor list.
   std::vector<size_t> sorted_idxs;
-  if (ver_list.size() > max_num_neighbor * 2) {
+  if (ver_len > max_num_neighbor * 2) {
     sorted_idxs.reserve(max_num_neighbor);
-    RandomSample(ver_list.size(), max_num_neighbor, &sorted_idxs, seed);
+    RandomSample(ver_len, max_num_neighbor, &sorted_idxs, seed);
     std::sort(sorted_idxs.begin(), sorted_idxs.end());
   } else {
     std::vector<size_t> negate;
-    negate.reserve(ver_list.size() - max_num_neighbor);
-    RandomSample(ver_list.size(), ver_list.size() - max_num_neighbor,
+    negate.reserve(ver_len - max_num_neighbor);
+    RandomSample(ver_len, ver_len - max_num_neighbor,
                  &negate, seed);
     std::sort(negate.begin(), negate.end());
-    NegateSet(negate, ver_list.size(), &sorted_idxs);
+    NegateSet(negate, ver_len, &sorted_idxs);
   }
   // verify the result.
   CHECK_EQ(sorted_idxs.size(), max_num_neighbor);
@@ -499,8 +484,8 @@ static void GetUniformSample(const std::vector<dgl_id_t>& ver_list,
     CHECK_GT(sorted_idxs[i], sorted_idxs[i - 1]);
   }
   for (auto idx : sorted_idxs) {
-    out_ver->push_back(ver_list[idx]);
-    out_edge->push_back(edge_list[idx]);
+    out_ver->push_back(col_list[idx]);
+    out_edge->push_back(val_list[idx]);
   }
 }
 
@@ -508,26 +493,26 @@ static void GetUniformSample(const std::vector<dgl_id_t>& ver_list,
  * Non-uniform sample via ArrayHeap
  */
 static void GetNonUniformSample(const float* probability,
-                                const std::vector<dgl_id_t>& ver_list,
-                                const std::vector<dgl_id_t>& edge_list,
+                                const dgl_id_t* val_list,
+                                const dgl_id_t* col_list,
+                                const size_t ver_len,
                                 const size_t max_num_neighbor,
                                 std::vector<dgl_id_t>* out_ver,
                                 std::vector<dgl_id_t>* out_edge,
                                 unsigned int* seed) {
-  CHECK_EQ(ver_list.size(), edge_list.size());
   // Copy ver_list to output
-  if (ver_list.size() <= max_num_neighbor) {
-    for (size_t i = 0; i < ver_list.size(); ++i) {
-      out_ver->push_back(ver_list[i]);
-      out_edge->push_back(edge_list[i]);
+  if (ver_len <= max_num_neighbor) {
+    for (size_t i = 0; i < ver_len; ++i) {
+      out_ver->push_back(col_list[i]);
+      out_edge->push_back(val_list[i]);
     }
     return;
   }
   // Make sample
   std::vector<size_t> sp_index(max_num_neighbor);
-  std::vector<float> sp_prob(ver_list.size());
-  for (size_t i = 0; i < ver_list.size(); ++i) {
-    sp_prob[i] = probability[ver_list[i]];
+  std::vector<float> sp_prob(ver_len);
+  for (size_t i = 0; i < ver_len; ++i) {
+    sp_prob[i] = probability[col_list[i]];
   }
   ArrayHeap arrayHeap(sp_prob);
   arrayHeap.SampleWithoutReplacement(max_num_neighbor, &sp_index, seed);
@@ -535,21 +520,13 @@ static void GetNonUniformSample(const float* probability,
   out_edge->resize(max_num_neighbor);
   for (size_t i = 0; i < max_num_neighbor; ++i) {
     size_t idx = sp_index[i];
-    out_ver->at(i) = ver_list[idx];
-    out_edge->at(i) = edge_list[idx];
+    out_ver->at(i) = col_list[idx];
+    out_edge->at(i) = val_list[idx];
   }
   sort(out_ver->begin(), out_ver->end());
   sort(out_edge->begin(), out_edge->end());
 }
 
-/*
- * This is used for BFS traversal
- */
-struct ver_node {
-  dgl_id_t vertex_id;
-  int level;
-};
-
 /*
  * Used for subgraph sampling
  */
@@ -571,9 +548,9 @@ static void SampleSubgraph(const NDArray &csr,
                            float* sub_prob,
                            const NDArray &sub_layer,
                            const float* probability,
-                           dgl_id_t num_hops,
-                           dgl_id_t num_neighbor,
-                           dgl_id_t max_num_vertices) {
+                           int num_hops,
+                           size_t num_neighbor,
+                           size_t max_num_vertices) {
   unsigned int time_seed = time(nullptr);
   size_t num_seeds = seed_arr.shape().Size();
   CHECK_GE(max_num_vertices, num_seeds);
@@ -586,123 +563,119 @@ static void SampleSubgraph(const NDArray &csr,
   dgl_id_t* out_layer = sub_layer.data().dptr<dgl_id_t>();
 
   // BFS traverse the graph and sample vertices
-  dgl_id_t sub_vertices_count = 0;
   // <vertex_id, layer_id>
-  std::unordered_map<dgl_id_t, int> sub_ver_mp;
-  std::queue<ver_node> node_queue;
+  std::unordered_set<dgl_id_t> sub_ver_mp;
+  std::vector<std::pair<dgl_id_t, dgl_id_t> > sub_vers;
+  sub_vers.reserve(num_seeds * 10);
   // add seed vertices
   for (size_t i = 0; i < num_seeds; ++i) {
-    ver_node node;
-    node.vertex_id = seed[i];
-    node.level = 0;
-    node_queue.push(node);
+    auto ret = sub_ver_mp.insert(seed[i]);
+    // If the vertex is inserted successfully.
+    if (ret.second) {
+      sub_vers.emplace_back(seed[i], 0);
+    }
   }
-  std::vector<dgl_id_t> tmp_src_list;
-  std::vector<dgl_id_t> tmp_edge_list;
   std::vector<dgl_id_t> tmp_sampled_src_list;
   std::vector<dgl_id_t> tmp_sampled_edge_list;
-  std::unordered_map<dgl_id_t, neigh_list> neigh_mp;
+  // ver_id, position
+  std::vector<std::pair<dgl_id_t, size_t> > neigh_pos;
+  neigh_pos.reserve(num_seeds);
+  std::vector<dgl_id_t> neighbor_list;
   size_t num_edges = 0;
-  while (!node_queue.empty() &&
-    sub_vertices_count <= max_num_vertices ) {
-    ver_node& cur_node = node_queue.front();
-    dgl_id_t dst_id = cur_node.vertex_id;
-    if (cur_node.level < num_hops) {
-      auto ret = sub_ver_mp.find(dst_id);
-      if (ret != sub_ver_mp.end()) {
-        node_queue.pop();
-        continue;
-      }
-      tmp_src_list.clear();
-      tmp_edge_list.clear();
-      tmp_sampled_src_list.clear();
-      tmp_sampled_edge_list.clear();
-      GetSrcList(val_list,
-                 col_list,
-                 indptr,
-                 dst_id,
-                 &tmp_src_list,
-                 &tmp_edge_list);
-      if (probability == nullptr) {  // uniform-sample
-        GetUniformSample(tmp_src_list,
-                       tmp_edge_list,
+
+  // sub_vers is used both as a node collection and a queue.
+  // In the while loop, we iterate over sub_vers and new nodes are added to the vector.
+  // A vertex in the vector only needs to be accessed once. If there is a vertex behind idx
+  // isn't in the last level, we will sample its neighbors. If not, the while loop terminates.
+  size_t idx = 0;
+  while (idx < sub_vers.size() &&
+    sub_ver_mp.size() < max_num_vertices) {
+    dgl_id_t dst_id = sub_vers[idx].first;
+    int cur_node_level = sub_vers[idx].second;
+    idx++;
+    // If the node is in the last level, we don't need to sample neighbors
+    // from this node.
+    if (cur_node_level >= num_hops)
+      continue;
+
+    tmp_sampled_src_list.clear();
+    tmp_sampled_edge_list.clear();
+    dgl_id_t ver_len = *(indptr+dst_id+1) - *(indptr+dst_id);
+    if (probability == nullptr) {  // uniform-sample
+      GetUniformSample(val_list + *(indptr + dst_id),
+                       col_list + *(indptr + dst_id),
+                       ver_len,
                        num_neighbor,
                        &tmp_sampled_src_list,
                        &tmp_sampled_edge_list,
                        &time_seed);
-      } else {  // non-uniform-sample
-        GetNonUniformSample(probability,
-                       tmp_src_list,
-                       tmp_edge_list,
+    } else {  // non-uniform-sample
+      GetNonUniformSample(probability,
+                       val_list + *(indptr + dst_id),
+                       col_list + *(indptr + dst_id),
+                       ver_len,
                        num_neighbor,
                        &tmp_sampled_src_list,
                        &tmp_sampled_edge_list,
                        &time_seed);
-      }
-      neigh_mp.insert(std::pair<dgl_id_t, neigh_list>(dst_id,
-        neigh_list(tmp_sampled_src_list,
-                   tmp_sampled_edge_list)));
-      num_edges += tmp_sampled_src_list.size();
-      sub_ver_mp[cur_node.vertex_id] = cur_node.level;
-      for (size_t i = 0; i < tmp_sampled_src_list.size(); ++i) {
-        auto ret = sub_ver_mp.find(tmp_sampled_src_list[i]);
-        if (ret == sub_ver_mp.end()) {
-          ver_node new_node;
-          new_node.vertex_id = tmp_sampled_src_list[i];
-          new_node.level = cur_node.level + 1;
-          node_queue.push(new_node);
-        }
-      }
-    } else {  // vertex without any neighbor
-      auto ret = sub_ver_mp.find(dst_id);
-      if (ret != sub_ver_mp.end()) {
-        node_queue.pop();
-        continue;
-      }
-      tmp_sampled_src_list.clear();
-      tmp_sampled_edge_list.clear();
-      neigh_mp.insert(std::pair<dgl_id_t, neigh_list>(dst_id,
-        neigh_list(tmp_sampled_src_list,      // empty vector
-                   tmp_sampled_edge_list)));  // empty vector
-      sub_ver_mp[cur_node.vertex_id] = cur_node.level;
     }
-    sub_vertices_count++;
-    node_queue.pop();
+    CHECK_EQ(tmp_sampled_src_list.size(), tmp_sampled_edge_list.size());
+    size_t pos = neighbor_list.size();
+    neigh_pos.emplace_back(dst_id, pos);
+    // First we push the size of neighbor vector
+    neighbor_list.push_back(tmp_sampled_edge_list.size());
+    // Then push the vertices
+    for (size_t i = 0; i < tmp_sampled_src_list.size(); ++i) {
+      neighbor_list.push_back(tmp_sampled_src_list[i]);
+    }
+    // Finally we push the edge list
+    for (size_t i = 0; i < tmp_sampled_edge_list.size(); ++i) {
+      neighbor_list.push_back(tmp_sampled_edge_list[i]);
+    }
+    num_edges += tmp_sampled_src_list.size();
+    for (size_t i = 0; i < tmp_sampled_src_list.size(); ++i) {
+      // If we have sampled the max number of vertices, we have to stop.
+      if (sub_ver_mp.size() >= max_num_vertices)
+        break;
+      // We need to add the neighbor in the hashtable here. This ensures that
+      // the vertex in the queue is unique. If we see a vertex before, we don't
+      // need to add it to the queue again.
+      auto ret = sub_ver_mp.insert(tmp_sampled_src_list[i]);
+      // If the sampled neighbor is inserted to the map successfully.
+      if (ret.second)
+        sub_vers.emplace_back(tmp_sampled_src_list[i], cur_node_level + 1);
+    }
+  }
+  // Let's check if there is a vertex that we haven't sampled its neighbors.
+  for (; idx < sub_vers.size(); idx++) {
+    if (sub_vers[idx].second < num_hops) {
+      LOG(WARNING)
+        << "The sampling is truncated because we have reached the max number of vertices\n"
+        << "Please use a smaller number of seeds or a small neighborhood";
+      break;
+    }
   }
 
   // Copy sub_ver_mp to output[0]
-  size_t idx = 0;
-  for (auto& data : sub_ver_mp) {
-    *(out+idx) = data.first;
-    idx++;
-  }
+  // Copy layer
   size_t num_vertices = sub_ver_mp.size();
-  std::sort(out, out + num_vertices);
-  // The rest data will be set to -1
-  for (dgl_id_t i = idx; i < max_num_vertices; ++i) {
-    *(out+i) = -1;
+  std::sort(sub_vers.begin(), sub_vers.end(),
+            [](const std::pair<dgl_id_t, dgl_id_t> &a1, const std::pair<dgl_id_t, dgl_id_t> &a2) {
+    return a1.first < a2.first;
+  });
+  for (size_t i = 0; i < sub_vers.size(); i++) {
+    out[i] = sub_vers[i].first;
+    out_layer[i] = sub_vers[i].second;
   }
   // The last element stores the actual
   // number of vertices in the subgraph.
   out[max_num_vertices] = sub_ver_mp.size();
+
   // Copy sub_probability
   if (sub_prob != nullptr) {
-    for (dgl_id_t i = 0; i < max_num_vertices; ++i) {
+    for (size_t i = 0; i < sub_ver_mp.size(); ++i) {
       dgl_id_t idx = out[i];
-      if (idx != -1) {
-        sub_prob[i] = probability[idx];
-      } else {
-        sub_prob[i] = -1;
-      }
-    }
-  }
-  // Copy layer
-  for (dgl_id_t i = 0; i < max_num_vertices; ++i) {
-    dgl_id_t idx = out[i];
-    if (idx != -1) {
-      out_layer[i] = sub_ver_mp[idx];
-    } else {
-      out_layer[i] = -1;
+      sub_prob[i] = probability[idx];
     }
   }
   // Construct sub_csr_graph
@@ -718,20 +691,37 @@ static void SampleSubgraph(const NDArray &csr,
   dgl_id_t* indptr_out = sub_csr.aux_data(0).dptr<dgl_id_t>();
   indptr_out[0] = 0;
   size_t collected_nedges = 0;
+
+  // Both the out array and neigh_pos are sorted. By scanning the two arrays, we can see
+  // which vertices have neighbors and which don't.
+  std::sort(neigh_pos.begin(), neigh_pos.end(),
+            [](const std::pair<dgl_id_t, size_t> &a1, const std::pair<dgl_id_t, size_t> &a2) {
+    return a1.first < a2.first;
+  });
+  size_t idx_with_neigh = 0;
   for (size_t i = 0; i < num_vertices; i++) {
     dgl_id_t dst_id = *(out + i);
-    auto it = neigh_mp.find(dst_id);
-    const auto &edges = it->second.edges;
-    const auto &neighs = it->second.neighs;
-    CHECK_EQ(edges.size(), neighs.size());
-    if (!edges.empty()) {
-      std::copy(edges.begin(), edges.end(), val_list_out + collected_nedges);
-      std::copy(neighs.begin(), neighs.end(), col_list_out + collected_nedges);
-      collected_nedges += edges.size();
+    // If a vertex is in sub_ver_mp but not in neigh_pos, this vertex must not
+    // have edges.
+    size_t edge_size = 0;
+    if (idx_with_neigh < neigh_pos.size() && dst_id == neigh_pos[idx_with_neigh].first) {
+      size_t pos = neigh_pos[idx_with_neigh].second;
+      CHECK_LT(pos, neighbor_list.size());
+      edge_size = neighbor_list[pos];
+      CHECK_LE(pos + edge_size * 2 + 1, neighbor_list.size());
+
+      std::copy_n(neighbor_list.begin() + pos + 1,
+                  edge_size,
+                  col_list_out + collected_nedges);
+      std::copy_n(neighbor_list.begin() + pos + edge_size + 1,
+                  edge_size,
+                  val_list_out + collected_nedges);
+      collected_nedges += edge_size;
+      idx_with_neigh++;
     }
-    indptr_out[i+1] = indptr_out[i] + edges.size();
+    indptr_out[i+1] = indptr_out[i] + edge_size;
   }
-  for (dgl_id_t i = num_vertices+1; i <= max_num_vertices; ++i) {
+  for (size_t i = num_vertices+1; i <= max_num_vertices; ++i) {
     indptr_out[i] = indptr_out[i-1];
   }
 }
@@ -766,8 +756,16 @@ static void CSRNeighborUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_contrib_dgl_csr_neighbor_uniform_sample)
-.describe(R"code(This operator samples sub-graph from a csr graph via an
-uniform probability. 
+.describe(R"code(This operator samples sub-graphs from a csr graph via an
+uniform probability. The operator is designed for DGL.
+
+The operator outputs three sets of NDArrays to represent the sampled results
+(the number of NDArrays in each set is the same as the number of seed NDArrays):
+1) a set of 1D NDArrays containing the sampled vertices, 2) a set of CSRNDArrays representing
+the sampled edges, 3) a set of 1D NDArrays indicating the layer where a vertex is sampled.
+The first set of 1D NDArrays have a length of max_num_vertices+1. The last element in an NDArray
+indicate the acutal number of vertices in a subgraph. The third set of NDArrays have a length
+of max_num_vertices, and the valid number of vertices is the same as the ones in the first set.
 
 Example:
 
@@ -853,7 +851,16 @@ static void CSRNeighborNonUniformSampleComputeExCPU(const nnvm::NodeAttrs& attrs
 
 NNVM_REGISTER_OP(_contrib_dgl_csr_neighbor_non_uniform_sample)
 .describe(R"code(This operator samples sub-graph from a csr graph via an
-uniform probability. 
+non-uniform probability. The operator is designed for DGL.
+
+The operator outputs four sets of NDArrays to represent the sampled results
+(the number of NDArrays in each set is the same as the number of seed NDArrays):
+1) a set of 1D NDArrays containing the sampled vertices, 2) a set of CSRNDArrays representing
+the sampled edges, 3) a set of 1D NDArrays with the probability that vertices are sampled,
+4) a set of 1D NDArrays indicating the layer where a vertex is sampled.
+The first set of 1D NDArrays have a length of max_num_vertices+1. The last element in an NDArray
+indicate the acutal number of vertices in a subgraph. The third and fourth set of NDArrays have a length
+of max_num_vertices, and the valid number of vertices is the same as the ones in the first set.
 
 Example:
 
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index c005dfa06590..b4f66d8fcf1d 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -174,7 +174,6 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
     }
 
     // apply nms
-#pragma omp parallel for num_threads(omp_threads)
     for (int i = 0; i < nkeep; ++i) {
       int offset_i = i * 6;
       if (p_out[offset_i] < 0) continue;  // skip eliminated
diff --git a/src/operator/contrib/nnvm_to_onnx-inl.h b/src/operator/contrib/nnvm_to_onnx-inl.h
index 58f88b051433..011ffe6b7ddb 100644
--- a/src/operator/contrib/nnvm_to_onnx-inl.h
+++ b/src/operator/contrib/nnvm_to_onnx-inl.h
@@ -37,7 +37,6 @@
 #include <nnvm/graph.h>
 #include <nnvm/pass_functions.h>
 
-#include <NvInfer.h>
 #include <onnx/onnx.pb.h>
 
 #include <algorithm>
@@ -49,13 +48,48 @@
 #include <utility>
 #include <string>
 
-#include "./tensorrt-inl.h"
 #include "../operator_common.h"
 #include "../../common/utils.h"
 #include "../../common/serialization.h"
 
 namespace mxnet {
 namespace op {
+
+namespace nnvm_to_onnx {
+    enum class TypeIO { Inputs = 0, Outputs = 1 };
+    using NameToIdx_t = std::map<std::string, int32_t>;
+    using InferenceTuple_t = std::tuple<uint32_t, TShape, int, int>;
+    using InferenceMap_t = std::map<std::string, InferenceTuple_t>;
+}  // namespace nnvm_to_onnx
+
+struct ONNXParam : public dmlc::Parameter<ONNXParam> {
+  std::string serialized_onnx_graph;
+  std::string serialized_input_map;
+  std::string serialized_output_map;
+  nnvm_to_onnx::NameToIdx_t input_map;
+  nnvm_to_onnx::InferenceMap_t output_map;
+  ::onnx::ModelProto onnx_pb_graph;
+
+  ONNXParam() {}
+
+  ONNXParam(const ::onnx::ModelProto& onnx_graph,
+           const nnvm_to_onnx::InferenceMap_t& input_map,
+           const nnvm_to_onnx::NameToIdx_t& output_map) {
+    common::Serialize(input_map, &serialized_input_map);
+    common::Serialize(output_map, &serialized_output_map);
+    onnx_graph.SerializeToString(&serialized_onnx_graph);
+  }
+
+DMLC_DECLARE_PARAMETER(ONNXParam) {
+    DMLC_DECLARE_FIELD(serialized_onnx_graph)
+    .describe("Serialized ONNX graph");
+    DMLC_DECLARE_FIELD(serialized_input_map)
+    .describe("Map from inputs to topological order as input.");
+    DMLC_DECLARE_FIELD(serialized_output_map)
+    .describe("Map from outputs to order in g.outputs.");
+  }
+};
+
 namespace nnvm_to_onnx {
 
 using namespace nnvm;
@@ -76,7 +110,7 @@ void ConvertConstant(GraphProto* const graph_proto,
   const std::string& node_name,
   std::unordered_map<std::string, NDArray>* const shared_buffer);
 
-void ConvertOutput(op::tensorrt::InferenceMap_t* const trt_output_map,
+void ConvertOutput(op::nnvm_to_onnx::InferenceMap_t* const trt_output_map,
                    GraphProto* const graph_proto,
                    const std::unordered_map<std::string, uint32_t>::iterator& out_iter,
                    const std::string& node_name,
@@ -133,7 +167,7 @@ void ConvertElementwiseAdd(NodeProto *node_proto,
                     const nnvm::IndexedGraph &ig,
                     const array_view<IndexedGraph::NodeEntry> &inputs);
 
-TRTParam ConvertNnvmGraphToOnnx(
+ONNXParam ConvertNnvmGraphToOnnx(
     const nnvm::Graph &g,
     std::unordered_map<std::string, NDArray> *const shared_buffer);
 
diff --git a/src/operator/contrib/nnvm_to_onnx.cc b/src/operator/contrib/nnvm_to_onnx.cc
index 902466614c7c..784384e94e1e 100644
--- a/src/operator/contrib/nnvm_to_onnx.cc
+++ b/src/operator/contrib/nnvm_to_onnx.cc
@@ -47,7 +47,6 @@
 #include "../../operator/nn/fully_connected-inl.h"
 #include "../../operator/nn/pooling-inl.h"
 #include "../../operator/softmax_output-inl.h"
-#include "./tensorrt-inl.h"
 
 #if MXNET_USE_TENSORRT_ONNX_CHECKER
 #include <onnx/checker.h>
@@ -55,14 +54,17 @@
 
 namespace mxnet {
 namespace op {
+
+DMLC_REGISTER_PARAMETER(ONNXParam);
+
 namespace nnvm_to_onnx {
 
-op::TRTParam ConvertNnvmGraphToOnnx(
+op::ONNXParam ConvertNnvmGraphToOnnx(
     const nnvm::Graph& g,
     std::unordered_map<std::string, NDArray>* const shared_buffer) {
-    op::TRTParam trt_param;
-    op::tensorrt::NameToIdx_t trt_input_map;
-    op::tensorrt::InferenceMap_t trt_output_map;
+    op::ONNXParam onnx_param;
+    op::nnvm_to_onnx::NameToIdx_t onnx_input_map;
+    op::nnvm_to_onnx::InferenceMap_t onnx_output_map;
 
   const nnvm::IndexedGraph& ig = g.indexed_graph();
   const auto& storage_types = g.GetAttr<StorageTypeVector>("storage_type");
@@ -105,7 +107,7 @@ op::TRTParam ConvertNnvmGraphToOnnx(
           current_input++;
           continue;
         }
-        trt_input_map.emplace(node_name, current_input++);
+        onnx_input_map.emplace(node_name, current_input++);
         ConvertPlaceholder(node_name, placeholder_shapes, graph_proto);
       } else {
         // If it's not a placeholder, then by exclusion it's a constant.
@@ -140,23 +142,23 @@ op::TRTParam ConvertNnvmGraphToOnnx(
       auto out_iter = output_lookup.find(node_name);
       // We found an output
       if (out_iter != output_lookup.end()) {
-        ConvertOutput(&trt_output_map, graph_proto, out_iter, node_name, g,
+        ConvertOutput(&onnx_output_map, graph_proto, out_iter, node_name, g,
                       storage_types, dtypes);
       }  // output found
     }    // conversion function exists
   }      // loop over i from 0 to num_nodes
 
-  model_proto.SerializeToString(&trt_param.serialized_onnx_graph);
-  common::Serialize<op::tensorrt::NameToIdx_t>(trt_input_map,
-                                          &trt_param.serialized_input_map);
-  common::Serialize<op::tensorrt::InferenceMap_t>(trt_output_map,
-                                             &trt_param.serialized_output_map);
+  model_proto.SerializeToString(&onnx_param.serialized_onnx_graph);
+  common::Serialize<op::nnvm_to_onnx::NameToIdx_t>(onnx_input_map,
+                                          &onnx_param.serialized_input_map);
+  common::Serialize<op::nnvm_to_onnx::InferenceMap_t>(onnx_output_map,
+                                             &onnx_param.serialized_output_map);
 
 #if MXNET_USE_TENSORRT_ONNX_CHECKER
   onnx::checker::check_model(model_proto);
 #endif  // MXNET_USE_TENSORRT_ONNX_CHECKER
 
-  return trt_param;
+  return onnx_param;
 }
 
 void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
@@ -489,7 +491,7 @@ void ConvertConstant(
 }
 
 void ConvertOutput(
-    op::tensorrt::InferenceMap_t* const trt_output_map,
+    op::nnvm_to_onnx::InferenceMap_t* const output_map,
     GraphProto* const graph_proto,
     const std::unordered_map<std::string, uint32_t>::iterator& out_iter,
     const std::string& node_name, const nnvm::Graph& g,
@@ -501,10 +503,10 @@ void ConvertOutput(
   int dtype = dtypes[out_idx];
 
   // This should work with fp16 as well
-  op::tensorrt::InferenceTuple_t out_tuple{out_iter->second, out_shape, storage_type,
+  op::nnvm_to_onnx::InferenceTuple_t out_tuple{out_iter->second, out_shape, storage_type,
                                       dtype};
 
-  trt_output_map->emplace(node_name, out_tuple);
+  output_map->emplace(node_name, out_tuple);
 
   auto graph_out = graph_proto->add_output();
   auto tensor_type = graph_out->mutable_type()->mutable_tensor_type();
diff --git a/src/operator/contrib/roi_align-inl.h b/src/operator/contrib/roi_align-inl.h
index 263f72a6abc0..9f4d7ce48827 100644
--- a/src/operator/contrib/roi_align-inl.h
+++ b/src/operator/contrib/roi_align-inl.h
@@ -20,7 +20,7 @@
  * Copyright (c) 2018 by Contributors
  * \file roi_align-inl.h
  * \brief roi align operator and symbol
- * \author Hang Zhang
+ * \author Hang Zhang, Shesung
  * modified from Caffe2
 */
 #ifndef MXNET_OPERATOR_CONTRIB_ROI_ALIGN_INL_H_
@@ -35,7 +35,6 @@
 namespace mxnet {
 namespace op {
 
-
 // Declare enumeration of input order to make code more intuitive.
 // These enums are only visible within this header
 namespace roialign {
@@ -48,6 +47,7 @@ struct ROIAlignParam : public dmlc::Parameter<ROIAlignParam> {
   TShape pooled_size;
   float spatial_scale;
   int sample_ratio;
+  bool position_sensitive;
   DMLC_DECLARE_PARAMETER(ROIAlignParam) {
     DMLC_DECLARE_FIELD(pooled_size)
     .set_expect_ndim(2).enforce_nonzero()
@@ -57,6 +57,10 @@ struct ROIAlignParam : public dmlc::Parameter<ROIAlignParam> {
     "Equals the reciprocal of total stride in convolutional layers");
     DMLC_DECLARE_FIELD(sample_ratio).set_default(-1)
     .describe("Optional sampling ratio of ROI align, using adaptive size by default.");
+    DMLC_DECLARE_FIELD(position_sensitive).set_default(false)
+    .describe("Whether to perform position-sensitive RoI pooling. PSRoIPooling is "
+    "first proposaled by R-FCN and it can reduce the input channels by ph*pw times, "
+    "where (ph, pw) is the pooled_size");
   }
 };
 
diff --git a/src/operator/contrib/roi_align.cc b/src/operator/contrib/roi_align.cc
index 76675677fa08..e584ea30325d 100644
--- a/src/operator/contrib/roi_align.cc
+++ b/src/operator/contrib/roi_align.cc
@@ -20,7 +20,7 @@
  * Copyright (c) 2018 by Contributors
  * \file roi_align.cc
  * \brief roi align operator
- * \author Hang Zhang
+ * \author Hang Zhang, Shesung
  * Adapted from Caffe2
 */
 #include "./roi_align-inl.h"
@@ -142,6 +142,7 @@ void ROIAlignForward(
     const int nthreads,
     const T* bottom_data,
     const T& spatial_scale,
+    const bool position_sensitive,
     const int channels,
     const int height,
     const int width,
@@ -156,6 +157,8 @@ void ROIAlignForward(
   int n_rois = nthreads / channels / pooled_width / pooled_height;
   // (n, c, ph, pw) is an element in the pooled output
   // can be parallelized using omp
+#pragma omp parallel for \
+num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int n = 0; n < n_rois; n++) {
     int index_n = n * channels * pooled_width * pooled_height;
 
@@ -208,19 +211,23 @@ void ROIAlignForward(
         roi_bin_grid_w,
         &pre_calc);
 
-    int c;
-#pragma omp parallel for private(c) \
-num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-    for (c = 0; c < channels; c++) {
+    for (int c = 0; c < channels; c++) {
       int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_bottom_data =
-          bottom_data + (roi_batch_ind * channels + c) * height * width;
       int pre_calc_index = 0;
 
       for (int ph = 0; ph < pooled_height; ph++) {
         for (int pw = 0; pw < pooled_width; pw++) {
           int index = index_n_c + ph * pooled_width + pw;
 
+          int c_unpooled = c;
+          int channels_unpooled = channels;
+          if (position_sensitive) {
+            c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
+            channels_unpooled = channels * pooled_height * pooled_width;
+          }
+          const T* offset_bottom_data =
+              bottom_data + (roi_batch_ind * channels_unpooled + c_unpooled)
+              * height * width;
           T output_val = 0.;
           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
@@ -310,6 +317,7 @@ void ROIAlignBackward(
     const T* top_diff,
     const int /*num_rois*/,
     const T& spatial_scale,
+    const bool position_sensitive,
     const int channels,
     const int height,
     const int width,
@@ -347,8 +355,15 @@ void ROIAlignBackward(
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
+    int c_unpooled = c;
+    int channels_unpooled = channels;
+    if (position_sensitive) {
+      c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
+      channels_unpooled = channels * pooled_height * pooled_width;
+    }
     T* offset_bottom_diff =
-        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+        bottom_diff + (roi_batch_ind * channels_unpooled + c_unpooled)
+        * height * width;
 
     int top_offset = (n * channels + c) * pooled_height * pooled_width;
     const T* offset_top_diff = top_diff + top_offset;
@@ -426,7 +441,7 @@ void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
 
   const int count = out_data[roialign::kOut].Size();
   // const int num_rois = in_data[roialign::kBox].size(0);
-  const int channels = in_data[roialign::kData].size(1);
+  const int channels = out_data[roialign::kOut].size(1);  // channels of pooled output
   const int height = in_data[roialign::kData].size(2);
   const int width = in_data[roialign::kData].size(3);
   const int pooled_height = out_data[roialign::kOut].size(2);
@@ -439,9 +454,9 @@ void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
     const DType *bottom_rois = in_data[roialign::kBox].dptr<DType>();
     DType *top_data = out_data[roialign::kOut].dptr<DType>();
 
-    ROIAlignForward<DType>(count, bottom_data, param.spatial_scale, channels,
-                           height, width, pooled_height, pooled_width, param.sample_ratio,
-                           bottom_rois, rois_cols, top_data);
+    ROIAlignForward<DType>(count, bottom_data, param.spatial_scale, param.position_sensitive,
+                           channels, height, width, pooled_height, pooled_width,
+                           param.sample_ratio, bottom_rois, rois_cols, top_data);
   })
 }
 
@@ -470,7 +485,7 @@ void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
 
   const int count = out_grad[0].Size();
   const int num_rois = in_data[0].size(0);
-  const int channels = outputs[0].size(1);
+  const int channels = out_grad[0].size(1);  // channels of pooled output
   const int height = outputs[0].size(2);
   const int width = outputs[0].size(3);
   const int pooled_height = out_grad[0].size(2);
@@ -489,8 +504,9 @@ void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
         Fill<false>(s, outputs[0], kWriteTo, static_cast<DType>(0));
       }
       ROIAlignBackward<DType>(count, top_diff, num_rois, param.spatial_scale,
-                     channels, height, width, pooled_height, pooled_width,
-                     param.sample_ratio, grad_in, bottom_rois, rois_cols);
+                     param.position_sensitive, channels, height, width,
+                     pooled_height, pooled_width, param.sample_ratio, grad_in,
+                     bottom_rois, rois_cols);
     }
     if (kWriteTo == req[roialign::kBox]) {
       Fill<false>(s, outputs[1], kWriteTo, static_cast<DType>(0));
@@ -545,8 +561,17 @@ He, Kaiming, et al. "Mask R-CNN." ICCV, 2017
   CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]";
   // out: [num_rois, c, pooled_h, pooled_w]
   out_shape->clear();
-  out_shape->push_back(
-       Shape4(bshape[0], dshape[1], param.pooled_size[0], param.pooled_size[1]));
+  if (param.position_sensitive) {
+    CHECK_EQ(dshape[1] % (param.pooled_size[0]*param.pooled_size[1]), 0) <<
+      "Input channels should be divided by pooled_size[0]*pooled_size[1]"
+      "when position_sensitive is true.";
+    out_shape->push_back(
+         Shape4(bshape[0], dshape[1]/param.pooled_size[0]/param.pooled_size[1],
+                param.pooled_size[0], param.pooled_size[1]));
+  } else {
+    out_shape->push_back(
+         Shape4(bshape[0], dshape[1], param.pooled_size[0], param.pooled_size[1]));
+  }
   return true;
 })
 .set_attr<nnvm::FInferType>("FInferType", [](const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/contrib/roi_align.cu b/src/operator/contrib/roi_align.cu
index d3db70b73b1a..38b461d5f58c 100644
--- a/src/operator/contrib/roi_align.cu
+++ b/src/operator/contrib/roi_align.cu
@@ -20,7 +20,7 @@
  * Copyright (c) 2018 by Contributors
  * \file roi_align.cu
  * \brief roi align operator
- * \author Hang Zhang
+ * \author Hang Zhang, Shesung
  * Adapted from Caffe2
 */
 #include "./roi_align-inl.h"
@@ -111,6 +111,7 @@ __global__ void RoIAlignForwardKernel(
     const int nthreads,
     const T* bottom_data,
     const T spatial_scale,
+    const bool position_sensitive,
     const int channels,
     const int height,
     const int width,
@@ -145,8 +146,15 @@ __global__ void RoIAlignForwardKernel(
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
+    int c_unpooled = c;
+    int channels_unpooled = channels;
+    if (position_sensitive) {
+      c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
+      channels_unpooled = channels * pooled_height * pooled_width;
+    }
     const T* offset_bottom_data =
-        bottom_data + (roi_batch_ind * channels + c) * height * width;
+        bottom_data + (roi_batch_ind * channels_unpooled + c_unpooled)
+        * height * width;
 
     // We use roi_bin_grid to sample the grid and mimic integral
     int roi_bin_grid_h = (sampling_ratio > 0)
@@ -242,6 +250,7 @@ __global__ void RoIAlignBackwardKernel(
     const T* top_diff,
     const int num_rois,
     const T spatial_scale,
+    const bool position_sensitive,
     const int channels,
     const int height,
     const int width,
@@ -276,8 +285,15 @@ __global__ void RoIAlignBackwardKernel(
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
+    int c_unpooled = c;
+    int channels_unpooled = channels;
+    if (position_sensitive) {
+      c_unpooled = c * pooled_height * pooled_width + ph * pooled_width + pw;
+      channels_unpooled = channels * pooled_height * pooled_width;
+    }
     T* offset_bottom_diff =
-        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+        bottom_diff + (roi_batch_ind * channels_unpooled + c_unpooled)
+        * height * width;
 
     int top_offset = (n * channels + c) * pooled_height * pooled_width;
     const T* offset_top_diff = top_diff + top_offset;
@@ -357,7 +373,7 @@ void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
 
   const int count = out_data[roialign::kOut].Size();
   const int num_rois = in_data[roialign::kBox].size(0);
-  const int channels = in_data[roialign::kData].size(1);
+  const int channels = out_data[roialign::kOut].size(1);  // channels of pooled output
   const int height = in_data[roialign::kData].size(2);
   const int width = in_data[roialign::kData].size(3);
   const int pooled_height = out_data[roialign::kOut].size(2);
@@ -377,6 +393,7 @@ void ROIAlignForwardCompute(const nnvm::NodeAttrs& attrs,
           count,
           bottom_data,
           param.spatial_scale,
+          param.position_sensitive,
           channels,
           height,
           width,
@@ -414,7 +431,7 @@ void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
 
   const int count = out_grad[0].Size();
   const int num_rois = in_data[0].size(0);
-  const int channels = outputs[0].size(1);
+  const int channels = out_grad[0].size(1);  // channels of pooled output
   const int height = outputs[0].size(2);
   const int width = outputs[0].size(3);
   const int pooled_height = out_grad[0].size(2);
@@ -445,6 +462,7 @@ void ROIAlignBackwardCompute(const nnvm::NodeAttrs& attrs,
         top_diff,
         num_rois,
         param.spatial_scale,
+        param.position_sensitive,
         channels,
         height,
         width,
diff --git a/src/operator/contrib/tensorrt-inl.h b/src/operator/contrib/tensorrt-inl.h
index be335ab1208f..062d22e35795 100644
--- a/src/operator/contrib/tensorrt-inl.h
+++ b/src/operator/contrib/tensorrt-inl.h
@@ -38,7 +38,6 @@
 #include <nnvm/pass_functions.h>
 
 #include <NvInfer.h>
-#include <onnx/onnx.pb.h>
 
 #include <algorithm>
 #include <iostream>
@@ -49,6 +48,7 @@
 #include <utility>
 #include <string>
 
+#include "nnvm_to_onnx-inl.h"
 #include "../operator_common.h"
 #include "../../common/utils.h"
 #include "../../common/serialization.h"
@@ -60,49 +60,15 @@ namespace mxnet {
 namespace op {
 
 using namespace nnvm;
-using namespace ::onnx;
 using int64 = ::google::protobuf::int64;
 
-namespace tensorrt {
-  enum class TypeIO { Inputs = 0, Outputs = 1 };
-  using NameToIdx_t = std::map<std::string, int32_t>;
-  using InferenceTuple_t = std::tuple<uint32_t, TShape, int, int>;
-  using InferenceMap_t = std::map<std::string, InferenceTuple_t>;
-}  // namespace tensorrt
 
 using trt_name_to_idx = std::map<std::string, uint32_t>;
 
-struct TRTParam : public dmlc::Parameter<TRTParam> {
-  std::string serialized_onnx_graph;
-  std::string serialized_input_map;
-  std::string serialized_output_map;
-  tensorrt::NameToIdx_t input_map;
-  tensorrt::InferenceMap_t output_map;
-  ::onnx::ModelProto onnx_pb_graph;
-
-  TRTParam() {}
-
-  TRTParam(const ::onnx::ModelProto& onnx_graph,
-           const tensorrt::InferenceMap_t& input_map,
-           const tensorrt::NameToIdx_t& output_map) {
-    common::Serialize(input_map, &serialized_input_map);
-    common::Serialize(output_map, &serialized_output_map);
-    onnx_graph.SerializeToString(&serialized_onnx_graph);
-  }
-
-DMLC_DECLARE_PARAMETER(TRTParam) {
-    DMLC_DECLARE_FIELD(serialized_onnx_graph)
-    .describe("Serialized ONNX graph");
-    DMLC_DECLARE_FIELD(serialized_input_map)
-    .describe("Map from inputs to topological order as input.");
-    DMLC_DECLARE_FIELD(serialized_output_map)
-    .describe("Map from outputs to order in g.outputs.");
-  }
-};
 
 struct TRTEngineParam {
   nvinfer1::IExecutionContext* trt_executor;
-  std::vector<std::pair<uint32_t, tensorrt::TypeIO> > binding_map;
+  std::vector<std::pair<uint32_t, nnvm_to_onnx::TypeIO> > binding_map;
 };
 
 }  // namespace op
diff --git a/src/operator/contrib/tensorrt.cc b/src/operator/contrib/tensorrt.cc
index 619fe1e2b8f4..88a65fba3ea3 100644
--- a/src/operator/contrib/tensorrt.cc
+++ b/src/operator/contrib/tensorrt.cc
@@ -44,20 +44,18 @@
 namespace mxnet {
 namespace op {
 
-DMLC_REGISTER_PARAMETER(TRTParam);
-
 OpStatePtr GetPtrMapping(nvinfer1::ICudaEngine* trt_engine,
-                         tensorrt::NameToIdx_t input_map,
-                         tensorrt::NameToIdx_t output_map) {
+                         nnvm_to_onnx::NameToIdx_t input_map,
+                         nnvm_to_onnx::NameToIdx_t output_map) {
   TRTEngineParam param;
   for (int b = 0; b < trt_engine->getNbBindings(); ++b) {
     const std::string& binding_name = trt_engine->getBindingName(b);
     if (trt_engine->bindingIsInput(b)) {
       param.binding_map.emplace_back(input_map[binding_name],
-                                     tensorrt::TypeIO::Inputs);
+                                     nnvm_to_onnx::TypeIO::Inputs);
     } else {
       param.binding_map.emplace_back(output_map[binding_name],
-                                     tensorrt::TypeIO::Outputs);
+                                     nnvm_to_onnx::TypeIO::Outputs);
     }
   }
   param.trt_executor = trt_engine->createExecutionContext();
@@ -67,7 +65,7 @@ OpStatePtr GetPtrMapping(nvinfer1::ICudaEngine* trt_engine,
 OpStatePtr TRTCreateState(const nnvm::NodeAttrs& attrs, Context /*ctx*/,
                           const std::vector<TShape>& /*ishape*/,
                           const std::vector<int>& /*itype*/) {
-  const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
 
   ::onnx::ModelProto model_proto;
   bool success = model_proto.ParseFromString(node_param.serialized_onnx_graph);
@@ -82,7 +80,7 @@ OpStatePtr TRTCreateState(const nnvm::NodeAttrs& attrs, Context /*ctx*/,
   nvinfer1::ICudaEngine* const trt_engine = ::onnx_to_tensorrt::onnxToTrtCtx(
       node_param.serialized_onnx_graph, batch_size, 1 << 30);
 
-  tensorrt::NameToIdx_t output_map;
+  nnvm_to_onnx::NameToIdx_t output_map;
   for (auto& el : node_param.output_map) {
     output_map[el.first] = std::get<0>(el.second);
   }
@@ -90,7 +88,7 @@ OpStatePtr TRTCreateState(const nnvm::NodeAttrs& attrs, Context /*ctx*/,
 }
 
 void TRTParamParser(nnvm::NodeAttrs* attrs) {
-  TRTParam param_;
+  ONNXParam param_;
 
   try {
     param_.Init(attrs->dict);
@@ -114,7 +112,7 @@ void TRTParamParser(nnvm::NodeAttrs* attrs) {
 
 inline bool TRTInferShape(const NodeAttrs& attrs, std::vector<TShape>* /*in_shape*/,
                           std::vector<TShape>* out_shape) {
-  const auto &node_param = nnvm::get<TRTParam>(attrs.parsed);
+  const auto &node_param = nnvm::get<ONNXParam>(attrs.parsed);
   for (auto& el : node_param.output_map) {
     (*out_shape)[std::get<0>(el.second)] = std::get<1>(el.second);
   }
@@ -131,7 +129,7 @@ inline bool TRTInferStorageType(const NodeAttrs& /*attrs*/, const int /*dev_mask
 
 inline bool TRTInferType(const NodeAttrs& attrs, std::vector<int>* /*in_dtype*/,
                          std::vector<int>* out_dtype) {
-  const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
   for (auto& el : node_param.output_map) {
     (*out_dtype)[std::get<0>(el.second)] = std::get<3>(el.second);
   }
@@ -140,7 +138,7 @@ inline bool TRTInferType(const NodeAttrs& attrs, std::vector<int>* /*in_dtype*/,
 
 inline std::vector<std::string> TRTListInputNames(const NodeAttrs& attrs) {
   std::vector<std::string> output;
-  const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
   output.resize(node_param.input_map.size());
   for (auto& el : node_param.input_map) {
     output[el.second] = el.first;
@@ -150,7 +148,7 @@ inline std::vector<std::string> TRTListInputNames(const NodeAttrs& attrs) {
 
 inline std::vector<std::string> TRTListOutputNames(const NodeAttrs& attrs) {
   std::vector<std::string> output;
-  const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+  const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
   output.resize(node_param.output_map.size());
   for (auto& el : node_param.output_map) {
     output[std::get<0>(el.second)] = el.first;
@@ -162,11 +160,11 @@ NNVM_REGISTER_OP(_trt_op)
     .describe(R"code(TRT operation (one engine)
 )code" ADD_FILELINE)
     .set_num_inputs([](const NodeAttrs& attrs) {
-      const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+      const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
       return node_param.input_map.size();
     })
     .set_num_outputs([](const NodeAttrs& attrs) {
-      const auto& node_param = nnvm::get<TRTParam>(attrs.parsed);
+      const auto& node_param = nnvm::get<ONNXParam>(attrs.parsed);
       return node_param.output_map.size();
     })
     .set_attr_parser(TRTParamParser)
diff --git a/src/operator/contrib/tensorrt.cu b/src/operator/contrib/tensorrt.cu
index 2fe8727b73e4..9a9c3c024366 100644
--- a/src/operator/contrib/tensorrt.cu
+++ b/src/operator/contrib/tensorrt.cu
@@ -52,7 +52,7 @@ void TRTCompute(const OpStatePtr& state, const OpContext& ctx,
   std::vector<void*> bindings;
   bindings.reserve(param.binding_map.size());
   for (auto& p : param.binding_map) {
-    if (p.second == tensorrt::TypeIO::Inputs) {
+    if (p.second == nnvm_to_onnx::TypeIO::Inputs) {
       bindings.emplace_back(inputs[p.first].dptr_);
     } else {
       bindings.emplace_back(outputs[p.first].dptr_);
diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc
index 305eeab21176..fb920c31ce37 100644
--- a/src/operator/nn/activation.cc
+++ b/src/operator/nn/activation.cc
@@ -97,9 +97,10 @@ static void ActivationComputeExCPU(const nnvm::NodeAttrs& attrs,
                                    const std::vector<NDArray>& inputs,
                                    const std::vector<OpReqType>& req,
                                    const std::vector<NDArray>& outputs) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
-  if (SupportMKLDNN(inputs[0])) {
+  if (SupportMKLDNNAct(param, inputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
     MKLDNNActivationForward(attrs, ctx, inputs[0], req[0], outputs[0]);
     MKLDNN_OPCHECK_RUN(ActivationCompute<cpu>, attrs, ctx, inputs, req, outputs);
@@ -115,7 +116,7 @@ void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                 const std::vector<NDArray>& outputs) {
   const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), activation::GradNumInputs(param.act_type));
-  if (SupportMKLDNN(inputs[0])) {
+  if (SupportMKLDNNAct(param, inputs[0])) {
     MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
     // XXX: for y = relu(x), y is passed as "in_data" to Backward()
     const bool relu = param.act_type == activation::kReLU;
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
index 440705884b3f..8c64888b4608 100644
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -49,6 +49,15 @@ bool SupportMKLDNNAct(const ActivationParam& param) {
       || param.act_type == activation::kTanh;
 }
 
+bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input) {
+  // MKL-DNN Activation supports 1d, 2d, 3d, 4d data layout
+  if ((input.shape().ndim() < 1) ||
+      (input.shape().ndim() > 4) ||
+      (input.dtype() != mshadow::kFloat32))
+    return false;
+  return SupportMKLDNNAct(param);
+}
+
 static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) {
   switch (param.act_type) {
     case activation::kReLU:
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 17e74094c2bb..18ef3f3e767b 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -60,7 +60,7 @@
 #include "mxnet/op_attr_types.h"
 using namespace mkldnn;
 namespace mxnet {
-extern bool EnableMkldnnWarnGenerated();
+
 // =====  CpuEngine =======================================
 // cpu_engine singleton
 class CpuEngine {
@@ -175,10 +175,11 @@ struct ConvolutionParam;
 struct DeconvolutionParam;
 struct SoftmaxParam;
 bool SupportMKLDNNAct(const ActivationParam& param);
+bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input);
 bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input);
 bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input);
 bool SupportMKLDNNSoftmax(const SoftmaxParam& param);
-}
+}  // namespace op
 
 static int GetTypeSize(int dtype) {
   int size = -1;
@@ -250,15 +251,24 @@ inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) {
 
 inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
                                                  int num_groups) {
+  auto ndim = arr.shape().ndim();
+  mkldnn::memory::dims tz = mkldnn::memory::dims{0};
   if (num_groups == 1) {
     return GetMemDesc(arr);
   } else {
-    CHECK_EQ(arr.shape().ndim(), 4U);
-    mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups,
-      static_cast<int>(arr.shape()[0] / num_groups),
-      static_cast<int>(arr.shape()[1]),
-      static_cast<int>(arr.shape()[2]),
-      static_cast<int>(arr.shape()[3])};
+    CHECK((ndim == 3) || (ndim == 4))
+        << "MKL-DNN weight currectly supports 3d and 4d layout";
+    const int N = 0, H = 2, W = 3, C = 1;
+    if (ndim == 3) {
+      tz = mkldnn::memory::dims{
+          num_groups, static_cast<int>(arr.shape()[N] / num_groups),
+          static_cast<int>(arr.shape()[C]), static_cast<int>(arr.shape()[H])};
+    } else {
+      tz = mkldnn::memory::dims{
+          num_groups, static_cast<int>(arr.shape()[N] / num_groups),
+          static_cast<int>(arr.shape()[C]), static_cast<int>(arr.shape()[H]),
+          static_cast<int>(arr.shape()[W])};
+    }
     return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()),
                                 mkldnn::memory::format::any};
   }
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 5db51817db9d..ccb9d7ec0075 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -70,9 +70,10 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) {
   } else {
     // If curr_mem has been initialized and we still reach here. It means
     // the current allocated memory isn't enough.
-    if (this->curr_mem)
+    if (this->curr_mem && dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false)) {
       LOG(WARNING) << "Allocate " << pd.get_size()
           << " bytes with malloc directly";
+    }
     mkldnn_mem_ptr ret(new mkldnn::memory(pd));
     MKLDNNStream::Get()->RegisterMem(ret);
     return ret.get();
@@ -238,39 +239,49 @@ const mkldnn::memory *GetWeights(const NDArray &arr,
     return mem;
 
   mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype());
+  mkldnn::memory::dims tz = mkldnn::memory::dims{0};
+  mkldnn::memory::format format = mkldnn::memory::format::format_undef;
   auto engine = CpuEngine::Get()->get_engine();
+  const int O = 0, I = 1, H = 2, W = 3;
   if (arr.shape().ndim() == 2) {
-    mkldnn::memory::dims tz = mkldnn::memory::dims{
-      static_cast<int>(arr.shape()[0]), static_cast<int>(arr.shape()[1])};
-    mkldnn::memory::desc md =
-        mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi};
-    mkldnn::memory::primitive_desc pd =
-        mkldnn::memory::primitive_desc{md, engine};
-    mem = arr.GetMKLDNNData(pd);
-  } else if (arr.shape().ndim() == 4 && num_groups == 1) {
-    mkldnn::memory::dims tz = mkldnn::memory::dims{
-      static_cast<int>(arr.shape()[0]), static_cast<int>(arr.shape()[1]),
-          static_cast<int>(arr.shape()[2]), static_cast<int>(arr.shape()[3])};
-    mkldnn::memory::desc md =
-        mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw};
-    mkldnn::memory::primitive_desc pd =
-        mkldnn::memory::primitive_desc{md, engine};
-    mem = arr.GetMKLDNNData(pd);
+    tz = mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
+                              static_cast<int>(arr.shape()[I])};
+    format = mkldnn::memory::format::oi;
+  } else if (arr.shape().ndim() == 3) {
+    tz = num_groups > 1
+             ? mkldnn::memory::dims{num_groups,
+                                    static_cast<int>(arr.shape()[O] /
+                                                     num_groups),
+                                    static_cast<int>(arr.shape()[I]),
+                                    static_cast<int>(arr.shape()[H])}
+             : mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
+                                    static_cast<int>(arr.shape()[I]),
+                                    static_cast<int>(arr.shape()[H])};
+    format = num_groups > 1 ? mkldnn::memory::format::goiw
+                            : mkldnn::memory::format::oiw;
   } else if (arr.shape().ndim() == 4) {
-    mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups,
-      static_cast<int>(arr.shape()[0] / num_groups),
-      static_cast<int>(arr.shape()[1]),
-      static_cast<int>(arr.shape()[2]),
-      static_cast<int>(arr.shape()[3])};
-    mkldnn::memory::desc md =
-        mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw};
-    mkldnn::memory::primitive_desc pd =
-        mkldnn::memory::primitive_desc{md, engine};
-    mem = arr.GetMKLDNNData(pd);
+    tz = num_groups > 1
+             ? mkldnn::memory::dims{num_groups,
+                                    static_cast<int>(arr.shape()[O] /
+                                                     num_groups),
+                                    static_cast<int>(arr.shape()[I]),
+                                    static_cast<int>(arr.shape()[H]),
+                                    static_cast<int>(arr.shape()[W])}
+             : mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
+                                    static_cast<int>(arr.shape()[I]),
+                                    static_cast<int>(arr.shape()[H]),
+                                    static_cast<int>(arr.shape()[W])};
+    format = num_groups > 1 ? mkldnn::memory::format::goihw
+                            : mkldnn::memory::format::oihw;
   } else {
     LOG(FATAL) << "The weight array has an unsupported number of dimensions";
     return nullptr;
   }
+  mkldnn::memory::desc md =
+      mkldnn::memory::desc{tz, type, format};
+  mkldnn::memory::primitive_desc pd =
+      mkldnn::memory::primitive_desc{md, engine};
+  mem = arr.GetMKLDNNData(pd);
   if (mem == nullptr)
     mem = arr.GetMKLDNNDataReorder(target_pd);
   if (mem->get_primitive_desc() == target_pd) return mem;
@@ -284,6 +295,7 @@ mkldnn_memory_format_t GetDefaultFormat(int num_dims) {
   switch (num_dims) {
     case 1: return mkldnn_x;
     case 2: return mkldnn_nc;
+    case 3: return mkldnn_ncw;
     case 4: return mkldnn_nchw;
     case 5: return mkldnn_goihw;
     default:
@@ -300,6 +312,30 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       return mkldnn_oi;
     else
       return desc.data.format;
+  } else if (desc.data.ndims == 3) {
+    switch (desc.data.format) {
+      case mkldnn_ncw:
+      case mkldnn_nwc:
+      case mkldnn_nCw8c:
+      case mkldnn_nCw16c:
+        return mkldnn_ncw;
+      case mkldnn_oiw:
+      case mkldnn_wio:
+      case mkldnn_Owi8o:
+      case mkldnn_OIw8i8o:
+      case mkldnn_OIw8o8i:
+      case mkldnn_OIw16i16o:
+      case mkldnn_OIw16o16i:
+      case mkldnn_Oiw16o:
+      case mkldnn_Owi16o:
+      case mkldnn_OIw8i16o2i:
+      case mkldnn_OIw8o16i2o:
+      case mkldnn_IOw16o16i:
+        return mkldnn_oiw;
+      default:
+        LOG(FATAL) << "Unknown MKLDNN format for 3 dimensions: " << desc.data.format;
+        return mkldnn_format_undef;
+    }
   } else if (desc.data.ndims == 4) {
     switch (desc.data.format) {
       case mkldnn_nchw:
@@ -328,6 +364,18 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       case mkldnn_Ohwi16o:
       case mkldnn_OhIw16o4i:
         return mkldnn_oihw;
+      case mkldnn_goiw:
+      case mkldnn_gOwi8o:
+      case mkldnn_gOIw8o8i:
+      case mkldnn_gOIw8i8o:
+      case mkldnn_gOIw16i16o:
+      case mkldnn_gOIw16o16i:
+      case mkldnn_gOiw16o:
+      case mkldnn_gOwi16o:
+      case mkldnn_gOIw8i16o2i:
+      case mkldnn_gOIw8o16i2o:
+      case mkldnn_gIOw16o16i:
+        return mkldnn_goiw;
       default:
         LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format;
         return mkldnn_format_undef;
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index dd1f3ec07d70..7f423ce45249 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -37,9 +37,12 @@ namespace op {
 DMLC_REGISTER_PARAMETER(MKLDNNConvParam);
 
 bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input) {
-  if (params.kernel.ndim() != 2)
+  if ((params.kernel.ndim() != 1) &&
+      (params.kernel.ndim() != 2))
     return false;
-  return SupportMKLDNNQuantize(input.dtype()) && input.shape().ndim() == 4;
+  return SupportMKLDNNQuantize(input.dtype()) &&
+         ((input.shape().ndim() == 3) ||
+          (input.shape().ndim() == 4));
 }
 
 mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
@@ -51,15 +54,26 @@ mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
   auto weight_md = GetWeightDesc(weights, param.conv_param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.conv_param.stride.ndim(), 2U);
-  CHECK_GE(param.conv_param.pad.ndim(), 2U);
-  CHECK_GE(param.conv_param.dilate.ndim(), 2U);
-  mkldnn::memory::dims strides{0, 0};
-  strides[0] = param.conv_param.stride[0];
-  strides[1] = param.conv_param.stride[1];
-  mkldnn::memory::dims padding{0, 0};
-  padding[0] = param.conv_param.pad[0];
-  padding[1] = param.conv_param.pad[1];
+  mkldnn::memory::dims strides(param.conv_param.kernel.ndim());
+  mkldnn::memory::dims padding(param.conv_param.kernel.ndim());
+  if (param.conv_param.kernel.ndim() == 1) {
+    CHECK_GE(param.conv_param.stride.ndim(), 1U);
+    CHECK_GE(param.conv_param.pad.ndim(), 1U);
+    CHECK_GE(param.conv_param.dilate.ndim(), 1U);
+    strides[0] = param.conv_param.stride[0];
+    padding[0] = param.conv_param.pad[0];
+  } else if (param.conv_param.kernel.ndim() == 2) {
+    CHECK_GE(param.conv_param.stride.ndim(), 2U);
+    CHECK_GE(param.conv_param.pad.ndim(), 2U);
+    CHECK_GE(param.conv_param.dilate.ndim(), 2U);
+    strides[0] = param.conv_param.stride[0];
+    strides[1] = param.conv_param.stride[1];
+    padding[0] = param.conv_param.pad[0];
+    padding[1] = param.conv_param.pad[1];
+  } else {
+    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size "
+               << param.conv_param.kernel.ndim() << ", supporting only 1 or 2.";
+  }
   mkldnn::primitive_attr attr;
   mkldnn::post_ops ops;
   if (param.mkldnn_param.with_relu) {
@@ -113,9 +127,17 @@ mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
     }
     return conv_pd;
   } else {
-    mkldnn::memory::dims dilates{0, 0};
-    dilates[0] = param.conv_param.dilate[0] - 1;
-    dilates[1] = param.conv_param.dilate[1] - 1;
+    mkldnn::memory::dims dilates(param.conv_param.kernel.ndim());
+    if (param.conv_param.dilate.ndim() == 1) {
+      dilates[0] = param.conv_param.dilate[0] - 1;
+    } else if (param.conv_param.dilate.ndim() == 2) {
+      dilates[0] = param.conv_param.dilate[0] - 1;
+      dilates[1] = param.conv_param.dilate[1] - 1;
+    } else {
+      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
+                 << param.conv_param.dilate.ndim()
+                 << ", supporting only 1 or 2.";
+    }
     if (bias == nullptr) {
       mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
           data_md, weight_md, out_md, strides, dilates, padding, padding,
@@ -151,15 +173,26 @@ static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2U);
-  CHECK_GE(param.pad.ndim(), 2U);
-  CHECK_GE(param.dilate.ndim(), 2U);
-  mkldnn::memory::dims strides{0, 0};
-  strides[0] = param.stride[0];
-  strides[1] = param.stride[1];
-  mkldnn::memory::dims padding{0, 0};
-  padding[0] = param.pad[0];
-  padding[1] = param.pad[1];
+  mkldnn::memory::dims strides(param.kernel.ndim());
+  mkldnn::memory::dims padding(param.kernel.ndim());
+  if (param.kernel.ndim() == 1) {
+    CHECK_GE(param.stride.ndim(), 1U);
+    CHECK_GE(param.pad.ndim(), 1U);
+    CHECK_GE(param.dilate.ndim(), 1U);
+    strides[0] = param.stride[0];
+    padding[0] = param.pad[0];
+  } else if (param.kernel.ndim() == 2) {
+    CHECK_GE(param.stride.ndim(), 2U);
+    CHECK_GE(param.pad.ndim(), 2U);
+    CHECK_GE(param.dilate.ndim(), 2U);
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  } else {
+    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.kernel.ndim()
+               << ", supporting only 1 or 2.";
+  }
 
   // MKL-DNN introduced padded formats since 0.15 which require more memory
   // for computation compared with the actual tensor size. Currently, MKL-DNN
@@ -177,9 +210,16 @@ static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData(
     }
     return conv_pd;
   } else {
-    mkldnn::memory::dims dilates{0, 0};
-    dilates[0] = param.dilate[0] - 1;
-    dilates[1] = param.dilate[1] - 1;
+    mkldnn::memory::dims dilates(param.kernel.ndim());
+    if (param.dilate.ndim() == 1) {
+      dilates[0] = param.dilate[0] - 1;
+    } else if (param.dilate.ndim() == 2) {
+      dilates[0] = param.dilate[0] - 1;
+      dilates[1] = param.dilate[1] - 1;
+    } else {
+      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
+                 << param.dilate.ndim() << ", supporting only 1 or 2.";
+    }
     mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
         data_md, weight_md, out_md, strides, dilates, padding, padding,
         mkldnn::padding_kind::zero);
@@ -201,15 +241,26 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
   auto weight_md = GetWeightDesc(weights, param.num_group);
   auto out_md = GetMemDesc(output);
   auto engine = CpuEngine::Get()->get_engine();
-  CHECK_GE(param.stride.ndim(), 2U);
-  CHECK_GE(param.pad.ndim(), 2U);
-  CHECK_GE(param.dilate.ndim(), 2U);
-  mkldnn::memory::dims strides{0, 0};
-  strides[0] = param.stride[0];
-  strides[1] = param.stride[1];
-  mkldnn::memory::dims padding{0, 0};
-  padding[0] = param.pad[0];
-  padding[1] = param.pad[1];
+  mkldnn::memory::dims strides(param.kernel.ndim());
+  mkldnn::memory::dims padding(param.kernel.ndim());
+  if (param.kernel.ndim() == 1) {
+    CHECK_GE(param.stride.ndim(), 1U);
+    CHECK_GE(param.pad.ndim(), 1U);
+    CHECK_GE(param.dilate.ndim(), 1U);
+    strides[0] = param.stride[0];
+    padding[0] = param.pad[0];
+  } else if (param.kernel.ndim() == 2) {
+    CHECK_GE(param.stride.ndim(), 2U);
+    CHECK_GE(param.pad.ndim(), 2U);
+    CHECK_GE(param.dilate.ndim(), 2U);
+    strides[0] = param.stride[0];
+    strides[1] = param.stride[1];
+    padding[0] = param.pad[0];
+    padding[1] = param.pad[1];
+  } else {
+    LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.kernel.ndim()
+               << ", supporting only 1 or 2.";
+  }
 
   // MKL-DNN introduced padded formats since 0.15 which require more memory
   // for computation compared with the actual tensor size. Currently, MKL-DNN
@@ -239,9 +290,16 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
     }
     return conv_pd;
   } else {
-    mkldnn::memory::dims dilates{0, 0};
-    dilates[0] = param.dilate[0] - 1;
-    dilates[1] = param.dilate[1] - 1;
+    mkldnn::memory::dims dilates(param.kernel.ndim());
+    if (param.dilate.ndim() == 1) {
+      dilates[0] = param.dilate[0] - 1;
+    } else if (param.dilate.ndim() == 2) {
+      dilates[0] = param.dilate[0] - 1;
+      dilates[1] = param.dilate[1] - 1;
+    } else {
+      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
+                 << param.dilate.ndim() << ", supporting only 1 or 2.";
+    }
     if (bias == nullptr) {
       mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
           data_md, weight_md, out_md, strides, dilates, padding, padding,
diff --git a/src/operator/operator_tune-inl.h b/src/operator/operator_tune-inl.h
index 127691bccccd..1dbcf4298918 100644
--- a/src/operator/operator_tune-inl.h
+++ b/src/operator/operator_tune-inl.h
@@ -56,7 +56,7 @@ namespace op {
 #endif
 #endif  // MXNET_NO_INLINE
 
-#define OUTSIDE_COUNT_SHIFT    9
+#define OUTSIDE_COUNT_SHIFT  3
 
 namespace tune {
 
@@ -356,7 +356,8 @@ class OperatorTune : public OperatorTuneByType<DType> {
   static duration_t GetOMPLoopOverhead() {
     // It was found empirically that OMP times was not heavily tied to number of cores,
     // so take an average across all core counts
-    const auto max_cores = static_cast<size_t>(omp_get_num_procs()) >> 1;
+    const auto max_cores_default = static_cast<size_t>(omp_get_num_procs()) >> 1;
+    const auto max_cores = dmlc::GetEnv("MXNET_USE_NUM_CORES_OPERATOR_TUNING", max_cores_default);
     if (max_cores >= 2) {
       std::vector<duration_t> core_times;
       // Take care of any OMP lazy-init with a throwaway call
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index e334fe7ec9b2..64ce73ba1cf7 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -23,11 +23,17 @@
  * \brief
  * \author Ziheng Jiang, Jun Wu
 */
+#include <vector>
+#include "quantization_utils.h"
 #include "../nn/fully_connected-inl.h"
 
 namespace mxnet {
 namespace op {
 
+namespace quantized_fc {
+enum QuantizedfcOpResource {kTempSpace};
+}
+
 bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
                                   std::vector<TShape> *in_shape,
                                   std::vector<TShape> *out_shape) {
@@ -79,6 +85,151 @@ bool QuantizedFullyConnectedType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+bool QuantizedFullyConnectedStorageType(const nnvm::NodeAttrs& attrs,
+                                        const int dev_mask,
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+
+  for (auto &v : *out_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+
+  for (auto &v : *in_attrs) {
+    v = kDefaultStorage;
+    if (common::stype_string(v).compare("unknown") == 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+struct QuantizedSumInitKernelWithBias {
+  //  init sum data with bias for matrix b (n)
+  MSHADOW_XINLINE static void Map(int i, int32_t *out,
+                                  const int8_t *bias, const float *min_out,
+                                  const float *max_out, const float *min_bias,
+                                  const float *max_bias) {
+    typedef int32_t T1;
+    typedef int8_t  T2;
+    using mshadow::red::limits::MinValue;
+    using mshadow::red::limits::MaxValue;
+    float float_for_one_out_quant  =
+        MaxAbs(*min_out, *max_out) / static_cast<double>(MaxValue<T1>());
+    float float_for_one_bias_quant =
+        MaxAbs(*min_bias, *max_bias) / static_cast<double>(MaxValue<T2>());
+    if (float_for_one_out_quant != 0) {
+      out[i] = bias[i] * float_for_one_bias_quant /
+          float_for_one_out_quant;
+    } else {
+      LOG(INFO) << "float_for_one_out_quant is 0,"
+                << " need to check the why MaxAbs(*min_out, *max_out) of out_data is 0!";
+      out[i] = 0;
+    }
+  }
+};
+
+
+template<typename SrcType>
+void QuantizedFullyConnectedForward(const nnvm::NodeAttrs& attrs,
+                                    const OpContext &ctx,
+                                    const std::vector<NDArray> &in_data,
+                                    const std::vector<OpReqType> &req,
+                                    const std::vector<NDArray> &out_data) {
+#if MSHADOW_USE_MKL == 1
+  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+  using namespace mshadow;
+  using namespace mxnet_op;
+  size_t num_inputs = param.no_bias ? 2 : 3;
+  CHECK_EQ(in_data.size(),  num_inputs * 3);
+  CHECK_EQ(out_data.size(), 3U);
+  const NDArray& data = in_data[0];
+  const NDArray& weight = in_data[1];
+  const NDArray& out = out_data[0];
+  TShape dshape = data.shape();
+  TShape wshape = weight.shape();
+  TShape oshape = out.shape();
+  auto output_temp = out.data().dptr<int32_t>();
+  auto weight_temp = weight.data().dptr<SrcType>();
+  auto data_temp = data.data().dptr<SrcType>();
+  const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  const float alpha = 1.0f;
+  const float beta  = 1.0f;
+  const CBLAS_OFFSET offsetc = CblasFixOffset;
+  const MKL_INT8 oa = 0;
+  const MKL_INT8 ob = 0;
+  MKL_INT32 oc = 0;
+  const int m = dshape[0], n = wshape[0], k = dshape.ProdShape(1, dshape.ndim());
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  //  cblas_gemm_s8u8s32 required first matrix must be uint8
+  //  shift data from int8(from -128 to 127) to uint8 (from 0 to 255)
+  int shift = 128;
+  Tensor<cpu, 1, uint8_t> shiftdata =
+    ctx.requested[quantized_fc::kTempSpace].get_space_typed<cpu, 1, uint8_t>(
+      Shape1(m * k), s);
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < m * k; ++i) {
+    shiftdata.dptr_[i] = data_temp[i] + shift;
+  }
+
+  Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+      out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
+      in_data[num_inputs].data().dptr<float>(), in_data[num_inputs+1].data().dptr<float>(),
+      in_data[num_inputs+2].data().dptr<float>(), in_data[num_inputs+3].data().dptr<float>());
+  if (!param.no_bias) {
+    const NDArray& bias = in_data[2];
+    Kernel<QuantizedSumInitKernelWithBias, cpu>::Launch(s, n, out.data().dptr<int32_t>(),
+        bias.data().dptr<int8_t>(), out_data[1].data().dptr<float>(),
+        out_data[2].data().dptr<float>(), in_data[7].data().dptr<float>(),
+        in_data[8].data().dptr<float>());
+  } else {
+    #pragma omp parallel for num_threads(omp_threads)
+    for (int i = 0; i < m * n; ++i) {
+      output_temp[i] = 0;
+    }
+  }
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < k; ++j) {
+      output_temp[i] -= shift * weight_temp[i * k + j];
+    }
+  }
+  #pragma omp parallel for num_threads(omp_threads)
+  for (int i = n; i < m * n; ++i) {
+    output_temp[i] = output_temp[i % n];
+  }
+  cblas_gemm_s8u8s32(CblasRowMajor,
+                     CblasNoTrans,
+                     CblasTrans,
+                     offsetc,
+                     m,
+                     n,
+                     k,
+                     alpha,
+                     shiftdata.dptr_,
+                     k,
+                     oa,
+                     weight.data().dptr<SrcType>(),
+                     k,
+                     ob,
+                     beta,
+                     out.data().dptr<int32_t>(),
+                     n,
+                     &oc);
+#else
+  LOG(FATAL) << "Quantized fully connected operator relies on cblas_gemm_s8u8s32"
+             << " which is only supported by MKL BLAS."
+             << " Please build MXNet with USE_BLAS=mkl to leverage this operator.";
+#endif
+}
+
 NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
 .describe(R"code(Fully Connected operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
@@ -112,7 +263,14 @@ and max thresholds representing the threholds for quantizing the float32 output
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizedFullyConnectedShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedFullyConnectedType)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedFullyConnectedStorageType)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+    QuantizedFullyConnectedForward<int8_t>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("weight", "NDArray-or-Symbol", "weight.")
 .add_argument("bias", "NDArray-or-Symbol", "bias.")
@@ -135,6 +293,5 @@ NNVM_REGISTER_OP(FullyConnected)
     }
     return node;
   });
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index dfa98d1f5ee9..65e0e5c4b27a 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -626,8 +626,12 @@ std::vector<std::pair<int, int>> SgMKLDNNConvInplaceOption(
 }
 
 nnvm::NodePtr SgMKLDNNConvQuantizedOp(const NodeAttrs& attrs) {
+  auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
   nnvm::NodePtr node = nnvm::Node::Create();
   node->attrs.op = Op::Get("_sg_mkldnn_conv");
+  CHECK_EQ(param.full_conv_param.conv_param.kernel.ndim(), 2U)
+      << "Quantized Convolution of MKL-DNN only supports 2D kernel currently."
+      <<  "Please exclude this layer from the quantized model.";
   node->attrs.name = "quantized_" + attrs.name;
   node->attrs.dict = attrs.dict;
   node->attrs.dict["quantized"] = "true";
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 83b86bf1d94c..8d5ad055b118 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -29,11 +29,15 @@
 #include <vector>
 #include <utility>
 #include <algorithm>
+#include <climits>
 #include "./cast_storage-inl.h"
 #include "../mshadow_op.h"
 #include "../mxnet_op.h"
 #include "../elemwise_op_common.h"
 #include "../../ndarray/ndarray_function.h"
+#if MSHADOW_USE_MKL == 1
+#include "mkl.h"
+#endif
 
 namespace mxnet {
 namespace op {
@@ -348,6 +352,43 @@ class UnaryOp : public OpBase {
       LogUnimplementedOp(attrs, ctx, inputs, req, outputs);
     }
   }
+
+#if MSHADOW_USE_MKL == 1
+  static inline void MKLLog(MKL_INT size, const float* pIn, float* pOut) {
+    vsLn(size, pIn, pOut);
+  }
+
+  static inline void MKLLog(MKL_INT size, const double* pIn, double* pOut) {
+    vdLn(size, pIn, pOut);
+  }
+#endif
+
+  template<typename xpu, typename OP>
+  static void LogCompute(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+    if (req[0] == kNullOp) return;
+    // if defined MSHADOW_USE_MKL then call mkl log when req is KWriteTo, type_flag
+    // is mshadow::kFloat32 or mshadow::kFloat64 and data size less than or equal MKL_INT_MAX
+#if MSHADOW_USE_MKL == 1
+    auto type_flag = inputs[0].type_flag_;
+    const size_t MKL_INT_MAX = (sizeof(MKL_INT) == sizeof(int)) ? INT_MAX : LLONG_MAX;
+    size_t input_size = inputs[0].Size();
+    if (req[0] == kWriteTo &&
+        input_size <= MKL_INT_MAX &&
+        (type_flag == mshadow::kFloat32 || type_flag == mshadow::kFloat64)) {
+      MSHADOW_SGL_DBL_TYPE_SWITCH(type_flag, DType, {
+        MKLLog(input_size, inputs[0].dptr<DType>(), outputs[0].dptr<DType>());
+      });
+    } else {
+      Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
+    }
+#else
+    Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
+#endif
+  }
 };
 
 /*! \brief Map legacy unary_bwd to backward_grad */
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 301fc48d2128..7f69395d1c87 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -236,6 +236,20 @@ NNVM_REGISTER_OP(_backward_copy)
     return std::vector<bool>{true};
   });
 
+NNVM_REGISTER_OP(_backward_reshape)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+                                [](const NodeAttrs& attrs){
+                                  return std::vector<std::pair<int, int> >{{0, 0}};
+                                })
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+                                  [](const NodeAttrs& attrs){
+                                    return std::vector<bool>{true};
+                                  });
+
 MXNET_OPERATOR_REGISTER_UNARY(BlockGrad)
 MXNET_ADD_SPARSE_OP_ALIAS(stop_gradient)
 .add_alias("stop_gradient")
@@ -940,7 +954,7 @@ The storage type of ``exp`` output is always dense
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
 
 // log
-MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log, cpu, mshadow_op::log)
+MXNET_OPERATOR_REGISTER_UNARY(log)
 MXNET_ADD_SPARSE_OP_ALIAS(log)
 .describe(R"code(Returns element-wise Natural logarithmic value of the input.
 
@@ -949,6 +963,7 @@ The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
 The storage type of ``log`` output is always dense
 
 )code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::LogCompute<cpu, mshadow_op::log>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
 
 // log10
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
index c28934e94658..14f2be02ab1a 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cu
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -68,6 +68,10 @@ NNVM_REGISTER_OP(_copy)
 .set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::IdentityComputeEx<gpu>);
 
 NNVM_REGISTER_OP(_backward_copy)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::IdentityComputeEx<gpu>);
+
+NNVM_REGISTER_OP(_backward_reshape)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
 NNVM_REGISTER_OP(BlockGrad)
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 0faa668caf97..db8efa454385 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -103,6 +103,57 @@ DMLC_REGISTER_PARAMETER(StackParam);
 DMLC_REGISTER_PARAMETER(SqueezeParam);
 DMLC_REGISTER_PARAMETER(DepthToSpaceParam);
 
+#if MXNET_USE_MKLDNN == 1
+void MKLDNNReshape(const NDArray &in_data, const NDArray &out_data) {
+  MSHADOW_TYPE_SWITCH(in_data.dtype(), DType, {
+    auto this_mem = in_data.GetMKLDNNData();
+    auto out_dptr = out_data.data().dptr<DType>();
+    mkldnn::memory::primitive_desc this_pd = this_mem->get_primitive_desc();
+    mkldnn::memory::desc this_desc = this_pd.desc();
+    mkldnn::memory::dims dims(this_desc.data.dims,
+                              this_desc.data.dims + this_desc.data.ndims);
+    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
+    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
+    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
+    mkldnn::memory::primitive_desc pd(data_md, this_pd.get_engine());
+    auto temp_mem = mkldnn::memory(pd, out_dptr);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*this_mem, temp_mem));
+    MKLDNNStream::Get()->Submit();
+
+    // Removing out_data mkl_mem_ and store data in the default format
+    const_cast<NDArray &>(out_data).InvalidateMKLDNNData();
+  });
+}
+
+static void ReshapeComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  // If inputs are supposed to be in MKLDNN format and
+  // MKLDNNsupport the data type or the shape. Then convert
+  // it to the output format and shape
+  if (SupportMKLDNNArray(inputs[0].dtype(), inputs[0].shape()) && req[0] != kAddTo) {
+    MKLDNNReshape(inputs[0], outputs[0]);
+    return;
+  }
+  FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+
+inline static bool ReshapeStorageType(const nnvm::NodeAttrs& attrs,
+                                      const int dev_mask,
+                                      DispatchMode* dispatch_mode,
+                                      std::vector<int>* in_attrs,
+                                      std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
+}
+#endif
+
 NNVM_REGISTER_OP(Reshape)
 .add_alias("reshape")
 .describe(R"code(Reshapes the input array.
@@ -172,8 +223,16 @@ If the argument `reverse` is set to 1, then the special values are inferred from
 .set_attr_parser(ParamParser<ReshapeParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", ReshapeShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_copy"})
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_reshape"})
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ReshapeComputeExCPU)
+.set_attr<FInferStorageType>("FInferStorageType", ReshapeStorageType)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#else
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
@@ -182,6 +241,7 @@ If the argument `reverse` is set to 1, then the special values are inferred from
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
   })
+#endif
 .add_argument("data", "NDArray-or-Symbol", "Input data to reshape.")
 .add_arguments(ReshapeParam::__FIELDS__());
 
@@ -210,6 +270,7 @@ static void FlattenEx(const nnvm::NodeAttrs& attrs,
 #endif
 }
 
+#if MXNET_USE_MKLDNN == 1
 static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
                                    const int dev_mask,
                                    DispatchMode* dispatch_mode,
@@ -217,17 +278,10 @@ static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
                                    std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
-  bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, dispatch_mode,
-                                                            in_attrs, out_attrs);
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask
-      && in_attrs->at(0) == kDefaultStorage
-      && out_attrs->at(0) == kDefaultStorage) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-#endif
-  return ret;
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
 }
+#endif
 
 NNVM_REGISTER_OP(Flatten)
 .add_alias("flatten")
@@ -261,7 +315,9 @@ Example::
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", FlattenShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", FlattenStorageType)
+#endif
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_backward_copy" })
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .set_attr<FComputeEx>("FComputeEx<cpu>", FlattenEx)
@@ -359,7 +415,7 @@ will return a new array with shape ``(2,1,3,4)``.
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
   })
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_copy"})
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_reshape"})
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "Source input")
 .add_arguments(ExpandDimParam::__FIELDS__());
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index 43e98fe04a1f..25ad61efb232 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -43,12 +43,12 @@ class CPUDeviceStorage {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  inline static void* Alloc(size_t size);
+  inline static void* Alloc(Storage::Handle* handle);
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    */
-  inline static void Free(void* ptr);
+  inline static void Free(Storage::Handle handle);
 
  private:
   /*!
@@ -63,7 +63,8 @@ class CPUDeviceStorage {
 #endif
 };  // class CPUDeviceStorage
 
-inline void* CPUDeviceStorage::Alloc(size_t size) {
+inline void* CPUDeviceStorage::Alloc(Storage::Handle* handle) {
+  const size_t size = handle->size;
   void* ptr;
 #if _MSC_VER
   ptr = _aligned_malloc(size, alignment_);
@@ -75,7 +76,8 @@ inline void* CPUDeviceStorage::Alloc(size_t size) {
   return ptr;
 }
 
-inline void CPUDeviceStorage::Free(void* ptr) {
+inline void CPUDeviceStorage::Free(Storage::Handle handle) {
+  void * ptr = handle.dptr;
 #if _MSC_VER
   _aligned_free(ptr);
 #else
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index 435c7e81d2ae..562badb8752e 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -46,17 +46,19 @@ class GPUDeviceStorage {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  inline static void* Alloc(size_t size);
+  inline static void* Alloc(Storage::Handle* handle);
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    */
-  inline static void Free(void* ptr);
+  inline static void Free(Storage::Handle handle);
 };  // class GPUDeviceStorage
 
-inline void* GPUDeviceStorage::Alloc(size_t size) {
+inline void* GPUDeviceStorage::Alloc(Storage::Handle* handle) {
+  const size_t size = handle->size;
   void* ret = nullptr;
 #if MXNET_USE_CUDA
+  mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
@@ -69,8 +71,10 @@ inline void* GPUDeviceStorage::Alloc(size_t size) {
   return ret;
 }
 
-inline void GPUDeviceStorage::Free(void* ptr) {
+inline void GPUDeviceStorage::Free(Storage::Handle handle) {
 #if MXNET_USE_CUDA
+  void * ptr = handle.dptr;
+  mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
diff --git a/src/storage/naive_storage_manager.h b/src/storage/naive_storage_manager.h
index b05b242a799e..55112b5a82e9 100644
--- a/src/storage/naive_storage_manager.h
+++ b/src/storage/naive_storage_manager.h
@@ -49,7 +49,7 @@ class NaiveStorageManager final : public StorageManager {
   void Free(Storage::Handle handle) override;
 
   void DirectFree(Storage::Handle handle) override {
-    DeviceStorage::Free(handle.dptr);
+    DeviceStorage::Free(handle);
   }
 
  private:
@@ -58,12 +58,12 @@ class NaiveStorageManager final : public StorageManager {
 
 template <class DeviceStorage>
 void NaiveStorageManager<DeviceStorage>::Alloc(Storage::Handle* handle) {
-  handle->dptr = DeviceStorage::Alloc(handle->size);
+  handle->dptr = DeviceStorage::Alloc(handle);
 }
 
 template <class DeviceStorage>
 void NaiveStorageManager<DeviceStorage>::Free(Storage::Handle handle) {
-  DeviceStorage::Free(handle.dptr);
+  DeviceStorage::Free(handle);
 }
 
 }  // namespace storage
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index e3fec2f4a06d..c4ababbdc03a 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -41,29 +41,33 @@ class PinnedMemoryStorage {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  inline static void* Alloc(size_t size);
+  inline static void* Alloc(Storage::Handle* handle);
 
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    */
-  inline static void Free(void* ptr);
+  inline static void Free(Storage::Handle handle);
 };
 
-inline void* PinnedMemoryStorage::Alloc(size_t size) {
+inline void* PinnedMemoryStorage::Alloc(Storage::Handle* handle) {
   void* ret = nullptr;
+  const size_t size = handle->size;
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
 #endif
+  mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
   // make the memory available across all devices
   CUDA_CALL(cudaHostAlloc(&ret, size, cudaHostAllocPortable));
   return ret;
 }
 
-inline void PinnedMemoryStorage::Free(void* ptr) {
+inline void PinnedMemoryStorage::Free(Storage::Handle handle) {
+  void * ptr = handle.dptr;
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
 #endif
+  mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
   cudaError_t err = cudaFreeHost(ptr);
   // ignore unloading error, as memory has already been recycled
   if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index cade8d9495f4..c407a9f00cb6 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -84,6 +84,7 @@ class GPUPooledStorageManager final : public StorageManager {
 
  private:
   void DirectFreeNoLock(Storage::Handle handle) {
+    mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
     cudaError_t err = cudaFree(handle.dptr);
     size_t size = RoundAllocSize(handle.size);
     // ignore unloading error, as memory has already been recycled
@@ -132,6 +133,7 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
   size_t size = RoundAllocSize(handle->size);
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
+    mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
     size_t free, total;
     cudaMemGetInfo(&free, &total);
     if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
@@ -252,6 +254,7 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
   }
 
   void DirectFreeNoLock(Storage::Handle handle) {
+    mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
     cudaError_t err = cudaFree(handle.dptr);
     size_t size = get_size(get_bucket(handle.size));
     // ignore unloading error, as memory has already been recycled
@@ -288,6 +291,7 @@ void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
   size_t size = get_size(bucket);
   auto&& reuse_pool = memory_pool_[bucket];
   if (reuse_pool.size() == 0) {
+    mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
     size_t free, total;
     cudaMemGetInfo(&free, &total);
     if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index c7100a456d80..911d30cc3f05 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -48,35 +48,6 @@ class StorageImpl : public Storage {
   static int num_gpu_device;
 #endif  // MXNET_USE_CUDA
 
-  static void ActivateDevice(Context ctx) {
-    switch (ctx.dev_type) {
-      case Context::kCPU:
-        break;
-      case Context::kCPUPinned:
-#if MXNET_USE_CUDA
-        if (num_gpu_device > 0) {
-          CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
-        }
-#endif  // MXNET_USE_CUDA
-        break;
-      case Context::kCPUShared: {
-#if defined(ANDROID) || defined(__ANDROID__)
-        LOG(FATAL) << "Unimplemented device";
-#endif  // defined(ANDROID) || defined(__ANDROID__)
-      }
-        break;
-      case Context::kGPU: {
-#if MXNET_USE_CUDA
-          if (num_gpu_device > 0) {
-            CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
-          }
-#endif  // MXNET_USE_CUDA
-          break;
-        }
-      default:
-        LOG(FATAL) << "Unimplemented device";
-    }
-  }
   // internal storage managers
   std::array<common::LazyAllocArray<storage::StorageManager>,
              kMaxNumberOfDevices> storage_managers_;
@@ -100,6 +71,8 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
           case Context::kCPUShared: {
 #if !defined(ANDROID) && !defined(__ANDROID__)
             ptr = new storage::CPUSharedStorageManager();
+#else
+            LOG(FATAL) << "Unimplemented device";
 #endif  // !defined(ANDROID) && !defined(__ANDROID__)
             break;
           }
@@ -149,13 +122,6 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
         return ptr;
       });
 
-#if MXNET_USE_CUDA
-  // Will restore gpu device to before ActivateDevice if necessary
-  bool restore = handle->ctx.dev_type == Context::kCPUPinned ||
-                 handle->ctx.dev_type == Context::kGPU;
-  mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
-  this->ActivateDevice(handle->ctx);
   manager->Alloc(handle);
   profiler_.OnAlloc(*handle);
 }
@@ -169,12 +135,6 @@ void StorageImpl::Free(Storage::Handle handle) {
         return nullptr;
       });
 
-#if MXNET_USE_CUDA
-  // Will restore gpu device to before ActivateDevice if necessary
-  bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU;
-  mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
-  this->ActivateDevice(ctx);
   manager->Free(handle);
   profiler_.OnFree(handle);
 }
@@ -188,12 +148,6 @@ void StorageImpl::DirectFree(Storage::Handle handle) {
         return nullptr;
       });
 
-#if MXNET_USE_CUDA
-  // Will restore gpu device to before ActivateDevice if necessary
-  bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU;
-  mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
-  this->ActivateDevice(ctx);
   manager->DirectFree(handle);
   profiler_.OnFree(handle);
 }
diff --git a/tests/cpp/operator/krprod_test.cc b/tests/cpp/operator/krprod_test.cc
index 26c2661bc352..66ddddd771f8 100644
--- a/tests/cpp/operator/krprod_test.cc
+++ b/tests/cpp/operator/krprod_test.cc
@@ -250,6 +250,8 @@ TEST(row_wise_kronecker, FourInputMatrices) {
   FreeSpace(&result);
 }
 
+
+#if MXNET_USE_LAPACK == 1
 TEST(khatri_rao, OneInputMatrix) {
   // Input matrices of shape (2, 4) which is also the expected result
   DType mat[8] {1, 2, 3, 4, 5, 6, 7, 8};
@@ -444,5 +446,6 @@ TEST(inv_khatri_rao, ThreeInputMatricesTranposed) {
   FreeSpace(&kr_t);
   FreeSpace(&actual_dot);
 }
+#endif  // MXNET_USE_LAPACK == 1
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 665ce6982874..746ee2f096f1 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -41,22 +41,22 @@ gtest-all.o : $(GTEST_SRCS_)
 gtest.a : gtest-all.o
 	$(AR) $(ARFLAGS) $@ $^
 
-build/tests/cpp/%.o : tests/cpp/%.cc
+build/tests/cpp/%.o : tests/cpp/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc
+build/tests/cpp/operator/%.o : tests/cpp/operator/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/operator/$* $< > build/tests/cpp/operator/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/operator/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc
+build/tests/cpp/storage/%.o : tests/cpp/storage/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/storage/$* $< > build/tests/cpp/storage/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/storage/$*.o $(filter %.cc %.a, $^)
 
-build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc
+build/tests/cpp/engine/%.o : tests/cpp/engine/%.cc | mkldnn
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -MM -MT tests/cpp/engine/$* $< > build/tests/cpp/engine/$*.d
 	$(CXX) -c -std=c++11 $(TEST_CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/engine/$*.o $(filter %.cc %.a, $^)
diff --git a/tests/jenkins/run_test.sh b/tests/jenkins/run_test.sh
index bc69ca1d7f39..5ef8c1ac01ef 100755
--- a/tests/jenkins/run_test.sh
+++ b/tests/jenkins/run_test.sh
@@ -56,8 +56,8 @@ nosetests3 --verbose tests/python/train || exit -1
 
 echo "BUILD scala_test"
 export PATH=$PATH:/opt/apache-maven/bin
-make scalapkg || exit -1
-make scalatest || exit -1
+cd scala-package
+mvn install || exit -1
 
 # echo "BUILD julia_test"
 # export MXNET_HOME="${PWD}"
diff --git a/tests/jenkins/run_test_amzn_linux_gpu.sh b/tests/jenkins/run_test_amzn_linux_gpu.sh
index ecfb5211b9e6..57d9c7884088 100755
--- a/tests/jenkins/run_test_amzn_linux_gpu.sh
+++ b/tests/jenkins/run_test_amzn_linux_gpu.sh
@@ -65,5 +65,5 @@ nosetests3 --verbose tests/python/train
 #julia -e 'try Pkg.clone("MXNet"); catch end; Pkg.checkout("MXNet"); Pkg.build("MXNet"); Pkg.test("MXNet")' || exit -1
 
 echo "BUILD scala_test"
-make scalapkg
-make scalatest
+cd scala-package
+mvn integration-test
diff --git a/tests/jenkins/run_test_ubuntu.sh b/tests/jenkins/run_test_ubuntu.sh
index cdddd2865ddc..28e00331b47e 100755
--- a/tests/jenkins/run_test_ubuntu.sh
+++ b/tests/jenkins/run_test_ubuntu.sh
@@ -68,6 +68,6 @@ nosetests3 --verbose tests/python/train || exit 1
 
 echo "BUILD scala_test"
 export PATH=$PATH:/opt/apache-maven/bin
-make scalapkg || exit 1
-make scalatest || exit 1
+cd scala-package
+mvn integration-test || exit 1
 
diff --git a/tests/nightly/Jenkinsfile b/tests/nightly/Jenkinsfile
index d769f08abfc5..b8e2849fd6a3 100755
--- a/tests/nightly/Jenkinsfile
+++ b/tests/nightly/Jenkinsfile
@@ -113,6 +113,14 @@ core_logic: {
         }
       }
     },
+    'Java Demo: CPU': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/java-demo') {
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_cpu', 'nightly_java_demo_test_cpu', false)
+        }
+      }
+    },
     'MXNetJS: CPU': {
       node(NODE_LINUX_CPU) {
         ws('workspace/nt-mxnetjs') {
diff --git a/tests/nightly/apache_rat_license_check/rat-excludes b/tests/nightly/apache_rat_license_check/rat-excludes
index a488eb84d069..5969f01a3225 100755
--- a/tests/nightly/apache_rat_license_check/rat-excludes
+++ b/tests/nightly/apache_rat_license_check/rat-excludes
@@ -5,19 +5,15 @@
 .*ipynb
 .*html
 .*json
-.*js
 .*txt
 .*md
 3rdparty/*
 R-package/*
-src/operator/mkl/*
 trunk/*
 docker/*
-docker_multiarch/*
 .*\\.m
 .*\\.mk
 .*\\.R
-Dockerfile*
 .*svg
 .*cfg
 .*config
@@ -27,21 +23,15 @@ build/*
 .*\\.t
 MANIFEST
 Changes
-META.yml
 .*csv
 .*names
 CODEOWNERS
-prepare_mkl.sh
-readthedocs.yml
 snap.python
-snapcraft.yaml
-image-classification-predict.cc
 bbox.pyx
 cpu_nms.pyx
 gpu_nms.pyx
 nms_kernel.cu
 _mask.pyx
-unicodemap_en_baidu.csv
 coco.py
 base.pyi
 special_functions-inl.h
@@ -50,7 +40,6 @@ im2col.h
 pool.h
 README.rst
 dataset.cPickle
-rcnn/*
 image-classification/*
 rat-excludes
 apache-rat-tasks/*
@@ -59,5 +48,5 @@ deformable_im2col.cuh
 deformable_im2col.h
 REQUIRE
 include/*
-*/test/test-symbol.json.ref
-*/profiler/test/profile-matmul-20iter.json.ref
\ No newline at end of file
+.*.iml
+.*.json.ref
\ No newline at end of file
diff --git a/tests/nightly/dist_async_kvstore.py b/tests/nightly/dist_async_kvstore.py
index 3e400eafa045..b990b6b3f13e 100644
--- a/tests/nightly/dist_async_kvstore.py
+++ b/tests/nightly/dist_async_kvstore.py
@@ -27,22 +27,26 @@
 nworker = kv.num_workers
 
 def test_gluon_trainer_type():
-    def check_trainer_kv_update(update_on_kv):
+    def check_trainer_kv_update(weight_stype, update_on_kv):
         params = mx.gluon.ParameterDict()
-        x = params.get('x', shape=(10,1), lr_mult=1.0)
+        x = params.get('x', shape=(10,1), lr_mult=1.0, stype=weight_stype)
         params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
         try:
-            trainer = mx.gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}, kvstore=kv, update_on_kvstore=update_on_kv)
+            trainer = mx.gluon.Trainer(params, 'sgd', {'learning_rate': 0.1},
+                                       kvstore=kv, update_on_kvstore=update_on_kv)
             trainer._init_kvstore()
             assert trainer._kv_initialized
             assert trainer._update_on_kvstore is True
         except ValueError:
             assert update_on_kv is False
 
-    check_trainer_kv_update(False)
-    check_trainer_kv_update(True)
-    check_trainer_kv_update(None)
+    check_trainer_kv_update('default', False)
+    check_trainer_kv_update('default', True)
+    check_trainer_kv_update('default', None)
+    check_trainer_kv_update('row_sparse', False)
+    check_trainer_kv_update('row_sparse', True)
+    check_trainer_kv_update('row_sparse', None)
     print('worker ' + str(my_rank) + ' passed test_gluon_trainer_type')
 
 if __name__ == "__main__":
-    test_gluon_trainer_type()
\ No newline at end of file
+    test_gluon_trainer_type()
diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py
index 861b85913ac8..4523a361cf88 100644
--- a/tests/nightly/dist_sync_kvstore.py
+++ b/tests/nightly/dist_sync_kvstore.py
@@ -376,18 +376,26 @@ def check_invalid_pull():
     check_invalid_pull()
 
 def test_gluon_trainer_type():
-    def check_trainer_kv_type(stype, grad_stype, update_on_kv):
+    def check_trainer_kv_type(stype, grad_stype, update_on_kv, expected):
         params = mx.gluon.ParameterDict()
         x = params.get('x', shape=(10,1), lr_mult=1.0, stype=stype, grad_stype=grad_stype)
         params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
-        trainer = mx.gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}, kvstore=kv)
-        trainer._init_kvstore()
-        assert trainer._kv_initialized
-        assert trainer._update_on_kvstore is update_on_kv
-
-    check_trainer_kv_type('default', 'default', False)
-    check_trainer_kv_type('default', 'row_sparse', True)
-    check_trainer_kv_type('row_sparse', 'row_sparse', True)
+        trainer = mx.gluon.Trainer(params, 'sgd', {'learning_rate': 0.1},
+                                   kvstore=kv, update_on_kvstore=update_on_kv)
+        try:
+            trainer._init_kvstore()
+            assert trainer._kv_initialized
+            assert trainer._update_on_kvstore is expected
+        except Exception as err:
+            assert isinstance(err, expected)
+
+    check_trainer_kv_type('default', 'default', None, True)
+    check_trainer_kv_type('default', 'default', True, True)
+    check_trainer_kv_type('default', 'default', False, False)
+    check_trainer_kv_type('default', 'row_sparse', None, True)
+    check_trainer_kv_type('default', 'row_sparse', False, ValueError)
+    check_trainer_kv_type('row_sparse', 'row_sparse', None, True)
+    check_trainer_kv_type('row_sparse', 'row_sparse', False, ValueError)
     print('worker ' + str(my_rank) + ' passed test_gluon_trainer_type')
 
 def test_gluon_trainer_step():
diff --git a/tests/python-pytest/onnx/README.md b/tests/python-pytest/onnx/README.md
new file mode 100644
index 000000000000..d8f58cba3d5c
--- /dev/null
+++ b/tests/python-pytest/onnx/README.md
@@ -0,0 +1,33 @@
+# ONNX tests
+
+## Directory structure:
+
+```bash
+.
+├── README.md
+├── backend.py
+├── backend_rep.py
+├── backend_test.py
+├── gluon_backend_test.py
+├── mxnet_backend_test.py
+├── mxnet_export_test.py
+├── test_cases.py
+├── test_models.py
+└── test_node.py
+```
+
+* `backend.py` - MXNetBackend. This file contains prepare(). \
+This class can be used for both, MXNet and Gluon backend.
+* `backend_rep.py` - MXNetBackendRep and GluonBackendRep for running inference
+* `backend_test.py` - prepare tests by including tests from `test_cases.py`
+* `gluon_backend_test.py` - Set backend as gluon and execute ONNX tests for ONNX->Gluon import.
+* `mxnet_backend_test.py` - Set backend as gluon and add tests for ONNX->MXNet import/export.
+Since MXNetBackend for export, tests both import and export, the test list in this file is
+a union of tests that execute for import and export, export alone, and import alone.
+* `mxnet_export_test.py` - Execute unit tests for testing MXNet export code - this is not specific to
+any operator.
+* `test_cases.py` - list of test cases for operators/models that are supported
+for "both", import and export, "import" alone, or "export" alone.
+* `test_models.py` - custom tests for models
+* `test_node.py` - custom tests for operators. These tests are written independent of ONNX tests, in case
+ONNX doesn't have tests yet or for MXNet specific operators.
\ No newline at end of file
diff --git a/tests/python-pytest/onnx/export/backend.py b/tests/python-pytest/onnx/backend.py
similarity index 57%
rename from tests/python-pytest/onnx/export/backend.py
rename to tests/python-pytest/onnx/backend.py
index 3ea1dafca255..2f9e2470d225 100644
--- a/tests/python-pytest/onnx/export/backend.py
+++ b/tests/python-pytest/onnx/backend.py
@@ -16,51 +16,57 @@
 # under the License.
 
 # coding: utf-8
-"""backend wrapper for onnx test infrastructure"""
-import os
-import sys
-import numpy as np
+"""MXNet/Gluon backend wrapper for onnx test infrastructure"""
+
 from mxnet.contrib.onnx.onnx2mx.import_onnx import GraphProto
 from mxnet.contrib.onnx.mx2onnx.export_onnx import MXNetGraph
+import mxnet as mx
+import numpy as np
+
 try:
     from onnx import helper, TensorProto, mapping
     from onnx.backend.base import Backend
 except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed")
-CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(CURR_PATH, '../'))
-from backend_rep import MXNetBackendRep
+    raise ImportError("Onnx and protobuf need to be installed. Instructions to"
+                      + " install - /~https://github.com/onnx/onnx#installation")
+from backend_rep import MXNetBackendRep, GluonBackendRep
+
 
-# Using these functions for onnx test infrastructure.
-# Implemented by following onnx docs guide:
-# /~https://github.com/onnx/onnx/blob/master/docs/Implementing%20an%20ONNX%20backend.md
 # MXNetBackend class will take an ONNX model with inputs, perform a computation,
 # and then return the output.
+# Implemented by following onnx docs guide:
+# /~https://github.com/onnx/onnx/blob/master/docs/ImplementingAnOnnxBackend.md
 
 class MXNetBackend(Backend):
-    """MXNet backend for ONNX"""
+    """MXNet/Gluon backend for ONNX"""
+
+    backend = 'mxnet'
+    operation = 'import'
+
+    @classmethod
+    def set_params(cls, backend, operation):
+        cls.backend = backend
+        cls.operation = operation
 
     @staticmethod
-    def perform_import_export(graph_proto, input_shape):
+    def perform_import_export(sym, arg_params, aux_params, input_shape):
         """ Import ONNX model to mxnet model and then export to ONNX model
             and then import it back to mxnet for verifying the result"""
         graph = GraphProto()
 
-        sym, arg_params, aux_params = graph.from_onnx(graph_proto)
-
         params = {}
         params.update(arg_params)
         params.update(aux_params)
         # exporting to onnx graph proto format
         converter = MXNetGraph()
-        graph_proto = converter.create_onnx_graph_proto(sym, params, in_shape=input_shape, in_type=mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype('float32')])
+        graph_proto = converter.create_onnx_graph_proto(sym, params, in_shape=input_shape,
+                                                        in_type=mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype('float32')])
 
         # importing back to MXNET for verifying result.
         sym, arg_params, aux_params = graph.from_onnx(graph_proto)
 
         return sym, arg_params, aux_params
 
-
     @classmethod
     def prepare(cls, model, device='CPU', **kwargs):
         """For running end to end model(used for onnx test backend)
@@ -80,13 +86,31 @@ def prepare(cls, model, device='CPU', **kwargs):
             Returns object of MXNetBackendRep class which will be in turn
             used to run inference on the input model and return the result for comparison.
         """
+        backend = kwargs.get('backend', cls.backend)
+        operation = kwargs.get('operation', cls.operation)
 
         graph = GraphProto()
-        metadata = graph.get_graph_metadata(model.graph)
-        input_data = metadata['input_tensor_data']
-        input_shape = [data[1] for data in input_data]
-        sym, arg_params, aux_params = MXNetBackend.perform_import_export(model.graph, input_shape)
-        return MXNetBackendRep(sym, arg_params, aux_params, device)
+        if device == 'CPU':
+            ctx = mx.cpu()
+        else:
+            raise NotImplementedError("ONNX tests are run only for CPU context.")
+
+        if backend == 'mxnet':
+            sym, arg_params, aux_params = graph.from_onnx(model.graph)
+            if operation == 'export':
+                metadata = graph.get_graph_metadata(model.graph)
+                input_data = metadata['input_tensor_data']
+                input_shape = [data[1] for data in input_data]
+                sym, arg_params, aux_params = MXNetBackend.perform_import_export(sym, arg_params, aux_params,
+                                                                                 input_shape)
+
+            return MXNetBackendRep(sym, arg_params, aux_params, device)
+        elif backend == 'gluon':
+            if operation == 'import':
+                net = graph.graph_to_gluon(model.graph, ctx)
+                return GluonBackendRep(net, device)
+            elif operation == 'export':
+                raise NotImplementedError("Gluon->ONNX export not implemented.")
 
     @classmethod
     def supports_device(cls, device):
@@ -96,6 +120,4 @@ def supports_device(cls, device):
 
 prepare = MXNetBackend.prepare
 
-run_node = MXNetBackend.run_node
-
 supports_device = MXNetBackend.supports_device
diff --git a/tests/python-pytest/onnx/backend_rep.py b/tests/python-pytest/onnx/backend_rep.py
index 63836ac848df..be6bc88d9d70 100644
--- a/tests/python-pytest/onnx/backend_rep.py
+++ b/tests/python-pytest/onnx/backend_rep.py
@@ -22,7 +22,9 @@
 except ImportError:
     raise ImportError("Onnx and protobuf need to be installed. Instructions to"
                       + " install - /~https://github.com/onnx/onnx#installation")
+import numpy as np
 import mxnet as mx
+from mxnet import nd
 
 # Using these functions for onnx test infrastructure.
 # Implemented by following onnx docs guide:
@@ -80,5 +82,56 @@ def run(self, inputs, **kwargs):
         args = dict(zip(data_names, data_forward))
         exe = self.symbol.bind(ctx, args=args, aux_states=self.aux_params)
         exe.forward(is_train=False)
-        result = exe.outputs[0].asnumpy()
-        return [result]
+        result = []
+        for output in exe.outputs:
+            result.append(output.asnumpy())
+        return result
+
+
+# GluonBackendRep object will be returned by GluonBackend's prepare method which is used to
+# execute a model repeatedly.
+# Inputs will be passed to the run method of MXNetBackendRep class, it will perform computation and
+# retrieve the corresponding results for comparison to the onnx backend.
+# /~https://github.com/onnx/onnx/blob/master/onnx/backend/test/runner/__init__.py.
+# Implemented by following onnx docs guide:
+# /~https://github.com/onnx/onnx/blob/master/docs/ImplementingAnOnnxBackend.md
+
+class GluonBackendRep(BackendRep):
+    """Running model inference on gluon backend and return the result
+     to onnx test infrastructure for comparison."""
+    def __init__(self, net, device):
+        self.net = net
+        self.device = device
+
+    def run(self, inputs, **kwargs):
+        """Run model inference and return the result
+
+        Parameters
+        ----------
+        inputs : numpy array
+            input to run a layer on
+
+        Returns
+        -------
+        params : numpy array
+            result obtained after running the inference on mxnet
+        """
+        # create module, passing cpu context
+        if self.device == 'CPU':
+            ctx = mx.cpu()
+        else:
+            raise NotImplementedError("ONNX tests are run only for CPU context.")
+
+        # run inference
+        net_inputs = [nd.array(input_data, ctx=ctx) for input_data in inputs]
+        net_outputs = self.net(*net_inputs)
+        results = []
+        if isinstance(net_outputs, list):
+            for output in net_outputs:
+                results.append(output.asnumpy())
+            result = results
+        else:
+            results.extend([o for o in net_outputs.asnumpy()])
+            result = [np.array(results)]
+
+        return result
diff --git a/tests/python-pytest/onnx/backend_test.py b/tests/python-pytest/onnx/backend_test.py
new file mode 100644
index 000000000000..8eaa303a6c1b
--- /dev/null
+++ b/tests/python-pytest/onnx/backend_test.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""ONNX test backend wrapper"""
+try:
+    import onnx.backend.test
+except ImportError:
+    raise ImportError("Onnx and protobuf need to be installed")
+
+import test_cases
+import unittest
+import backend as mxnet_backend
+import logging
+
+operations = ['import', 'export']
+backends = ['mxnet', 'gluon']
+# This is a pytest magic variable to load extra plugins
+pytest_plugins = "onnx.backend.test.report",
+
+
+def test_suite(backend_tests):  # type: () -> unittest.TestSuite
+    '''
+    TestSuite that can be run by TestRunner
+    This has been borrowed from onnx/onnx/backend/test/runner/__init__.py,
+    since Python3 cannot sort objects of type 'Type' as Runner.test_suite()
+    expects.
+    '''
+    suite = unittest.TestSuite()
+    for case in backend_tests.test_cases.values():
+        suite.addTests(unittest.defaultTestLoader.loadTestsFromTestCase(case))
+    return suite
+
+
+def prepare_tests(backend, oper):
+    """
+    Prepare the test list
+    :param backend: mxnet/gluon backend
+    :param oper: str. export or import
+    :return: backend test list
+    """
+    BACKEND_TESTS = onnx.backend.test.BackendTest(backend, __name__)
+    implemented_ops = test_cases.IMPLEMENTED_OPERATORS_TEST.get('both', []) + \
+                      test_cases.IMPLEMENTED_OPERATORS_TEST.get(oper, [])
+
+    for op_test in implemented_ops:
+        BACKEND_TESTS.include(op_test)
+
+    basic_models = test_cases.BASIC_MODEL_TESTS.get('both', []) + \
+                   test_cases.BASIC_MODEL_TESTS.get(oper, [])
+
+    for basic_model_test in basic_models:
+        BACKEND_TESTS.include(basic_model_test)
+
+    std_models = test_cases.STANDARD_MODEL.get('both', []) + \
+                 test_cases.STANDARD_MODEL.get(oper, [])
+
+    for std_model_test in std_models:
+        BACKEND_TESTS.include(std_model_test)
+
+    # Tests for scalar ops are in test_node.py
+    BACKEND_TESTS.exclude('.*scalar.*')
+
+    return BACKEND_TESTS
+
+
+for bkend in backends:
+    for operation in operations:
+        log = logging.getLogger(bkend + operation)
+        if bkend == 'gluon' and operation == 'export':
+            log.warning('Gluon->ONNX export not implemented. Skipping tests...')
+            continue
+        log.info('Executing tests for ' + bkend + ' backend: ' + operation)
+        mxnet_backend.MXNetBackend.set_params(bkend, operation)
+        BACKEND_TESTS = prepare_tests(mxnet_backend, operation)
+        unittest.TextTestRunner().run(test_suite(BACKEND_TESTS.enable_report()))
diff --git a/tests/python-pytest/onnx/export/mxnet_export_test.py b/tests/python-pytest/onnx/export/mxnet_export_test.py
deleted file mode 100644
index 22db0d637a3a..000000000000
--- a/tests/python-pytest/onnx/export/mxnet_export_test.py
+++ /dev/null
@@ -1,431 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Tests for individual operators
-This module contains operator tests which currently do not exist on
-ONNX backend test framework. Once we have PRs on the ONNX repo and get
-those PRs merged, this file will get EOL'ed.
-"""
-# pylint: disable=too-many-locals,wrong-import-position,import-error
-from __future__ import absolute_import
-import sys
-import os
-import unittest
-import logging
-import tarfile
-import tempfile
-from collections import namedtuple
-import numpy as np
-import numpy.testing as npt
-from onnx import numpy_helper, helper
-from onnx import TensorProto
-from mxnet import nd, sym
-from mxnet.gluon import nn
-from mxnet.test_utils import download
-from mxnet.contrib import onnx as onnx_mxnet
-import mxnet as mx
-CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(CURR_PATH, '../../../python/unittest'))
-import backend
-from common import with_seed
-
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-URLS = {
-    'bvlc_googlenet':
-        'https://s3.amazonaws.com/onnx-mxnet/model-zoo/bvlc_googlenet.tar.gz',
-    'bvlc_reference_caffenet':
-        'https://s3.amazonaws.com/onnx-mxnet/model-zoo/bvlc_reference_caffenet.tar.gz',
-    'bvlc_reference_rcnn_ilsvrc13':
-        'https://s3.amazonaws.com/onnx-mxnet/model-zoo/bvlc_reference_rcnn_ilsvrc13.tar.gz',
-    'inception_v1':
-        'https://s3.amazonaws.com/onnx-mxnet/model-zoo/inception_v1.tar.gz',
-    'inception_v2':
-        'https://s3.amazonaws.com/onnx-mxnet/model-zoo/inception_v2.tar.gz'
-}
-
-def get_test_files(name):
-    """Extract tar file and returns model path and input, output data"""
-    tar_name = download(URLS.get(name), dirname=CURR_PATH.__str__())
-    # extract tar file
-    tar_path = os.path.join(CURR_PATH, tar_name)
-    tar = tarfile.open(tar_path.__str__(), "r:*")
-    tar.extractall(path=CURR_PATH.__str__())
-    tar.close()
-    data_dir = os.path.join(CURR_PATH, name)
-    model_path = os.path.join(data_dir, 'model.onnx')
-
-    inputs = []
-    outputs = []
-    # get test files
-    for test_file in os.listdir(data_dir):
-        case_dir = os.path.join(data_dir, test_file)
-        # skip the non-dir files
-        if not os.path.isdir(case_dir):
-            continue
-        input_file = os.path.join(case_dir, 'input_0.pb')
-        input_tensor = TensorProto()
-        with open(input_file, 'rb') as proto_file:
-            input_tensor.ParseFromString(proto_file.read())
-        inputs.append(numpy_helper.to_array(input_tensor))
-
-        output_tensor = TensorProto()
-        output_file = os.path.join(case_dir, 'output_0.pb')
-        with open(output_file, 'rb') as proto_file:
-            output_tensor.ParseFromString(proto_file.read())
-        outputs.append(numpy_helper.to_array(output_tensor))
-
-    return model_path, inputs, outputs
-
-
-def forward_pass(sym, arg, aux, data_names, input_data):
-    """ Perform forward pass on given data"""
-    # create module
-    mod = mx.mod.Module(symbol=sym, data_names=data_names, context=mx.cpu(), label_names=None)
-    mod.bind(for_training=False, data_shapes=[(data_names[0], input_data.shape)], label_shapes=None)
-    mod.set_params(arg_params=arg, aux_params=aux,
-                   allow_missing=True, allow_extra=True)
-    # run inference
-    batch = namedtuple('Batch', ['data'])
-    mod.forward(batch([mx.nd.array(input_data)]), is_train=False)
-
-    return mod.get_outputs()[0].asnumpy()
-
-
-def test_models(model_name, input_shape, output_shape):
-    """ Tests Googlenet model for both onnx import and export"""
-    model_path, inputs, outputs = get_test_files(model_name)
-    logging.info("Translating model from ONNX model zoo to Mxnet")
-    sym, arg_params, aux_params = onnx_mxnet.import_model(model_path)
-    params = {}
-    params.update(arg_params)
-    params.update(aux_params)
-
-    dir_path = os.path.dirname(model_path)
-    new_model_name = "exported_" + model_name + ".onnx"
-    onnx_file = os.path.join(dir_path, new_model_name)
-
-    logging.info("Translating converted model from mxnet to ONNX")
-    converted_model_path = onnx_mxnet.export_model(sym, params, [input_shape], np.float32, onnx_file)
-
-    sym, arg_params, aux_params = onnx_mxnet.import_model(converted_model_path)
-
-    metadata = onnx_mxnet.get_model_metadata(converted_model_path)
-    assert len(metadata) == 2
-    assert metadata.get('input_tensor_data')
-    assert metadata.get('input_tensor_data')[0][1] == input_shape
-    assert metadata.get('output_tensor_data')
-    assert metadata.get('output_tensor_data')[0][1] == output_shape
-    data_names = [input_name[0] for input_name in metadata.get('input_tensor_data')]
-
-    logging.info("Running inference on onnx re-import model in mxnet")
-    # run test for each test file
-    for input_data, output_data in zip(inputs, outputs):
-        result = forward_pass(sym, arg_params, aux_params, data_names, input_data)
-
-        # verify the results
-        npt.assert_equal(result.shape, output_data.shape)
-        npt.assert_almost_equal(output_data, result, decimal=3)
-    logging.info(model_name + " conversion successful")
-
-
-def test_model_accuracy(model_name, input_shape):
-    """ Imports ONNX model, runs inference, exports and imports back
-        run inference, compare result with the previous inference result"""
-    model_path, inputs, outputs = get_test_files(model_name)
-    logging.info("Translating model from ONNX model zoo to Mxnet")
-    sym, arg_params, aux_params = onnx_mxnet.import_model(model_path)
-
-    metadata = onnx_mxnet.get_model_metadata(model_path)
-    data_names = [input_name[0] for input_name in metadata.get('input_tensor_data')]
-
-    expected_result= []
-    for input_data, output_data in zip(inputs, outputs):
-        result = forward_pass(sym, arg_params, aux_params, data_names, input_data)
-        expected_result.append(result)
-
-    params = {}
-    params.update(arg_params)
-    params.update(aux_params)
-
-    dir_path = os.path.dirname(model_path)
-    new_model_name = "exported_" + model_name + ".onnx"
-    onnx_file = os.path.join(dir_path, new_model_name)
-
-    logging.info("Translating converted model from mxnet to ONNX")
-    converted_model_path = onnx_mxnet.export_model(sym, params, [input_shape], np.float32,
-                                                   onnx_file)
-
-    sym, arg_params, aux_params = onnx_mxnet.import_model(converted_model_path)
-
-    metadata = onnx_mxnet.get_model_metadata(converted_model_path)
-    data_names = [input_name[0] for input_name in metadata.get('input_tensor_data')]
-
-    actual_result = []
-    for input_data, output_data in zip(inputs, outputs):
-        result = forward_pass(sym, arg_params, aux_params, data_names, input_data)
-        actual_result.append(result)
-
-    # verify the results
-    for expected, actual in zip(expected_result, actual_result):
-        npt.assert_equal(expected.shape, actual.shape)
-        npt.assert_almost_equal(expected, actual, decimal=3)
-
-@with_seed()
-def test_spacetodepth():
-    n, c, h, w = shape = (1, 1, 4, 6)
-    input1 = np.random.rand(n, c, h, w).astype("float32")
-    blocksize = 2
-    inputs = [helper.make_tensor_value_info("input1", TensorProto.FLOAT, shape=shape)]
-
-    outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=(1, 4, 2, 3))]
-
-    nodes = [helper.make_node("SpaceToDepth", ["input1"], ["output"], block_size=blocksize)]
-
-    graph = helper.make_graph(nodes,
-                              "spacetodepth_test",
-                              inputs,
-                              outputs)
-
-    spacetodepth_model = helper.make_model(graph)
-
-    bkd_rep = backend.prepare(spacetodepth_model)
-    output = bkd_rep.run([input1])
-
-    tmp = np.reshape(input1, [n, c,
-                    h // blocksize, blocksize,
-                    w // blocksize, blocksize])
-    tmp = np.transpose(tmp, [0, 3, 5, 1, 2, 4])
-    numpy_op = np.reshape(tmp, [n, c * (blocksize**2),
-                    h // blocksize,
-                    w // blocksize])
-
-    npt.assert_almost_equal(output[0], numpy_op)
-
-@with_seed()
-def test_square():
-    input1 = np.random.randint(1, 10, (2, 3)).astype("float32")
-
-    ipsym = mx.sym.Variable("input1")
-    square = mx.sym.square(data=ipsym)
-    model = mx.mod.Module(symbol=square, data_names=['input1'], label_names=None)
-    model.bind(for_training=False, data_shapes=[('input1', np.shape(input1))], label_shapes=None)
-    model.init_params()
-
-    args, auxs = model.get_params()
-    params = {}
-    params.update(args)
-    params.update(auxs)
-
-    converted_model = onnx_mxnet.export_model(square, params, [np.shape(input1)], np.float32, "square.onnx")
-
-    sym, arg_params, aux_params = onnx_mxnet.import_model(converted_model)
-    result = forward_pass(sym, arg_params, aux_params, ['input1'], input1)
-
-    numpy_op = np.square(input1)
-
-    npt.assert_almost_equal(result, numpy_op)
-
-
-def test_softmax():
-    input1 = np.random.rand(1000, 1000).astype("float32")
-    label1 = np.random.rand(1000)
-    input_nd = mx.nd.array(input1)
-    label_nd = mx.nd.array(label1)
-
-    ipsym = mx.sym.Variable("ipsym")
-    label = mx.sym.Variable('label')
-    sym = mx.sym.SoftmaxOutput(data=ipsym, label=label, ignore_label=0, use_ignore=False)
-    ex = sym.bind(ctx=mx.cpu(0), args={'ipsym': input_nd, 'label': label_nd})
-    ex.forward(is_train=True)
-    softmax_out = ex.outputs[0].asnumpy()
-
-    converted_model = onnx_mxnet.export_model(sym, {}, [(1000, 1000), (1000,)], np.float32, "softmaxop.onnx")
-
-    sym, arg_params, aux_params = onnx_mxnet.import_model(converted_model)
-    result = forward_pass(sym, arg_params, aux_params, ['ipsym'], input1)
-
-    # Comparing result of forward pass before using onnx export, import
-    npt.assert_almost_equal(result, softmax_out)
-
-@with_seed()
-def test_comparison_ops():
-    """Test greater, lesser, equal"""
-    def test_ops(op_name, inputs, input_tensors, numpy_op):
-        outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=np.shape(inputs[0]))]
-        nodes = [helper.make_node(op_name, ["input"+str(i+1) for i in range(len(inputs))], ["output"])]
-        graph = helper.make_graph(nodes,
-                                  op_name + "_test",
-                                  input_tensors,
-                                  outputs)
-        model = helper.make_model(graph)
-        bkd_rep = backend.prepare(model)
-        output = bkd_rep.run(inputs)
-        npt.assert_almost_equal(output[0], numpy_op)
-    input_data = [np.random.rand(1, 3, 4, 5).astype("float32"),
-                  np.random.rand(1, 5).astype("float32")]
-    input_tensor = []
-    for idx, ip in enumerate(input_data):
-        input_tensor.append(helper.make_tensor_value_info("input" + str(idx + 1),
-                                                          TensorProto.FLOAT, shape=np.shape(ip)))
-    test_ops("Greater", input_data, input_tensor,
-             np.greater(input_data[0], input_data[1]).astype(np.float32))
-    test_ops("Less", input_data, input_tensor,
-             np.less(input_data[0], input_data[1]).astype(np.float32))
-    test_ops("Equal", input_data, input_tensor,
-             np.equal(input_data[0], input_data[1]).astype(np.float32))
-
-
-def get_int_inputs(interval, shape):
-    """Helper to get integer input of given shape and range"""
-    assert len(interval) == len(shape)
-    inputs = []
-    input_tensors = []
-    for idx in range(len(interval)):
-        low, high = interval[idx]
-        inputs.append(np.random.randint(low, high, size=shape[idx]).astype("float32"))
-        input_tensors.append(helper.make_tensor_value_info("input"+str(idx+1),
-                                                        TensorProto.FLOAT, shape=shape[idx]))
-    return inputs, input_tensors
-
-
-@with_seed()
-def test_logical_ops():
-    """Test for logical and, or, not, xor operators"""
-    def test_ops(op_name, inputs, input_tensors, numpy_op):
-        outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=np.shape(inputs[0]))]
-        nodes = [helper.make_node(op_name, ["input"+str(i+1) for i in range(len(inputs))], ["output"])]
-        graph = helper.make_graph(nodes,
-                                  op_name + "_test",
-                                  input_tensors,
-                                  outputs)
-        model = helper.make_model(graph)
-        bkd_rep = backend.prepare(model)
-        output = bkd_rep.run(inputs)
-        npt.assert_almost_equal(output[0], numpy_op)
-    input_data, input_tensor = get_int_inputs([(0, 2), (0, 2)], [(3, 4, 5), (3, 4, 5)])
-    test_ops("And", input_data, input_tensor,
-             np.logical_and(input_data[0], input_data[1]).astype(np.float32))
-    test_ops("Or", input_data, input_tensor,
-             np.logical_or(input_data[0], input_data[1]).astype(np.float32))
-    test_ops("Xor", input_data, input_tensor,
-             np.logical_xor(input_data[0], input_data[1]).astype(np.float32))
-    test_ops("Not", [input_data[0]], [input_tensor[0]],
-             np.logical_not(input_data[0]).astype(np.float32))
-
-
-def _assert_sym_equal(lhs, rhs):
-    assert lhs.list_inputs() == rhs.list_inputs()  # input names must be identical
-    assert len(lhs.list_outputs()) == len(rhs.list_outputs())  # number of outputs must be identical
-
-
-def _force_list(output):
-    if isinstance(output, nd.NDArray):
-        return [output]
-    return list(output)
-
-
-def _optional_group(symbols, group=False):
-    if group:
-        return sym.Group(symbols)
-    else:
-        return symbols
-
-
-def _check_onnx_export(net, group_outputs=False, shape_type=tuple, extra_params={}):
-    net.initialize()
-    data = nd.random.uniform(0, 1, (1, 1024))
-    output = _force_list(net(data))  # initialize weights
-    net_sym = _optional_group(net(sym.Variable('data')), group_outputs)
-    net_params = {name:param._reduce() for name, param in net.collect_params().items()}
-    net_params.update(extra_params)
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        onnx_file_path = os.path.join(tmpdirname, 'net.onnx')
-        export_path = onnx_mxnet.export_model(
-            sym=net_sym,
-            params=net_params,
-            input_shape=[shape_type(data.shape)],
-            onnx_file_path=onnx_file_path)
-        assert export_path == onnx_file_path
-        # Try importing the model to symbol
-        _assert_sym_equal(net_sym, onnx_mxnet.import_model(export_path)[0])
-
-        # Try importing the model to gluon
-        imported_net = onnx_mxnet.import_to_gluon(export_path, ctx=None)
-        _assert_sym_equal(net_sym, _optional_group(imported_net(sym.Variable('data')), group_outputs))
-
-        # Confirm network outputs are the same
-        imported_net_output = _force_list(imported_net(data))
-        for out, imp_out in zip(output, imported_net_output):
-            mx.test_utils.assert_almost_equal(out.asnumpy(), imp_out.asnumpy())
-
-
-@with_seed()
-def test_onnx_export_single_output():
-    net = nn.HybridSequential(prefix='single_output_net')
-    with net.name_scope():
-        net.add(nn.Dense(100, activation='relu'), nn.Dense(10))
-    _check_onnx_export(net)
-
-
-@with_seed()
-def test_onnx_export_multi_output():
-    class MultiOutputBlock(nn.HybridBlock):
-        def __init__(self):
-            super(MultiOutputBlock, self).__init__()
-            with self.name_scope():
-                self.net = nn.HybridSequential()
-                for i in range(10):
-                    self.net.add(nn.Dense(100 + i * 10, activation='relu'))
-
-        def hybrid_forward(self, F, x):
-            out = tuple(block(x) for block in self.net._children.values())
-            return out
-
-    net = MultiOutputBlock()
-    assert len(sym.Group(net(sym.Variable('data'))).list_outputs()) == 10
-    _check_onnx_export(net, group_outputs=True)
-
-
-@with_seed()
-def test_onnx_export_list_shape():
-    net = nn.HybridSequential(prefix='list_shape_net')
-    with net.name_scope():
-        net.add(nn.Dense(100, activation='relu'), nn.Dense(10))
-    _check_onnx_export(net, shape_type=list)
-
-
-@with_seed()
-def test_onnx_export_extra_params():
-    net = nn.HybridSequential(prefix='extra_params_net')
-    with net.name_scope():
-        net.add(nn.Dense(100, activation='relu'), nn.Dense(10))
-    _check_onnx_export(net, extra_params={'extra_param': nd.array([1, 2])})
-
-
-if __name__ == '__main__':
-    test_models("bvlc_googlenet", (1, 3, 224, 224), (1, 1000))
-    test_models("bvlc_reference_caffenet", (1, 3, 224, 224), (1, 1000))
-    test_models("bvlc_reference_rcnn_ilsvrc13", (1, 3, 224, 224), (1, 200))
-
-    # Comparing MXNet inference result, since MXNet results don't match
-    # ONNX expected results due to AveragePool issue github issue(#10194)
-    test_model_accuracy("inception_v1", (1, 3, 224, 224))
-    test_model_accuracy("inception_v2", (1, 3, 224, 224))
-
-    unittest.main()
diff --git a/tests/python-pytest/onnx/export/onnx_backend_test.py b/tests/python-pytest/onnx/export/onnx_backend_test.py
deleted file mode 100644
index c9926c4d5e15..000000000000
--- a/tests/python-pytest/onnx/export/onnx_backend_test.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""ONNX test backend wrapper"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import unittest
-try:
-    import onnx.backend.test
-except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed")
-
-import backend as mxnet_backend
-
-# This is a pytest magic variable to load extra plugins
-pytest_plugins = "onnx.backend.test.report",
-
-BACKEND_TESTS = onnx.backend.test.BackendTest(mxnet_backend, __name__)
-
-IMPLEMENTED_OPERATORS_TEST = [
-    'test_random_uniform',
-    'test_random_normal',
-    'test_add',
-    'test_sub',
-    'test_mul',
-    'test_div',
-    'test_neg',
-    'test_abs',
-    'test_sum',
-    'test_tanh',
-    'test_cos',
-    'test_sin',
-    'test_tan',
-    'test_acos',
-    'test_asin',
-    'test_atan'
-    'test_ceil',
-    'test_floor',
-    'test_concat',
-    'test_identity',
-    'test_sigmoid',
-    'test_relu',
-    'test_constant_pad',
-    'test_edge_pad',
-    'test_reflect_pad',
-    'test_reduce_min',
-    'test_reduce_max',
-    'test_reduce_mean',
-    'test_reduce_prod',
-    'test_reduce_sum_d',
-    'test_reduce_sum_keepdims_random',
-    'test_squeeze',
-    'test_softmax_example',
-    'test_softmax_large_number',
-    'test_softmax_axis_2',
-    'test_transpose',
-    'test_globalmaxpool',
-    'test_globalaveragepool',
-    # enabling partial test cases for matmul
-    'test_matmul_3d',
-    'test_matmul_4d',
-    'test_slice_cpu',
-    'test_slice_neg',
-    'test_squeeze_',
-    'test_reciprocal',
-    'test_sqrt',
-    'test_pow',
-    'test_exp_',
-    'test_argmax',
-    'test_argmin',
-    'test_min',
-    'test_max'
-    #pytorch operator tests
-    'test_operator_exp',
-    'test_operator_maxpool',
-    'test_operator_params',
-    'test_operator_permute2',
-    'test_clip'
-    'test_cast',
-    'test_depthtospace',
-    'test_hardsigmoid',
-    'test_instancenorm',
-    'test_shape',
-    'test_size'
-    ]
-
-BASIC_MODEL_TESTS = [
-    'test_AvgPool2D',
-    'test_BatchNorm',
-    'test_ConstantPad2d',
-    'test_Conv2d',
-    'test_ELU',
-    'test_LeakyReLU',
-    'test_MaxPool',
-    'test_PReLU',
-    'test_ReLU',
-    'test_selu_default'
-    'test_Sigmoid',
-    'test_Softmax',
-    'test_softmax_functional',
-    'test_softmax_lastdim',
-    'test_Tanh'
-    ]
-
-STANDARD_MODEL = [
-    'test_bvlc_alexnet',
-    'test_densenet121',
-    # 'test_inception_v1',
-    # 'test_inception_v2',
-    'test_resnet50',
-    # 'test_shufflenet',
-    'test_squeezenet',
-    'test_vgg16',
-    'test_vgg19'
-    ]
-
-for op_test in IMPLEMENTED_OPERATORS_TEST:
-    BACKEND_TESTS.include(op_test)
-
-for basic_model_test in BASIC_MODEL_TESTS:
-    BACKEND_TESTS.include(basic_model_test)
-
-for std_model_test in STANDARD_MODEL:
-    BACKEND_TESTS.include(std_model_test)
-
-BACKEND_TESTS.exclude('.*broadcast.*')
-BACKEND_TESTS.exclude('.*bcast.*')
-
-
-# import all test cases at global scope to make them visible to python.unittest
-globals().update(BACKEND_TESTS.enable_report().test_cases)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/python-pytest/onnx/import/gluon_backend.py b/tests/python-pytest/onnx/import/gluon_backend.py
deleted file mode 100644
index 25be60b57dc6..000000000000
--- a/tests/python-pytest/onnx/import/gluon_backend.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-"""Gluon backend wrapper for onnx test infrastructure"""
-from mxnet.contrib.onnx.onnx2mx.import_onnx import GraphProto
-import mxnet as mx
-
-try:
-    from onnx import helper, TensorProto
-    from onnx.backend.base import Backend
-except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed. Instructions to"
-                      + " install - /~https://github.com/onnx/onnx#installation")
-from gluon_backend_rep import GluonBackendRep
-
-# GluonBackend class will take an ONNX model with inputs, perform a computation,
-# and then return the output.
-# Implemented by following onnx docs guide:
-# /~https://github.com/onnx/onnx/blob/master/docs/ImplementingAnOnnxBackend.md
-
-class GluonBackend(Backend):
-    """Gluon backend for ONNX"""
-
-    @classmethod
-    def prepare(cls, model, device='CPU', **kwargs):
-        """For running end to end model(used for onnx test backend)
-
-        Parameters
-        ----------
-        model  : onnx ModelProto object
-            loaded onnx graph
-        device : 'CPU'
-            specifying device to run test on
-        kwargs :
-            other arguments
-
-        Returns
-        -------
-        GluonBackendRep : object
-            Returns object of GluonBackendRep class which will be in turn
-            used to run inference on the input model and return the result for comparison.
-        """
-        graph = GraphProto()
-        if device == 'CPU':
-            ctx = mx.cpu()
-        else:
-            raise NotImplementedError("ONNX tests are run only for CPU context.")
-
-        net = graph.graph_to_gluon(model.graph, ctx)
-        return GluonBackendRep(net, device)
-
-    @classmethod
-    def supports_device(cls, device):
-        """Supports only CPU for testing"""
-        return device == 'CPU'
-
-
-prepare = GluonBackend.prepare
-
-supports_device = GluonBackend.supports_device
diff --git a/tests/python-pytest/onnx/import/gluon_backend_rep.py b/tests/python-pytest/onnx/import/gluon_backend_rep.py
deleted file mode 100644
index 04c6ddde63e9..000000000000
--- a/tests/python-pytest/onnx/import/gluon_backend_rep.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-"""gluon backend rep for onnx test infrastructure"""
-import numpy as np
-try:
-    from onnx.backend.base import BackendRep
-except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed. Instructions to"
-                      + " install - /~https://github.com/onnx/onnx#installation")
-import mxnet as mx
-from mxnet import nd
-
-# GluonBackendRep object will be returned by GluonBackend's prepare method which is used to
-# execute a model repeatedly.
-# Inputs will be passed to the run method of MXNetBackendRep class, it will perform computation and
-# retrieve the corresponding results for comparison to the onnx backend.
-# /~https://github.com/onnx/onnx/blob/master/onnx/backend/test/runner/__init__.py.
-# Implemented by following onnx docs guide:
-# /~https://github.com/onnx/onnx/blob/master/docs/ImplementingAnOnnxBackend.md
-
-
-class GluonBackendRep(BackendRep):
-    """Running model inference on gluon backend and return the result
-     to onnx test infrastructure for comparison."""
-    def __init__(self, net, device):
-        self.net = net
-        self.device = device
-
-    def run(self, inputs, **kwargs):
-        """Run model inference and return the result
-
-        Parameters
-        ----------
-        inputs : numpy array
-            input to run a layer on
-
-        Returns
-        -------
-        params : numpy array
-            result obtained after running the inference on mxnet
-        """
-        # create module, passing cpu context
-        if self.device == 'CPU':
-            ctx = mx.cpu()
-        else:
-            raise NotImplementedError("ONNX tests are run only for CPU context.")
-
-        # run inference
-        net_inputs = [nd.array(input_data, ctx=ctx) for input_data in inputs]
-        net_outputs = self.net(*net_inputs)
-        results = []
-        results.extend([o for o in net_outputs.asnumpy()])
-        result = np.array(results)
-
-        return [result]
diff --git a/tests/python-pytest/onnx/import/gluon_backend_test.py b/tests/python-pytest/onnx/import/gluon_backend_test.py
deleted file mode 100644
index 6dd5f8a071c9..000000000000
--- a/tests/python-pytest/onnx/import/gluon_backend_test.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""ONNX test backend wrapper"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import unittest
-try:
-    import onnx.backend.test
-except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed. Instructions to"
-                      + " install - /~https://github.com/onnx/onnx#installation")
-
-import gluon_backend
-import test_cases
-
-# This is a pytest magic variable to load extra plugins
-pytest_plugins = "onnx.backend.test.report",
-
-BACKEND_TESTS = onnx.backend.test.BackendTest(gluon_backend, __name__)
-
-for op_tests in test_cases.IMPLEMENTED_OPERATORS_TEST:
-    BACKEND_TESTS.include(op_tests)
-
-for std_model_test in test_cases.STANDARD_MODEL:
-    BACKEND_TESTS.include(std_model_test)
-
-for basic_model_test in test_cases.BASIC_MODEL_TESTS:
-    BACKEND_TESTS.include(basic_model_test)
-
-BACKEND_TESTS.exclude('.*broadcast.*')
-BACKEND_TESTS.exclude('.*bcast.*')
-
-# import all test cases at global scope to make them visible to python.unittest
-globals().update(BACKEND_TESTS.enable_report().test_cases)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/python-pytest/onnx/import/mxnet_backend.py b/tests/python-pytest/onnx/import/mxnet_backend.py
deleted file mode 100644
index bd4910b64f85..000000000000
--- a/tests/python-pytest/onnx/import/mxnet_backend.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-"""MXNet backend wrapper for onnx test infrastructure"""
-import os
-import sys
-from mxnet.contrib.onnx.onnx2mx.import_onnx import GraphProto
-try:
-    from onnx import helper, TensorProto
-    from onnx.backend.base import Backend
-except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed. Instructions to"
-                      + " install - /~https://github.com/onnx/onnx#installation")
-CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(CURR_PATH, '../'))
-from backend_rep import MXNetBackendRep
-
-# MXNetBackend class will take an ONNX model with inputs, perform a computation,
-# and then return the output.
-# Implemented by following onnx docs guide:
-# /~https://github.com/onnx/onnx/blob/master/docs/ImplementingAnOnnxBackend.md
-
-class MXNetBackend(Backend):
-    """MXNet backend for ONNX"""
-
-    @classmethod
-    def prepare(cls, model, device='CPU', **kwargs):
-        """For running end to end model(used for onnx test backend)
-
-        Parameters
-        ----------
-        model  : onnx ModelProto object
-            loaded onnx graph
-        device : 'CPU'
-            specifying device to run test on
-        kwargs :
-            other arguments
-
-        Returns
-        -------
-        MXNetBackendRep : object
-            Returns object of MXNetBackendRep class which will be in turn
-            used to run inference on the input model and return the result for comparison.
-        """
-        graph = GraphProto()
-        sym, arg_params, aux_params = graph.from_onnx(model.graph)
-        return MXNetBackendRep(sym, arg_params, aux_params, device)
-
-    @classmethod
-    def supports_device(cls, device):
-        """Supports only CPU for testing"""
-        return device == 'CPU'
-
-prepare = MXNetBackend.prepare
-
-supports_device = MXNetBackend.supports_device
diff --git a/tests/python-pytest/onnx/import/mxnet_backend_test.py b/tests/python-pytest/onnx/import/mxnet_backend_test.py
deleted file mode 100644
index d9e4dccae24e..000000000000
--- a/tests/python-pytest/onnx/import/mxnet_backend_test.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""ONNX test backend wrapper"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import unittest
-try:
-    import onnx.backend.test
-except ImportError:
-    raise ImportError("Onnx and protobuf need to be installed. Instructions to"
-                      + " install - /~https://github.com/onnx/onnx#installation")
-
-import mxnet_backend
-import test_cases
-
-# This is a pytest magic variable to load extra plugins
-pytest_plugins = "onnx.backend.test.report",
-
-BACKEND_TESTS = onnx.backend.test.BackendTest(mxnet_backend, __name__)
-
-for op_tests in test_cases.IMPLEMENTED_OPERATORS_TEST:
-    BACKEND_TESTS.include(op_tests)
-
-for basic_model_test in test_cases.BASIC_MODEL_TESTS:
-    BACKEND_TESTS.include(basic_model_test)
-
-for std_model_test in test_cases.STANDARD_MODEL:
-    BACKEND_TESTS.include(std_model_test)
-
-BACKEND_TESTS.exclude('.*broadcast.*')
-BACKEND_TESTS.exclude('.*bcast.*')
-
-# import all test cases at global scope to make them visible to python.unittest
-globals().update(BACKEND_TESTS.enable_report().test_cases)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/python-pytest/onnx/import/onnx_import_test.py b/tests/python-pytest/onnx/import/onnx_import_test.py
deleted file mode 100644
index c2d1e9cb2d36..000000000000
--- a/tests/python-pytest/onnx/import/onnx_import_test.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Tests for individual operators
-This module contains operator tests which currently do not exist on
-ONNX backend test framework. Once we have PRs on the ONNX repo and get
-those PRs merged, this file will get EOL'ed.
-"""
-# pylint: disable=too-many-locals,wrong-import-position,import-error
-from __future__ import absolute_import
-import sys
-import os
-import unittest
-import logging
-import hashlib
-import tarfile
-from collections import namedtuple
-import numpy as np
-import numpy.testing as npt
-from onnx import helper
-from onnx import numpy_helper
-from onnx import TensorProto
-from mxnet.test_utils import download
-from mxnet.contrib import onnx as onnx_mxnet
-import mxnet as mx
-CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(CURR_PATH, '../../../python/unittest'))
-from common import with_seed
-import mxnet_backend
-
-
-URLS = {
-    'bvlc_googlenet' :
-        'https://s3.amazonaws.com/onnx-mxnet/model-zoo/opset7/bvlc_googlenet.tar.gz',
-    'bvlc_reference_caffenet' :
-        'https://s3.amazonaws.com/onnx-mxnet/model-zoo/opset7/bvlc_reference_caffenet.tar.gz',
-    'bvlc_reference_rcnn_ilsvrc13' :
-        'https://s3.amazonaws.com/onnx-mxnet/model-zoo/opset7/bvlc_reference_rcnn_ilsvrc13.tar.gz',
-}
-
-@with_seed()
-def test_broadcast():
-    """Test for broadcasting in onnx operators."""
-    input1 = np.random.rand(1, 3, 4, 5).astype("float32")
-    input2 = np.random.rand(1, 5).astype("float32")
-    inputs = [helper.make_tensor_value_info("input1", TensorProto.FLOAT, shape=(1, 3, 4, 5)),
-              helper.make_tensor_value_info("input2", TensorProto.FLOAT, shape=(1, 5))]
-
-    outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=(1, 3, 4, 5))]
-
-    nodes = [helper.make_node("Add", ["input1", "input2"], ["output"])]
-
-    graph = helper.make_graph(nodes,
-                              "bcast_test",
-                              inputs,
-                              outputs)
-
-    bcast_model = helper.make_model(graph)
-    
-    bkd_rep = mxnet_backend.prepare(bcast_model)
-    numpy_op = input1 + input2
-    output = bkd_rep.run([input1, input2])
-    npt.assert_almost_equal(output[0], numpy_op)
-
-@with_seed()
-def test_greater():
-    """Test for logical greater in onnx operators."""
-    input1 = np.random.rand(1, 3, 4, 5).astype("float32")
-    input2 = np.random.rand(1, 5).astype("float32")
-    inputs = [helper.make_tensor_value_info("input1", TensorProto.FLOAT, shape=(1, 3, 4, 5)),
-              helper.make_tensor_value_info("input2", TensorProto.FLOAT, shape=(1, 5))]
-
-    outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=(1, 3, 4, 5))]
-
-    nodes = [helper.make_node("Greater", ["input1", "input2"], ["output"])]
-
-    graph = helper.make_graph(nodes,
-                              "greater_test",
-                              inputs,
-                              outputs)
-
-    greater_model = helper.make_model(graph)
-    
-    bkd_rep = mxnet_backend.prepare(greater_model)
-    numpy_op = np.greater(input1, input2).astype(np.float32)
-    output = bkd_rep.run([input1, input2])
-    npt.assert_almost_equal(output[0], numpy_op)
-
-@with_seed()
-def test_lesser():
-    """Test for logical greater in onnx operators."""
-    input1 = np.random.rand(1, 3, 4, 5).astype("float32")
-    input2 = np.random.rand(1, 5).astype("float32")
-    inputs = [helper.make_tensor_value_info("input1", TensorProto.FLOAT, shape=(1, 3, 4, 5)),
-              helper.make_tensor_value_info("input2", TensorProto.FLOAT, shape=(1, 5))]
-
-    outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=(1, 3, 4, 5))]
-
-    nodes = [helper.make_node("Less", ["input1", "input2"], ["output"])]
-
-    graph = helper.make_graph(nodes,
-                              "lesser_test",
-                              inputs,
-                              outputs)
-
-    greater_model = helper.make_model(graph)
-    
-    bkd_rep = mxnet_backend.prepare(greater_model)
-    numpy_op = np.less(input1, input2).astype(np.float32)
-    output = bkd_rep.run([input1, input2])
-    npt.assert_almost_equal(output[0], numpy_op)
-    
-@with_seed()
-def test_equal():
-    """Test for logical greater in onnx operators."""
-    input1 = np.random.rand(1, 3, 4, 5).astype("float32")
-    input2 = np.random.rand(1, 5).astype("float32")
-    inputs = [helper.make_tensor_value_info("input1", TensorProto.FLOAT, shape=(1, 3, 4, 5)),
-              helper.make_tensor_value_info("input2", TensorProto.FLOAT, shape=(1, 5))]
-
-    outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=(1, 3, 4, 5))]
-
-    nodes = [helper.make_node("Equal", ["input1", "input2"], ["output"])]
-
-    graph = helper.make_graph(nodes,
-                              "equal_test",
-                              inputs,
-                              outputs)
-
-    greater_model = helper.make_model(graph)
-    
-    bkd_rep = mxnet_backend.prepare(greater_model)
-    numpy_op = np.equal(input1, input2).astype(np.float32)
-    output = bkd_rep.run([input1, input2])
-    npt.assert_almost_equal(output[0], numpy_op)
-
-
-def get_test_files(name):
-    """Extract tar file and returns model path and input, output data"""
-    tar_name = download(URLS.get(name), dirname=CURR_PATH.__str__())
-    # extract tar file
-    tar_path = os.path.join(CURR_PATH, tar_name)
-    tar = tarfile.open(tar_path.__str__(), "r:*")
-    tar.extractall(path=CURR_PATH.__str__())
-    tar.close()
-    data_dir = os.path.join(CURR_PATH, name)
-    model_path = os.path.join(data_dir, 'model.onnx')
-
-    inputs = []
-    outputs = []
-    # get test files
-    for test_file in os.listdir(data_dir):
-        case_dir = os.path.join(data_dir, test_file)
-        # skip the non-dir files
-        if not os.path.isdir(case_dir):
-            continue
-        input_file = os.path.join(case_dir, 'input_0.pb')
-        input_tensor = TensorProto()
-        with open(input_file, 'rb') as proto_file:
-            input_tensor.ParseFromString(proto_file.read())
-        inputs.append(numpy_helper.to_array(input_tensor))
-
-        output_tensor = TensorProto()
-        output_file = os.path.join(case_dir, 'output_0.pb')
-        with open(output_file, 'rb') as proto_file:
-            output_tensor.ParseFromString(proto_file.read())
-        outputs.append(numpy_helper.to_array(output_tensor))
-
-    return model_path, inputs, outputs
-
-def test_bvlc_googlenet():
-    """ Tests Googlenet model"""
-    model_path, inputs, outputs = get_test_files('bvlc_googlenet')
-    logging.info("Translating Googlenet model from ONNX to Mxnet")
-    sym, arg_params, aux_params = onnx_mxnet.import_model(model_path)
-    metadata = onnx_mxnet.get_model_metadata(model_path)
-    assert len(metadata) == 2
-    assert metadata.get('input_tensor_data')
-    assert metadata.get('input_tensor_data') == [(u'data_0', (1, 3, 224, 224))]
-    assert metadata.get('output_tensor_data')
-    assert metadata.get('output_tensor_data') == [(u'prob_1', (1, 1000))]
-    data_names = [input_name[0] for input_name in metadata.get('input_tensor_data')]
-
-    # run test for each test file
-    for input_data, output_data in zip(inputs, outputs):
-        # create module
-        mod = mx.mod.Module(symbol=sym, data_names=data_names, context=mx.cpu(), label_names=None)
-        mod.bind(for_training=False, data_shapes=[(data_names[0], input_data.shape)], label_shapes=None)
-        mod.set_params(arg_params=arg_params, aux_params=aux_params,
-                       allow_missing=True, allow_extra=True)
-        # run inference
-        batch = namedtuple('Batch', ['data'])
-        mod.forward(batch([mx.nd.array(input_data)]), is_train=False)
-
-        # verify the results
-        npt.assert_equal(mod.get_outputs()[0].shape, output_data.shape)
-        npt.assert_almost_equal(output_data, mod.get_outputs()[0].asnumpy(), decimal=3)
-    logging.info("Googlenet model conversion Successful")
-
-def test_bvlc_reference_caffenet():
-    """Tests the bvlc cafenet model"""
-    model_path, inputs, outputs = get_test_files('bvlc_reference_caffenet')
-    logging.info("Translating Caffenet model from ONNX to Mxnet")
-    sym, arg_params, aux_params = onnx_mxnet.import_model(model_path)
-    metadata = onnx_mxnet.get_model_metadata(model_path)
-    assert len(metadata) == 2
-    assert metadata.get('input_tensor_data')
-    assert metadata.get('input_tensor_data') == [(u'data_0', (1, 3, 224, 224))]
-    assert metadata.get('output_tensor_data')
-    assert metadata.get('output_tensor_data') == [(u'prob_1', (1, 1000))]
-    data_names = [input_name[0] for input_name in metadata.get('input_tensor_data')]
-
-    # run test for each test file
-    for input_data, output_data in zip(inputs, outputs):
-        # create module
-        mod = mx.mod.Module(symbol=sym, data_names=data_names, context=mx.cpu(), label_names=None)
-        mod.bind(for_training=False, data_shapes=[(data_names[0], input_data.shape)], label_shapes=None)
-        mod.set_params(arg_params=arg_params, aux_params=aux_params,
-                       allow_missing=True, allow_extra=True)
-        # run inference
-        batch = namedtuple('Batch', ['data'])
-        mod.forward(batch([mx.nd.array(input_data)]), is_train=False)
-
-        # verify the results
-        npt.assert_equal(mod.get_outputs()[0].shape, output_data.shape)
-        npt.assert_almost_equal(output_data, mod.get_outputs()[0].asnumpy(), decimal=3)
-    logging.info("Caffenet model conversion Successful")
-
-def test_bvlc_rcnn_ilsvrc13():
-    """Tests the bvlc rcnn model"""
-    model_path, inputs, outputs = get_test_files('bvlc_reference_rcnn_ilsvrc13')
-    logging.info("Translating rcnn_ilsvrc13 model from ONNX to Mxnet")
-    sym, arg_params, aux_params = onnx_mxnet.import_model(model_path)
-    metadata = onnx_mxnet.get_model_metadata(model_path)
-    assert len(metadata) == 2
-    assert metadata.get('input_tensor_data')
-    assert metadata.get('input_tensor_data') == [(u'data_0', (1, 3, 224, 224))]
-    assert metadata.get('output_tensor_data')
-    assert metadata.get('output_tensor_data') == [(u'fc-rcnn_1', (1, 200))]
-    data_names = [input_name[0] for input_name in metadata.get('input_tensor_data')]
-
-    # run test for each test file
-    for input_data, output_data in zip(inputs, outputs):
-        # create module
-        mod = mx.mod.Module(symbol=sym, data_names=data_names, context=mx.cpu(), label_names=None)
-        mod.bind(for_training=False, data_shapes=[(data_names[0], input_data.shape)], label_shapes=None)
-        mod.set_params(arg_params=arg_params, aux_params=aux_params,
-                       allow_missing=True, allow_extra=True)
-        # run inference
-        batch = namedtuple('Batch', ['data'])
-        mod.forward(batch([mx.nd.array(input_data)]), is_train=False)
-
-        # verify the results
-        npt.assert_equal(mod.get_outputs()[0].shape, output_data.shape)
-        npt.assert_almost_equal(output_data, mod.get_outputs()[0].asnumpy(), decimal=3)
-    logging.info("rcnn_ilsvrc13 model conversion Successful")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/python-pytest/onnx/import/test_cases.py b/tests/python-pytest/onnx/import/test_cases.py
deleted file mode 100644
index e0b26cc49830..000000000000
--- a/tests/python-pytest/onnx/import/test_cases.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test Cases to be run for the import module"""
-
-IMPLEMENTED_OPERATORS_TEST = [
-    'test_split_equal'
-    'test_random_',
-    'test_add',
-    'test_sub',
-    'test_mul',
-    'test_div',
-    'test_neg',
-    'test_abs',
-    'test_sum',
-    'test_tanh',
-    'test_ceil',
-    'test_floor',
-    'test_concat',
-    'test_identity',
-    'test_sigmoid',
-    'test_relu',
-    'test_constant_pad',
-    'test_edge_pad',
-    'test_reflect_pad',
-    'test_squeeze_',
-    'test_unsqueeze',
-    'test_softmax_example',
-    'test_softmax_large_number',
-    'test_softmax_axis_2',
-    'test_transpose',
-    'test_globalmaxpool',
-    'test_globalaveragepool',
-    'test_global_lppooling',
-    'test_slice_cpu',
-    'test_slice_neg',
-    'test_reciprocal',
-    'test_sqrt',
-    'test_pow',
-    'test_exp_',
-    'test_argmax',
-    'test_argmin',
-    'test_min',
-    # enabling partial test cases for matmul
-    'test_matmul_3d',
-    'test_matmul_4d',
-    'test_clip',
-    'test_softsign',
-    'test_reduce_',
-    'test_softplus',
-    'test_mean',
-    'test_acos',
-    'test_asin',
-    'test_atan',
-    'test_cos',
-    'test_sin',
-    'test_tan',
-    'test_shape',
-    'test_hardsigmoid',
-    'test_averagepool_1d',
-    'test_averagepool_2d_pads_count_include_pad',
-    'test_averagepool_2d_precomputed_pads_count_include_pad',
-    'test_averagepool_2d_precomputed_strides',
-    'test_averagepool_2d_strides',
-    'test_averagepool_3d',
-    'test_LpPool_',
-    'test_cast',
-    'test_instancenorm',
-    #pytorch operator tests
-    'test_operator_exp',
-    'test_operator_maxpool',
-    'test_operator_params',
-    'test_operator_permute2',
-    'test_depthtospace',
-    'test_size'
-    ]
-
-BASIC_MODEL_TESTS = [
-    'test_AvgPool2D',
-    'test_BatchNorm',
-    'test_ConstantPad2d'
-    'test_Conv2d',
-    'test_ELU',
-    'test_LeakyReLU',
-    'test_MaxPool',
-    'test_PReLU',
-    'test_ReLU',
-    'test_selu_default',
-    'test_Sigmoid',
-    'test_Softmax',
-    'test_softmax_functional',
-    'test_softmax_lastdim',
-    'test_Tanh'
-    ]
-
-STANDARD_MODEL = [
-    'test_bvlc_alexnet',
-    'test_densenet121',
-    #'test_inception_v1',
-    #'test_inception_v2',
-    'test_resnet50',
-    #'test_shufflenet',
-    'test_squeezenet',
-    'test_zfnet512',
-    'test_vgg19'
-    ]
diff --git a/tests/python-pytest/onnx/mxnet_export_test.py b/tests/python-pytest/onnx/mxnet_export_test.py
new file mode 100644
index 000000000000..6c81198a8bca
--- /dev/null
+++ b/tests/python-pytest/onnx/mxnet_export_test.py
@@ -0,0 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# pylint: disable=too-many-locals,wrong-import-position,import-error
+from __future__ import absolute_import
+import os
+import unittest
+import logging
+import tempfile
+from mxnet import nd, sym
+from mxnet.gluon import nn
+from mxnet.contrib import onnx as onnx_mxnet
+import mxnet as mx
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+
+def _assert_sym_equal(lhs, rhs):
+    assert lhs.list_inputs() == rhs.list_inputs()  # input names must be identical
+    assert len(lhs.list_outputs()) == len(rhs.list_outputs())  # number of outputs must be identical
+
+
+def _force_list(output):
+    if isinstance(output, nd.NDArray):
+        return [output]
+    return list(output)
+
+
+def _optional_group(symbols, group=False):
+    if group:
+        return sym.Group(symbols)
+    else:
+        return symbols
+
+
+def _check_onnx_export(net, group_outputs=False, shape_type=tuple, extra_params={}):
+    net.initialize()
+    data = nd.random.uniform(0, 1, (1, 1024))
+    output = _force_list(net(data))  # initialize weights
+    net_sym = _optional_group(net(sym.Variable('data')), group_outputs)
+    net_params = {name: param._reduce() for name, param in net.collect_params().items()}
+    net_params.update(extra_params)
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        onnx_file_path = os.path.join(tmpdirname, 'net.onnx')
+        export_path = onnx_mxnet.export_model(
+            sym=net_sym,
+            params=net_params,
+            input_shape=[shape_type(data.shape)],
+            onnx_file_path=onnx_file_path)
+        assert export_path == onnx_file_path
+        # Try importing the model to symbol
+        _assert_sym_equal(net_sym, onnx_mxnet.import_model(export_path)[0])
+
+        # Try importing the model to gluon
+        imported_net = onnx_mxnet.import_to_gluon(export_path, ctx=None)
+        _assert_sym_equal(net_sym, _optional_group(imported_net(sym.Variable('data')), group_outputs))
+
+        # Confirm network outputs are the same
+        imported_net_output = _force_list(imported_net(data))
+        for out, imp_out in zip(output, imported_net_output):
+            mx.test_utils.assert_almost_equal(out.asnumpy(), imp_out.asnumpy())
+
+
+class TestExport(unittest.TestCase):
+    """ Tests ONNX export.
+    """
+
+    def test_onnx_export_single_output(self):
+        net = nn.HybridSequential(prefix='single_output_net')
+        with net.name_scope():
+            net.add(nn.Dense(100, activation='relu'), nn.Dense(10))
+        _check_onnx_export(net)
+
+    def test_onnx_export_multi_output(self):
+        class MultiOutputBlock(nn.HybridBlock):
+            def __init__(self):
+                super(MultiOutputBlock, self).__init__()
+                with self.name_scope():
+                    self.net = nn.HybridSequential()
+                    for i in range(10):
+                        self.net.add(nn.Dense(100 + i * 10, activation='relu'))
+
+            def hybrid_forward(self, F, x):
+                out = tuple(block(x) for block in self.net._children.values())
+                return out
+
+        net = MultiOutputBlock()
+        assert len(sym.Group(net(sym.Variable('data'))).list_outputs()) == 10
+        _check_onnx_export(net, group_outputs=True)
+
+    def test_onnx_export_list_shape(self):
+        net = nn.HybridSequential(prefix='list_shape_net')
+        with net.name_scope():
+            net.add(nn.Dense(100, activation='relu'), nn.Dense(10))
+        _check_onnx_export(net, shape_type=list)
+
+    def test_onnx_export_extra_params(self):
+        net = nn.HybridSequential(prefix='extra_params_net')
+        with net.name_scope():
+            net.add(nn.Dense(100, activation='relu'), nn.Dense(10))
+        _check_onnx_export(net, extra_params={'extra_param': nd.array([1, 2])})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/python-pytest/onnx/test_cases.py b/tests/python-pytest/onnx/test_cases.py
new file mode 100644
index 000000000000..6ec37092d437
--- /dev/null
+++ b/tests/python-pytest/onnx/test_cases.py
@@ -0,0 +1,131 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+IMPLEMENTED_OPERATORS_TEST = {
+    'both': ['test_add',
+             'test_sub',
+             'test_mul',
+             'test_div',
+             'test_neg',
+             'test_abs',
+             'test_sum',
+             'test_tanh',
+             'test_ceil',
+             'test_floor',
+             'test_concat',
+             'test_identity',
+             'test_sigmoid',
+             'test_relu',
+             'test_constant_pad',
+             'test_edge_pad',
+             'test_reflect_pad',
+             'test_softmax_example',
+             'test_softmax_large_number',
+             'test_softmax_axis_2',
+             'test_transpose',
+             'test_globalmaxpool',
+             'test_globalaveragepool',
+             'test_slice_cpu',
+             'test_slice_neg',
+             'test_reciprocal',
+             'test_sqrt',
+             'test_pow',
+             'test_exp_',
+             'test_argmax',
+             'test_argmin',
+             'test_min',
+             # pytorch operator tests
+             'test_exp_',
+             'test_operator_maxpool',
+             'test_operator_params',
+             'test_operator_permute2',
+             'test_cos',
+             'test_sin',
+             'test_tan',
+             'test_acos',
+             'test_asin',
+             'test_atan',
+             'test_squeeze',
+             'test_matmul',
+             'test_depthtospace',
+             'test_hardsigmoid',
+             'test_instancenorm',
+             'test_shape',
+             'test_cast',
+             'test_clip',
+             'test_size',
+             'test_dropout',
+             'test_unsqueeze',
+             'test_log_',
+             'test_flatten_default_axis',
+             'test_leakyrelu',
+             'test_selu_default',
+             'test_elu',
+             'test_max_',
+             'test_softplus',
+             'test_reduce_'
+             ],
+    'import': ['test_gather',
+               'test_softsign',
+               'test_mean',
+               'test_averagepool_1d',
+               'test_averagepool_2d_pads_count_include_pad',
+               'test_averagepool_2d_precomputed_pads_count_include_pad',
+               'test_averagepool_2d_precomputed_strides',
+               'test_averagepool_2d_strides',
+               'test_averagepool_3d',
+               'test_split_equal',
+               'test_hardmax'
+               ],
+    'export': ['test_random_uniform',
+               'test_random_normal',
+               'test_reduce_min',
+               'test_reduce_max',
+               'test_reduce_mean',
+               'test_reduce_prod',
+               'test_reduce_sum_d',
+               'test_reduce_sum_keepdims_random',
+               'test_lrn'
+               ]
+}
+
+BASIC_MODEL_TESTS = {
+    'both': ['test_AvgPool2D',
+             'test_BatchNorm',
+             'test_ConstantPad2d'
+             'test_Conv2d',
+             'test_MaxPool',
+             'test_PReLU',
+             'test_Softmax',
+             'test_softmax_functional',
+             'test_softmax_lastdim',
+             ]
+}
+
+STANDARD_MODEL = {
+    'both': ['test_bvlc_alexnet',
+             'test_densenet121',
+             # 'test_inception_v1',
+             # 'test_inception_v2',
+             'test_resnet50',
+             # 'test_shufflenet',
+             'test_squeezenet',
+             'test_vgg19'
+             ],
+    'import': ['test_zfnet512'],
+    'export': ['test_vgg16']
+}
diff --git a/tests/python-pytest/onnx/test_models.py b/tests/python-pytest/onnx/test_models.py
new file mode 100644
index 000000000000..f85786141d6e
--- /dev/null
+++ b/tests/python-pytest/onnx/test_models.py
@@ -0,0 +1,178 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# pylint: disable=too-many-locals,wrong-import-position,import-error
+from __future__ import absolute_import
+import sys
+import os
+import unittest
+import logging
+import tarfile
+from collections import namedtuple
+import numpy as np
+import numpy.testing as npt
+from onnx import numpy_helper
+from onnx import TensorProto
+from mxnet.test_utils import download
+from mxnet.contrib import onnx as onnx_mxnet
+import mxnet as mx
+
+CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(CURR_PATH, '../../python/unittest'))
+
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+URLS = {
+    'bvlc_googlenet':
+        'https://s3.amazonaws.com/download.onnx/models/opset_8/bvlc_googlenet.tar.gz',
+    'bvlc_reference_caffenet':
+        'https://s3.amazonaws.com/download.onnx/models/opset_8/bvlc_reference_caffenet.tar.gz',
+    'bvlc_reference_rcnn_ilsvrc13':
+        'https://s3.amazonaws.com/download.onnx/models/opset_8/bvlc_reference_rcnn_ilsvrc13.tar.gz',
+    'inception_v1':
+        'https://s3.amazonaws.com/download.onnx/models/opset_8/inception_v1.tar.gz',
+    'inception_v2':
+        'https://s3.amazonaws.com/download.onnx/models/opset_8/inception_v2.tar.gz'
+}
+
+test_model_path = "https://s3.amazonaws.com/onnx-mxnet/test_model.onnx"
+
+def get_test_files(name):
+    """Extract tar file and returns model path and input, output data"""
+    tar_name = download(URLS.get(name), dirname=CURR_PATH.__str__())
+    # extract tar file
+    tar_path = os.path.join(CURR_PATH, tar_name)
+    tar = tarfile.open(tar_path.__str__(), "r:*")
+    tar.extractall(path=CURR_PATH.__str__())
+    tar.close()
+    data_dir = os.path.join(CURR_PATH, name)
+    model_path = os.path.join(data_dir, 'model.onnx')
+
+    inputs = []
+    outputs = []
+    # get test files
+    for test_file in os.listdir(data_dir):
+        case_dir = os.path.join(data_dir, test_file)
+        # skip the non-dir files
+        if not os.path.isdir(case_dir):
+            continue
+        input_file = os.path.join(case_dir, 'input_0.pb')
+        input_tensor = TensorProto()
+        with open(input_file, 'rb') as proto_file:
+            input_tensor.ParseFromString(proto_file.read())
+        inputs.append(numpy_helper.to_array(input_tensor))
+
+        output_tensor = TensorProto()
+        output_file = os.path.join(case_dir, 'output_0.pb')
+        with open(output_file, 'rb') as proto_file:
+            output_tensor.ParseFromString(proto_file.read())
+        outputs.append(numpy_helper.to_array(output_tensor))
+
+    return model_path, inputs, outputs
+
+
+def forward_pass(sym, arg, aux, data_names, input_data):
+    """ Perform forward pass on given data"""
+    # create module
+    mod = mx.mod.Module(symbol=sym, data_names=data_names, context=mx.cpu(), label_names=None)
+    mod.bind(for_training=False, data_shapes=[(data_names[0], input_data.shape)], label_shapes=None)
+    mod.set_params(arg_params=arg, aux_params=aux,
+                   allow_missing=True, allow_extra=True)
+    # run inference
+    batch = namedtuple('Batch', ['data'])
+    mod.forward(batch([mx.nd.array(input_data)]), is_train=False)
+
+    return mod.get_outputs()[0].asnumpy()
+
+
+class TestModel(unittest.TestCase):
+    """ Tests for models.
+    Tests are dynamically added.
+    Therefore edit test_models to add more tests.
+    """
+    def test_import_export(self):
+        def get_model_results(modelpath):
+            symbol, args, aux = onnx_mxnet.import_model(modelpath)
+
+            data = onnx_mxnet.get_model_metadata(modelpath)
+            data_names = [input_name[0] for input_name in data.get('input_tensor_data')]
+
+            result = []
+            for input_data, output_data in zip(inputs, outputs):
+                output = forward_pass(symbol, args, aux, data_names, input_data)
+                result.append(output)
+            return symbol, args, aux, result, data
+
+        for test in test_cases:
+            model_name, input_shape, output_shape = test
+            with self.subTest(model_name):
+                model_path, inputs, outputs = get_test_files(model_name)
+                logging.info("Translating " + model_name + " from ONNX model zoo to MXNet")
+
+                sym, arg_params, aux_params, expected_result, _ = get_model_results(model_path)
+
+                params = {}
+                params.update(arg_params)
+                params.update(aux_params)
+
+                dir_path = os.path.dirname(model_path)
+                new_model_name = "exported_" + model_name + ".onnx"
+                onnx_file = os.path.join(dir_path, new_model_name)
+
+                logging.info("Translating converted model from mxnet to ONNX")
+                converted_model_path = onnx_mxnet.export_model(sym, params, [input_shape], np.float32, onnx_file)
+
+                sym, arg_params, aux_params, actual_result, metadata = get_model_results(converted_model_path)
+
+                assert len(metadata) == 2
+                assert metadata.get('input_tensor_data')
+                assert metadata.get('input_tensor_data')[0][1] == input_shape
+                assert metadata.get('output_tensor_data')
+                assert metadata.get('output_tensor_data')[0][1] == output_shape
+
+                # verify the results
+                for expected, actual in zip(expected_result, actual_result):
+                    npt.assert_equal(expected.shape, actual.shape)
+                    npt.assert_almost_equal(expected, actual, decimal=3)
+
+                logging.info(model_name + " conversion successful")
+
+    def test_nodims_import(self):
+        # Download test model without dims mentioned in params
+        test_model = download(test_model_path, dirname=CURR_PATH.__str__())
+        input_data = np.array([0.2, 0.5])
+        nd_data = mx.nd.array(input_data).expand_dims(0)
+        sym, arg_params, aux_params = onnx_mxnet.import_model(test_model)
+        model_metadata = onnx_mxnet.get_model_metadata(test_model)
+        input_names = [inputs[0] for inputs in model_metadata.get('input_tensor_data')]
+        output_data = forward_pass(sym, arg_params, aux_params, input_names, nd_data)
+        assert(output_data.shape == (1,1))
+
+# test_case = ("model name", input shape, output shape)
+test_cases = [
+    ("bvlc_googlenet", (1, 3, 224, 224), (1, 1000)),
+    ("bvlc_reference_caffenet", (1, 3, 224, 224), (1, 1000)),
+    ("bvlc_reference_rcnn_ilsvrc13", (1, 3, 224, 224), (1, 200)),
+    ("inception_v1", (1, 3, 224, 224), (1, 1000)),
+    ("inception_v2", (1, 3, 224, 224), (1, 1000))
+]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/python-pytest/onnx/test_node.py b/tests/python-pytest/onnx/test_node.py
new file mode 100644
index 000000000000..186666eb247e
--- /dev/null
+++ b/tests/python-pytest/onnx/test_node.py
@@ -0,0 +1,278 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Tests for individual operators
+This module contains operator tests which currently do not exist on
+ONNX backend test framework. Once we have PRs on the ONNX repo and get
+those PRs merged, this file will get EOL'ed.
+"""
+# pylint: disable=too-many-locals,wrong-import-position,import-error
+from __future__ import absolute_import
+import sys
+import os
+import unittest
+import logging
+import tarfile
+from collections import namedtuple
+import numpy as np
+import numpy.testing as npt
+from onnx import numpy_helper, helper, load_model
+from onnx import TensorProto
+from mxnet.test_utils import download
+from mxnet.contrib import onnx as onnx_mxnet
+import mxnet as mx
+import backend
+
+CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(CURR_PATH, '../../python/unittest'))
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+
+def get_rnd(shape, low=-1.0, high=1.0, dtype=np.float32):
+    if dtype == np.float32:
+        return (np.random.uniform(low, high,
+                                  np.prod(shape)).reshape(shape).astype(np.float32))
+    elif dtype == np.int32:
+        return (np.random.randint(low, high,
+                                  np.prod(shape)).reshape(shape).astype(np.float32))
+    elif dtype == np.bool_:
+        return np.random.choice(a=[False, True], size=shape).astype(np.float32)
+
+
+def _fix_attributes(attrs, attribute_mapping):
+    new_attrs = attrs
+    attr_modify = attribute_mapping.get('modify', {})
+    for k, v in attr_modify.items():
+        new_attrs[v] = new_attrs.pop(k, None)
+
+    attr_add = attribute_mapping.get('add', {})
+    for k, v in attr_add.items():
+        new_attrs[k] = v
+
+    attr_remove = attribute_mapping.get('remove', [])
+    for k in attr_remove:
+        if k in new_attrs:
+            del new_attrs[k]
+
+    return new_attrs
+
+
+def forward_pass(sym, arg, aux, data_names, input_data):
+    """ Perform forward pass on given data
+    :param sym: Symbol
+    :param arg: Arg params
+    :param aux: Aux params
+    :param data_names: Input names (list)
+    :param input_data: Input data (list). If there is only one input,
+                        pass it as a list. For example, if input is [1, 2],
+                        pass input_data=[[1, 2]]
+    :return: result of forward pass
+    """
+    data_shapes = []
+    data_forward = []
+    for idx in range(len(data_names)):
+        val = input_data[idx]
+        data_shapes.append((data_names[idx], np.shape(val)))
+        data_forward.append(mx.nd.array(val))
+    # create module
+    mod = mx.mod.Module(symbol=sym, data_names=data_names, context=mx.cpu(), label_names=None)
+    mod.bind(for_training=False, data_shapes=data_shapes, label_shapes=None)
+    if not arg and not aux:
+        mod.init_params()
+    else:
+        mod.set_params(arg_params=arg, aux_params=aux,
+                       allow_missing=True, allow_extra=True)
+    # run inference
+    batch = namedtuple('Batch', ['data'])
+    mod.forward(batch(data_forward), is_train=False)
+
+    return mod.get_outputs()[0].asnumpy()
+
+
+def get_input_tensors(input_data):
+    input_tensor = []
+    input_names = []
+    input_sym = []
+    for idx, ip in enumerate(input_data):
+        name = "input" + str(idx + 1)
+        input_sym.append(mx.sym.Variable(name))
+        input_names.append(name)
+        input_tensor.append(helper.make_tensor_value_info(name,
+                                                          TensorProto.FLOAT, shape=np.shape(ip)))
+    return input_names, input_tensor, input_sym
+
+
+def get_onnx_graph(testname, input_names, inputs, output_name, output_shape, attr):
+    outputs = [helper.make_tensor_value_info("output", TensorProto.FLOAT, shape=output_shape)]
+
+    nodes = [helper.make_node(output_name, input_names, ["output"], **attr)]
+
+    graph = helper.make_graph(nodes, testname, inputs, outputs)
+
+    model = helper.make_model(graph)
+    return model
+
+class TestNode(unittest.TestCase):
+    """ Tests for models.
+    Tests are dynamically added.
+    Therefore edit test_models to add more tests.
+    """
+    def test_import_export(self):
+        for test in test_cases:
+            test_name, mxnet_op, onnx_name, inputs, attrs, mxnet_specific, fix_attrs, check_value, check_shape = test
+            with self.subTest(test_name):
+                names, input_tensors, inputsym = get_input_tensors(inputs)
+                if inputs:
+                    test_op = mxnet_op(*inputsym, **attrs)
+                    mxnet_output = forward_pass(test_op, None, None, names, inputs)
+                    outputshape = np.shape(mxnet_output)
+                else:
+                    test_op = mxnet_op(**attrs)
+                    shape = attrs.get('shape', (1,))
+                    x = mx.nd.zeros(shape, dtype='float32')
+                    xgrad = mx.nd.zeros(shape, dtype='float32')
+                    exe = test_op.bind(ctx=mx.cpu(), args={'x': x}, args_grad={'x': xgrad})
+                    mxnet_output = exe.forward(is_train=False)[0].asnumpy()
+                    outputshape = np.shape(mxnet_output)
+
+                if mxnet_specific:
+                    onnxmodelfile = onnx_mxnet.export_model(test_op, {}, [np.shape(ip) for ip in inputs],
+                                                            np.float32,
+                                                            onnx_name + ".onnx")
+                    onnxmodel = load_model(onnxmodelfile)
+                else:
+                    onnx_attrs = _fix_attributes(attrs, fix_attrs)
+                    onnxmodel = get_onnx_graph(test_name, names, input_tensors, onnx_name, outputshape, onnx_attrs)
+
+                bkd_rep = backend.prepare(onnxmodel, operation='export')
+                output = bkd_rep.run(inputs)
+
+                if check_value:
+                    npt.assert_almost_equal(output[0], mxnet_output)
+
+                if check_shape:
+                    npt.assert_equal(output[0].shape, outputshape)
+
+        input1 = get_rnd((1, 10, 2, 3))
+        ipsym = mx.sym.Variable("input1")
+        for test in test_scalar_ops:
+            if test == 'Add':
+                outsym = 2 + ipsym
+            if test == "Sub":
+                outsym = ipsym - 2
+            if test == "rSub":
+                outsym = ipsym.__rsub__(2)
+            if test == "Mul":
+                outsym = 2 * ipsym
+            if test == "Div":
+                outsym = ipsym / 2
+            if test == "Pow":
+                outsym = ipsym ** 2
+            forward_op = forward_pass(outsym, None, None, ['input1'], input1)
+            converted_model = onnx_mxnet.export_model(outsym, {}, [np.shape(input1)], np.float32,
+                                                      onnx_file_path=outsym.name + ".onnx")
+
+            sym, arg_params, aux_params = onnx_mxnet.import_model(converted_model)
+        result = forward_pass(sym, arg_params, aux_params, ['input1'], input1)
+
+        npt.assert_almost_equal(result, forward_op)
+
+    def test_imports(self):
+        for test in import_test_cases:
+            test_name, onnx_name, inputs, np_op, attrs = test
+            with self.subTest(test_name):
+                names, input_tensors, inputsym = get_input_tensors(inputs)
+                np_out = [np_op(*inputs, **attrs)]
+                output_shape = np.shape(np_out)
+                onnx_model = get_onnx_graph(test_name, names, input_tensors, onnx_name, output_shape, attrs)
+                bkd_rep = backend.prepare(onnx_model, operation='import')
+                mxnet_out = bkd_rep.run(inputs)
+                npt.assert_almost_equal(np_out, mxnet_out)
+
+# test_case = ("test_case_name", mxnet op, "ONNX_op_name", [input_list], attribute map, MXNet_specific=True/False,
+# fix_attributes = {'modify': {mxnet_attr_name: onnx_attr_name},
+#                   'remove': [attr_name],
+#                   'add': {attr_name: value},
+# check_value=True/False, check_shape=True/False)
+test_cases = [
+    ("test_equal", mx.sym.broadcast_equal, "Equal", [get_rnd((1, 3, 4, 5)), get_rnd((1, 5))], {}, False, {}, True,
+     False),
+    ("test_greater", mx.sym.broadcast_greater, "Greater", [get_rnd((1, 3, 4, 5)), get_rnd((1, 5))], {}, False, {}, True,
+     False),
+    ("test_less", mx.sym.broadcast_lesser, "Less", [get_rnd((1, 3, 4, 5)), get_rnd((1, 5))], {}, False, {}, True,
+     False),
+    ("test_and", mx.sym.broadcast_logical_and, "And",
+     [get_rnd((3, 4, 5), dtype=np.bool_), get_rnd((3, 4, 5), dtype=np.bool_)], {}, False, {}, True, False),
+    ("test_xor", mx.sym.broadcast_logical_xor, "Xor",
+     [get_rnd((3, 4, 5), dtype=np.bool_), get_rnd((3, 4, 5), dtype=np.bool_)], {}, False, {}, True, False),
+    ("test_or", mx.sym.broadcast_logical_or, "Or",
+     [get_rnd((3, 4, 5), dtype=np.bool_), get_rnd((3, 4, 5), dtype=np.bool_)], {}, False, {}, True, False),
+    ("test_not", mx.sym.logical_not, "Not", [get_rnd((3, 4, 5), dtype=np.bool_)], {}, False, {}, True, False),
+    ("test_square", mx.sym.square, "Pow", [get_rnd((2, 3), dtype=np.int32)], {}, True, {}, True, False),
+    ("test_spacetodepth", mx.sym.space_to_depth, "SpaceToDepth", [get_rnd((1, 1, 4, 6))],
+     {'block_size': 2}, False, {}, True, False),
+    ("test_softmax", mx.sym.SoftmaxOutput, "Softmax", [get_rnd((1000, 1000)), get_rnd(1000)],
+     {'ignore_label': 0, 'use_ignore': False}, True, {}, True, False),
+    ("test_logistic_regression", mx.sym.LogisticRegressionOutput, "Sigmoid",
+     [get_rnd((1000, 1000)), get_rnd((1000, 1000))], {}, True, {}, True, False),
+    ("test_fullyconnected", mx.sym.FullyConnected, "Gemm", [get_rnd((4, 3)), get_rnd((4, 3)), get_rnd(4)],
+     {'num_hidden': 4, 'name': 'FC'}, True, {}, True, False),
+    ("test_lppool1", mx.sym.Pooling, "LpPool", [get_rnd((2, 3, 20, 20))],
+     {'kernel': (4, 5), 'pad': (0, 0), 'stride': (1, 1), 'p_value': 1, 'pool_type': 'lp'}, False,
+     {'modify': {'kernel': 'kernel_shape', 'pad': 'pads', 'stride': 'strides', 'p_value': 'p'},
+      'remove': ['pool_type']}, True, False),
+    ("test_lppool2", mx.sym.Pooling, "LpPool", [get_rnd((2, 3, 20, 20))],
+     {'kernel': (4, 5), 'pad': (0, 0), 'stride': (1, 1), 'p_value': 2, 'pool_type': 'lp'}, False,
+     {'modify': {'kernel': 'kernel_shape', 'pad': 'pads', 'stride': 'strides', 'p_value': 'p'},
+      'remove': ['pool_type']}, True, False),
+    ("test_globallppool1", mx.sym.Pooling, "GlobalLpPool", [get_rnd((2, 3, 20, 20))],
+     {'kernel': (4, 5), 'pad': (0, 0), 'stride': (1, 1), 'p_value': 1, 'pool_type': 'lp', 'global_pool': True}, False,
+     {'modify': {'p_value': 'p'},
+      'remove': ['pool_type', 'kernel', 'pad', 'stride', 'global_pool']}, True, False),
+    ("test_globallppool2", mx.sym.Pooling, "GlobalLpPool", [get_rnd((2, 3, 20, 20))],
+     {'kernel': (4, 5), 'pad': (0, 0), 'stride': (1, 1), 'p_value': 2, 'pool_type': 'lp', 'global_pool': True}, False,
+     {'modify': {'p_value': 'p'},
+      'remove': ['pool_type', 'kernel', 'pad', 'stride', 'global_pool']}, True, False),
+    ("test_roipool", mx.sym.ROIPooling, "MaxRoiPool",
+     [[[get_rnd(shape=(8, 6), low=1, high=100, dtype=np.int32)]], [[0, 0, 0, 4, 4]]],
+     {'pooled_size': (2, 2), 'spatial_scale': 0.7}, False,
+     {'modify': {'pooled_size': 'pooled_shape'}}, True, False),
+
+    # since results would be random, checking for shape alone
+    ("test_multinomial", mx.sym.sample_multinomial, "Multinomial",
+     [np.array([0, 0.1, 0.2, 0.3, 0.4]).astype("float32")],
+     {'shape': (10,)}, False, {'modify': {'shape': 'sample_size'}}, False, True),
+    ("test_random_normal", mx.sym.random_normal, "RandomNormal", [],
+     {'shape': (2, 2), 'loc': 0, 'scale': 1}, False, {'modify': {'loc': 'mean'}}, False, True),
+    ("test_random_uniform", mx.sym.random_uniform, "RandomUniform", [],
+     {'shape': (2, 2), 'low': 0.5, 'high': 1.0}, False, {}, False, True)
+]
+
+test_scalar_ops = ['Add', 'Sub', 'rSub' 'Mul', 'Div', 'Pow']
+
+# test_case = ("test_case_name", "ONNX_op_name", [input_list], np_op, attribute map)
+import_test_cases = [
+    ("test_lpnormalization_default", "LpNormalization", [get_rnd([5, 3, 3, 2])], np.linalg.norm, {'ord':2, 'axis':-1}),
+    ("test_lpnormalization_ord1", "LpNormalization", [get_rnd([5, 3, 3, 2])], np.linalg.norm, {'ord':1, 'axis':-1}),
+    ("test_lpnormalization_ord2", "LpNormalization", [get_rnd([5, 3, 3, 2])], np.linalg.norm, {'ord':2, 'axis':1})
+]
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index d9d3abfc3ced..01ba03cab7cd 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -27,6 +27,7 @@
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import *
+import test_mkldnn_install as install
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
@@ -97,6 +98,37 @@ def __getitem__(self, key):
         assert_almost_equal(y[0, 0, 0, 0], 0.016711406)
         break
 
+@with_seed()
+def test_mkldnn_reshape():
+    def test_reshape_after_conv(dst_shape):
+        shape = (1,1,4,4)
+        data = mx.symbol.Variable('data')
+        conv = mx.symbol.Convolution(data=data, num_filter=16, kernel=(1, 1), pad=(0, 0), stride=(1, 1))
+        res = mx.symbol.reshape(data=conv, shape=dst_shape)
+        exe = res.simple_bind(mx.cpu(), data=shape, grad_req='null')
+
+        val1 = np.random.uniform(-1, 1, (4, 4))
+        val2 = np.random.uniform(-1, 1, (1, 1, 1, 1))
+        val3 = np.random.uniform(-1 ,1, (1))
+
+        exe.arg_arrays[0][:] = val1
+        exe.arg_arrays[1][:] = val2
+        exe.arg_arrays[2][:] = val3
+        outputs = exe.forward(is_train=False)[0].asnumpy()
+
+        conv_exe = conv.simple_bind(mx.cpu(), data=shape, grad_req='null')
+        conv_exe.arg_arrays[0][:] = val1
+        conv_exe.arg_arrays[1][:] = val2
+        conv_exe.arg_arrays[2][:] = val3
+        data_npy = conv_exe.forward(is_train=False)[0].asnumpy()
+        assert_almost_equal(outputs, data_npy.reshape(dst_shape))
+
+
+    # Test mkldnn reshape (Using shape)
+    test_cases = [(256), (16, 16), (4, 4, 16), (4, 4, 4, 4)]
+    for test_case in test_cases:
+        test_reshape_after_conv(test_case)
+
 
 @with_seed()
 def test_reshape_before_conv():
@@ -440,4 +472,7 @@ def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
     custom = mx.symbol.Custom(name='custom', data=conv, op_type='custom')
     exec1 = custom.bind(mx.cpu(), args={'data': mx.nd.ones([10,3,96,96]), 'conv_weight': mx.nd.ones([8,3,5,5])})
     exec1.forward()[0].wait_to_read()
-    
+
+
+if __name__ == '__main__':
+    install.test_mkldnn_install()
diff --git a/tests/python/mkl/test_mkldnn_install.py b/tests/python/mkl/test_mkldnn_install.py
new file mode 100644
index 000000000000..c2f26df72f2e
--- /dev/null
+++ b/tests/python/mkl/test_mkldnn_install.py
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+MKL-DNN related test cases
+"""
+
+import sys
+import os
+import logging
+
+
+def test_mkldnn_install():
+    """
+    This test will verify that MXNet is built/installed correctly when
+    compiled with Intel MKL-DNN library. The method will try to import
+    the mxnet module and see if the mkldnn library is mapped to this
+    process's address space.
+    """
+    logging.basicConfig(level=logging.INFO)
+
+    if not sys.platform.startswith('linux'):
+        logging.info("Bypass mkldnn install test for non-Linux OS")
+        return
+
+    try:
+        #pylint: disable=unused-variable
+        import mxnet as mx
+    except (ImportError, OSError) as e:
+        assert 0, "Import mxnet error: %s. Please double check your build/" \
+            "install steps or environment variable settings" % str(e)
+
+    pid = os.getpid()
+    rc = os.system("cat /proc/" + str(pid) +
+                   "/maps | grep libmkldnn > /dev/null")
+
+    if rc == 0:
+        logging.info("MXNet is built/installed correctly with MKL-DNN")
+    else:
+        assert 0, "MXNet is built/installed incorrectly with MKL-DNN, please " \
+            "double check your build/install steps or environment " \
+            "variable settings"
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 518b69626246..3ff4b69302fb 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -26,6 +26,7 @@
 from mxnet.module import Module
 from mxnet.io import NDArrayIter
 import unittest
+import operator
 
 def is_test_for_gpu():
     return mx.current_context().device_type == 'gpu'
@@ -278,8 +279,15 @@ def check_quantized_pooling(data_shape, kernel, pool_type, pad, stride, global_p
 def test_quantized_fc():
     def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         if mx.current_context().device_type != 'gpu':
-            print('skipped testing quantized_fc on cpu since it is not supported yet')
-            return
+            hasMKL = False;
+            for key in os.environ.keys():
+                if operator.eq(key, "BUILD_TAG"):
+                    if os.environ['BUILD_TAG'].find("MKL") != -1:
+                        hasMKL = True
+                    break
+            if hasMKL == False:
+                print('skipped testing quantized_fc on cpu since s8u8s32 is only supported by MKL BLAS library')
+                return
         elif qdtype == 'uint8' and is_test_for_gpu():
             print('skipped testing quantized_fc for gpu uint8 since it is not supported yet')
             return
@@ -291,16 +299,16 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         fc_fp32_exe = fc_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
         if qdtype == 'uint8':
             data_low = 0.0
-            data_high = 127.0
+            data_high = 63.0
         else:
-            data_low = -127.0
-            data_high = 127.0
+            data_low = -63.0
+            data_high = 63.0
         fc_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                      shape=data_shape).astype('int32')
-        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+        fc_fp32_exe.arg_dict[arg_names[1]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                      shape=arg_shapes[1]).astype('int32')
         if not no_bias:
-            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=-127.0, high=127.0,
+            fc_fp32_exe.arg_dict[arg_names[2]][:] = mx.nd.random.uniform(low=data_low, high=data_high,
                                                                          shape=arg_shapes[2]).astype('int32')
         output = fc_fp32_exe.forward()[0]
 
@@ -343,6 +351,10 @@ def check_quantized_fc(data_shape, num_hidden, no_bias, qdtype, flatten=True):
         check_quantized_fc((32, 111, 2, 2), 100, True, qdtype)
         check_quantized_fc((32, 512, 2, 2), 100, False, qdtype)
         check_quantized_fc((32, 111, 2, 2), 100, False, qdtype)
+        check_quantized_fc((256, 2048, 2, 2), 800, False, qdtype)
+        check_quantized_fc((256, 111, 2, 2), 800, False, qdtype)
+        check_quantized_fc((256, 2048, 2, 2), 800, True, qdtype)
+        check_quantized_fc((256, 111, 2, 2), 800, True, qdtype)
 
 @with_seed()
 def test_quantized_flatten():
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index 2f889845af3f..5b5aff31231b 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -21,6 +21,7 @@
 from mxnet.autograd import *
 from mxnet.test_utils import *
 from common import setup_module, with_seed, teardown
+from mxnet.test_utils import EnvManager
 
 
 def grad_and_loss(func, argnum=None):
@@ -120,8 +121,9 @@ def check_unary_func(x):
         autograd_assert(x, func=f_square, grad_func=f_square_grad)
     uniform = nd.uniform(shape=(4, 5))
     stypes = ['default', 'row_sparse', 'csr']
-    for stype in stypes:
-        check_unary_func(uniform.tostype(stype))
+    with EnvManager('MXNET_STORAGE_FALLBACK_LOG_VERBOSE', '0'):
+        for stype in stypes:
+            check_unary_func(uniform.tostype(stype))
 
 @with_seed()
 def test_binary_func():
@@ -138,11 +140,12 @@ def check_binary_func(x, y):
     uniform_x = nd.uniform(shape=(4, 5))
     uniform_y = nd.uniform(shape=(4, 5))
     stypes = ['default', 'row_sparse', 'csr']
-    for stype_x in stypes:
-        for stype_y in stypes:
-            x = uniform_x.tostype(stype_x)
-            y = uniform_y.tostype(stype_y)
-            check_binary_func(x, y)
+    with EnvManager('MXNET_STORAGE_FALLBACK_LOG_VERBOSE', '0'):
+        for stype_x in stypes:
+            for stype_y in stypes:
+                x = uniform_x.tostype(stype_x)
+                y = uniform_y.tostype(stype_y)
+                check_binary_func(x, y)
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_dgl_graph.py b/tests/python/unittest/test_dgl_graph.py
index 069fef6e32f0..e24cf4deb756 100644
--- a/tests/python/unittest/test_dgl_graph.py
+++ b/tests/python/unittest/test_dgl_graph.py
@@ -32,15 +32,12 @@ def check_uniform(out, num_hops, max_num_vertices):
     layer = out[2]
     # check sample_id
     assert (len(sample_id) == max_num_vertices+1)
-    count = 0
-    for data in sample_id:
-        if data != -1:
-            count = count + 1
-    assert (mx.nd.array([count-1], dtype=np.int64) == sample_id[-1])
+    num_vertices = sample_id[-1].asnumpy()[0]
     # check sub_csr
     sub_csr.check_format(full_check=True)
+    assert np.all((sub_csr.indptr[num_vertices:] == sub_csr.indptr[num_vertices]).asnumpy())
     # check layer
-    for data in layer:
+    for data in layer[:num_vertices]:
         assert(data <= num_hops)
 
 def check_non_uniform(out, num_hops, max_num_vertices):
@@ -50,17 +47,14 @@ def check_non_uniform(out, num_hops, max_num_vertices):
     layer = out[3]
     # check sample_id
     assert (len(sample_id) == max_num_vertices+1)
-    count = 0
-    for data in sample_id:
-        if data != -1:
-            count = count + 1
-    assert (mx.nd.array([count-1], dtype=np.int64) == sample_id[-1])
+    num_vertices = sample_id[-1].asnumpy()[0]
     # check sub_csr
     sub_csr.check_format(full_check=True)
+    assert np.all((sub_csr.indptr[num_vertices:] == sub_csr.indptr[num_vertices]).asnumpy())
     # check prob
     assert (len(prob) == max_num_vertices)
     # check layer
-    for data in layer:
+    for data in layer[:num_vertices]:
         assert(data <= num_hops)
 
 def check_compact(csr, id_arr, num_nodes):
@@ -101,9 +95,9 @@ def test_uniform_sample():
     check_compact(out[1], out[0], num_nodes)
 
     seed = mx.nd.array([0], dtype=np.int64)
-    out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=2, num_neighbor=1, max_num_vertices=4)
+    out = mx.nd.contrib.dgl_csr_neighbor_uniform_sample(a, seed, num_args=2, num_hops=2, num_neighbor=1, max_num_vertices=3)
     assert (len(out) == 3)
-    check_uniform(out, num_hops=2, max_num_vertices=4)
+    check_uniform(out, num_hops=2, max_num_vertices=3)
     num_nodes = out[0][-1].asnumpy()
     assert num_nodes > 0
     assert num_nodes < len(out[0])
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index a3ba222c71d8..353a819ddbf6 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -77,6 +77,10 @@ def _dataset_transform_fn(x, y):
     """Named transform function since lambda function cannot be pickled."""
     return x, y
 
+def _dataset_transform_first_fn(x):
+    """Named transform function since lambda function cannot be pickled."""
+    return x
+
 @with_seed()
 def test_recordimage_dataset_with_data_loader_multiworker():
     recfile = prepare_record()
@@ -95,17 +99,13 @@ def test_recordimage_dataset_with_data_loader_multiworker():
         assert x.shape[0] == 1 and x.shape[3] == 3
         assert y.asscalar() == i
 
-    # try limit recursion depth
-    import sys
-    old_limit = sys.getrecursionlimit()
-    sys.setrecursionlimit(500)  # this should be smaller than any default value used in python
-    dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(_dataset_transform_fn)
+    # with transform_first
+    dataset = gluon.data.vision.ImageRecordDataset(recfile).transform_first(_dataset_transform_first_fn)
     loader = gluon.data.DataLoader(dataset, 1, num_workers=5)
 
     for i, (x, y) in enumerate(loader):
         assert x.shape[0] == 1 and x.shape[3] == 3
         assert y.asscalar() == i
-    sys.setrecursionlimit(old_limit)
 
 @with_seed()
 def test_sampler():
@@ -156,9 +156,10 @@ def __getitem__(self, key):
 @with_seed()
 def test_multi_worker():
     data = Dataset()
-    loader = gluon.data.DataLoader(data, batch_size=1, num_workers=5)
-    for i, batch in enumerate(loader):
-        assert (batch.asnumpy() == i).all()
+    for thread_pool in [True, False]:
+        loader = gluon.data.DataLoader(data, batch_size=1, num_workers=5, thread_pool=thread_pool)
+        for i, batch in enumerate(loader):
+            assert (batch.asnumpy() == i).all()
 
 class _Dummy(Dataset):
     """Dummy dataset for randomized shape arrays."""
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 72c01acb2652..985c38c31356 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -55,16 +55,15 @@ def dict_equ(a, b):
             y.backward()
     trainer.step(1)
 
+    assert trainer._optimizer.param_dict == trainer._optimizer.param_dict
     assert (x.data(mx.cpu(1)).asnumpy() == -2).all()
 
     x.lr_mult = 0.5
-
     with mx.autograd.record():
         for w in x.list_data():
             y = w + 1
             y.backward()
     trainer.step(1)
-
     assert (x.data(mx.cpu(1)).asnumpy() == -4).all()
 
     trainer.save_states('test_trainer.states')
@@ -74,12 +73,13 @@ def dict_equ(a, b):
     if trainer._update_on_kvstore:
         dict_equ(trainer._kvstore._updater.states, states)
         assert trainer._optimizer == trainer._kvstore._updater.optimizer
+        # invalid usage of update and allreduce_grads if update_on_kvstore
+        assert_raises(AssertionError, trainer.update, 1)
+        assert_raises(AssertionError, trainer.allreduce_grads)
     else:
         for updater in trainer._updaters:
             dict_equ(updater.states, states)
         assert trainer._optimizer == trainer._updaters[0].optimizer
-    assert_raises(AssertionError, trainer.update, 1)
-    assert_raises(AssertionError, trainer.allreduce_grads)
 
     x = gluon.Parameter('x', shape=(10,))
     x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
@@ -193,8 +193,10 @@ def check_trainer_reset_kv(kv):
         # load would reset kvstore
         mx.nd.waitall()
         params.load('test_trainer_reset_kv.params')
-        assert trainer._kvstore is None
-        assert trainer._kv_initialized is False
+        if trainer._update_on_kvstore:
+            # drop kvstore state if new parameters are loaded
+            assert trainer._kvstore is None
+            assert trainer._kv_initialized is False
         with mx.autograd.record():
             for w in x.list_data():
                 y = w + 1
@@ -209,28 +211,74 @@ def check_trainer_reset_kv(kv):
 
 @with_seed()
 def test_trainer_sparse_kv():
-    def check_trainer_sparse_kv(kv, stype, grad_stype, update_on_kv):
+    def check_trainer_sparse_kv(kv, stype, grad_stype, update_on_kv, expected):
         params = gluon.ParameterDict()
         x = params.get('x', shape=(10,1), lr_mult=1.0, stype=stype, grad_stype=grad_stype)
         params.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
-        trainer = gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}, kvstore=kv)
+        trainer = gluon.Trainer(params, 'sgd', {'learning_rate': 0.1},
+                                kvstore=kv, update_on_kvstore=update_on_kv)
         all_rows = mx.nd.arange(0, 10, ctx=mx.cpu(0))
-        ws = x.list_data() if stype == 'default' else x.list_row_sparse_data(all_rows)
+        try:
+            ws = x.list_data() if stype == 'default' else x.list_row_sparse_data(all_rows)
+            with mx.autograd.record():
+                for w in ws:
+                    y = w + 1
+                    y.backward()
+            trainer.step(1)
+            assert trainer._kvstore.type == kv
+            assert trainer._kv_initialized
+            assert trainer._update_on_kvstore is expected
+            # the updated parameter should be based on the loaded checkpoint
+            mx.nd.waitall()
+            updated_w = x.data(mx.cpu(0)) if stype == 'default' else x.row_sparse_data(all_rows)
+            assert (updated_w == -0.2).asnumpy().all()
+        except Exception as err:
+            assert isinstance(err, expected)
+
+    kvs = ['local', 'device']
+    for kv in kvs:
+        check_trainer_sparse_kv(kv, 'default', 'default', True, True)
+        check_trainer_sparse_kv(kv, 'default', 'default', False, False)
+        check_trainer_sparse_kv(kv, 'default', 'default', None, True)
+        check_trainer_sparse_kv(kv, 'default', 'row_sparse', None, False)
+        check_trainer_sparse_kv(kv, 'default', 'row_sparse', True, True)
+        check_trainer_sparse_kv(kv, 'default', 'row_sparse', False, False)
+        check_trainer_sparse_kv(kv, 'row_sparse', 'row_sparse', None, True)
+        check_trainer_sparse_kv(kv, 'row_sparse', 'row_sparse', False, ValueError)
+
+@with_seed()
+def test_trainer_lr_sched():
+    x = gluon.Parameter('x', shape=(10,))
+    x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
+    freq = 2
+    factor = 0.1
+    lr = 1
+    lr_sched = mx.lr_scheduler.FactorScheduler(freq, factor=factor, base_lr=lr)
+    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': lr, 'lr_scheduler': lr_sched})
+    for i in range(10):
         with mx.autograd.record():
-            for w in ws:
+            for w in x.list_data():
                 y = w + 1
                 y.backward()
         trainer.step(1)
-        assert trainer._kvstore.type == kv
-        assert trainer._kv_initialized
-        assert trainer._update_on_kvstore is update_on_kv
-        # the updated parameter should be based on the loaded checkpoint
-        mx.nd.waitall()
-        updated_w = x.data(mx.cpu(0)) if stype == 'default' else x.row_sparse_data(all_rows)
-        assert (updated_w == -0.2).asnumpy().all()
+        if i % freq == 0:
+            assert trainer.learning_rate == lr, (lr, trainer.learning_rate, i)
+            lr *= factor
+    mx.nd.waitall()
 
-    kvs = ['local', 'device']
-    for kv in kvs:
-        check_trainer_sparse_kv(kv, 'default', 'default', True)
-        check_trainer_sparse_kv(kv, 'default', 'row_sparse', False)
-        check_trainer_sparse_kv(kv, 'row_sparse', 'row_sparse', True)
+@with_seed()
+def test_trainer_invalid_lr_sched():
+    x = gluon.Parameter('x', shape=(10,))
+    x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
+    freq = 2
+    factor = 0.1
+    lr = 1
+    lr_sched = mx.lr_scheduler.FactorScheduler(freq, factor=factor, base_lr=lr)
+    invalid_trainer = gluon.Trainer([x], 'sgd', {'learning_rate': lr, 'lr_scheduler': lr_sched},
+                                    update_on_kvstore=False)
+    with mx.autograd.record():
+        for w in x.list_data():
+            y = w + 1
+            y.backward()
+    assert_raises(ValueError, invalid_trainer.step, 1)
+    mx.nd.waitall()
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 26277d2acff5..2821c4bbae3c 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -18,6 +18,8 @@
 import mxnet as mx
 import numpy as np
 import json
+from common import with_seed
+from copy import deepcopy
 
 def check_metric(metric, *args, **kwargs):
     metric = mx.metric.create(metric, *args, **kwargs)
@@ -37,6 +39,67 @@ def test_metrics():
     composite = mx.metric.create(['acc', 'f1'])
     check_metric(composite)
 
+def _check_global_metric(metric, *args, **kwargs):
+    def _create_pred_label():
+        if use_same_shape:
+            pred = mx.nd.random.uniform(0, 1, shape=shape)
+            label = mx.nd.random.uniform(0, 1, shape=shape)
+        else:
+            # Make a random prediction
+            idx = np.random.rand(*shape).argsort(1)
+            pred = mx.nd.array(1 - 0.1 * idx)
+            # Label is half 1 and half 0
+            # Setting all 0s or all 1s would make either
+            # MCC or F1 metrics always produce 0
+            label = mx.nd.ones(shape[0])
+            label[:shape[0] // 2] = 0
+        return pred, label
+
+    shape = kwargs.pop('shape', (10,10))
+    use_same_shape = kwargs.pop('use_same_shape', False)
+    m1 = mx.metric.create(metric, *args, **kwargs)
+    m2 = deepcopy(m1)
+    # check that global stats are not reset when calling
+    # reset_local()
+    for i in range(10):
+        pred, label = _create_pred_label()
+        m1.update([label], [pred])
+        m1.reset_local()
+        m2.update([label], [pred])
+    assert m1.get_global() == m2.get()
+
+    # check that reset_local() properly resets the local state
+    m1.reset_local()
+    m2.reset()
+    pred, label = _create_pred_label()
+    m1.update([label], [pred])
+    m1.reset_local()
+    pred, label = _create_pred_label()
+    m1.update([label], [pred])
+    m2.update([label], [pred])
+    assert m1.get() == m2.get()
+
+@with_seed()
+def test_global_metric():
+    _check_global_metric('acc')
+    _check_global_metric('TopKAccuracy', top_k=3)
+    _check_global_metric('f1', shape=(10,2))
+    _check_global_metric('f1', shape=(10,2), average='micro')
+    _check_global_metric('mcc', shape=(10,2))
+    _check_global_metric('mcc', shape=(10,2), average='micro')
+    _check_global_metric('perplexity', -1)
+    _check_global_metric('pearsonr', use_same_shape=True)
+    _check_global_metric('nll_loss')
+    _check_global_metric('loss')
+    _check_global_metric('ce')
+    _check_global_metric('mae', use_same_shape=True)
+    _check_global_metric('mse', use_same_shape=True)
+    _check_global_metric('rmse', use_same_shape=True)
+    def custom_metric(label, pred):
+        return np.mean(np.abs(label-pred))
+    _check_global_metric(custom_metric, use_same_shape=True)
+    _check_global_metric(['acc', 'f1'], shape=(10,2))
+
 def test_nll_loss():
     metric = mx.metric.create('nll_loss')
     pred = mx.nd.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]])
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 09157396f839..cb19fd869d30 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -17,6 +17,7 @@
 
 # pylint: skip-file
 from __future__ import print_function
+from __future__ import division
 import numpy as np
 import mxnet as mx
 import copy
@@ -1602,33 +1603,33 @@ def check_batchnorm_training(stype):
 def test_convolution_grouping():
     for dim in [1, 2, 3]:
         num_filter = 4
-        num_group = 2
-        kernel = (3,) * dim
-        shape = (1, 4) + (9,) * dim
-
-        x = mx.sym.Variable('x')
-        w = mx.sym.Variable('w')
-        b = mx.sym.Variable('b')
-        y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group, kernel=kernel)
-        xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
-        wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
-        bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
-        y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i],
-                                                num_filter=num_filter//num_group, kernel=kernel)
-                           for i in range(num_group)])
-
-        exe1 = y1.simple_bind(default_context(), x=shape)
-        exe2 = y2.simple_bind(default_context(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
-        for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
-            arr1[:] = np.float32(np.random.normal(size=arr1.shape))
-            arr2[:] = arr1
-        exe1.forward(is_train=True)
-        exe1.backward(exe1.outputs[0])
-        exe2.forward(is_train=True)
-        exe2.backward(exe2.outputs[0])
-
-        for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
-            np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3)
+        for num_group in [1, 2]:
+            kernel = (3,) * dim
+            shape = (1, 4) + (9,) * dim
+
+            x = mx.sym.Variable('x')
+            w = mx.sym.Variable('w')
+            b = mx.sym.Variable('b')
+            y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group, kernel=kernel)
+            xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
+            wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
+            bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
+            y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i],
+                                                    num_filter=num_filter//num_group, kernel=kernel)
+                            for i in range(num_group)])
+
+            exe1 = y1.simple_bind(default_context(), x=shape)
+            exe2 = y2.simple_bind(default_context(), x=shape, w=(num_filter, shape[1]//num_group) + kernel, b=(num_filter,))
+            for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
+                arr1[:] = np.float32(np.random.normal(size=arr1.shape))
+                arr2[:] = arr1
+            exe1.forward(is_train=True)
+            exe1.backward(exe1.outputs[0])
+            exe2.forward(is_train=True)
+            exe2.backward(exe2.outputs[0])
+
+            for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
+                np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3)
 
 
 @unittest.skip("Flaky test /~https://github.com/apache/incubator-mxnet/issues/12203")
@@ -4390,6 +4391,7 @@ def test_1d_cond():
     test_1d_cond()
 
 
+@unittest.skip("Flaky test. Tracked in /~https://github.com/apache/incubator-mxnet/issues/13600")
 @with_seed()
 def test_softmin():
     for ndim in range(1, 5):
@@ -6772,7 +6774,7 @@ def get_output_names_callback(name, arr):
 
 @with_seed()
 def test_activation():
-    shape=(9, 10)
+    shapes = [(9,), (9, 10), (9, 10, 10), (1, 9, 10, 10)]
     dtype_l = [np.float64, np.float32, np.float16]
     rtol_l = [1e-7, 1e-6, 1e-2]
     atol_l = [1e-7, 1e-6, 1e-2]
@@ -6803,17 +6805,19 @@ def test_activation():
     }
     # Loop over operators
     for name, op in unary_ops.items():
-        # Loop over dtype's
-        for ind in range(len(dtype_l)):
-            dtype = dtype_l[ind]
-            rtol = rtol_l[ind]
-            atol = atol_l[ind]
-            compare_forw_backw_unary_op(
-                name, op[0], op[1], op[2], shape, op[3], op[4], rtol, atol,
-                dtype)
-        # Finite difference testing
-        finite_diff_unary_op(
-            name, op[0], shape, op[3], op[4], rtol_fd, atol_fd, num_eps)
+        # Loop over shapes
+        for shape in shapes:
+            # Loop over dtype's
+            for ind in range(len(dtype_l)):
+                dtype = dtype_l[ind]
+                rtol = rtol_l[ind]
+                atol = atol_l[ind]
+                compare_forw_backw_unary_op(
+                    name, op[0], op[1], op[2], shape, op[3], op[4], rtol, atol,
+                    dtype)
+            # Finite difference testing
+            finite_diff_unary_op(
+                name, op[0], shape, op[3], op[4], rtol_fd, atol_fd, num_eps)
 
 @with_seed()
 def test_ravel():
@@ -6897,14 +6901,16 @@ def bilinear_interpolate(bottom, height, width, y, x):
                ]
         return val, grad
 
-    def roialign_forward_backward(data, rois, pooled_size, spatial_scale, sampling_ratio, dy):
+    def roialign_forward_backward(data, rois, pooled_size, spatial_scale, sampling_ratio,
+            position_sensitive, dy):
         N, C, H, W = data.shape
         R = rois.shape[0]
         PH, PW = pooled_size
         assert len(rois.shape) == 2
         assert rois.shape[1] == 5
 
-        out = np.zeros((R, C, PH, PW))
+        C_out = C // PH // PW if position_sensitive else C
+        out = np.zeros((R, C_out, PH, PW))
         dx = np.zeros_like(data)
         drois = np.zeros_like(rois)
 
@@ -6922,24 +6928,25 @@ def roialign_forward_backward(data, rois, pooled_size, spatial_scale, sampling_r
                 roi_bin_grid_h = int(np.ceil(roi_h * 1.0 / PH))
                 roi_bin_grid_w = int(np.ceil(roi_w * 1.0 / PW))
             count = roi_bin_grid_h * roi_bin_grid_w
-            for c in range(C):
+            for c in range(C_out):
                 for ph in range(PH):
                     for pw in range(PW):
                         val = 0.0
+                        c_in = c * PH * PW + ph * PW + pw if position_sensitive else c
                         for iy in range(roi_bin_grid_h):
                             y = sh + ph * bin_h + (iy + 0.5) * bin_h / roi_bin_grid_h
                             for ix in range(roi_bin_grid_w):
                                 x = sw + pw * bin_w + (ix + 0.5) * bin_w / roi_bin_grid_w
-                                v, g = bilinear_interpolate(bdata[c], H, W, y, x)
+                                v, g = bilinear_interpolate(bdata[c_in], H, W, y, x)
                                 val += v
                                 # compute grad
                                 for qy, qx, qw in g:
-                                    dx[batch_ind, c, qy, qx] += dy[r, c, ph, pw] * qw * 1.0 / count
+                                    dx[batch_ind, c_in, qy, qx] += dy[r, c, ph, pw] * qw * 1.0 / count
 
                         out[r, c, ph, pw] = val * 1.0 / count
         return out, [dx, drois]
 
-    def test_roi_align_value(sampling_ratio=0):
+    def test_roi_align_value(sampling_ratio=0, position_sensitive=False):
         ctx=default_context()
         dtype = np.float32
 
@@ -6948,6 +6955,7 @@ def test_roi_align_value(sampling_ratio=0):
         assert H == W
         R = 7
         pooled_size = (3, 4)
+        C = C * pooled_size[0] * pooled_size[1] if position_sensitive else C
 
         spatial_scale = H * 1.0 / dlen
         data = mx.nd.array(np.arange(N*C*W*H).reshape((N,C,H,W)), ctx=ctx, dtype = dtype)
@@ -6962,11 +6970,14 @@ def test_roi_align_value(sampling_ratio=0):
         rois.attach_grad()
         with mx.autograd.record():
             output = mx.nd.contrib.ROIAlign(data, rois, pooled_size=pooled_size,
-                    spatial_scale=spatial_scale, sample_ratio=sampling_ratio)
-        dy = mx.nd.random.uniform(-1, 1, (R, C) + pooled_size, ctx=ctx, dtype = dtype)
+                    spatial_scale=spatial_scale, sample_ratio=sampling_ratio,
+                    position_sensitive=position_sensitive)
+        C_out = C // pooled_size[0] // pooled_size[1] if position_sensitive else C
+        dy = mx.nd.random.uniform(-1, 1, (R, C_out) + pooled_size, ctx=ctx, dtype = dtype)
         output.backward(dy)
         real_output, [dx, drois] = roialign_forward_backward(data.asnumpy(), rois.asnumpy(), pooled_size,
-                                                             spatial_scale, sampling_ratio, dy.asnumpy())
+                                                             spatial_scale, sampling_ratio,
+                                                             position_sensitive, dy.asnumpy())
         assert np.allclose(output.asnumpy(), real_output)
         # It seems that the precision between Cfloat and Pyfloat is different.
         assert np.allclose(data.grad.asnumpy(), dx, atol = 1e-5), np.abs(data.grad.asnumpy() - dx).max()
@@ -6992,7 +7003,8 @@ def test_roi_align_autograd(sampling_ratio=0):
                                numeric_eps=1e-4, rtol=1e-1, atol=1e-4, ctx=ctx)
 
     test_roi_align_value()
-    test_roi_align_value(2)
+    test_roi_align_value(sampling_ratio=2)
+    test_roi_align_value(position_sensitive=True)
     test_roi_align_autograd()
 
 @with_seed()
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index acf24ee1b794..eb33f9b5217e 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -506,12 +506,11 @@ def test_ftml():
 class PyAdam(mx.optimizer.Optimizer):
     """python reference implemenation of adam"""
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 decay_factor=(1 - 1e-8), lazy_update=True, **kwargs):
+                 lazy_update=True, **kwargs):
         super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
-        self.decay_factor = decay_factor
         self.lazy_update = lazy_update
 
     def create_state(self, index, weight):
@@ -614,7 +613,6 @@ def test_adam():
                                           dtype, w_stype='default', g_stype='row_sparse',
                                           rtol=1e-4, atol=2e-5)
 
-
 # AdaMax
 class PyAdamax(mx.optimizer.Optimizer):
     """The python reference of AdaMax optimizer.
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 57808248b081..05175bb435f2 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -2306,6 +2306,48 @@ def check_sparse_quadratic_function(a, b, c, expected_stype):
     check_sparse_quadratic_function(a, b, 0.0, 'csr')
     check_sparse_quadratic_function(a, b, 1.0, 'default')
 
+def test_reshape_backward_fallback():
+    """
+     out
+     |  \
+    w_x  x
+     /
+    w
+    in which x is a sparse tensor.
+    Due to sparse gradient optimization in sym.dot, grad(w_x) is sparse.
+    Though sym.reshape itself does not have sparse version,
+    if we somehow make grad(w) sparse as well, e.g.,
+        - by setting args_grad in symbol.bind
+        - or, we can have out_y = sym.dot(sparse_y, w), then grad(w) will be inferred as sparse
+    reshape backward (from w_x to w) needs to understand how to handle sparse inputs.
+    """
+    ctx = default_context()
+    w_shape = (12, 4)
+    w_x_shape = (1, 48)
+    x_nd = rand_ndarray((4, 1), 'csr')
+
+    w_nd = rand_ndarray(w_shape)
+
+    w_x_nd = w_nd.reshape(w_x_shape)
+    out_x_nd = mx.nd.dot(x_nd, w_x_nd)
+
+    w_x_backward_grad = mx.nd.dot(x_nd, out_x_nd, transpose_a=True).asnumpy()
+    expected_grad_nd = w_x_backward_grad.reshape(w_shape)
+
+    x = mx.sym.Variable('x', stype='csr')
+    w = mx.sym.Variable('w')
+
+    w_x = mx.sym.reshape(w, w_x_shape, name="w_x")
+    out = mx.sym.sparse.dot(x, w_x, name='out_x')
+
+    grad_w_nd = rand_ndarray(w_shape, 'row_sparse')
+    executor = out.bind(ctx=ctx, args={"x": x_nd, "w": w_nd},
+                        args_grad={"w": grad_w_nd})
+    executor.forward(is_train=True)
+    executor.backward(out_x_nd)
+
+    assert_almost_equal(grad_w_nd.asnumpy(), expected_grad_nd)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 3ca696b288c9..f64f7ffb6705 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -2,3 +2,4 @@
 mock
 nose
 nose-timer
+ipython
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index 54adcd5c53d1..8d8ef398d708 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -73,6 +73,9 @@ def test_basic_module():
 def test_basic_data():
     assert _test_tutorial_nb('basic/data')
 
+def test_basic_reshape_transpose():
+       assert _test_tutorial_nb('basic/reshape_transpose')
+
 def test_gluon_customop():
     assert _test_tutorial_nb('gluon/customop')
 
diff --git a/tools/build/build_lib.sh b/tools/build/build_lib.sh
new file mode 100755
index 000000000000..032fcb92045f
--- /dev/null
+++ b/tools/build/build_lib.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script builds the libraries of mxnet.
+make_config=config/pip_${PLATFORM}_${VARIANT}.mk
+if [[ ! -f $make_config ]]; then
+    >&2 echo "Couldn't find make config $make_config for the current settings."
+    exit 1
+fi
+
+git clone --recursive /~https://github.com/apache/incubator-mxnet mxnet-build
+
+>&2 echo "Now building mxnet modules..."
+cp $make_config mxnet-build/config.mk
+
+cd mxnet-build
+
+make DEPS_PATH=$DEPS_PATH DMLCCORE
+make DEPS_PATH=$DEPS_PATH $PWD/3rdparty/tvm/nnvm/lib/libnnvm.a
+make DEPS_PATH=$DEPS_PATH PSLITE
+
+if [[ $VARIANT == *mkl ]]; then
+    MKLDNN_LICENSE='license.txt'
+    if [[ $PLATFORM == 'linux' ]]; then
+        IOMP_LIBFILE='libiomp5.so'
+        MKLML_LIBFILE='libmklml_intel.so'
+        MKLDNN_LIBFILE='libmkldnn.so.0'
+    else
+        IOMP_LIBFILE='libiomp5.dylib'
+        MKLML_LIBFILE='libmklml.dylib'
+        MKLDNN_LIBFILE='libmkldnn.0.dylib'
+    fi
+    make DEPS_PATH=$DEPS_PATH mkldnn
+    cp 3rdparty/mkldnn/LICENSE ./MKLML_LICENSE
+fi
+
+if [[ $VARIANT == *mkl ]]; then
+    >&2 echo "Copying MKL license."
+    rm lib/libmkldnn.{so,dylib}
+    rm lib/libmkldnn.0.*.dylib
+    rm lib/libmkldnn.so.0.*
+fi
+
+>&2 echo "Now building mxnet..."
+make DEPS_PATH=$DEPS_PATH || exit 1;
+
+if [[ $PLATFORM == 'linux' ]]; then
+    cp -L /usr/lib/gcc/x86_64-linux-gnu/4.8/libgfortran.so lib/libgfortran.so.3
+    cp -L /usr/lib/x86_64-linux-gnu/libquadmath.so.0 lib/libquadmath.so.0
+fi
+
+# Print the linked objects on libmxnet.so
+>&2 echo "Checking linked objects on libmxnet.so..."
+if [[ ! -z $(command -v readelf) ]]; then
+    readelf -d lib/libmxnet.so
+    strip --strip-unneeded lib/libmxnet.so
+elif [[ ! -z $(command -v otool) ]]; then
+    otool -L lib/libmxnet.so
+    strip -u -r -x lib/libmxnet.so
+else
+    >&2 echo "Not available"
+fi
+
+cd ../
diff --git a/tools/build/build_wheel.sh b/tools/build/build_wheel.sh
new file mode 100755
index 000000000000..a79634117c21
--- /dev/null
+++ b/tools/build/build_wheel.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script builds the wheel for binary distribution and performs sanity check.
+
+cd mxnet-build
+echo $(git rev-parse HEAD) >> python/mxnet/COMMIT_HASH
+cd -
+
+# Make wheel for testing
+python setup.py bdist_wheel
+
+wheel_name=$(ls -t dist | head -n 1)
+pip install -U --user --force-reinstall dist/$wheel_name
+python sanity_test.py
diff --git a/tools/dependencies/make_shared_dependencies.sh b/tools/dependencies/make_shared_dependencies.sh
new file mode 100755
index 000000000000..d678fddcc02d
--- /dev/null
+++ b/tools/dependencies/make_shared_dependencies.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This is a convenience script for calling the build scripts of all dependency libraries.
+# Environment variables should be set beforehand.
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+
+
+if [[ ! $PLATFORM == 'darwin' ]]; then
+    source $DIR/openblas.sh
+fi
+source $DIR/libz.sh
+source $DIR/libturbojpeg.sh
+source $DIR/libpng.sh
+source $DIR/libtiff.sh
+source $DIR/openssl.sh
+source $DIR/curl.sh
+source $DIR/eigen.sh
+source $DIR/opencv.sh
+source $DIR/protobuf.sh
+source $DIR/cityhash.sh
+source $DIR/zmq.sh
+source $DIR/lz4.sh
diff --git a/tools/dependencies/opencv.sh b/tools/dependencies/opencv.sh
index 98ff115f1765..99d0ecb71c36 100755
--- a/tools/dependencies/opencv.sh
+++ b/tools/dependencies/opencv.sh
@@ -20,6 +20,7 @@
 # This script builds the static library of opencv that can be used as dependency of mxnet.
 # It expects openblas, libjpeg, libpng, libtiff, eigen, etc., to be in $DEPS_PATH.
 OPENCV_VERSION=3.4.2
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 if [[ $PLATFORM == 'linux' ]]; then
     OPENCV_LAPACK_OPTIONS=" \
           -D OpenBLAS_HOME=$DEPS_PATH \
@@ -181,7 +182,7 @@ if [[ ! -f $DEPS_PATH/lib/libopencv_core.a ]] || [[ ! -f $DEPS_PATH/lib/libopenc
           -D CMAKE_BUILD_TYPE=RELEASE \
           -D CMAKE_INSTALL_PREFIX=$DEPS_PATH ..
     if [[ $PLATFORM == 'linux' ]]; then
-        cp $DEPS_PATH/../patch/opencv_lapack.h ./
+        cp $DIR/patch/opencv_lapack.h ./
     fi
     make
     make install
diff --git a/tools/dependencies/patch/opencv_lapack.h b/tools/dependencies/patch/opencv_lapack.h
new file mode 100644
index 000000000000..97af9d67ea31
--- /dev/null
+++ b/tools/dependencies/patch/opencv_lapack.h
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+extern "C" {
+#include "cblas.h"
+#include "lapacke.h"
+}
diff --git a/tools/dependencies/protobuf.sh b/tools/dependencies/protobuf.sh
index dfa3d71f3750..1564701042af 100755
--- a/tools/dependencies/protobuf.sh
+++ b/tools/dependencies/protobuf.sh
@@ -39,3 +39,5 @@ if [[ ! -e $LIBPROTOBUF ]] || [[ ! -e $LIBPROTOC ]]; then
     make install
     cd -
 fi
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$(dirname $(find $DEPS_PATH -type f -name 'libprotoc*' | grep protobuf | head -n 1)):$DEPS_PATH/lib
diff --git a/tools/license_header.py b/tools/license_header.py
index 10ba8b909e70..199d56c7ee35 100755
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -37,6 +37,7 @@
 from itertools import chain
 import logging
 import sys
+import subprocess
 
 # the default apache license
 _LICENSE = """Licensed to the Apache Software Foundation (ASF) under one
@@ -60,24 +61,35 @@
 _LICENSE_PATTERNS = ['Licensed to the Apache Software Foundation']
 
 # the folders or files that will be ignored
-_WHITE_LIST = ['R-package/',
+_WHITE_LIST = [
+               # Licensed under docker/Dockerfiles/License.md
                'docker/Dockerfiles',
+
+               # Git submodules under different licenses
                '3rdparty',
-               'src/operator/mkl/',
+
+               # Code shared with project by author - see file for details
                'src/operator/special_functions-inl.h',
+
+               # Code generated by scala-package, checked in, and verified
+               'scala-package/init-native/src/main/native/org_apache_mxnet_init_native_c_api.h',
+               'scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h',
+
+               # Licensed under Caffe header
                'src/operator/nn/pool.h',
                'src/operator/contrib/psroi_pooling-inl.h',
                'src/operator/contrib/nn/deformable_im2col.h',
                'src/operator/contrib/nn/deformable_im2col.cuh',
                'src/operator/nn/im2col.h',
                'src/operator/nn/im2col.cuh',
+
+               # Licenses in headers
+               'docs/_static/searchtools_custom.js',
+               'docs/_static/js/clipboard.js',
+               'docs/_static/js/clipboard.min.js',
+
+               # Licensed under 2-Clause BSD in header
                'example/ssd/dataset/pycocotools/coco.py',
-               'example/rcnn/rcnn/cython/setup.py',
-               'example/rcnn/rcnn/cython/nms_kernel.cu',
-               'prepare_mkl.sh',
-               'example/image-classification/predict-cpp/image-classification-predict.cc',
-               'src/operator/contrib/ctc_include/',
-               'julia/REQUIRE'
                ]
 
 # language extensions and the according commment mark
@@ -85,14 +97,28 @@
           '.pm':'#', '.scala':'*', '.cc':'*', '.sh':'#', '.cmake':'#',
           '.java':'*', '.sh':'#', '.cpp':'*', '.hpp':'*', '.c':'*',
           '.bat':'rem', '.pl':'#', '.m':'%', '.R':'#', '.mk':'#', '.cfg':'#',
-          '.t':'#', '.ps1':'#', '.jl':'#'}
+          '.t':'#', '.ps1':'#', '.jl':'#', '.clj':';;', '.pyx':'#', '.js':'*'}
 
 # Previous license header, which will be removed
 _OLD_LICENSE = re.compile('.*Copyright.*by Contributors')
 
-def _has_license(lines):
+
+def get_mxnet_root():
+    curpath = os.path.abspath(os.path.dirname(__file__))
+    def is_mxnet_root(path: str) -> bool:
+        return os.path.exists(os.path.join(path, ".mxnet_root"))
+    while not is_mxnet_root(curpath):
+        parent = os.path.abspath(os.path.join(curpath, os.pardir))
+        if parent == curpath:
+            raise RuntimeError("Got to the root and couldn't find a parent folder with .mxnet_root")
+        curpath = parent
+    return curpath
+
+
+def _lines_have_license(lines):
     return any([any([p in l for p in _LICENSE_PATTERNS]) for l in lines])
 
+
 def _get_license(comment_mark):
     if comment_mark == '*':
         body = '/*\n'
@@ -111,65 +137,88 @@ def _get_license(comment_mark):
     body += '\n'
     return body
 
-def _valid_file(fname, verbose=False):
+
+def should_have_license(fname):
     if any([l in fname for l in _WHITE_LIST]):
-        if verbose:
-            logging.info('skip ' + fname + ', it matches the white list')
+        logging.debug('skip ' + fname + ', it matches the white list')
         return False
     _, ext = os.path.splitext(fname)
     if ext not in _LANGS:
-        if verbose:
-            logging.info('skip ' + fname + ', unknown file extension')
+        logging.debug('skip ' + fname + ', unknown file extension')
         return False
     return True
 
-def process_file(fname, action, verbose=True):
-    if not _valid_file(fname, verbose):
+
+def file_has_license(fname):
+    if not should_have_license(fname):
         return True
     try:
         with open(fname, 'r', encoding="utf-8") as f:
             lines = f.readlines()
-        if not lines:
-            return True
-        if _has_license(lines):
+        if not lines or _lines_have_license(lines):
             return True
-        elif action == 'check':
+        else:
+            logging.error("File %s doesn't have a license", fname)
             return False
-        _, ext = os.path.splitext(fname)
-        with open(fname, 'w', encoding="utf-8") as f:
-            # shebang line
-            if lines[0].startswith('#!'):
-                f.write(lines[0].rstrip()+'\n\n')
-                del lines[0]
-            f.write(_get_license(_LANGS[ext]))
-            for l in lines:
-                f.write(l.rstrip()+'\n')
-        logging.info('added license header to ' + fname)
     except UnicodeError:
         return True
     return True
 
-def process_folder(root, action):
-    excepts = []
-    for root, _, files in os.walk(root):
-        for f in files:
-            fname = os.path.normpath(os.path.join(root, f))
-            if not process_file(fname, action):
-                excepts.append(fname)
-    if action == 'check' and excepts:
-        logging.warning('The following files do not contain a valid license, '+
-                        'you can use `tools/license_header.py add [file]` to add'+
-                        'them automatically: ')
-        for x in excepts:
-            logging.warning(x)
-        return False
-    return True
 
-if __name__ == '__main__':
-    logging.getLogger().setLevel(logging.INFO)
-    logging.basicConfig(format='%(asctime)-15s %(message)s')
+def file_add_license(fname):
+    if not should_have_license(fname):
+        return
+    with open(fname, 'r', encoding="utf-8") as f:
+        lines = f.readlines()
+    if _lines_have_license(lines):
+        return
+    _, ext = os.path.splitext(fname)
+    with open(fname, 'w', encoding="utf-8") as f:
+        # shebang line
+        if lines[0].startswith('#!'):
+            f.write(lines[0].rstrip()+'\n\n')
+            del lines[0]
+        f.write(_get_license(_LANGS[ext]))
+        for l in lines:
+            f.write(l.rstrip()+'\n')
+    logging.info('added license header to ' + fname)
+    return
+
+
+def under_git():
+    return subprocess.run(['git', 'rev-parse', 'HEAD'],
+        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).returncode == 0
+
+
+def git_files():
+    return list(map(os.fsdecode,
+        subprocess.check_output('git ls-tree -r HEAD --name-only -z'.split()).split(b'\0')))
+
+
+def file_generator(path: str):
+    for (dirpath, dirnames, files) in os.walk(path):
+        for file in files:
+            yield os.path.abspath(os.path.join(dirpath, file))
+
+
+def foreach(fn, iterable):
+    for x in iterable:
+        fn(x)
+
+
+def script_name():
+    """:returns: script name with leading paths removed"""
+    return os.path.split(sys.argv[0])[1]
+
+
+def main():
+    logging.basicConfig(
+        format='{}: %(levelname)s %(message)s'.format(script_name()),
+        level=os.environ.get("LOGLEVEL", "INFO"))
+
     parser = argparse.ArgumentParser(
         description='Add or check source license header')
+
     parser.add_argument(
         'action', nargs=1, type=str,
         choices=['add', 'check'], default='add',
@@ -180,19 +229,26 @@ def process_folder(root, action):
         help='Files to add license header to')
 
     args = parser.parse_args()
-    files = list(chain(*args.file))
     action = args.action[0]
-    has_license = True
-    if len(files) > 0:
-        for file in files:
-            has_license = process_file(file, action)
-            if action == 'check' and not has_license:
-                logging.warn("{} doesn't have a license".format(file))
-                has_license = False
-    else:
-        has_license = process_folder(os.path.join(os.path.dirname(__file__), '..'), action)
-    if not has_license:
-        sys.exit(1)
+    files = list(chain(*args.file))
+    if not files and action =='check':
+        if under_git():
+            logging.info("Git detected: Using files under version control")
+            files = git_files()
+        else:
+            logging.info("Using files under mxnet sources root")
+            files = file_generator(get_mxnet_root())
+
+    if action == 'check':
+        if not all(map(file_has_license, files)):
+            return 1
+        else:
+            logging.info("All known and whitelisted files have license")
+            return 0
     else:
-        sys.exit(0)
+        assert action == 'add'
+        foreach(file_add_license, files)
+    return 0
 
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/tools/pip_package/MANIFEST.in b/tools/pip/MANIFEST.in
similarity index 72%
rename from tools/pip_package/MANIFEST.in
rename to tools/pip/MANIFEST.in
index 5c6a72377e9f..8037b6a1059c 100644
--- a/tools/pip_package/MANIFEST.in
+++ b/tools/pip/MANIFEST.in
@@ -16,5 +16,13 @@
 # under the License.
 
 include README
-recursive-include * *.py
-recursive-include * *.so
+include mxnet/COMMIT_HASH
+recursive-include mxnet/tools *
+recursive-include mxnet *.py
+recursive-include mxnet *.so
+recursive-include mxnet *.so.*
+recursive-include mxnet *.dylib
+recursive-include mxnet *_LICENSE
+recursive-include mxnet *.h
+recursive-include mxnet *.cuh
+recursive-include dmlc_tracker *.py
diff --git a/tools/pip/sanity_test.py b/tools/pip/sanity_test.py
new file mode 100644
index 000000000000..dc51e479906b
--- /dev/null
+++ b/tools/pip/sanity_test.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Sanity test."""
+from __future__ import print_function
+import sys
+from base64 import b64decode
+
+try:
+    import mxnet as mx
+    mx.img.imdecode(b64decode('iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==')).asnumpy()
+    print('Test succeeded')
+except:
+    import traceback
+    print('Test failed')
+    traceback.print_exc()
+    sys.exit(1)
diff --git a/tools/pip/setup.py b/tools/pip/setup.py
new file mode 100644
index 000000000000..d5db6d87fc1d
--- /dev/null
+++ b/tools/pip/setup.py
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=invalid-name, exec-used
+"""Setup mxnet package for pip."""
+from __future__ import absolute_import
+from datetime import datetime
+import os
+import sys
+import shutil
+import platform
+
+if platform.system() == 'Linux':
+    sys.argv.append('--universal')
+    sys.argv.append('--plat-name=manylinux1_x86_64')
+
+from setuptools import setup, find_packages
+from setuptools.dist import Distribution
+
+# We can not import `mxnet.info.py` in setup.py directly since mxnet/__init__.py
+# Will be invoked which introduces dependences
+CURRENT_DIR = os.path.dirname(__file__)
+libinfo_py = os.path.join(CURRENT_DIR, 'mxnet-build/python/mxnet/libinfo.py')
+libinfo = {'__file__': libinfo_py}
+exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
+
+LIB_PATH = libinfo['find_lib_path']()
+__version__ = libinfo['__version__']
+if 'TRAVIS_TAG' not in os.environ or not os.environ['TRAVIS_TAG'].strip():
+    __version__ += 'b{0}'.format(datetime.today().strftime('%Y%m%d'))
+elif 'TRAVIS_TAG' in os.environ and os.environ['TRAVIS_TAG'].startswith('patch-'):
+    __version__ = os.environ['TRAVIS_TAG'].split('-')[1]
+
+class BinaryDistribution(Distribution):
+    def has_ext_modules(self):
+        return platform.system() == 'Darwin'
+
+
+DEPENDENCIES = [
+    'numpy<1.15.0,>=1.8.2',
+    'requests>=2.20.0',
+    'graphviz<0.9.0,>=0.8.1'
+]
+
+shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet'), ignore_errors=True)
+shutil.rmtree(os.path.join(CURRENT_DIR, 'dmlc_tracker'), ignore_errors=True)
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/python/mxnet'),
+                os.path.join(CURRENT_DIR, 'mxnet'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/dmlc-core/tracker/dmlc_tracker'),
+                os.path.join(CURRENT_DIR, 'dmlc_tracker'))
+shutil.copy(LIB_PATH[0], os.path.join(CURRENT_DIR, 'mxnet'))
+
+# copy tools to mxnet package
+shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet/tools'), ignore_errors=True)
+os.mkdir(os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/launch.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/im2rec.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/kill-mxnet.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/parse_log.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/tools/diagnose.py'), os.path.join(CURRENT_DIR, 'mxnet/tools'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/tools/caffe_converter'), os.path.join(CURRENT_DIR, 'mxnet/tools/caffe_converter'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/tools/bandwidth'), os.path.join(CURRENT_DIR, 'mxnet/tools/bandwidth'))
+
+# copy headers to mxnet package
+shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet/include'), ignore_errors=True)
+os.mkdir(os.path.join(CURRENT_DIR, 'mxnet/include'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/include/mxnet'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/mxnet'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/dlpack/include/dlpack'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/dlpack'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/dmlc-core/include/dmlc'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/dmlc'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/mshadow/mshadow'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/mshadow'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/tvm/nnvm/include/nnvm'),
+                os.path.join(CURRENT_DIR, 'mxnet/include/nnvm'))
+
+package_name = 'mxnet'
+
+variant = os.environ['mxnet_variant'].upper()
+if variant != 'CPU':
+    package_name = 'mxnet_{0}'.format(variant.lower())
+
+with open('doc/PYPI_README.md') as readme_file:
+    long_description = readme_file.read()
+
+with open('doc/{0}_ADDITIONAL.md'.format(variant)) as variant_doc:
+    long_description = long_description + variant_doc.read()
+
+# pypi only supports rst, so use pandoc to convert
+import pypandoc
+if platform.system() == 'Darwin':
+    pypandoc.download_pandoc()
+long_description = pypandoc.convert_text(long_description, 'rst', 'md')
+short_description = 'MXNet is an ultra-scalable deep learning framework.'
+libraries = []
+if variant == 'CPU':
+    libraries.append('openblas')
+else:
+    if variant.startswith('CU92'):
+        libraries.append('CUDA-9.2')
+    elif variant.startswith('CU91'):
+        libraries.append('CUDA-9.1')
+    elif variant.startswith('CU90'):
+        libraries.append('CUDA-9.0')
+    elif variant.startswith('CU80'):
+        libraries.append('CUDA-8.0')
+    elif variant.startswith('CU75'):
+        libraries.append('CUDA-7.5')
+    if variant.endswith('MKL'):
+        libraries.append('MKLDNN')
+
+short_description += ' This version uses {0}.'.format(' and '.join(libraries))
+
+package_data = {'mxnet': [os.path.join('mxnet', os.path.basename(LIB_PATH[0]))],
+                'dmlc_tracker': []}
+if variant.endswith('MKL'):
+    if platform.system() == 'Darwin':
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmklml.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libiomp5.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmkldnn.0.dylib'), os.path.join(CURRENT_DIR, 'mxnet'))
+        package_data['mxnet'].append('mxnet/libmklml.dylib')
+        package_data['mxnet'].append('mxnet/libiomp5.dylib')
+        package_data['mxnet'].append('mxnet/libmkldnn.0.dylib')
+    else:
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmklml_intel.so'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libiomp5.so'), os.path.join(CURRENT_DIR, 'mxnet'))
+        shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libmkldnn.so.0'), os.path.join(CURRENT_DIR, 'mxnet'))
+        package_data['mxnet'].append('mxnet/libmklml_intel.so')
+        package_data['mxnet'].append('mxnet/libiomp5.so')
+        package_data['mxnet'].append('mxnet/libmkldnn.so.0')
+    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), '../MKLML_LICENSE'), os.path.join(CURRENT_DIR, 'mxnet'))
+    package_data['mxnet'].append('mxnet/MKLML_LICENSE')
+if platform.system() == 'Linux':
+    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libgfortran.so.3'), os.path.join(CURRENT_DIR, 'mxnet'))
+    package_data['mxnet'].append('mxnet/libgfortran.so.3')
+    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libquadmath.so.0'), os.path.join(CURRENT_DIR, 'mxnet'))
+    package_data['mxnet'].append('mxnet/libquadmath.so.0')
+
+from mxnet.base import _generate_op_module_signature
+from mxnet.ndarray.register import _generate_ndarray_function_code
+from mxnet.symbol.register import _generate_symbol_function_code
+_generate_op_module_signature('mxnet', 'symbol', _generate_symbol_function_code)
+_generate_op_module_signature('mxnet', 'ndarray', _generate_ndarray_function_code)
+
+setup(name=package_name,
+      version=__version__,
+      long_description=long_description,
+      description=short_description,
+      zip_safe=False,
+      packages=find_packages(),
+      package_data=package_data,
+      include_package_data=True,
+      install_requires=DEPENDENCIES,
+      distclass=BinaryDistribution,
+      license='Apache 2.0',
+      classifiers=[ # https://pypi.org/pypi?%3Aaction=list_classifiers
+          'Development Status :: 5 - Production/Stable',
+          'Intended Audience :: Developers',
+          'Intended Audience :: Education',
+          'Intended Audience :: Science/Research',
+          'License :: OSI Approved :: Apache Software License',
+          'Programming Language :: C++',
+          'Programming Language :: Cython',
+          'Programming Language :: Other',  # R, Scala
+          'Programming Language :: Perl',
+          'Programming Language :: Python',
+          'Programming Language :: Python :: 2.7',
+          'Programming Language :: Python :: 3.4',
+          'Programming Language :: Python :: 3.5',
+          'Programming Language :: Python :: 3.6',
+          'Programming Language :: Python :: Implementation :: CPython',
+          'Topic :: Scientific/Engineering',
+          'Topic :: Scientific/Engineering :: Artificial Intelligence',
+          'Topic :: Scientific/Engineering :: Mathematics',
+          'Topic :: Software Development',
+          'Topic :: Software Development :: Libraries',
+          'Topic :: Software Development :: Libraries :: Python Modules',
+      ],
+      url='/~https://github.com/apache/incubator-mxnet')
diff --git a/tools/pip_package/README.md b/tools/pip_package/README.md
deleted file mode 100644
index f289c98b7155..000000000000
--- a/tools/pip_package/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-MXNet Python Package
-====================
-MXNet is a deep learning framework designed for both *efficiency* and *flexibility*.
-It allows you to mix the flavours of deep learning programs together to maximize the efficiency and your productivity.
-
-
-Installation
-------------
-To install, check [Build Instruction](http://mxnet.io/get_started/setup.html)
diff --git a/tools/pip_package/make_pip_package.sh b/tools/pip_package/make_pip_package.sh
deleted file mode 100755
index 46b4938b0785..000000000000
--- a/tools/pip_package/make_pip_package.sh
+++ /dev/null
@@ -1,179 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# Assuming the script is run at mxnet/tools/pip_package
-# This script builds from scratch the dependencies of mxnet into static
-# librareis and statically links them to produce a (mostly) standalone
-# libmxnet.so, then packages it into the python wheel.
-# It assumes the build environment to be a sandbox that doesn't have the .so
-# objects for the dependencies, i.e. zlib, openblas, libjpeg, libpng, libtiff
-# and opencv.
-
-# Install necessary build tools
-if [ -n "$(command -v apt-get)" ]; then
-    sudo apt-get update;
-    sudo apt-get install -y build-essential git python-pip zip pkg-config cmake
-elif [ -n "$(command -v yum)" ]; then
-    sudo yum install -y cmake
-    sudo yum groupinstall -y "Development Tools"
-    sudo yum install -y python27 python27-setuptools python27-tools python-pip
-else
-    echo "Need a package manager to install build tools, e.g. apt/yum"
-    exit 1
-fi
-sudo pip install -U pip setuptools wheel
-
-# Set up path as temporary working directory
-DEPS_PATH=$PWD/../../deps
-mkdir $DEPS_PATH
-
-# Dependencies can be updated here. Be sure to verify the download link before
-# changing. The dependencies are:
-ZLIB_VERSION=1.2.6
-OPENBLAS_VERSION=0.2.19
-JPEG_VERSION=8.4.0
-PNG_VERSION=1.5.10
-TIFF_VERSION=3.8.2
-OPENCV_VERSION=2.4.13
-
-# Setup path to dependencies
-export PKG_CONFIG_PATH=$DEPS_PATH/lib/pkgconfig:$DEPS_PATH/lib64/pkgconfig:$PKG_CONFIG_PATH
-export CPATH=$DEPS_PATH/include:$CPATH
-
-# Position Independent code must be turned on for statically linking .a
-export CC="gcc -fPIC"
-export CXX="g++ -fPIC"
-
-# Download and build zlib
-curl -L /~https://github.com/LuaDist/zlib/archive/$ZLIB_VERSION.zip -o $DEPS_PATH/zlib.zip
-unzip $DEPS_PATH/zlib.zip -d $DEPS_PATH
-mkdir $DEPS_PATH/zlib-$ZLIB_VERSION/build
-cd $DEPS_PATH/zlib-$ZLIB_VERSION/build
-cmake -D CMAKE_BUILD_TYPE=RELEASE \
-      -D CMAKE_INSTALL_PREFIX=$DEPS_PATH \
-      -D BUILD_SHARED_LIBS=OFF ..
-make -j$(nproc)
-make install
-cd -
-
-# download and build openblas
-curl -L /~https://github.com/xianyi/OpenBLAS/archive/v$OPENBLAS_VERSION.zip -o $DEPS_PATH/openblas.zip
-unzip $DEPS_PATH/openblas.zip -d $DEPS_PATH
-cd $DEPS_PATH/OpenBLAS-$OPENBLAS_VERSION
-make FC=gfortran -j $(($(nproc) + 1))
-make PREFIX=$DEPS_PATH install
-cd -
-ln -s $DEPS_PATH/lib/libopenblas_haswellp-r0.2.19.a $DEPS_PATH/lib/libcblas.a
-
-# download and build libjpeg
-curl -L /~https://github.com/LuaDist/libjpeg/archive/$JPEG_VERSION.zip -o $DEPS_PATH/libjpeg.zip
-unzip $DEPS_PATH/libjpeg.zip -d $DEPS_PATH
-cd $DEPS_PATH/libjpeg-$JPEG_VERSION
-./configure --disable-shared --prefix=$DEPS_PATH
-make -j$(nproc)
-make test
-make install
-cd -
-
-# download and build libpng
-curl -L /~https://github.com/LuaDist/libpng/archive/$PNG_VERSION.zip -o $DEPS_PATH/libpng.zip
-unzip $DEPS_PATH/libpng.zip -d $DEPS_PATH
-mkdir $DEPS_PATH/libpng-$PNG_VERSION/build
-cd $DEPS_PATH/libpng-$PNG_VERSION/build
-cmake -D CMAKE_BUILD_TYPE=RELEASE \
-      -D CMAKE_INSTALL_PREFIX=$DEPS_PATH \
-      -D PNG_CONFIGURE_LIBPNG=-fPIC \
-      -D BUILD_SHARED_LIBS=OFF ..
-make -j$(nproc)
-make install
-cd -
-
-# download and build libtiff
-curl -L /~https://github.com/LuaDist/libtiff/archive/$TIFF_VERSION.zip -o $DEPS_PATH/libtiff.zip
-unzip $DEPS_PATH/libtiff.zip -d $DEPS_PATH
-cd $DEPS_PATH/libtiff-$TIFF_VERSION
-./configure --disable-shared --prefix=$DEPS_PATH
-make -j$(nproc)
-make install
-cd -
-
-# download and build opencv since we need the static library
-curl -L /~https://github.com/Itseez/opencv/archive/$OPENCV_VERSION.zip -o $DEPS_PATH/opencv.zip
-unzip $DEPS_PATH/opencv.zip -d $DEPS_PATH
-mkdir $DEPS_PATH/opencv-$OPENCV_VERSION/build
-cd $DEPS_PATH/opencv-$OPENCV_VERSION/build
-cmake -D WITH_1394=OFF \
-      -D WITH_AVFOUNDATION=OFF \
-      -D WITH_CUDA=OFF \
-      -D WITH_VTK=OFF \
-      -D WITH_CUFFT=OFF \
-      -D WITH_CUBLAS=OFF \
-      -D WITH_NVCUVID=OFF \
-      -D WITH_EIGEN=ON \
-      -D WITH_VFW=OFF \
-      -D WITH_FFMPEG=OFF \
-      -D WITH_GSTREAMER=OFF \
-      -D WITH_GTK=OFF \
-      -D WITH_JASPER=OFF \
-      -D WITH_JPEG=ON \
-      -D WITH_PNG=ON \
-      -D WITH_QUICKTIME=OFF \
-      -D WITH_TBB=ON \
-      -D WITH_TIFF=OFF \
-      -D WITH_V4L=OFF \
-      -D WITH_LIBV4L=OFF \
-      -D WITH_DSHOW=OFF \
-      -D WITH_MSMF=OFF \
-      -D WITH_OPENCL=OFF \
-      -D WITH_OPENCLAMDFFT=OFF \
-      -D WITH_OPENCLAMDBLAS=OFF \
-      -D BUILD_SHARED_LIBS=OFF \
-      -D BUILD_opencv_apps=OFF \
-      -D BUILD_opencv_gpu=OFF \
-      -D BUILD_opencv_video=OFF \
-      -D BUILD_opencv_contrib=OFF \
-      -D BUILD_opencv_nonfree=OFF \
-      -D BUILD_opencv_flann=OFF \
-      -D BUILD_opencv_features2d=OFF \
-      -D BUILD_opencv_calib3d=OFF \
-      -D BUILD_opencv_objdetect=OFF \
-      -D BUILD_opencv_ml=OFF \
-      -D BUILD_opencv_photo=OFF \
-      -D BUILD_DOCS=OFF \
-      -D BUILD_PACKAGE=OFF \
-      -D CMAKE_BUILD_TYPE=RELEASE \
-      -D CMAKE_INSTALL_PREFIX=$DEPS_PATH ..
-make -j $(nproc)
-make install # user will always have access to home, so no sudo needed
-cd -
-
-# Although .so building is explicitly turned off for most libraries, sometimes
-# they still get created. So, remove them just to make sure they don't
-# interfere, or otherwise we might get libmxnet.so that is not self-contained.
-rm $DEPS_PATH/{lib,lib64}/*.{so,so.0}
-
-# Go to the parent path and build mxnet
-cd ../../
-cp make/pip_$(uname | tr '[:upper:]' '[:lower:]')_cpu.mk config.mk
-make -j $(nproc)
-
-# Generate wheel. The output is in the mxnet/tools/pip_package/dist path.
-cd tools/pip_package
-python setup.py bdist_wheel
diff --git a/tools/pip_package/setup.py b/tools/pip_package/setup.py
deleted file mode 100644
index e4bf48236bde..000000000000
--- a/tools/pip_package/setup.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-name, exec-used
-"""Setup mxnet package."""
-from __future__ import absolute_import
-import os
-import shutil
-
-from setuptools import setup, find_packages
-from setuptools.dist import Distribution
-
-# We can not import `mxnet.info.py` in setup.py directly since mxnet/__init__.py
-# Will be invoked which introduces dependences
-CURRENT_DIR = os.path.dirname(__file__)
-libinfo_py = os.path.join(CURRENT_DIR, '../../python/mxnet/libinfo.py')
-libinfo = {'__file__': libinfo_py}
-exec(compile(open(libinfo_py, "rb").read(), libinfo_py, 'exec'), libinfo, libinfo)
-
-LIB_PATH = libinfo['find_lib_path']()
-__version__ = libinfo['__version__']
-
-class BinaryDistribution(Distribution):
-    def has_ext_modules(self):
-        return True
-
-
-DEPENDENCIES = [
-    'numpy',
-]
-
-shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet'), ignore_errors=True)
-shutil.copytree(os.path.join(CURRENT_DIR, '../../python/mxnet'),
-                os.path.join(CURRENT_DIR, 'mxnet'))
-shutil.copy(LIB_PATH[0], os.path.join(CURRENT_DIR, 'mxnet'))
-
-setup(name='mxnet',
-      version=__version__,
-      description=open(os.path.join(CURRENT_DIR, 'README.md')).read(),
-      zip_safe=False,
-      packages=find_packages(),
-      package_data={'mxnet': [os.path.join('mxnet', os.path.basename(LIB_PATH[0]))]},
-      include_package_data=True,
-      install_requires=DEPENDENCIES,
-      distclass=BinaryDistribution,
-      url='/~https://github.com/dmlc/mxnet')
diff --git a/tools/setup_gpu_build_tools.sh b/tools/setup_gpu_build_tools.sh
new file mode 100755
index 000000000000..44b44c574114
--- /dev/null
+++ b/tools/setup_gpu_build_tools.sh
@@ -0,0 +1,254 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script installs the tools and libraries for CUDA GPU on Ubuntu.
+# Usage: VARIANT=cu92mkl; DEPS_PATH=$HOME; setup_gpu_build_tools.sh $VARIANT $DEPS_PATH;
+# It installs the tools into DEPS_PATH as specified by the second argument, and will set
+# the following environment variables:
+# PATH, CPLUS_INCLUDE_PATH, C_INCLUDE_PATH, LIBRARY_PATH, LD_LIBRARY_PATH, NVCC
+
+VARIANT=$1
+DEPS_PATH=$2
+
+>&2 echo "Setting CUDA versions for $VARIANT"
+if [[ $VARIANT == cu100* ]]; then
+    CUDA_VERSION='10.0.130-1'
+    CUDA_PATCH_VERSION='10.0.130-1'
+    LIBCUDA_VERSION='410.48-0ubuntu1'
+    LIBCUDNN_VERSION='7.3.1.20-1+cuda10.0'
+    LIBNCCL_VERSION='2.3.4-1+cuda9.2'
+elif [[ $VARIANT == cu92* ]]; then
+    CUDA_VERSION='9.2.148-1'
+    CUDA_PATCH_VERSION='9.2.148.1-1'
+    LIBCUDA_VERSION='396.44-0ubuntu1'
+    LIBCUDNN_VERSION='7.3.1.20-1+cuda9.2'
+    LIBNCCL_VERSION='2.3.4-1+cuda9.2'
+elif [[ $VARIANT == cu91* ]]; then
+    CUDA_VERSION='9.1.85-1'
+    CUDA_PATCH_VERSION='9.1.85.3-1'
+    LIBCUDA_VERSION='396.44-0ubuntu1'
+    LIBCUDNN_VERSION='7.1.3.16-1+cuda9.1'
+    LIBNCCL_VERSION='2.2.12-1+cuda9.1'
+elif [[ $VARIANT == cu90* ]]; then
+    CUDA_VERSION='9.0.176-1'
+    CUDA_PATCH_VERSION='9.0.176.3-1'
+    LIBCUDA_VERSION='384.145-0ubuntu1'
+    LIBCUDNN_VERSION='7.3.1.20-1+cuda9.0'
+    LIBNCCL_VERSION='2.3.4-1+cuda9.0'
+elif [[ $VARIANT == cu80* ]]; then
+    CUDA_VERSION='8.0.61-1'
+    CUDA_PATCH_VERSION='8.0.61.2-1'
+    LIBCUDA_VERSION='375.88-0ubuntu1'
+    LIBCUDNN_VERSION='7.2.1.38-1+cuda8.0'
+    LIBNCCL_VERSION='2.3.4-1+cuda8.0'
+elif [[ $VARIANT == cu75* ]]; then
+    CUDA_VERSION='7.5-18'
+    CUDA_PATCH_VERSION='7.5-18'
+    LIBCUDA_VERSION='375.88-0ubuntu1'
+    LIBCUDNN_VERSION='6.0.21-1+cuda7.5'
+    LIBNCCL_VERSION=''
+fi
+if [[ $VARIANT == cu* ]]; then
+    CUDA_MAJOR_VERSION=$(echo $CUDA_VERSION | tr '-' '.' | cut -d. -f1,2)
+    CUDA_MAJOR_DASH=$(echo $CUDA_VERSION | tr '-' '.' | cut -d. -f1,2 | tr '.' '-')
+    NVIDIA_MAJOR_VERSION=$(echo $LIBCUDA_VERSION | cut -d. -f1)
+    LIBCUDA_MAJOR=$(echo $LIBCUDA_VERSION | cut -d. -f1)
+    LIBCUDNN_MAJOR=$(echo $LIBCUDNN_VERSION | cut -d. -f1)
+    os_name=$(cat /etc/*release | grep '^ID=' | sed 's/^.*=//g')
+    os_version=$(cat /etc/*release | grep VERSION_ID | sed 's/^.*"\([0-9]*\)\.\([0-9]*\)"/\1\2/g')
+    os_id="${os_name}${os_version}"
+    if [[ $CUDA_MAJOR_DASH == 9-* ]] || [[ $CUDA_MAJOR_DASH == 10-* ]]; then
+        os_id="ubuntu1604"
+    fi
+    export PATH=/usr/lib/binutils-2.26/bin/:${PATH}:$DEPS_PATH/usr/local/cuda-$CUDA_MAJOR_VERSION/bin
+    export CPLUS_INCLUDE_PATH=${CPLUS_INCLUDE_PATH}:$DEPS_PATH/usr/local/cuda-$CUDA_MAJOR_VERSION/include
+    export C_INCLUDE_PATH=${C_INCLUDE_PATH}:$DEPS_PATH/usr/local/cuda-$CUDA_MAJOR_VERSION/include
+    export LIBRARY_PATH=${LIBRARY_PATH}:$DEPS_PATH/usr/local/cuda-$CUDA_MAJOR_VERSION/lib64:$DEPS_PATH/usr/lib/x86_64-linux-gnu:$DEPS_PATH/usr/lib/nvidia-$NVIDIA_MAJOR_VERSION
+    export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$DEPS_PATH/usr/local/cuda-$CUDA_MAJOR_VERSION/lib64:$DEPS_PATH/usr/lib/x86_64-linux-gnu:$DEPS_PATH/usr/lib/nvidia-$NVIDIA_MAJOR_VERSION
+    export NVCC=$DEPS_PATH/usr/local/cuda-$CUDA_MAJOR_VERSION/bin/nvcc
+fi
+
+# list of debs to download from nvidia
+if [[ $VARIANT == cu100* ]]; then
+    cuda_files=( \
+      "cuda-core-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cublas-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cublas-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cudart-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cudart-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-curand-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-misc-headers-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvcc-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "libcuda1-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+      "nvidia-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+    )
+    ml_files=( \
+      "libcudnn${LIBCUDNN_MAJOR}-dev_${LIBCUDNN_VERSION}_amd64.deb" \
+      "libnccl-dev_${LIBNCCL_VERSION}_amd64.deb" \
+    )
+elif [[ $VARIANT == cu92* ]]; then
+    cuda_files=( \
+      "cuda-core-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cublas-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cublas-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cudart-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cudart-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-curand-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-misc-headers-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvcc-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "libcuda1-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+      "nvidia-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+    )
+    ml_files=( \
+      "libcudnn${LIBCUDNN_MAJOR}-dev_${LIBCUDNN_VERSION}_amd64.deb" \
+      "libnccl-dev_${LIBNCCL_VERSION}_amd64.deb" \
+    )
+elif [[ $VARIANT == cu91* ]]; then
+    cuda_files=( \
+      "cuda-core-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cublas-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cublas-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cudart-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cudart-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-misc-headers-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvcc-${CUDA_MAJOR_DASH}_9.1.85.2-1_amd64.deb" \
+      "libcuda1-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+      "nvidia-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+    )
+    ml_files=( \
+      "libcudnn${LIBCUDNN_MAJOR}-dev_${LIBCUDNN_VERSION}_amd64.deb" \
+      "libnccl-dev_${LIBNCCL_VERSION}_amd64.deb" \
+    )
+elif [[ $VARIANT == cu90* ]]; then
+    cuda_files=( \
+      "cuda-core-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cublas-${CUDA_MAJOR_DASH}_9.0.176.4-1_amd64.deb" \
+      "cuda-cublas-dev-${CUDA_MAJOR_DASH}_9.0.176.4-1_amd64.deb" \
+      "cuda-cudart-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cudart-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-misc-headers-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "libcuda1-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+      "nvidia-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+    )
+    ml_files=( \
+      "libcudnn${LIBCUDNN_MAJOR}-dev_${LIBCUDNN_VERSION}_amd64.deb" \
+      "libnccl-dev_${LIBNCCL_VERSION}_amd64.deb" \
+    )
+elif [[ $VARIANT == cu80* ]]; then
+    cuda_files=( \
+      "cuda-core-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cublas-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cublas-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cudart-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cudart-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-misc-headers-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "libcuda1-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+      "nvidia-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+    )
+    ml_files=( \
+      "libcudnn${LIBCUDNN_MAJOR}-dev_${LIBCUDNN_VERSION}_amd64.deb" \
+      "libnccl-dev_${LIBNCCL_VERSION}_amd64.deb" \
+    )
+elif [[ $VARIANT == cu75* ]]; then
+    cuda_files=( \
+      "cuda-core-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cublas-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cublas-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cudart-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-cudart-dev-${CUDA_MAJOR_DASH}_${CUDA_PATCH_VERSION}_amd64.deb" \
+      "cuda-curand-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-curand-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cufft-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-nvrtc-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-cusolver-dev-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "cuda-misc-headers-${CUDA_MAJOR_DASH}_${CUDA_VERSION}_amd64.deb" \
+      "libcuda1-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+      "nvidia-${LIBCUDA_MAJOR}_${LIBCUDA_VERSION}_amd64.deb" \
+    )
+    ml_files=( \
+      "libcudnn${LIBCUDNN_MAJOR}-dev_${LIBCUDNN_VERSION}_amd64.deb" \
+    )
+fi
+
+
+if [[ ! -d $DEPS_PATH/usr/local/cuda-${CUDA_MAJOR_VERSION} ]]; then
+    prefix=$DEPS_PATH
+
+    for item in ${cuda_files[*]}
+    do
+        echo "Installing $item"
+        curl -sL "http://developer.download.nvidia.com/compute/cuda/repos/${os_id}/x86_64/${item}" -o package.deb
+        dpkg -X package.deb ${prefix}
+        rm package.deb
+    done
+    for item in ${ml_files[*]}
+    do
+        echo "Installing $item"
+        curl -sL "http://developer.download.nvidia.com/compute/machine-learning/repos/${os_id}/x86_64/${item}" -o package.deb
+        dpkg -X package.deb ${prefix}
+        rm package.deb
+    done
+
+    cp ${prefix}/usr/include/x86_64-linux-gnu/cudnn_v${LIBCUDNN_MAJOR}.h ${prefix}/include/cudnn.h
+    ln -s libcudnn_static_v${LIBCUDNN_MAJOR}.a ${prefix}/usr/lib/x86_64-linux-gnu/libcudnn.a
+    cp ${prefix}/usr/local/cuda-${CUDA_MAJOR_VERSION}/lib64/*.a ${prefix}/lib/
+    cp ${prefix}/usr/include/nccl.h ${prefix}/include/nccl.h
+    ln -s libnccl_static.a ${prefix}/usr/lib/x86_64-linux-gnu/libnccl.a
+fi