diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 55f3c7bc1d87..3ffea8694adf 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 55f3c7bc1d875fbc7d34fc26651bb8c6818c8355
+Subproject commit 3ffea8694adf9c0363f9abbf162dc0e4a45b22c5
diff --git a/3rdparty/googletest b/3rdparty/googletest
index ec44c6c1675c..eb9225ce361a 160000
--- a/3rdparty/googletest
+++ b/3rdparty/googletest
@@ -1 +1 @@
-Subproject commit ec44c6c1675c25b9827aacd08c02433cccde7780
+Subproject commit eb9225ce361affe561592e0912320b9db84985d0
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 95ebe0f109ae..6e94643bdf1d 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 95ebe0f109ae021d0d66e2a627ccfc55c3253b55
+Subproject commit 6e94643bdf1d51a505b147f28c358fb71070b8fd
diff --git a/3rdparty/sparse-matrix/Makefile b/3rdparty/sparse-matrix/Makefile
new file mode 100644
index 000000000000..214312f6586c
--- /dev/null
+++ b/3rdparty/sparse-matrix/Makefile
@@ -0,0 +1,21 @@
+CC = g++
+C = gcc
+MKLROOT = /opt/intel/mkl
+
+ifneq ($(USE_INTEL_PATH),)
+	MKLROOT = $(USE_INTEL_PATH)/mkl
+endif
+
+CFLAGS  = -fpic -O2 -I/opt/intel/mkl/include -c -Wall -Werror  -DMKL_ILP64 -m64  -std=c++11 
+LDFLAGS =  -Wl,--start-group -L${MKLROOT}/../compiler/lib/intel64 ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_intel_thread.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -liomp5 -lpthread -lm -ldl
+
+default: libsparse_matrix.so
+
+libsparse_matrix.so:  sparse_matrix.o
+	$(CC) -shared -o libsparse_matrix.so sparse_matrix.o $(LDFLAGS) 
+
+sparse_matrix.o:  sparse_matrix.cc sparse_matrix.h
+	$(CC) $(CFLAGS) sparse_matrix.cc 
+
+clean:
+	$(RM) libsparse_matrix.so *.o *~
diff --git a/3rdparty/sparse-matrix/sparse_matrix.cc b/3rdparty/sparse-matrix/sparse_matrix.cc
new file mode 100644
index 000000000000..fa362f0f8a18
--- /dev/null
+++ b/3rdparty/sparse-matrix/sparse_matrix.cc
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <mkl_spblas.h>
+#include "sparse_matrix.h"
+
+
+
+bool mkl_DotCsrDnsDns(SP_INT64* rows_start, SP_INT64* col_indx,
+	float* values, float* X, float* y,
+	int rows, int cols, int X_columns)
+{
+
+	sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO;
+	sparse_status_t status;
+	sparse_matrix_t A = NULL;
+	sparse_layout_t layout = SPARSE_LAYOUT_ROW_MAJOR;
+	float one, zero;
+	one = (float)1.0;
+	zero = (float)0.0;
+
+	MKL_INT* rows_end = rows_start + 1;
+	status = mkl_sparse_s_create_csr(&A, indexing, rows, cols, rows_start, rows_end, col_indx, values);
+
+  if (status != SPARSE_STATUS_SUCCESS)
+  {
+    std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
+    return false;
+  }
+	sparse_operation_t operation = SPARSE_OPERATION_NON_TRANSPOSE;
+	struct matrix_descr descrA;
+	descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+
+	status = mkl_sparse_s_mm(operation, one, A, descrA, layout, X, X_columns, X_columns, zero, y, X_columns);
+  if (status != SPARSE_STATUS_SUCCESS)
+  {
+    std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
+    return false;
+  }
+	
+	mkl_sparse_destroy(A);
+	
+	return true;
+
+}
diff --git a/3rdparty/sparse-matrix/sparse_matrix.h b/3rdparty/sparse-matrix/sparse_matrix.h
new file mode 100644
index 000000000000..93054a80b374
--- /dev/null
+++ b/3rdparty/sparse-matrix/sparse_matrix.h
@@ -0,0 +1,48 @@
+#ifndef MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
+#define MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
+
+
+#if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER)
+#define SP_INT64 __int64
+#define SP_UINT64 unsigned __int64
+#else
+#define SP_INT64 long long int
+#define SP_UINT64 unsigned long long int
+#endif
+
+
+#if defined _WIN32 || defined __CYGWIN__
+  #ifdef BUILDING_DLL
+    #ifdef __GNUC__
+      #define SPM_API_PUBLIC __attribute__ ((dllexport))
+    #else
+      #define SPM_API_PUBLIC __declspec(dllexport) // Note: actually gcc seems to also supports this syntax.
+    #endif
+  #else
+    #ifdef __GNUC__
+      #define SPM_API_PUBLIC __attribute__ ((dllimport))
+    #else
+      #define SPM_API_PUBLIC __declspec(dllimport) // Note: actually gcc seems to also supports this syntax.
+    #endif
+  #endif
+  #define SPM_API_LOCAL
+#else
+  #if __GNUC__ >= 4
+    #define SPM_API_PUBLIC __attribute__ ((visibility ("default")))
+    #define SPM_API_LOCAL  __attribute__ ((visibility ("hidden")))
+  #else
+    #define SPM_API_PUBLIC
+    #define SPM_API_LOCAL
+  #endif
+#endif
+
+
+
+extern "C"
+{
+	extern SPM_API_PUBLIC bool mkl_DotCsrDnsDns(SP_INT64* rows_start, SP_INT64* col_indx,
+		float* values, float* X, float* y, int rows, int cols, int X_columns);
+
+}
+
+#endif //MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e582bae5e95..d014e96ff77f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,10 +29,10 @@ mxnet_option(USE_SSE              "Build with x86 SSE instruction support" ON IF
 mxnet_option(USE_F16C             "Build with x86 F16C instruction support" ON) # autodetects support if ON
 mxnet_option(USE_LAPACK           "Build with lapack support" ON)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
-mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
+mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) )
 mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON IF NOT MSVC)
-mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
+mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support" OFF)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
 mxnet_option(USE_PROFILER         "Build with Profiler support"   ON)
 mxnet_option(USE_DIST_KVSTORE     "Build with DIST_KVSTORE support" OFF)
@@ -50,6 +50,7 @@ mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." OFF)
 mxnet_option(USE_TENSORRT         "Enable infeference optimization with TensorRT." OFF)
 mxnet_option(USE_ASAN             "Enable Clang/GCC ASAN sanitizers." OFF)
 mxnet_option(ENABLE_TESTCOVERAGE  "Enable compilation with test coverage metric output" OFF)
+mxnet_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
 
 message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}")
 message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}")
@@ -120,6 +121,7 @@ else(MSVC)
   # For cross compilation, we can't rely on the compiler which accepts the flag, but mshadow will
   # add platform specific includes not available in other arches
   if(USE_SSE)
+    check_cxx_compiler_flag("-msse3"     SUPPORT_MSSE3)
     check_cxx_compiler_flag("-msse2"     SUPPORT_MSSE2)
   else()
     set(SUPPORT_MSSE2 FALSE)
@@ -133,14 +135,15 @@ else(MSVC)
     set(SUPPORT_F16C FALSE)
   endif()
   if(SUPPORT_F16C)
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -mf16c")
+    message(STATUS "F16C enabled")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
   else()
     add_definitions(-DMSHADOW_USE_F16C=0)
   endif()
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-  set(CMAKE_C_FLAGS "-Wall -Wno-unknown-pragmas -Wno-sign-compare")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unknown-pragmas -Wno-sign-compare")
   if ("${CMAKE_CXX_COMPILER_ID}" MATCHES ".*Clang$")
-    set(CMAKE_C_FLAGS "-Wno-braced-scalar-init")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-braced-scalar-init")
   endif()
   if(CMAKE_BUILD_TYPE STREQUAL "Debug")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O0 -g")
@@ -151,7 +154,9 @@ else(MSVC)
     add_definitions(-DNDEBUG=1)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
   endif()
-  if(SUPPORT_MSSE2)
+  if(SUPPORT_MSSE3)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
+  elseif(SUPPORT_MSSE2)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
   endif()
   set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS})
@@ -295,6 +300,13 @@ else()
   add_definitions(-DMXNET_USE_NCCL=0)
 endif()
 
+if (USE_INT64_TENSOR_SIZE)
+  message(STATUS "Using 64-bit integer for tensor size")
+  add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=1)
+else()
+  add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=0)
+endif()
+
 include(cmake/ChooseBlas.cmake)
 if(USE_CUDA AND FIRST_CUDA)
   include(3rdparty/mshadow/cmake/Utils.cmake)
@@ -395,8 +407,8 @@ if(USE_OPENCV)
   endif()
   include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
   list(APPEND mxnet_LINKER_LIBS ${OpenCV_LIBS})
-  message(STATUS " OpenCV_LIBS=${OpenCV_LIBS}")
   message(STATUS "OpenCV ${OpenCV_VERSION} found (${OpenCV_CONFIG_PATH})")
+  message(STATUS " OpenCV_LIBS=${OpenCV_LIBS}")
   add_definitions(-DMXNET_USE_OPENCV=1)
 else(USE_OPENCV)
   message(STATUS "OpenCV Disabled")
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6c3300b445b0..53c0334f832c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -237,6 +237,8 @@ List of Contributors
 * [Zhiyuan Huang](/~https://github.com/huangzhiyuan)
 * [Zak Jost](/~https://github.com/zjost)
 * [Nick Guletskii](/~https://github.com/nickguletskii)
+* [Shoubhik Bhattacharya](/~https://github.com/shoubhik)
+* [Zach Kimberg](/~https://github.com/zachgk)
 
 Label Bot
 ---------
diff --git a/LICENSE b/LICENSE
index 72fe08f4e316..0673e0e40f7b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -207,7 +207,7 @@
     The Apache MXNET (incubating) project contains subcomponents with separate copyright
     notices and license terms. Your use of the source code for the these
     subcomponents is subject to the terms and conditions of the following
-    licenses.
+    licenses (full text in Appendix).
 
     =======================================================================================
     Apache-2.0 licenses
@@ -706,3 +706,43 @@
     FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
     ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     DEALINGS IN THE SOFTWARE.
+
+
+
+    =======================================================================================
+    Appendix
+    =======================================================================================
+
+    MIT License
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.   3-clause BSD
+
+    =======================================================================================
+
+    3-Clause BSD License
+
+    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+    3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    =======================================================================================
+
+    2-Clause BSD License
+
+    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 43d212e5f938..29cfd573665c 100644
--- a/Makefile
+++ b/Makefile
@@ -144,6 +144,7 @@ ifeq ($(USE_MKLDNN), 1)
 	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
 endif
 
+
 # setup opencv
 ifeq ($(USE_OPENCV), 1)
 	CFLAGS += -DMXNET_USE_OPENCV=1
@@ -188,6 +189,11 @@ ifeq ($(USE_OPERATOR_TUNING), 1)
 	CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1
 endif
 
+ifeq ($(USE_INT64_TENSOR_SIZE), 1)
+   CFLAGS += -DMSHADOW_INT64_TENSOR_SIZE=1
+else
+   CFLAGS += -DMSHADOW_INT64_TENSOR_SIZE=0
+endif
 # verify existence of separate lapack library when using blas/openblas/atlas
 # switch off lapack support in case it can't be found
 # issue covered with this
@@ -410,6 +416,14 @@ ifeq ($(USE_DIST_KVSTORE), 1)
 	LDFLAGS += $(PS_LDFLAGS_A)
 endif
 
+#sparse-matrix
+ifeq ($(USE_BLAS), mkl)
+	SPARSE_MATRIX_DIR =  $(ROOTDIR)/3rdparty/sparse-matrix
+	LIB_DEP += $(SPARSE_MATRIX_DIR)/libsparse_matrix.so
+	CFLAGS += -I$(SPARSE_MATRIX_DIR)
+	LDFLAGS += -L$(SPARSE_MATRIX_DIR) -lsparse_matrix -Wl,-rpath,'$${ORIGIN}'
+endif
+
 .PHONY: clean all extra-packages test lint docs clean_all rcpplint rcppexport roxygen\
 	cython2 cython3 cython cyclean
 
@@ -547,11 +561,30 @@ ifeq ($(UNAME_S), Darwin)
 endif
 endif
 
+ifeq ($(USE_BLAS), mkl)
+ifeq ($(UNAME_S), Darwin)
+	install_name_tool -change '@rpath/libsparse_matrix.dylib' '@loader_path/libsparse_matrix.dylib' $@
+endif
+endif
+
 $(PS_PATH)/build/libps.a: PSLITE
 
 PSLITE:
 	$(MAKE) CXX="$(CXX)" DEPS_PATH="$(DEPS_PATH)" -C $(PS_PATH) ps
 
+ifeq ($(USE_BLAS), mkl)
+$(SPARSE_MATRIX_DIR)/libsparse_matrix.so: SPARSE_MATRIX
+
+SPARSE_MATRIX:
+ifeq ($(USE_INTEL_PATH), NONE)
+	$(MAKE) -C $(SPARSE_MATRIX_DIR)
+else
+	$(MAKE) -C $(SPARSE_MATRIX_DIR) USE_INTEL_PATH=$(USE_INTEL_PATH)
+endif
+	mkdir -p $(ROOTDIR)/lib
+	cp $(SPARSE_MATRIX_DIR)/libsparse_matrix.so $(ROOTDIR)/lib/
+endif
+
 $(DMLC_CORE)/libdmlc.a: DMLCCORE
 
 DMLCCORE:
@@ -628,6 +661,10 @@ rpkg:
 		cp -rf lib/libmklml_intel.so R-package/inst/libs; \
 	fi
 
+	if [ -e "lib/libsparse_matrix.so" ]; then \
+		cp -rf lib/libsparse_matrix.so R-package/inst/libs; \
+	fi
+
 	mkdir -p R-package/inst/include
 	cp -rl include/* R-package/inst/include
 	Rscript -e "if(!require(devtools)){install.packages('devtools', repo = 'https://cloud.r-project.org/')}"
@@ -673,6 +710,7 @@ clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
 	(cd scala-package && mvn clean) || true
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
+	cd $(SPARSE_MATRIX_DIR); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
 	cd $(AMALGAMATION_PATH); $(MAKE) clean; cd -
 	$(RM) -r  $(patsubst %, %/*.d, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.d, $(EXTRA_OPERATORS))
@@ -683,6 +721,7 @@ clean: rclean mkldnn_clean cyclean testclean $(EXTRA_PACKAGES_CLEAN)
 	(cd scala-package && mvn clean) || true
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
+	cd $(SPARSE_MATRIX_DIR); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
 	cd $(AMALGAMATION_PATH); $(MAKE) clean; cd -
 endif
diff --git a/R-package/src/ndarray.cc b/R-package/src/ndarray.cc
index 94d24f3fb46b..0409d3ba8887 100644
--- a/R-package/src/ndarray.cc
+++ b/R-package/src/ndarray.cc
@@ -179,9 +179,9 @@ Rcpp::RObject NDArrayPacker::CreateNDArrayPacker() {
 }
 
 Rcpp::Dimension NDArray::dim() const {
-  mx_uint ndim;
-  const mx_uint *pshape;
-  MX_CALL(MXNDArrayGetShape(
+  int ndim;
+  const int *pshape;
+  MX_CALL(MXNDArrayGetShapeEx(
       ptr_->handle, &ndim, &pshape));
   Rcpp::IntegerVector dat(pshape, pshape + ndim);
   std::reverse(dat.begin(), dat.end());
diff --git a/R-package/src/symbol.cc b/R-package/src/symbol.cc
index 031c9a254019..317e82568012 100644
--- a/R-package/src/symbol.cc
+++ b/R-package/src/symbol.cc
@@ -167,8 +167,8 @@ Symbol::RObjectType Symbol::GetOutput(mx_uint index) const {
 
 // helper function to convert shape into Rcpp vector
 inline Rcpp::List BuildShapeData(mx_uint shape_size,
-                                 const mx_uint *shape_ndim,
-                                 const mx_uint **shape_data,
+                                 const int *shape_ndim,
+                                 const int **shape_data,
                                  const std::vector<std::string> &names) {
   Rcpp::List ret(shape_size);
   for (mx_uint i = 0; i < shape_size; ++i) {
@@ -185,7 +185,7 @@ SEXP Symbol::InferShape(const Rcpp::List& kwargs) const {
       << "Need to pass parameters in key=value style.\n";
   std::vector<std::string> keys = kwargs.names();
   std::vector<mx_uint> arg_ind_ptr(1, 0);
-  std::vector<mx_uint> arg_shape_data;
+  std::vector<int> arg_shape_data;
 
   for (size_t i = 0; i < kwargs.size(); ++i) {
     RCHECK(keys[i].length() != 0)
@@ -197,17 +197,17 @@ SEXP Symbol::InferShape(const Rcpp::List& kwargs) const {
   std::vector<const char*> c_keys = CKeys(keys);
 
   mx_uint in_shape_size;
-  const mx_uint *in_shape_ndim;
-  const mx_uint **in_shape_data;
+  const int *in_shape_ndim;
+  const int **in_shape_data;
   mx_uint out_shape_size;
-  const mx_uint *out_shape_ndim;
-  const mx_uint **out_shape_data;
+  const int *out_shape_ndim;
+  const int **out_shape_data;
   mx_uint aux_shape_size;
-  const mx_uint *aux_shape_ndim;
-  const mx_uint **aux_shape_data;
+  const int *aux_shape_ndim;
+  const int **aux_shape_data;
   int complete;
 
-  MX_CALL(MXSymbolInferShape(
+  MX_CALL(MXSymbolInferShapeEx(
       handle_, static_cast<mx_uint>(kwargs.size()), dmlc::BeginPtr(c_keys),
       dmlc::BeginPtr(arg_ind_ptr), dmlc::BeginPtr(arg_shape_data),
       &in_shape_size, &in_shape_ndim, &in_shape_data,
diff --git a/README.md b/README.md
index cab2c6deebc7..3eea2e78fa54 100644
--- a/README.md
+++ b/README.md
@@ -95,7 +95,7 @@ Features
 * Mix and match imperative and symbolic programming to maximize flexibility and efficiency
 * Lightweight, memory efficient and portable to smart devices
 * Scales up to multi GPUs and distributed setting with auto parallelism
-* Support for Python, Scala, C++, Java, Clojure, R and Julia
+* Support for [Python](/~https://github.com/apache/incubator-mxnet/tree/master/python), [Scala](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package), [C++](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package), [Java](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package), [Clojure](/~https://github.com/apache/incubator-mxnet/tree/master/contrib/clojure-package), [R](/~https://github.com/apache/incubator-mxnet/tree/master/R-package), [Go](/~https://github.com/jdeng/gomxnet/), [Javascript](/~https://github.com/dmlc/mxnet.js/), [Perl](/~https://github.com/apache/incubator-mxnet/tree/master/perl-package), [Matlab](/~https://github.com/apache/incubator-mxnet/tree/master/matlab), and [Julia](/~https://github.com/apache/incubator-mxnet/tree/master/julia)
 * Cloud-friendly and directly compatible with S3, HDFS, and Azure
 
 License
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 0a4be02b8ff9..e47ab6b0e22e 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -46,6 +46,14 @@
 if platform.system() != 'Windows':
     blacklist.append('windows.h')
     blacklist.append('process.h')
+    blacklist.append('Shlwapi.h')
+
+if platform.system() == 'Windows':
+    blacklist.append('unistd.h')
+
+if 'freebsd' not in sys.platform:
+    blacklist.append('sys/endian.h')
+
 
 
 def get_sources(def_file):
@@ -94,6 +102,7 @@ def find_source(name, start, stage):
 
 re1 = re.compile('<([./a-zA-Z0-9_-]*)>')
 re2 = re.compile('"([./a-zA-Z0-9_-]*)"')
+re3 = re.compile('DMLC_EXECINFO_H')
 
 sysheaders = []
 history = set([])
@@ -129,6 +138,9 @@ def expand(x, pending, stage):
     with open(x, 'rb') as x_h:
         for line in x_h.readlines():
             uline = line.decode('utf-8')
+            if '#define DMLC_LOG_STACK_TRACE 1' in uline.strip():
+                # Do not enable stacktrace logging
+                continue
             if uline.find('#include') < 0:
                 out.write(line)
                 continue
@@ -138,10 +150,15 @@ def expand(x, pending, stage):
             m = re1.search(uline)
             if not m:
                 m = re2.search(uline)
-            if not m:
-                print(uline + ' not found')
-                continue
-            path = m.groups()[0]
+            if m:
+                path = m.groups()[0]
+            else:
+                m = re3.search(uline)
+                if m:
+                    path = 'execinfo.h'
+                else:
+                    print(uline + ' not found')
+                    continue
             h = path.strip('./') if "../3rdparty/" not in path else path
             if h.endswith('complex.h') and x.endswith('openblas_config.h'):
                 source = ''
diff --git a/ci/docker/install/ubuntu_core.sh b/ci/docker/install/ubuntu_core.sh
index 61a4637830da..3cb806e0aadd 100755
--- a/ci/docker/install/ubuntu_core.sh
+++ b/ci/docker/install/ubuntu_core.sh
@@ -45,6 +45,7 @@ apt-get install -y \
     software-properties-common \
     sudo \
     unzip \
+    vim-nox \
     wget
 
 # Use libturbojpeg package as it is correctly compiled with -fPIC flag
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index a89c51de0d8e..c3610d2452e0 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -755,6 +755,53 @@ build_ubuntu_gpu_cmake() {
     ninja -v
 }
 
+build_ubuntu_cpu_large_tensor() {
+    set -ex
+    cd /work/build
+    build_ccache_wrappers
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
+        -DUSE_SIGNAL_HANDLER=ON                 \
+        -DENABLE_TESTCOVERAGE=ON                \
+        -DUSE_CUDA=OFF                          \
+        -DUSE_CUDNN=OFF                         \
+        -DUSE_MKLDNN=OFF                        \
+        -DCMAKE_BUILD_TYPE=Release              \
+        -DUSE_INT64_TENSOR_SIZE=ON              \
+        -G Ninja                                \
+        /work/mxnet
+
+    ninja -v
+}
+
+build_ubuntu_gpu_large_tensor() {
+    set -ex
+    cd /work/build
+    build_ccache_wrappers
+    cmake \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache   \
+        -DUSE_SIGNAL_HANDLER=ON                 \
+        -DENABLE_TESTCOVERAGE=ON                \
+        -DUSE_CUDA=ON                           \
+        -DUSE_CUDNN=ON                          \
+        -DUSE_MKL_IF_AVAILABLE=OFF              \
+        -DUSE_MKLML_MKL=OFF                     \
+        -DUSE_MKLDNN=OFF                        \
+        -DUSE_DIST_KVSTORE=ON                   \
+        -DCMAKE_BUILD_TYPE=Release              \
+        -DCUDA_ARCH_NAME=Manual                 \
+        -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+        -DUSE_INT64_TENSOR_SIZE=ON              \
+        -G Ninja                                \
+        /work/mxnet
+
+    ninja -v
+}
+
 build_ubuntu_blc() {
     echo "pass"
 }
@@ -1183,6 +1230,13 @@ nightly_test_KVStore_singleNode() {
     python tests/nightly/test_kvstore.py
 }
 
+#Test Large Tensor Size
+nightly_test_large_tensor() {
+    set -ex
+    export PYTHONPATH=./python/
+    nosetests-3.4 tests/nightly/test_large_array.py
+}
+
 #Tests Amalgamation Build with 5 different sets of flags
 nightly_test_amalgamation() {
     set -ex
@@ -1327,7 +1381,7 @@ deploy_jl_docs() {
     # ...
 }
 
-build_scala_static_mkl() {
+build_static_scala_mkl() {
     set -ex
     pushd .
     scala_prepare
@@ -1345,6 +1399,14 @@ build_static_python_mkl() {
     popd
 }
 
+build_static_python_cu100mkl() {
+    set -ex
+    pushd .
+    export mxnet_variant=cu100mkl
+    ./ci/publish/python/build.sh
+    popd
+}
+
 publish_scala_build() {
     set -ex
     pushd .
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index e34b25d6d355..23230ac0442f 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -33,7 +33,7 @@ mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/li
 // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default.
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
-mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, lib/libsparse_matrix.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
 mx_tensorrt_lib = 'build/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
@@ -119,6 +119,34 @@ def compile_unix_openblas_debug_cpu() {
     }]
 }
 
+def compile_unix_int64_cpu() {
+    return ['CPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-cpu-int64') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_cpu', 'build_ubuntu_cpu_large_tensor', false)
+            utils.pack_lib('ubuntu_cpu_int64', mx_cmake_lib, true)
+          }
+        }
+      }
+    }]
+}
+
+def compile_unix_int64_gpu() {
+    return ['GPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/build-gpu-int64') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run('ubuntu_gpu', 'build_ubuntu_gpu_large_tensor', false)
+            utils.pack_lib('ubuntu_gpu_int64', mx_cmake_lib, true)
+          }
+        }
+      }
+    }]
+}
+
 def compile_unix_mkl_cpu() {
     return ['CPU: MKL': {
       node(NODE_LINUX_CPU) {
@@ -521,7 +549,7 @@ def test_static_scala_cpu() {
         ws('workspace/ut-publish-scala-cpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.init_git()
-            utils.docker_run("publish.ubuntu1404_cpu", 'build_scala_static_mkl', false)
+            utils.docker_run("publish.ubuntu1404_cpu", 'build_static_scala_mkl', false)
           }
         }
     }
@@ -541,6 +569,19 @@ def test_static_python_cpu() {
   }]
 }
 
+def test_static_python_gpu() {
+  return ['Static build GPU 14.04 Python' : {
+    node(NODE_LINUX_GPU) {
+        ws('workspace/ut-publish-python-gpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run("publish.ubuntu1404_gpu", 'build_static_python_cu100mkl', true)
+          }
+        }
+    }
+  }]
+}
+
 def test_unix_python2_cpu() {
     return ['Python2: CPU': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index 919381ebccd4..fa0942988d9c 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -38,7 +38,8 @@ core_logic: {
     custom_steps.compile_unix_openblas_debug_cpu(),
     custom_steps.compile_unix_mkl_cpu(),
     custom_steps.compile_unix_mkldnn_cpu(),
-    custom_steps.compile_unix_mkldnn_mkl_cpu()
+    custom_steps.compile_unix_mkldnn_mkl_cpu(),
+    custom_steps.compile_unix_int64_cpu()
   ])
 
   utils.parallel_stage('Tests', [
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index f6191deb7a68..e2a089b0469b 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -40,6 +40,7 @@ core_logic: {
     custom_steps.compile_unix_cmake_mkldnn_gpu(),
     custom_steps.compile_unix_cmake_gpu(),
     custom_steps.compile_unix_tensorrt_gpu(),
+    custom_steps.compile_unix_int64_gpu()
   ]) 
 
   utils.parallel_stage('Tests', [
@@ -50,8 +51,7 @@ core_logic: {
     custom_steps.test_unix_python2_mkldnn_gpu(),
     custom_steps.test_unix_python3_mkldnn_gpu(),
     custom_steps.test_unix_python3_mkldnn_nocudnn_gpu(),
-// Disabled temporarily for /~https://github.com/apache/incubator-mxnet/issues/14626
-//    custom_steps.test_unix_python3_tensorrt_gpu(),
+    custom_steps.test_unix_python3_tensorrt_gpu(),
     custom_steps.test_unix_perl_gpu(),
     custom_steps.test_unix_r_gpu(),
     custom_steps.test_unix_cpp_gpu(),
@@ -59,7 +59,8 @@ core_logic: {
     custom_steps.test_unix_python3_integration_gpu(),
     custom_steps.test_unix_cpp_package_gpu(),
     custom_steps.test_unix_scala_gpu(),
-    custom_steps.test_unix_distributed_kvstore_gpu()
+    custom_steps.test_unix_distributed_kvstore_gpu(),
+    custom_steps.test_static_python_gpu()
 
     // Disabled due to: /~https://github.com/apache/incubator-mxnet/issues/11407
     //custom_steps.test_unix_caffe_gpu()
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index ac6ce3926c37..6b427db85ec9 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -216,7 +216,7 @@ function(mxnet_option variable description value)
       option(${variable} "${description}" ${__value})
     endif()
   else()
-    unset(${variable} CACHE)
+    option(${variable} "${description}" OFF)
   endif()
 endfunction()
 
diff --git a/contrib/clojure-package/.gitignore b/contrib/clojure-package/.gitignore
index f5d81ddc7620..71d812e56ecd 100644
--- a/contrib/clojure-package/.gitignore
+++ b/contrib/clojure-package/.gitignore
@@ -39,6 +39,8 @@ examples/visualization/test-vis.pdf
 src/.DS_Store
 src/org/.DS_Store
 test/test-ndarray.clj
+test/test-ndarray-api.clj
 test/test-symbol.clj
+test/test-symbol-api.clj
 src/org/apache/clojure_mxnet/gen/*
 
diff --git a/contrib/clojure-package/examples/bert-qa/.gitignore b/contrib/clojure-package/examples/bert-qa/.gitignore
new file mode 100644
index 000000000000..d18f225992a9
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/.gitignore
@@ -0,0 +1,12 @@
+/target
+/classes
+/checkouts
+profiles.clj
+pom.xml
+pom.xml.asc
+*.jar
+*.class
+/.lein-*
+/.nrepl-port
+.hgignore
+.hg/
diff --git a/contrib/clojure-package/examples/bert-qa/README.md b/contrib/clojure-package/examples/bert-qa/README.md
new file mode 100644
index 000000000000..9a21bcdfd66b
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/README.md
@@ -0,0 +1,91 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+
+# bert-qa
+
+**This example was based off of the Java API one. It shows how to do inference with a pre-trained BERT network that is trained on Questions and Answers using the [SQuAD Dataset](https://rajpurkar.github.io/SQuAD-explorer/)**
+
+The pretrained model was created using GluonNLP and then exported to the MXNet symbol format. You can find more information in the background section below.
+
+In this tutorial, we will walk through the BERT QA model trained by MXNet. 
+Users can provide a question with a paragraph contains answer to the model and
+the model will be able to find the best answer from the answer paragraph.
+
+Example:
+
+```
+{:input-answer "Steam engines are external combustion engines, where the working fluid is separate from the combustion products. Non-combustion heat sources such as solar power, nuclear power or geothermal energy may be used. The ideal thermodynamic cycle used to analyze this process is called the Rankine cycle. In the cycle, water is heated and transforms into steam within a boiler operating at a high pressure. When expanded through pistons or turbines, mechanical work is done. The reduced-pressure steam is then condensed and pumped back into the boiler."
+  :input-question "Along with geothermal and nuclear, what is a notable non-combustion heat source?"
+  :ground-truth-answers ["solar"
+                         "solar power"
+                         "solar power, nuclear power or geothermal energysolar"]}
+```
+
+The prediction in this case would be `solar power`
+
+## Setup Guide
+
+### Step 1: Download the model
+
+For this tutorial, you can get the model and vocabulary by running following bash file. This script will use `wget` to download these artifacts from AWS S3.
+
+From the example directory:
+
+```bash
+./get_bert_data.sh
+```
+
+Some sample questions and answers are provide in the `squad-sample.edn` file. Some are taken directly from the SQuAD dataset and one was just made up. Feel free to edit the file and add your own!
+
+
+## To run
+
+* `lein install` in the root of the main project directory
+* cd into this project directory and do `lein run`. This will execute the cpu version.
+
+`lein run :cpu` - to run with cpu
+`lein run :gpu` - to run with gpu
+
+## Background
+
+To learn more about how BERT works in MXNet, please follow this [MXNet Gluon tutorial on NLP using BERT](https://medium.com/apache-mxnet/gluon-nlp-bert-6a489bdd3340).
+
+The model was extracted from MXNet GluonNLP with static length settings.
+
+[Download link for the script](https://gluon-nlp.mxnet.io/_downloads/bert.zip)
+
+The original description can be found in the [MXNet GluonNLP model zoo](https://gluon-nlp.mxnet.io/model_zoo/bert/index.html#bert-base-on-squad-1-1).
+```bash
+python static_finetune_squad.py --optimizer adam --accumulate 2 --batch_size 6 --lr 3e-5 --epochs 2 --gpu 0 --export
+
+```
+This script will generate `json` and `param` fles that are the standard MXNet model files.
+By default, this model are using `bert_12_768_12` model with extra layers for QA jobs.
+
+After that, to be able to use it in Java, we need to export the dictionary from the script to parse the text
+to actual indexes. Please add the following lines after [this line](/~https://github.com/dmlc/gluon-nlp/blob/master/scripts/bert/staticbert/static_finetune_squad.py#L262).
+```python
+import json
+json_str = vocab.to_json()
+f = open("vocab.json", "w")
+f.write(json_str)
+f.close()
+```
+This would export the token vocabulary in json format.
+Once you have these three files, you will be able to run this example without problems.
+
diff --git a/contrib/clojure-package/examples/bert-qa/get_bert_data.sh b/contrib/clojure-package/examples/bert-qa/get_bert_data.sh
new file mode 100755
index 000000000000..603194a03c05
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/get_bert_data.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -e
+
+data_path=model
+
+if [ ! -d "$data_path" ]; then
+  mkdir -p "$data_path"
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/vocab.json -o $data_path/vocab.json
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/static_bert_qa-0002.params -o $data_path/static_bert_qa-0002.params
+  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/static_bert_qa-symbol.json -o $data_path/static_bert_qa-symbol.json
+fi
diff --git a/contrib/clojure-package/examples/bert-qa/project.clj b/contrib/clojure-package/examples/bert-qa/project.clj
new file mode 100644
index 000000000000..d256d44d0798
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/project.clj
@@ -0,0 +1,28 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+(defproject bert-qa "0.1.0-SNAPSHOT"
+  :description "BERT QA Example"
+  :plugins [[lein-cljfmt "0.5.7"]]
+  :dependencies [[org.clojure/clojure "1.9.0"]
+                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
+                 [cheshire "5.8.1"]]
+  :pedantic? :skip
+  :java-source-paths ["src/java"]
+  :main bert-qa.infer
+  :repl-options {:init-ns bert-qa.infer})
diff --git a/contrib/clojure-package/examples/bert-qa/squad-samples.edn b/contrib/clojure-package/examples/bert-qa/squad-samples.edn
new file mode 100644
index 000000000000..e99a181f7d17
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/squad-samples.edn
@@ -0,0 +1,39 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+[{:input-answer "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
+  :input-question "By what main attribute are computational problems classified utilizing computational complexity theory?"
+  :ground-truth-answers ["Computational complexity theory"
+                         "Computational  complexity theory"
+                         "complexity theory"]}
+ {:input-answer "Steam engines are external combustion engines, where the working fluid is separate from the combustion products. Non-combustion heat sources such as solar power, nuclear power or geothermal energy may be used. The ideal thermodynamic cycle used to analyze this process is called the Rankine cycle. In the cycle, water is heated and transforms into steam within a boiler operating at a high pressure. When expanded through pistons or turbines, mechanical work is done. The reduced-pressure steam is then condensed and pumped back into the boiler."
+  :input-question "Along with geothermal and nuclear, what is a notable non-combustion heat source?"
+  :ground-truth-answers ["solar"
+                         "solar power"
+                         "solar power, nuclear power or geothermal energysolar"]}
+ {:input-answer "In the 1960s, a series of discoveries, the most important of which was seafloor spreading, showed that the Earth's lithosphere, which includes the crust and rigid uppermost portion of the upper mantle, is separated into a number of tectonic plates that move across the plastically deforming, solid, upper mantle, which is called the asthenosphere. There is an intimate coupling between the movement of the plates on the surface and the convection of the mantle: oceanic plate motions and mantle convection currents always move in the same direction, because the oceanic lithosphere is the rigid upper thermal boundary layer of the convecting mantle. This coupling between rigid plates moving on the surface of the Earth and the convecting mantle is called plate tectonics."
+  :input-question "What was the most important discovery that led to the understanding that Earth's lithosphere is separated into tectonic plates?"
+  :ground-truth-answers ["seafloor spreading"]}
+ ;;; totally made up
+ {:input-answer "Susan had a cat named Sammy when she lived in the green house."
+  :input-question "What was Susan's cat named?"
+  :ground-truth-answers ["Sammy" "sammy"]}
+ ;;; more or less from wikipedia on clojure
+ {:input-answer "Rich Hickey is the creator of the Clojure language. Before Clojure, he developed dotLisp, a similar project based on the .NET platform, and three earlier attempts to provide interoperability between Lisp and Java: a Java foreign language interface for Common Lisp, A Foreign Object Interface for Lisp, and a Lisp-friendly interface to Java Servlets."
+  :input-question "Who created Clojure?"
+  :ground-truth-answers ["rich" "hickey"]}]
diff --git a/contrib/clojure-package/examples/bert-qa/src/bert_qa/infer.clj b/contrib/clojure-package/examples/bert-qa/src/bert_qa/infer.clj
new file mode 100644
index 000000000000..836684e04977
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/src/bert_qa/infer.clj
@@ -0,0 +1,159 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+(ns bert-qa.infer
+  (:require [clojure.string :as string]
+            [clojure.reflect :as r]
+            [cheshire.core :as json]
+            [clojure.java.io :as io]
+            [clojure.set :as set]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.layout :as layout]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [org.apache.clojure-mxnet.infer :as infer]
+            [clojure.pprint :as pprint]))
+
+(def model-path-prefix "model/static_bert_qa")
+;; epoch number of the model
+(def epoch 2)
+;; the vocabulary used in the model
+(def model-vocab "model/vocab.json")
+;; the input question
+;; the maximum length of the sequence
+(def seq-length 384)
+
+;;; data helpers
+
+(defn break-out-punctuation [s str-match]
+  (->> (string/split (str s "<punc>") (re-pattern (str "\\" str-match)))
+       (map #(string/replace % "<punc>" str-match))))
+
+(defn break-out-punctuations [s]
+  (if-let [target-char (first (re-seq #"[.,?!]" s))]
+    (break-out-punctuation s target-char)
+    [s]))
+
+(defn tokenize [s]
+  (->> (string/split s #"\s+")
+       (mapcat break-out-punctuations)
+       (into [])))
+
+(defn pad [tokens pad-item num]
+  (if (>= (count tokens) num)
+    tokens
+    (into tokens (repeat (- num (count tokens)) pad-item))))
+
+(defn get-vocab []
+  (let [vocab (json/parse-stream (clojure.java.io/reader "model/vocab.json"))]
+    {:idx->token (get vocab "idx_to_token")
+     :token->idx (get vocab "token_to_idx")}))
+
+(defn tokens->idxs [token->idx tokens]
+  (let [unk-idx (get token->idx "[UNK]")]
+   (mapv #(get token->idx % unk-idx) tokens)))
+
+(defn idxs->tokens [idx->token idxs]
+  (mapv #(get idx->token %) idxs))
+
+(defn post-processing [result tokens]
+  (let [output1 (ndarray/slice-axis result 2 0 1)
+        output2 (ndarray/slice-axis result 2 1 2)
+        ;;; get the formatted logits result
+        start-logits (ndarray/reshape output1 [0 -3])
+        end-logits (ndarray/reshape output2 [0 -3])
+        start-prob (ndarray/softmax start-logits)
+        end-prob (ndarray/softmax end-logits)
+        start-idx (-> (ndarray/argmax start-prob 1)
+                      (ndarray/->vec)
+                      (first))
+        end-idx (-> (ndarray/argmax end-prob 1)
+                    (ndarray/->vec)
+                    (first))]
+    (if (> end-idx start-idx)
+      (subvec tokens start-idx (inc end-idx))
+      (subvec tokens end-idx (inc end-idx)))))
+
+(defn make-predictor [ctx]
+  (let [input-descs [{:name "data0"
+                      :shape [1 seq-length]
+                      :dtype dtype/FLOAT32
+                      :layout layout/NT}
+                     {:name "data1"
+                      :shape [1 seq-length]
+                      :dtype dtype/FLOAT32
+                      :layout layout/NT}
+                     {:name "data2"
+                      :shape [1]
+                      :dtype dtype/FLOAT32
+                      :layout layout/N}]
+        factory (infer/model-factory model-path-prefix input-descs)]
+    (infer/create-predictor
+     factory
+     {:contexts [ctx]
+      :epoch 2})))
+
+(defn pre-processing [ctx idx->token token->idx qa-map]
+  (let [{:keys [input-question input-answer ground-truth-answers]} qa-map
+       ;;; pre-processing tokenize sentence
+        token-q (tokenize (string/lower-case input-question))
+        token-a (tokenize (string/lower-case input-answer))
+        valid-length (+ (count token-q) (count token-a))
+        ;;; generate token types [0000...1111...0000]
+        qa-embedded (into (pad [] 0 (count token-q))
+                          (pad [] 1 (count token-a)))
+        token-types (pad qa-embedded 0 seq-length)
+        ;;; make BERT pre-processing standard
+        token-a (conj token-a "[SEP]")
+        token-q (into [] (concat ["[CLS]"] token-q ["[SEP]"] token-a))
+        tokens (pad token-q "[PAD]" seq-length)
+        ;;; pre-processing - token to index translation
+
+        indexes (tokens->idxs token->idx tokens)]
+    {:input-batch [(ndarray/array indexes [1 seq-length] {:context ctx})
+                   (ndarray/array token-types [1 seq-length] {:context ctx})
+                   (ndarray/array [valid-length] [1] {:context ctx})]
+     :tokens tokens
+     :qa-map qa-map}))
+
+(defn infer [ctx]
+  (let [ctx (context/default-context)
+        predictor (make-predictor ctx)
+        {:keys [idx->token token->idx]} (get-vocab)
+        ;;; samples taken from https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/
+        question-answers (clojure.edn/read-string (slurp "squad-samples.edn"))]
+    (doseq [qa-map question-answers]
+      (let [{:keys [input-batch tokens qa-map]} (pre-processing ctx idx->token token->idx qa-map)
+            result (first (infer/predict-with-ndarray predictor input-batch))
+            answer (post-processing result tokens)]
+        (println "===============================")
+        (println "      Question Answer Data")
+        (pprint/pprint qa-map)
+        (println)
+        (println "  Predicted Answer: " answer)
+        (println "===============================")))))
+
+(defn -main [& args]
+  (let [[dev] args]
+    (if (= dev ":gpu")
+      (infer (context/gpu))
+      (infer (context/cpu)))))
+
+(comment
+
+  (infer :cpu))
diff --git a/contrib/clojure-package/examples/bert-qa/test/bert_qa/infer_test.clj b/contrib/clojure-package/examples/bert-qa/test/bert_qa/infer_test.clj
new file mode 100644
index 000000000000..767fb089f284
--- /dev/null
+++ b/contrib/clojure-package/examples/bert-qa/test/bert_qa/infer_test.clj
@@ -0,0 +1,42 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+(ns bert-qa.infer-test
+  (:require [bert-qa.infer :refer :all]
+            [clojure.java.io :as io]
+            [clojure.java.shell :refer [sh]]
+            [clojure.test :refer :all]
+            [org.apache.clojure-mxnet.context :as context]
+            [org.apache.clojure-mxnet.infer :as infer]))
+
+(def model-dir "model/")
+
+(when-not (.exists (io/file (str model-dir "static_bert_qa-0002.params")))
+  (println "Downloading bert qa data")
+  (sh "./get_bert_data.sh"))
+
+(deftest infer-test
+  (let [ctx (context/default-context)
+        predictor (make-predictor ctx)
+        {:keys [idx->token token->idx]} (get-vocab)
+        ;;; samples taken from https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/
+        question-answers (clojure.edn/read-string (slurp "squad-samples.edn"))]
+    (let [qa-map (last question-answers)
+          {:keys [input-batch tokens qa-map]} (pre-processing ctx idx->token token->idx qa-map)
+          result (first (infer/predict-with-ndarray predictor input-batch))]
+      (is (= ["rich" "hickey"] (post-processing result tokens))))))
diff --git a/contrib/clojure-package/src/dev/generator.clj b/contrib/clojure-package/src/dev/generator.clj
index ca93c3421d2a..d1f59dc5082e 100644
--- a/contrib/clojure-package/src/dev/generator.clj
+++ b/contrib/clojure-package/src/dev/generator.clj
@@ -17,14 +17,19 @@
 
 (ns dev.generator
   (:require [t6.from-scala.core :as scala]
+            [t6.from-scala.core :refer [$ $$] :as $]
             [clojure.reflect :as r]
-            [org.apache.clojure-mxnet.util :as util]
-            [clojure.pprint])
-  (:import (org.apache.mxnet NDArray Symbol))
+            [clojure.pprint]
+            [org.apache.clojure-mxnet.util :as util])
+  (:import (org.apache.mxnet NDArray NDArrayAPI
+                             Symbol SymbolAPI
+                             Base Base$RefInt Base$RefLong Base$RefFloat Base$RefString)
+           (scala.collection.mutable ListBuffer ArrayBuffer))
   (:gen-class))
 
 
 (defn clojure-case
+  "Transforms a scala string (function name) to clojure case"
   [string]
   (-> string
       (clojure.string/replace #"(\s+)([A-Z][a-z]+)" "$1-$2")
@@ -34,17 +39,17 @@
       (clojure.string/replace #"\_" "-")
       (clojure.string/replace #"\/" "div")))
 
-(defn symbol-transform-param-name [parameter-types]
+(defn transform-param-names [coerce-fn parameter-types]
   (->> parameter-types
        (map str)
-       (map (fn [x] (or (util/symbol-param-coerce x) x)))
+       (map (fn [x] (or (coerce-fn x) x)))
        (map (fn [x] (last (clojure.string/split x #"\."))))))
 
+(defn symbol-transform-param-name [parameter-types]
+  (transform-param-names util/symbol-param-coerce parameter-types))
+
 (defn ndarray-transform-param-name [parameter-types]
-  (->> parameter-types
-       (map str)
-       (map (fn [x] (or (util/ndarray-param-coerce x) x)))
-       (map (fn [x] (last (clojure.string/split x #"\."))))))
+  (transform-param-names util/ndarray-param-coerce parameter-types))
 
 (defn has-variadic? [params]
   (->> params
@@ -53,40 +58,160 @@
        count
        pos?))
 
-
 (defn increment-param-name [pname]
   (if-let [num-str (re-find #"-\d" pname)]
-    (str (first (clojure.string/split pname #"-")) "-" (inc (Integer/parseInt (last (clojure.string/split num-str #"-")))))
+    (str
+     (first (clojure.string/split pname #"-"))
+     "-"
+     (inc (Integer/parseInt (last (clojure.string/split num-str #"-")))))
     (str pname "-" 1)))
 
-(defn rename-duplicate-params [params]
-  (reduce (fn [known-names n] (conj known-names (if (contains? (set known-names) n)
-                                                  (increment-param-name n)
-                                                  n)))
-          []
-          params))
+(defn rename-duplicate-params [pnames]
+  (->> (reduce
+        (fn [pname-counts n]
+          (let [rn (if (pname-counts n) (str n "-" (pname-counts n)) n)
+                inc-pname-counts (update-in pname-counts [n] (fnil inc 0))]
+            (update-in inc-pname-counts [:params] conj rn)))
+        {:params []}
+        pnames)
+       :params))
+
+(defn get-public-no-default-methods [obj]
+  (->> (r/reflect obj)
+       :members
+       (map #(into {} %))
+       (filter #(-> % :flags :public))
+       (remove #(re-find #"org\$apache\$mxnet" (str (:name %))))
+       (remove #(re-find #"\$default" (str (:name %))))))
+
+(defn get-public-to-gen-methods [public-to-hand-gen public-no-default]
+  (let [public-to-hand-gen-names
+        (into #{} (mapv (comp str :name) public-to-hand-gen))]
+    (remove #(-> % :name str public-to-hand-gen-names) public-no-default)))
 
+(defn public-by-name-and-param-count [public-reflect-info]
+ (->> public-reflect-info
+      (group-by :name)
+      (map (fn [[k v]] [k (group-by #(count (:parameter-types %)) v)]))
+      (into {})))
 
-;;;;;;; symbol
-
-(def symbol-reflect-info (->> (:members (r/reflect Symbol))
-                              (map #(into {} %))))
+(def license
+  (str
+   ";; Licensed to the Apache Software Foundation (ASF) under one or more\n"
+   ";; contributor license agreements.  See the NOTICE file distributed with\n"
+   ";; this work for additional information regarding copyright ownership.\n"
+   ";; The ASF licenses this file to You under the Apache License, Version 2.0\n"
+   ";; (the \"License\"); you may not use this file except in compliance with\n"
+   ";; the License.  You may obtain a copy of the License at\n"
+   ";;\n"
+   ";;    http://www.apache.org/licenses/LICENSE-2.0\n"
+   ";;\n"
+   ";; Unless required by applicable law or agreed to in writing, software\n"
+   ";; distributed under the License is distributed on an \"AS IS\" BASIS,\n"
+   ";; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"
+   ";; See the License for the specific language governing permissions and\n"
+   ";; limitations under the License.\n"
+   ";;\n"))
 
-(def symbol-public (filter (fn [x] (-> x :flags :public)) symbol-reflect-info))
+(defn write-to-file [functions ns-gen fname]
+  (with-open [w (clojure.java.io/writer fname)]
+    (.write w ns-gen)
+    (.write w "\n\n")
+    (.write w ";; Do not edit - this is auto-generated")
+    (.write w "\n\n")
+    (.write w license)
+    (.write w "\n\n")
+    (.write w "\n\n")
+  (doseq [f functions]
+    (let [fstr (-> f
+                   clojure.pprint/pprint
+                   with-out-str
+                   (clojure.string/replace #"\\n\\n" "\n"))]
+      (.write w fstr))
+    (.write w "\n"))))
 
-(def symbol-public-no-default (->> symbol-public
-                                   (filter #(not (re-find #"org\$apache\$mxnet" (str (:name %)))))
-                                   (filter #(not (re-find #"\$default" (str (:name %)))))))
+(defn remove-prefix
+  [prefix s]
+  (let [regex (re-pattern (str prefix "(.*)"))
+        replacement "$1"]
+  (clojure.string/replace s regex replacement)))
+
+(defn in-namespace-random? [op-name]
+  (or (clojure.string/includes? op-name "random_")
+      (clojure.string/includes? op-name "sample_")))
+
+(defn op-name->namespace-type [op-name]
+  (cond
+    (#{"uniform" "normal"} op-name)              :deprecated
+    (clojure.string/includes? op-name "random_") :random
+    (clojure.string/includes? op-name "sample_") :random
+    :else                                        :core))
+
+;;;;;;; Common operations
+
+(def libinfo (Base/_LIB))
+
+(def op-names
+  (let [l ($ ListBuffer/empty)]
+    (.mxListAllOpNames libinfo l)
+    (->> l
+         (util/buffer->vec)
+         (remove #(or (= "Custom" %) (re-matches #"^_.*" %))))))
+
+(defn- parse-arg-type [s]
+  (let [[_ var-arg-type _ set-arg-type arg-spec _ type-req _ default-val] (re-find #"(([\w-\[\]\s]+)|\{([^}]+)\})\s*(\([^)]+\))?(,\s*(optional|required)(,\s*default=(.*))?)?" s)]
+    {:type (clojure.string/trim (or set-arg-type var-arg-type))
+     :spec arg-spec
+     :optional? (or (= "optional" type-req)
+                    (= "boolean" var-arg-type))
+     :default default-val
+     :orig s}))
+
+(defn- get-op-handle [op-name]
+  (let [ref (new Base$RefLong 0)]
+    (do (.nnGetOpHandle libinfo op-name ref)
+        (.value ref))))
+
+(defn gen-op-info [op-name]
+  (let [handle (get-op-handle op-name)
+        name (new Base$RefString nil)
+        desc (new Base$RefString nil)
+        key-var-num-args (new Base$RefString nil)
+        num-args (new Base$RefInt 0)
+        arg-names ($ ListBuffer/empty)
+        arg-types ($ ListBuffer/empty)
+        arg-descs ($ ListBuffer/empty)]
+    (do (.mxSymbolGetAtomicSymbolInfo libinfo
+                                      handle
+                                      name
+                                      desc
+                                      num-args
+                                      arg-names
+                                      arg-types
+                                      arg-descs
+                                      key-var-num-args)
+        {:fn-name (clojure-case (.value name))
+         :fn-description (.value desc)
+         :args (mapv (fn [t n d] (assoc t :name n :description d))
+                     (mapv parse-arg-type (util/buffer->vec arg-types))
+                     (mapv clojure-case (util/buffer->vec arg-names))
+                     (util/buffer->vec arg-descs))
+         :key-var-num-args (clojure-case (.value key-var-num-args))})))
+
+;;;;;;; Symbol
+
+(def symbol-public-no-default
+  (get-public-no-default-methods Symbol))
 
 (into #{} (mapcat :parameter-types symbol-public-no-default))
-                                        ;#{java.lang.Object scala.collection.Seq scala.Option long double scala.collection.immutable.Map int ml.dmlc.mxnet.Executor float ml.dmlc.mxnet.Context java.lang.String scala.Enumeration$Value ml.dmlc.mxnet.Symbol int<> ml.dmlc.mxnet.Symbol<> ml.dmlc.mxnet.Shape java.lang.String<>}
+;; #{java.lang.Object scala.collection.Seq scala.Option long double scala.collection.immutable.Map int ml.dmlc.mxnet.Executor float ml.dmlc.mxnet.Context java.lang.String scala.Enumeration$Value ml.dmlc.mxnet.Symbol int<> ml.dmlc.mxnet.Symbol<> ml.dmlc.mxnet.Shape java.lang.String<>}
 
-(def symbol-hand-gen-set  #{"scala.Option"
-                            "int org.apache.mxnet.Executor"
-                            "scala.Enumeration$Value"
-                            "org.apache.mxnet.Context"
-                            "scala.Tuple2"
-                            "scala.collection.Traversable"} )
+(def symbol-hand-gen-set
+  #{"scala.Option"
+    "scala.Enumeration$Value"
+    "org.apache.mxnet.Context"
+    "scala.Tuple2"
+    "scala.collection.Traversable"})
 
 ;;; min and max have a conflicting arity of 2 with the auto gen signatures
 (def symbol-filter-name-set #{"max" "min"})
@@ -102,34 +227,35 @@
         count
         pos?)))
 
-(def symbol-public-to-hand-gen (filter is-symbol-hand-gen? symbol-public-no-default))
-(def symbol-public-to-gen (->> (remove  #(contains?(->>  symbol-public-to-hand-gen
-                                                          (mapv :name)
-                                                          (mapv str)
-                                                          (set)) (str (:name %))) symbol-public-no-default)))
+(def symbol-public-to-hand-gen
+  (filter is-symbol-hand-gen? symbol-public-no-default))
+(def symbol-public-to-gen
+  (get-public-to-gen-methods symbol-public-to-hand-gen
+                             symbol-public-no-default))
 
 
 (count symbol-public-to-hand-gen) ;=> 35 mostly bind!
 (count symbol-public-to-gen) ;=> 307
 
-(into #{} (map :name symbol-public-to-hand-gen));=>  #{arange bind ones zeros simpleBind Variable}
+(into #{} (map :name symbol-public-to-hand-gen))
+;;=>  #{arange bind ones zeros simpleBind Variable}
 
-(defn public-by-name-and-param-count [public-reflect-info]
- (->> public-reflect-info
-      (group-by :name)
-      (map (fn [[k v]] [k (group-by #(count (:parameter-types %)) v)]))
-      (into {})))
 
 
 (defn symbol-vector-args []
-  `(if (map? ~'kwargs-map-or-vec-or-sym) (~'util/empty-list) (~'util/coerce-param ~'kwargs-map-or-vec-or-sym #{"scala.collection.Seq"})))
+  `(if (map? ~'kwargs-map-or-vec-or-sym)
+     (~'util/empty-list)
+     (~'util/coerce-param ~'kwargs-map-or-vec-or-sym #{"scala.collection.Seq"})))
 
 (defn symbol-map-args []
-  `(if (map? ~'kwargs-map-or-vec-or-sym) (util/convert-symbol-map ~'kwargs-map-or-vec-or-sym) nil))
+  `(if (map? ~'kwargs-map-or-vec-or-sym)
+     (util/convert-symbol-map ~'kwargs-map-or-vec-or-sym)
+     nil))
 
 
 (defn add-symbol-arities [params function-name]
-  (if (= ["sym-name" "kwargs-map" "symbol-list" "kwargs-map-1"] (mapv str params))
+  (if (= ["sym-name" "kwargs-map" "symbol-list" "kwargs-map-1"]
+         (mapv str params))
     [`([~'sym-name ~'attr-map ~'kwargs-map]
        (~function-name ~'sym-name (~'util/convert-symbol-map ~'attr-map) (~'util/empty-list) (~'util/convert-symbol-map ~'kwargs-map)))
      `([~'sym-name ~'kwargs-map-or-vec-or-sym]
@@ -180,37 +306,6 @@
      `(~'defn ~function-name
        ~@(remove nil? (gen-symbol-function-arity op-name op-values function-name))))))
 
-(def license
-  (str
-   ";; Licensed to the Apache Software Foundation (ASF) under one or more\n"
-   ";; contributor license agreements.  See the NOTICE file distributed with\n"
-   ";; this work for additional information regarding copyright ownership.\n"
-   ";; The ASF licenses this file to You under the Apache License, Version 2.0\n"
-   ";; (the \"License\"); you may not use this file except in compliance with\n"
-   ";; the License.  You may obtain a copy of the License at\n"
-   ";;\n"
-   ";;    http://www.apache.org/licenses/LICENSE-2.0\n"
-   ";;\n"
-   ";; Unless required by applicable law or agreed to in writing, software\n"
-   ";; distributed under the License is distributed on an \"AS IS\" BASIS,\n"
-   ";; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"
-   ";; See the License for the specific language governing permissions and\n"
-   ";; limitations under the License.\n"
-   ";;\n"))
-
-(defn write-to-file [functions ns-gen fname]
-  (with-open [w (clojure.java.io/writer fname)]
-    (.write w ns-gen)
-    (.write w "\n\n")
-    (.write w ";; Do not edit - this is auto-generated")
-    (.write w "\n\n")
-    (.write w license)
-    (.write w "\n\n")
-    (.write w "\n\n")
-  (doseq [f functions]
-    (clojure.pprint/pprint f w)
-    (.write w "\n"))))
-
 (def symbol-gen-ns "(ns org.apache.clojure-mxnet.symbol
   (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
                             min repeat reverse set sort take to-array empty sin
@@ -221,27 +316,22 @@
 
 (defn generate-symbol-file []
   (println "Generating symbol file")
-  (write-to-file all-symbol-functions symbol-gen-ns "src/org/apache/clojure_mxnet/gen/symbol.clj"))
-
-
-;;;;;;;;NDARRAY
-
-
-(def ndarray-reflect-info (->> (:members (r/reflect NDArray))
-                                (map #(into {} %))))
+  (write-to-file all-symbol-functions
+                 symbol-gen-ns
+                 "src/org/apache/clojure_mxnet/gen/symbol.clj"))
 
+;;;;;;; NDArray
 
-(def ndarray-public (filter (fn [x] (-> x :flags :public)) ndarray-reflect-info))
 
-(def ndarray-public-no-default (->> ndarray-public
-                                    (filter #(not (re-find #"org\$apache\$mxnet" (str (:name %)))))
-                                    (filter #(not (re-find #"\$default" (str (:name %)))))))
+(def ndarray-public-no-default
+  (get-public-no-default-methods NDArray))
 
-(def ndarray-hand-gen-set  #{"org.apache.mxnet.NDArrayFuncReturn"
-                             "org.apache.mxnet.Context"
-                             "scala.Enumeration$Value"
-                             "scala.Tuple2"
-                             "scala.collection.Traversable"} )
+(def ndarray-hand-gen-set
+  #{"org.apache.mxnet.NDArrayFuncReturn"
+    "org.apache.mxnet.Context"
+    "scala.Enumeration$Value"
+    "scala.Tuple2"
+    "scala.collection.Traversable"})
 
 (defn is-ndarray-hand-gen? [info]
   (->> (map str (:parameter-types info))
@@ -250,20 +340,16 @@
        count
        pos?))
 
-
-(def ndarray-public-to-hand-gen (filter is-ndarray-hand-gen? ndarray-public-no-default))
-(def ndarray-public-to-gen (->> (remove  #(contains?(->>  ndarray-public-to-hand-gen
-                                                          (mapv :name)
-                                                          (mapv str)
-                                                          (set)) (str (:name %))) ndarray-public-no-default)))
-
+(def ndarray-public-to-hand-gen
+  (filter is-ndarray-hand-gen? ndarray-public-no-default))
+(def ndarray-public-to-gen
+  (get-public-to-gen-methods ndarray-public-to-hand-gen
+                             ndarray-public-no-default))
 
 (count ndarray-public-to-hand-gen) ;=> 15
 (count ndarray-public-to-gen) ;=> 486
 
-(map :name ndarray-public-to-hand-gen)
-
-
+(->> ndarray-public-to-hand-gen (map :name) (into #{}))
 
 (defn gen-ndarray-function-arity [op-name op-values]
   (for [[param-count info] op-values]
@@ -294,18 +380,22 @@
           )))))
 
 
+(defn gen-ndarray-functions [public-to-gen-methods]
+  (for [operation (sort (public-by-name-and-param-count public-to-gen-methods))]
+    (let [[op-name op-values] operation
+          function-name (-> op-name
+                            str
+                            scala/decode-scala-symbol
+                            clojure-case
+                            symbol)]
+      `(~'defn ~function-name
+        ~@(remove nil? (gen-ndarray-function-arity op-name op-values))))))
+
 (def all-ndarray-functions
- (for [operation  (sort (public-by-name-and-param-count ndarray-public-to-gen))]
-   (let [[op-name op-values] operation
-         function-name (-> op-name
-                           str
-                           scala/decode-scala-symbol
-                           clojure-case
-                           symbol)]
-     `(~'defn ~function-name
-       ~@(remove nil? (gen-ndarray-function-arity op-name op-values))))))
+  (gen-ndarray-functions ndarray-public-to-gen))
 
-(def ndarray-gen-ns "(ns org.apache.clojure-mxnet.ndarray
+(def ndarray-gen-ns
+  "(ns org.apache.clojure-mxnet.ndarray
   (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
                             min repeat reverse set sort take to-array empty shuffle
                             ref])
@@ -314,18 +404,286 @@
 
 (defn generate-ndarray-file []
   (println "Generating ndarray file")
-  (write-to-file all-ndarray-functions ndarray-gen-ns  "src/org/apache/clojure_mxnet/gen/ndarray.clj"))
+  (write-to-file all-ndarray-functions
+                 ndarray-gen-ns
+                 "src/org/apache/clojure_mxnet/gen/ndarray.clj"))
+
+;;;;;;; SymbolAPI
+
+(defn fn-name->random-fn-name
+  [fn-name]
+  (cond
+    (clojure.string/starts-with? fn-name "-random-")
+    (remove-prefix "-random-" fn-name)
+
+    (clojure.string/starts-with? fn-name "-sample-")
+    (str (remove-prefix "-sample-" fn-name) "-like")
+
+    :else fn-name))
+
+(defn symbol-api-coerce-param
+  [{:keys [name sym type optional?]}]
+  (let [coerced-param (case type
+                        "Shape" `(when ~sym (~'mx-shape/->shape ~sym))
+                        "NDArray-or-Symbol[]" `(~'clojure.core/into-array ~sym)
+                        "Map[String, String]"
+                        `(when ~sym
+                           (->> ~sym
+                                (mapv (fn [[~'k ~'v]] [~'k (str ~'v)]))
+                                (into {})
+                                ~'util/convert-map))
+                        sym)
+        nil-param-allowed? (#{"name" "attr"} name)]
+    (if (and optional? (not nil-param-allowed?))
+      `(~'util/->option ~coerced-param)
+      coerced-param)))
+
+(defn gen-symbol-api-doc [fn-description params]
+  (let [param-descriptions (mapv (fn [{:keys [name description optional?]}]
+                                   (str "`" name "`: "
+                                        description
+                                        (when optional? " (optional)")
+                                        "\n"))
+                                 params)]
+    (str fn-description "\n\n"
+         (apply str param-descriptions))))
+
+(defn gen-symbol-api-default-arity [op-name params]
+  (let [opt-params (filter :optional? params)
+        coerced-params (mapv symbol-api-coerce-param params)
+        default-args (array-map :keys (mapv :sym params)
+                                :or (into {}
+                                          (mapv (fn [{:keys [sym]}] [sym nil])
+                                                opt-params))
+                                :as 'opts)]
+    `([~default-args]
+      (~'util/coerce-return
+       (~(symbol (str "SymbolAPI/" op-name))
+        ~@coerced-params)))))
+
+(defn symbol-api-gen-ns
+  [random-namespace?]
+  (str
+    "(ns\n"
+    "  ^{:doc \"Experimental\"}\n"
+    (if random-namespace?
+      "  org.apache.clojure-mxnet.symbol-random-api\n"
+      "  org.apache.clojure-mxnet.symbol-api\n")
+    "  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max\n"
+    "                            min repeat reverse set sort take to-array empty sin\n"
+    "                            get apply shuffle ref])\n"
+    "  (:require [org.apache.clojure-mxnet.util :as util]\n"
+    "            [org.apache.clojure-mxnet.shape :as mx-shape])\n"
+    "  (:import (org.apache.mxnet SymbolAPI)))"))
+
+(defn make-gen-symbol-api-function
+  [{:keys [fn-name->fn-name] :or {fn-name->fn-name identity}}]
+  (fn [op-name]
+    (let [{:keys [fn-name fn-description args]}
+          (-> op-name (gen-op-info) (update :fn-name fn-name->fn-name))
+          params (mapv (fn [{:keys [name type optional?] :as opts}]
+                         (assoc opts
+                                :sym (symbol name)
+                                :optional? (or optional?
+                                               (= "NDArray-or-Symbol" type))))
+                       (conj args
+                             {:name "name"
+                              :type "String"
+                              :optional? true
+                              :description "Name of the symbol"}
+                             {:name "attr"
+                              :type "Map[String, String]"
+                              :optional? true
+                              :description "Attributes of the symbol"}))
+          doc (clojure.string/join
+                "\n\n  "
+                (-> (gen-symbol-api-doc fn-description params)
+                    (clojure.string/split #"\n")))
+          default-call (gen-symbol-api-default-arity op-name params)]
+      `(~'defn ~(symbol fn-name)
+         ~doc
+         ~@default-call))))
+
+(def gen-symbol-api-function
+  (make-gen-symbol-api-function {}))
+
+(def gen-symbol-random-api-function
+  (make-gen-symbol-api-function {:fn-name->fn-name fn-name->random-fn-name}))
+
+(defn all-symbol-api-functions [op-names]
+  (->> op-names
+       (filter #(= :core (op-name->namespace-type %)))
+       (mapv gen-symbol-api-function)))
+
+(count (all-symbol-api-functions op-names)) ;215
+
+(defn all-symbol-random-api-functions [op-names]
+  (->> op-names
+       (filter #(= :random (op-name->namespace-type %)))
+       (mapv gen-symbol-random-api-function)))
+
+(count (all-symbol-random-api-functions op-names)) ;16
+
+(defn generate-symbol-api-file [op-names]
+  (println "Generating symbol-api file")
+  (write-to-file (all-symbol-api-functions op-names)
+                 (symbol-api-gen-ns false)
+                 "src/org/apache/clojure_mxnet/gen/symbol_api.clj"))
+
+(defn generate-symbol-random-api-file [op-names]
+  (println "Generating symbol-random-api file")
+  (write-to-file (all-symbol-random-api-functions op-names)
+                 (symbol-api-gen-ns true)
+                 "src/org/apache/clojure_mxnet/gen/symbol_random_api.clj"))
+
+;;;;;;; NDArrayAPI
+
+(defn ndarray-api-coerce-param
+  [{:keys [sym type optional?]}]
+  (let [coerced-param (case type
+                        "Shape" `(when ~sym (~'mx-shape/->shape ~sym))
+                        "NDArray-or-Symbol[]" `(~'clojure.core/into-array ~sym)
+                        sym)]
+    (if optional?
+      `(~'util/->option ~coerced-param)
+      coerced-param)))
+
+(defn gen-ndarray-api-doc [fn-description params]
+  (let [param-descriptions (mapv (fn [{:keys [name description optional?]}]
+                                   (str "`" name "`: "
+                                        description
+                                        (when optional? " (optional)")
+                                        "\n"))
+                                 params)]
+    (str fn-description "\n\n"
+         (apply str param-descriptions))))
+
+(defn gen-ndarray-api-default-arity [op-name params]
+  (let [opt-params (filter :optional? params)
+        coerced-params (mapv ndarray-api-coerce-param params)
+        default-args (array-map :keys (mapv :sym params)
+                                :or (into {}
+                                          (mapv (fn [{:keys [sym]}] [sym nil])
+                                                opt-params))
+                                :as 'opts)]
+    `([~default-args]
+      (~'util/coerce-return
+       (~(symbol (str "NDArrayAPI/" op-name))
+        ~@coerced-params)))))
+
+(defn gen-ndarray-api-required-arity [fn-name req-params]
+  (let [req-args (->> req-params
+                      (mapv (fn [{:keys [sym]}] [(keyword sym) sym]))
+                      (into {}))]
+    `(~(mapv :sym req-params)
+      (~(symbol fn-name) ~req-args))))
+
+(defn make-gen-ndarray-api-function
+  [{:keys [fn-name->fn-name] :or {fn-name->fn-name identity}}]
+  (fn [op-name]
+    (let [{:keys [fn-name fn-description args]}
+          (-> op-name (gen-op-info) (update :fn-name fn-name->fn-name))
+          params (mapv (fn [{:keys [name] :as opts}]
+                         (assoc opts :sym (symbol name)))
+                       (conj args {:name "out"
+                                   :type "NDArray-or-Symbol"
+                                   :optional? true
+                                   :description "Output array."}))
+          doc (clojure.string/join
+                "\n\n  "
+                (-> (gen-ndarray-api-doc fn-description params)
+                    (clojure.string/split #"\n")))
+          opt-params (filter :optional? params)
+          req-params (remove :optional? params)
+          req-call (gen-ndarray-api-required-arity fn-name req-params)
+          default-call (gen-ndarray-api-default-arity op-name params)]
+      (if (= 1 (count req-params))
+        `(~'defn ~(symbol fn-name)
+           ~doc
+           ~@default-call)
+        `(~'defn ~(symbol fn-name)
+           ~doc
+           ~req-call
+           ~default-call)))))
+
+(def gen-ndarray-api-function
+  (make-gen-ndarray-api-function {}))
+
+(def gen-ndarray-random-api-function
+  (make-gen-ndarray-api-function {:fn-name->fn-name fn-name->random-fn-name}))
+
+(defn all-ndarray-api-functions [op-names]
+  (->> op-names
+       (filter #(= :core (op-name->namespace-type %)))
+       (mapv gen-ndarray-api-function)))
+
+(count (all-ndarray-api-functions op-names)) ; 213
+
+(defn all-ndarray-random-api-functions [op-names]
+  (->> op-names
+       (filter #(= :random (op-name->namespace-type %)))
+       (mapv gen-ndarray-random-api-function)))
+
+(count (all-ndarray-random-api-functions op-names)) ;16
+
+(defn ndarray-api-gen-ns [random-namespace?]
+  (str
+    "(ns\n"
+    "  ^{:doc \"Experimental\"}\n"
+    (if random-namespace?
+      "  org.apache.clojure-mxnet.ndarray-random-api\n"
+      "  org.apache.clojure-mxnet.ndarray-api\n")
+    "  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max\n"
+    "                            min repeat reverse set sort take to-array empty shuffle\n"
+    "                            ref])\n"
+    "  (:require [org.apache.clojure-mxnet.shape :as mx-shape]\n"
+    "            [org.apache.clojure-mxnet.util :as util])\n"
+    "  (:import (org.apache.mxnet NDArrayAPI)))"))
+
+(defn generate-ndarray-api-file [op-names]
+  (println "Generating ndarray-api file")
+  (write-to-file (all-ndarray-api-functions op-names)
+                 (ndarray-api-gen-ns false)
+                 "src/org/apache/clojure_mxnet/gen/ndarray_api.clj"))
+
+(defn generate-ndarray-random-api-file [op-names]
+  (println "Generating ndarray-random-api file")
+  (write-to-file (all-ndarray-random-api-functions op-names)
+                 (ndarray-api-gen-ns true)
+                 "src/org/apache/clojure_mxnet/gen/ndarray_random_api.clj"))
+
 
 ;;; autogen the files
 (do
   (generate-ndarray-file)
-  (generate-symbol-file))
+
+  ;; NDArrayAPI
+  (generate-ndarray-api-file op-names)
+  (generate-ndarray-random-api-file op-names)
+
+  (generate-symbol-file)
+
+  ;; SymbolAPI
+  (generate-symbol-api-file op-names)
+  (generate-symbol-random-api-file op-names))
 
 
 (comment
 
+  (gen-op-info "ElementWiseSum")
+
+  (gen-ndarray-api-function "Activation")
+
+  (gen-symbol-api-function "Activation")
+
+  (gen-ndarray-random-api-function "random_randint")
+
+  (gen-ndarray-random-api-function "sample_normal")
+
+  (gen-symbol-random-api-function "random_poisson")
+
   ;; This generates a file with the bulk of the nd-array functions
   (generate-ndarray-file)
 
   ;; This generates a file with the bulk of the symbol functions
-  (generate-symbol-file)  )
+  (generate-symbol-file))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
index aa5ce39f7a80..09f17e5d81f4 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/module.clj
@@ -16,6 +16,7 @@
 ;;
 
 (ns org.apache.clojure-mxnet.module
+  "Module API for Clojure package."
   (:refer-clojure :exclude [update symbol])
   (:require [org.apache.clojure-mxnet.callback :as callback]
             [org.apache.clojure-mxnet.context :as context]
@@ -31,18 +32,29 @@
   (:import (org.apache.mxnet.module Module FitParams BaseModule)
            (org.apache.mxnet.io MXDataIter NDArrayIter)
            (org.apache.mxnet Initializer Optimizer NDArray DataBatch
-                             Context EvalMetric Monitor Callback$Speedometer DataDesc)))
+                             Context EvalMetric Monitor Callback$Speedometer
+                             DataDesc)))
 
 (defn module
-  "Module is a basic module that wrap a symbol.
-   sym : Symbol definition.
-   map of options
-       :data-names - Input data names.
-       :label-names - Input label names
-       :contexts - Default is cpu().
-       :workload-list - Default nil, indicating uniform workload.
-       :fixed-param-names Default nil, indicating no network parameters are fixed."
-  ([sym {:keys [data-names label-names contexts workload-list fixed-param-names] :as opts
+  "Module is a basic module that wrap a `symbol`.
+    `sym`: Symbol definition.
+    `opts-map` {
+      `data-names`: vector of strings - Default is [\"data\"]
+          Input data names
+      `label-names`: vector of strings - Default is [\"softmax_label\"]
+          Input label names
+      `contexts`: Context - Default is `context/cpu`.
+      `workload-list`: Default nil
+          Indicating uniform workload.
+      `fixed-param-names`: Default nil
+          Indicating no network parameters are fixed.
+    }
+   Ex:
+     (module sym)
+     (module sym {:data-names [\"data\"]
+                  :label-names [\"linear_regression_label\"]}"
+  ([sym {:keys [data-names label-names contexts
+                workload-list fixed-param-names] :as opts
          :or {data-names ["data"]
               label-names ["softmax_label"]
               contexts [(context/default-context)]}}]
@@ -80,31 +92,41 @@
 (s/def ::force-rebind boolean?)
 (s/def ::shared-module #(instance? Module))
 (s/def ::grad-req string?)
-(s/def ::bind-opts (s/keys :req-un [::data-shapes] :opt-un [::label-shapes ::for-training ::inputs-need-grad
-                                                            ::force-rebind ::shared-module ::grad-req]))
+(s/def ::bind-opts
+  (s/keys :req-un [::data-shapes]
+          :opt-un [::label-shapes ::for-training ::inputs-need-grad
+                   ::force-rebind ::shared-module ::grad-req]))
 
 (defn bind
   "Bind the symbols to construct executors. This is necessary before one
    can perform computation with the module.
-   mod : module
-   map of opts:
-     :data-shapes Typically is  (provide-data-desc data-iter). Data shape must be in the form of io/data-desc with is a map of :name :shape :dtype and :layout
-     :label-shapes Typically is  (provide-label-desc data-iter). map of :name :shape :dtype and :layout
-     :for-training Default is `true`. Whether the executors should be bind for training.
-     :inputs-need-grad Default is `false`.
-                       Whether the gradients to the input data need to be computed.
-                       Typically this is not needed.
-                       But this might be needed when implementing composition of modules.
-     :force-rebind Default is `false`.
-                   This function does nothing if the executors are already binded.
-                   But with this `true`, the executors will be forced to rebind.
-     :shared-module Default is nil. This is used in bucketing.
-                    When not `None`, the shared module essentially corresponds to
-                    a different bucket -- a module with different symbol
-                    but with the same sets of parameters
-                    (e.g. unrolled RNNs with different lengths). "
-  [mod {:keys [data-shapes label-shapes for-training inputs-need-grad force-rebind
-               shared-module grad-req] :as opts
+    `mod`: module
+    `opts-map` {
+      `data-shapes`: map of `:name`, `:shape`, `:dtype`, and `:layout`
+          Typically is `(provide-data-desc data-iter)`.Data shape must be in the
+          form of `io/data-desc`
+      `label-shapes`: map of `:name` `:shape` `:dtype` and `:layout`
+          Typically is `(provide-label-desc data-iter)`.
+      `for-training`: boolean - Default is `true`
+          Whether the executors should be bind for training.
+      `inputs-need-grad`: boolean - Default is `false`.
+          Whether the gradients to the input data need to be computed.
+          Typically this is not needed. But this might be needed when
+          implementing composition of modules.
+      `force-rebind`: boolean - Default is `false`.
+          This function does nothing if the executors are already binded. But
+          with this `true`, the executors will be forced to rebind.
+      `shared-module`: Default is nil.
+          This is used in bucketing. When not `nil`, the shared module
+          essentially corresponds to a different bucket -- a module with
+          different symbol but with the same sets of parameters (e.g. unrolled
+          RNNs with different lengths).
+    }
+   Ex:
+     (bind {:data-shapes (mx-io/provide-data train-iter)
+            :label-shapes (mx-io/provide-label test-iter)})) "
+  [mod {:keys [data-shapes label-shapes for-training inputs-need-grad
+               force-rebind shared-module grad-req] :as opts
         :or {for-training true
              inputs-need-grad false
              force-rebind false
@@ -129,24 +151,36 @@
 (s/def ::aux-params map?)
 (s/def ::force-init boolean?)
 (s/def ::allow-extra boolean?)
-(s/def ::init-params-opts (s/keys :opt-un [::initializer ::arg-params ::aux-params
-                                           ::force-init ::allow-extra]))
+(s/def ::init-params-opts
+  (s/keys :opt-un [::initializer ::arg-params ::aux-params
+                   ::force-init ::allow-extra]))
 
 (defn init-params
-  " Initialize the parameters and auxiliary states.
-   options map
-     :initializer - Called to initialize parameters if needed.
-     :arg-params -  If not nil, should be a map of existing arg-params.
-                     Initialization will be copied from that.
-     :auxParams - If not nil, should be a map of existing aux-params.
-                    Initialization will be copied from that.
-     :allow-missing - If true, params could contain missing values,
-                       and the initializer will be called to fill those missing params.
-     :force-init -  If true, will force re-initialize even if already initialized.
-     :allow-extra -  Whether allow extra parameters that are not needed by symbol.
-             If this is True, no error will be thrown when argParams or auxParams
-             contain extra parameters that is not needed by the executor."
-  ([mod {:keys [initializer arg-params aux-params allow-missing force-init allow-extra] :as opts
+  "Initialize the parameters and auxiliary states.
+    `opts-map` {
+      `initializer`: Initializer - Default is `uniform`
+          Called to initialize parameters if needed.
+      `arg-params`: map
+          If not nil, should be a map of existing arg-params. Initialization
+          will be copied from that.
+      `aux-params`: map
+          If not nil, should be a map of existing aux-params. Initialization
+          will be copied from that.
+      `allow-missing`: boolean - Default is `false`
+          If true, params could contain missing values, and the initializer will
+          be called to fill those missing params.
+      `force-init` boolean - Default is `false`
+          If true, will force re-initialize even if already initialized.
+      `allow-extra`: boolean - Default is `false`
+          Whether allow extra parameters that are not needed by symbol.
+          If this is `true`, no error will be thrown when `arg-params` or
+          `aux-params` contain extra parameters that is not needed by the
+          executor.
+   Ex:
+     (init-params {:initializer (initializer/xavier)})
+     (init-params {:force-init true :allow-extra true})"
+  ([mod {:keys [initializer arg-params aux-params allow-missing force-init
+                allow-extra] :as opts
          :or {initializer (initializer/uniform 0.01)
               allow-missing false
               force-init false
@@ -167,17 +201,23 @@
 (s/def ::kvstore string?)
 (s/def ::reset-optimizer boolean?)
 (s/def ::force-init boolean?)
-(s/def ::init-optimizer-opts (s/keys :opt-un [::optimizer ::kvstore ::reset-optimizer ::force-init]))
+(s/def ::init-optimizer-opts
+  (s/keys :opt-un [::optimizer ::kvstore ::reset-optimizer ::force-init]))
 
 (defn init-optimizer
-  " Install and initialize optimizers.
-   - mod Module
-   - options map of
-          - kvstore
-         - reset-optimizer Default `True`, indicating whether we should set
-           `rescaleGrad` & `idx2name` for optimizer according to executorGroup
-         -  force-init Default `False`, indicating whether we should force
-             re-initializing the optimizer in the case an optimizer is already installed."
+  "Install and initialize optimizers.
+    `mod`: Module
+    `opts-map` {
+      `kvstore`: string - Default is \"local\"
+      `optimizer`: Optimizer - Default is `sgd`
+      `reset-optimizer`: boolean - Default is `true`
+          Indicating whether we should set `rescaleGrad` & `idx2name` for
+          optimizer according to executorGroup.
+      `force-init`: boolean - Default is `false`
+          Indicating whether we should force re-initializing the optimizer
+          in the case an optimizer is already installed.
+   Ex:
+     (init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1})})"
   ([mod {:keys [kvstore optimizer reset-optimizer force-init] :as opts
          :or {kvstore "local"
               optimizer (optimizer/sgd)
@@ -191,8 +231,10 @@
 
 (defn forward
   "Forward computation.
-   data-batch -  input data of form io/data-batch either map or DataBatch
-   is-train -  Default is nil, which means `is_train` takes the value of `for_training`."
+    `data-batch`: Either map or DataBatch
+       Input data of form `io/data-batch`.
+    `is-train`: Default is nil
+       Which means `is_train` takes the value of `for_training`."
   ([mod data-batch is-train]
    (util/validate! ::mx-io/data-batch data-batch "Invalid data batch")
    (doto mod
@@ -209,9 +251,9 @@
 
 (defn backward
   "Backward computation.
-   out-grads -  Gradient on the outputs to be propagated back.
-                This parameter is only needed when bind is called
-                on outputs that are not a loss function."
+    `out-grads`: collection of NDArrays
+        Gradient on the outputs to be propagated back. This parameter is only
+        needed when bind is called on outputs that are not a loss function."
   ([mod out-grads]
    (util/validate! ::out-grads out-grads "Invalid out-grads")
    (doto mod
@@ -227,50 +269,48 @@
     (.forwardBackward data-batch)))
 
 (defn outputs
-  " Get outputs of the previous forward computation.
-  In the case when data-parallelism is used,
-            the outputs will be collected from multiple devices.
-            The results will look like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`,
-           those `NDArray` might live on different devices."
+  "Get outputs of the previous forward computation.
+   In the case when data-parallelism is used, the outputs will be collected from
+   multiple devices. The results will look like
+   `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`.
+   Those `NDArray`s might live on different devices."
   [mod]
   (->> (.getOutputs mod)
        (util/scala-vector->vec)
        (mapv util/scala-vector->vec)))
 
 (defn update
-  "Update parameters according to the installed optimizer and the gradients computed
-   in the previous forward-backward batch."
+  "Update parameters according to the installed optimizer and the gradients
+   computed in the previous forward-backward batch."
   [mod]
   (doto mod
     (.update)))
 
 (defn outputs-merged
-  " Get outputs of the previous forward computation.
-    return In the case when data-parallelism is used,
-            the outputs will be merged from multiple devices,
-            as they look like from a single executor.
-            The results will look like `[out1, out2]`"
+  "Get outputs of the previous forward computation.
+   In the case when data-parallelism is used, the outputs will be merged from
+   multiple devices, as they look like from a single executor.
+   The results will look like `[out1, out2]`."
   [mod]
   (->> (.getOutputsMerged mod)
        (util/scala-vector->vec)))
 
 (defn input-grads
-  "  Get the gradients to the inputs, computed in the previous backward computation.
-  In the case when data-parallelism is used,
-            the outputs will be collected from multiple devices.
-            The results will look like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`
-           those `NDArray` might live on different devices."
+  "Get the gradients to the inputs, computed in the previous backward computation.
+   In the case when data-parallelism is used, the outputs will be collected from
+   multiple devices. The results will look like
+   `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`.
+   Those `NDArray`s might live on different devices."
   [mod]
   (->> (.getInputGrads mod)
        (util/scala-vector->vec)
        (mapv util/scala-vector->vec)))
 
 (defn input-grads-merged
-  " Get the gradients to the inputs, computed in the previous backward computation.
-    return In the case when data-parallelism is used,
-            the outputs will be merged from multiple devices,
-            as they look like from a single executor.
-            The results will look like `[grad1, grad2]`"
+  "Get the gradients to the inputs, computed in the previous backward computation.
+   In the case when data-parallelism is used, the outputs will be merged from
+   multiple devices, as they look like from a single executor.
+   The results will look like `[grad1, grad2]`."
   [mod]
   (->> (.getInputGradsMerged mod)
        (util/scala-vector->vec)))
@@ -278,16 +318,25 @@
 (s/def ::prefix string?)
 (s/def ::epoch int?)
 (s/def ::save-opt-states boolean?)
-(s/def ::save-checkpoint-opts (s/keys :req-un [::prefix ::epoch] :opt-un [::save-opt-states ::save-checkpoint]))
+(s/def ::save-checkpoint-opts
+  (s/keys :req-un [::prefix ::epoch]
+          :opt-un [::save-opt-states ::save-checkpoint]))
 
 (defn save-checkpoint
-  " Save current progress to checkpoint.
-    Use mx.callback.module_checkpoint as epoch_end_callback to save during training.
-    - mod Module
-    -  opt-map with
-       :prefix The file prefix to checkpoint to
-       :epoch The current epoch number
-       :save-opt-states Whether to save optimizer states for continue training "
+  "Save current progress to checkpoint.
+   Use mx.callback.module_checkpoint as epoch_end_callback to save during
+   training.
+    `mod`: Module
+    `opts-map` {
+       `prefix`: string
+           The file prefix to checkpoint to
+       `epoch`: int
+           The current epoch number
+       `save-opt-states`: boolean - Default is `false`
+           Whether to save optimizer states for continue training
+    }
+   Ex:
+     (save-checkpoint {:prefix \"saved_model\" :epoch 0 :save-opt-states true})"
   ([mod {:keys [prefix epoch save-opt-states] :as opts
          :or {save-opt-states false}}]
    (util/validate! ::save-checkpoint-opts opts "Invalid save checkpoint opts")
@@ -303,24 +352,34 @@
 (s/def ::contexts (s/coll-of ::context :kind vector?))
 (s/def ::workload-list (s/coll-of number? :kind vector?))
 (s/def ::fixed-params-names (s/coll-of string? :kind vector?))
-(s/def ::load-checkpoint-opts (s/keys :req-un [::prefix ::epoch]
-                                      :opt-un [::load-optimizer-states ::data-names ::label-names
-                                               ::contexts ::workload-list ::fixed-param-names]))
+(s/def ::load-checkpoint-opts
+  (s/keys :req-un [::prefix ::epoch]
+          :opt-un [::load-optimizer-states ::data-names ::label-names
+                   ::contexts ::workload-list ::fixed-param-names]))
 
 (defn load-checkpoint
   "Create a model from previously saved checkpoint.
-   - opts map of
-     -  prefix Path prefix of saved model files. You should have prefix-symbol.json,
-                 prefix-xxxx.params, and optionally prefix-xxxx.states,
-                 where xxxx is the epoch number.
-     -  epoch Epoch to load.
-     - load-optimizer-states Whether to load optimizer states.
-                          Checkpoint needs to have been made with save-optimizer-states=True
-     - dataNames Input data names.
-     - labelNames Input label names
-     - contexts Default is cpu().
-     -  workload-list  Default nil, indicating uniform workload.
-     - fixed-param-names Default nil, indicating no network parameters are fixed."
+    `opts-map` {
+      `prefix`: string
+          Path prefix of saved model files. You should have prefix-symbol.json,
+          prefix-xxxx.params, and optionally prefix-xxxx.states, where xxxx is
+          the epoch number.
+      `epoch`: int
+          Epoch to load.
+      `load-optimizer-states`: boolean - Default is false
+           Whether to load optimizer states. Checkpoint needs to have been made
+           with `save-optimizer-states` = `true`.
+       `data-names`: vector of strings - Default is [\"data\"]
+           Input data names.
+       `label-names`: vector of strings - Default is [\"softmax_label\"]
+           Input label names.
+       `contexts`: Context - Default is `context/cpu`
+       `workload-list`:  Default nil
+           Indicating uniform workload.
+       `fixed-param-names`: Default nil
+           Indicating no network parameters are fixed.
+   Ex:
+     (load-checkpoint {:prefix \"my-model\" :epoch 1 :load-optimizer-states true}"
   ([{:keys [prefix epoch load-optimizer-states data-names label-names contexts
             workload-list fixed-param-names] :as opts
      :or {load-optimizer-states false
@@ -358,10 +417,10 @@
   (util/scala-map->map (.auxParams mod)))
 
 (defn reshape
-  " Reshapes the module for new input shapes.
-   - mod module
-   - data-shapes Typically is `(provide-data data-iter)
-   - param label-shapes Typically is `(provide-label data-tier)`. "
+  "Reshapes the module for new input shapes.
+    `mod`: Module
+    `data-shapes`: Typically is `(provide-data data-iter)`
+    `label-shapes`: Typically is `(provide-label data-tier)`"
   ([mod data-shapes label-shapes]
    (util/validate! ::data-shapes data-shapes "Invalid data-shapes")
    (util/validate! (s/nilable ::label-shapes) label-shapes "Invalid label-shapes")
@@ -376,28 +435,35 @@
   ([mod data-shapes]
    (reshape mod data-shapes nil)))
 
-(s/def ::set-param-opts (s/keys :opt-un [::arg-params ::aux-params ::allow-missing ::force-init ::allow-extra]))
+(s/def ::set-param-opts
+  (s/keys :opt-un [::arg-params ::aux-params ::allow-missing
+                   ::force-init ::allow-extra]))
 
 (defn get-params [mod]
   (.getParams mod))
 
 (defn set-params
-  " Assign parameter and aux state values.
-    - mod module
-    - arg-params : map
-            map of name to value (`NDArray`) mapping.
-    - aux-params : map
-           map of name to value (`NDArray`) mapping.
-    - allow-missing : bool
-            If true, params could contain missing values, and the initializer will be
-            called to fill those missing params.
-    - force-init : bool
-            If true, will force re-initialize even if already initialized.
-   -  allow-extra : bool
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg-params or aux-params
-            contain extra parameters that is not needed by the executor."
-  [mod {:keys [arg-params aux-params allow-missing force-init allow-extra] :as opts
+  "Assign parameters and aux state values.
+    `mod`: Module
+    `opts-map` {
+      `arg-params`: map - map of name to value (`NDArray`) mapping.
+      `aux-params`: map - map of name to value (`NDArray`) mapping.
+      `allow-missing`: boolean
+          If true, params could contain missing values, and the initializer will
+          be called to fill those missing params.
+      `force-init`: boolean - Default is `false`
+          If true, will force re-initialize even if already initialized.
+      `allow-extra`: boolean - Default is `false`
+          Whether allow extra parameters that are not needed by symbol. If this
+          is `true`, no error will be thrown when arg-params or aux-params
+          contain extra parameters that is not needed by the executor.
+    }
+   Ex:
+     (set-params mod
+       {:arg-params {\"fc_0_weight\" (ndarray/array [0.15 0.2 0.25 0.3] [2 2])
+        :allow-missing true})"
+  [mod {:keys [arg-params aux-params allow-missing force-init
+               allow-extra] :as opts
         :or {allow-missing false force-init true allow-extra false}}]
   (util/validate! ::set-param-opts opts "Invalid set-params")
   (doto mod
@@ -409,33 +475,32 @@
      allow-extra)))
 
 (defn install-monitor
-  "Install monitor on all executors"
+  "Install monitor on all executors."
   [mod monitor]
   (doto mod
     (.installMonitor monitor)))
 
 (defn borrow-optimizer
-  "Borrow optimizer from a shared module. Used in bucketing, where exactly the same
-   optimizer (esp. kvstore) is used.
-   - mod module
-   - shared-module"
+  "Borrow optimizer from a shared module. Used in bucketing, where exactly the
+   same optimizer (esp. kvstore) is used.
+    `mod`: Module
+    `shared-module`"
   [mod shared-module]
   (doto mod
     (.borrowOptimizer shared-module)))
 
 (defn save-optimizer-states
-  "Save optimizer (updater) state to file
-   - mod module
-   - fname Path to output states file."
+  "Save optimizer (updater) state to file.
+    `mod`: Module
+    `fname`: string - Path to output states file."
   [mod fname]
   (doto mod
     (.saveOptimizerStates mod fname)))
 
 (defn load-optimizer-states
-  "Load optimizer (updater) state from file
-   - mod module
-   - fname Path to input states file.
-  "
+  "Load optimizer (updater) state from file.
+    `mod`: Module
+    `fname`: string - Path to input states file."
   [mod fname]
   (doto mod
     (.loadOptimzerStates fname)))
@@ -444,10 +509,13 @@
 (s/def ::labels (s/coll-of ::ndarray :kind vector?))
 
 (defn update-metric
-  "Evaluate and accumulate evaluation metric on outputs of the last forward computation.
-    - mod module
-    - eval-metric
-    - labels"
+  "Evaluate and accumulate evaluation metric on outputs of the last forward
+   computation.
+     `mod`: module
+     `eval-metric`: EvalMetric
+     `labels`: collection of NDArrays
+  Ex:
+    (update-metric mod (eval-metric/mse) labels)"
   [mod eval-metric labels]
   (util/validate! ::eval-metric eval-metric "Invalid eval metric")
   (util/validate! ::labels labels "Invalid labels")
@@ -458,18 +526,48 @@
 (s/def ::validation-metric ::eval-metric)
 (s/def ::monitor #(instance? Monitor %))
 (s/def ::batch-end-callback #(instance? Callback$Speedometer %))
-(s/def ::fit-params-opts (s/keys :opt-un [::eval-metric ::kvstore ::optimizer ::initializer
-                                          ::arg-params ::aux-params ::allow-missing ::force-rebind
-                                          ::force-init ::begin-epoch ::validation-metric ::monitor
-                                          ::batch-end-callback]))
+(s/def ::fit-params-opts
+  (s/keys :opt-un [::eval-metric ::kvstore ::optimizer ::initializer
+                   ::arg-params ::aux-params ::allow-missing ::force-rebind
+                   ::force-init ::begin-epoch ::validation-metric ::monitor
+                   ::batch-end-callback]))
 
 ;; callbacks are not supported for now
 (defn fit-params
-  "Fit Params"
+  "Initialize FitParams with provided parameters.
+    `eval-metric`: EvalMetric - Default is `accuracy`
+    `kvstore`: String - Default is \"local\"
+    `optimizer`: Optimizer - Default is `sgd`
+    `initializer`: Initializer - Default is `uniform`
+        Called to initialize parameters if needed.
+    `arg-params`: map
+        If not nil, should be a map of existing `arg-params`. Initialization
+        will be copied from that.
+    `aux-params`: map -
+        If not nil, should be a map of existing `aux-params`. Initialization
+        will be copied from that.
+    `allow-missing`: boolean - Default is `false`
+        If `true`, params could contain missing values, and the initializer will
+        be called to fill those missing params.
+    `force-rebind`: boolean - Default is `false`
+        This function does nothing if the executors are already binded. But with
+        this `true`, the executors will be forced to rebind.
+    `force-init`: boolean - Default is `false`
+        If `true`, will force re-initialize even if already initialized.
+    `begin-epoch`: int - Default is 0
+    `validation-metric`: EvalMetric
+    `monitor`: Monitor
+  Ex:
+    (fit-params {:force-init true :force-rebind true :allow-missing true})
+    (fit-params
+      {:batch-end-callback (callback/speedometer batch-size 100)
+       :initializer (initializer/xavier)
+       :optimizer (optimizer/sgd {:learning-rate 0.01})
+       :eval-metric (eval-metric/mse)})"
   ([{:keys [eval-metric kvstore optimizer
             initializer arg-params aux-params
-            allow-missing force-rebind force-init begin-epoch validation-metric monitor
-            batch-end-callback] :as opts
+            allow-missing force-rebind force-init begin-epoch
+            validation-metric monitor batch-end-callback] :as opts
      :or {eval-metric (eval-metric/accuracy)
           kvstore "local"
           optimizer (optimizer/sgd)
@@ -500,25 +598,36 @@
 (s/def ::ndarray-iter #(instance? NDArrayIter %))
 (s/def ::train-data (s/or :mx-iter ::mx-data-iter :ndarry-iter ::ndarray-iter))
 (s/def ::eval-data ::train-data)
-(s/def ::num-epoch int?)
+(s/def ::num-epoch (s/and int? pos?))
 (s/def ::fit-params #(instance? FitParams %))
-(s/def ::fit-options (s/keys :req-un [::train-data] :opt-un [::eval-data ::num-epoch ::fit-params]))
+(s/def ::fit-options
+  (s/keys :req-un [::train-data]
+          :opt-un [::eval-data ::num-epoch ::fit-params]))
 
 ;;; High Level API
 
 (defn score
-  " Run prediction on `eval-data` and evaluate the performance according to `eval-metric`.
-   - mod module
-   - option map with
-     :eval-data : DataIter
-     :eval-metric : EvalMetric
-     :num-batch Number of batches to run. Default is `Integer.MAX_VALUE`,
-                   indicating run until the `DataIter` finishes.
-     :batch-end-callback -not supported yet
-     :reset Default `True`,
-                 indicating whether we should reset `eval-data` before starting evaluating.
-     :epoch Default 0. For compatibility, this will be passed to callbacks (if any).
-                During training, this will correspond to the training epoch number."
+  "Run prediction on `eval-data` and evaluate the performance according to
+  `eval-metric`.
+    `mod`: module
+     `opts-map` {
+       `eval-data`: DataIter
+       `eval-metric`: EvalMetric
+       `num-batch`: int - Default is `Integer.MAX_VALUE`
+           Number of batches to run. Indicating run until the `DataIter`
+           finishes.
+       `batch-end-callback`: not supported yet.
+       `reset`: boolean - Default is `true`,
+           Indicating whether we should reset `eval-data` before starting
+           evaluating.
+       `epoch`: int - Default is 0
+           For compatibility, this will be passed to callbacks (if any). During
+           training, this will correspond to the training epoch number.
+     }
+   Ex:
+     (score mod {:eval-data data-iter :eval-metric (eval-metric/accuracy)})
+     (score mod {:eval-data data-iter
+                 :eval-metric (eval-metric/mse) :num-batch 10})"
   [mod {:keys [eval-data eval-metric num-batch reset epoch] :as opts
         :or {num-batch Integer/MAX_VALUE
              reset true
@@ -537,15 +646,30 @@
 
 (defn fit
   "Train the module parameters.
-   - mod module
-   - train-data (data-iterator)
-   - eval-data (data-iterator)If not nil, will be used as validation set and evaluate
-                   the performance after each epoch.
-   - num-epoch Number of epochs to run training.
-   - f-params Extra parameters for training (See fit-params)."
+    `mod`: Module
+    `opts-map` {
+      `train-data`: DataIter
+      `eval-data`: DataIter
+          If not nil, will be used as validation set and evaluate the
+          performance after each epoch.
+      `num-epoch`: int
+          Number of epochs to run training.
+      `fit-params`: FitParams
+          Extra parameters for training (see fit-params).
+    }
+   Ex:
+     (fit {:train-data train-iter :eval-data test-iter :num-epoch 100)
+     (fit {:train-data train-iter
+           :eval-data test-iter
+           :num-epoch 5
+           :fit-params
+           (fit-params {:batch-end-callback (callback/speedometer 128 100)
+                        :initializer (initializer/xavier)
+                        :optimizer (optimizer/sgd {:learning-rate 0.01})
+                        :eval-metric (eval-metric/mse)}))"
   [mod {:keys [train-data eval-data num-epoch fit-params] :as opts
-        `:or {num-epoch 1
-              fit-params (new FitParams)}}]
+        :or {num-epoch 1
+             fit-params (new FitParams)}}]
   (util/validate! ::fit-options opts "Invalid options for fit")
   (doto mod
     (.fit
@@ -557,12 +681,13 @@
 (s/def ::eval-data ::train-data)
 (s/def ::num-batch integer?)
 (s/def ::reset boolean?)
-(s/def ::predict-opts (s/keys :req-un [::eval-data] :opt-un [::num-batch ::reset]))
+(s/def ::predict-opts
+  (s/keys :req-un [::eval-data] :opt-un [::num-batch ::reset]))
 
 (defn predict-batch
-  "Run the predication on a data batch
-   - mod module
-   - data-batch data-batch"
+  "Run the predication on a data batch.
+    `mod`: Module
+    `data-batch`: data-batch"
   [mod data-batch]
   (util/validate! ::mx-io/data-batch data-batch "Invalid data batch")
   (util/coerce-return (.predict mod (if (map? data-batch)
@@ -571,41 +696,60 @@
 
 (defn predict
   "Run prediction and collect the outputs.
-   - mod module
-   - option map with
-     - :eval-data
-     - :num-batch Default is -1, indicating running all the batches in the data iterator.
-     - :reset Default is `True`, indicating whether we should reset the data iter before start
-               doing prediction.
-    The return value will be a vector of NDArrays `[out1, out2, out3]`.
-          Where each element is concatenation of the outputs for all the mini-batches."
+    `mod`: Module
+    `opts-map` {
+      `eval-data`: DataIter
+      `num-batch` int - Default is `-1`
+          Indicating running all the batches in the data iterator.
+      `reset`: boolean - Default is `true`
+          Indicating whether we should reset the data iter before start doing
+          prediction.
+    }
+    returns: vector of NDArrays `[out1, out2, out3]` where each element is the
+       concatenation of the outputs for all the mini-batches.
+  Ex:
+    (predict mod {:eval-data test-iter})
+    (predict mod {:eval-data test-iter :num-batch 10 :reset false})"
   [mod {:keys [eval-data num-batch reset] :as opts
         :or {num-batch -1
              reset true}}]
   (util/validate! ::predict-opts opts "Invalid opts for predict")
   (util/scala-vector->vec (.predict mod eval-data (int num-batch) reset)))
 
-(s/def ::predict-every-batch-opts (s/keys :req-un [::eval-data] :opt-un [::num-batch ::reset]))
+(s/def ::predict-every-batch-opts
+  (s/keys :req-un [::eval-data] :opt-un [::num-batch ::reset]))
 
 (defn predict-every-batch
-  " Run prediction and collect the outputs.
-   - module
-   - option map with
-     :eval-data
-     :num-batch Default is -1, indicating running all the batches in the data iterator.
-     :reset Default is `True`, indicating whether we should reset the data iter before start
-               doing prediction.
-    The return value will be a nested list like
-   [[out1_batch1, out2_batch1, ...], [out1_batch2, out2_batch2, ...]]`
-   This mode is useful because in some cases (e.g. bucketing),
-    the module does not necessarily produce the same number of outputs."
+  "Run prediction and collect the outputs.
+    `mod`: Module
+    `opts-map` {
+      `eval-data`: DataIter
+      `num-batch` int - Default is `-1`
+          Indicating running all the batches in the data iterator.
+      `reset` boolean - Default is `true`
+          Indicating whether we should reset the data iter before start doing
+          prediction.
+    }
+    returns: nested list like this
+    `[[out1_batch1, out2_batch1, ...], [out1_batch2, out2_batch2, ...]]`
+
+   Note: This mode is useful because in some cases (e.g. bucketing), the module
+         does not necessarily produce the same number of outputs.
+   Ex:
+     (predict-every-batch mod {:eval-data test-iter})"
   [mod {:keys [eval-data num-batch reset] :as opts
         :or {num-batch -1
              reset true}}]
-  (util/validate! ::predict-every-batch-opts opts "Invalid opts for predict-every-batch")
-  (mapv util/scala-vector->vec (util/scala-vector->vec (.predictEveryBatch mod eval-data (int num-batch) reset))))
-
-(s/def ::score-opts (s/keys :req-un [::eval-data ::eval-metric] :opt-un [::num-batch ::reset ::epoch]))
+  (util/validate! ::predict-every-batch-opts
+                  opts
+                  "Invalid opts for predict-every-batch")
+  (mapv util/scala-vector->vec
+        (util/scala-vector->vec
+          (.predictEveryBatch mod eval-data (int num-batch) reset))))
+
+(s/def ::score-opts
+  (s/keys :req-un [::eval-data ::eval-metric]
+          :opt-un [::num-batch ::reset ::epoch]))
 
 (defn exec-group [mod]
   (.execGroup mod))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_api.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_api.clj
new file mode 100644
index 000000000000..e222775c60f6
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_api.clj
@@ -0,0 +1,32 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.ndarray-api
+  "Experimental NDArray API"
+  (:refer-clojure
+    :exclude [* - + > >= < <= / cast concat flatten identity load max
+              min repeat reverse set sort take to-array empty shuffle
+              ref])
+  (:require [org.apache.clojure-mxnet.base :as base]
+            [org.apache.clojure-mxnet.context :as mx-context]
+            [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util]
+            [clojure.reflect :as r]
+            [t6.from-scala.core :refer [$] :as $])
+  (:import (org.apache.mxnet NDArrayAPI)))
+
+;; loads the generated functions into the namespace
+(do (clojure.core/load "gen/ndarray_api"))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_random_api.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_random_api.clj
new file mode 100644
index 000000000000..1f45b6d4d646
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/ndarray_random_api.clj
@@ -0,0 +1,28 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.ndarray-random-api
+  "Experimental NDArray Random API"
+  (:require [org.apache.clojure-mxnet.base :as base]
+            [org.apache.clojure-mxnet.context :as mx-context]
+            [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util]
+            [clojure.reflect :as r]
+            [t6.from-scala.core :refer [$] :as $])
+  (:import (org.apache.mxnet NDArrayAPI)))
+
+;; loads the generated functions into the namespace
+(do (clojure.core/load "gen/ndarray_random_api"))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_api.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_api.clj
new file mode 100644
index 000000000000..69cc8136d500
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_api.clj
@@ -0,0 +1,32 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.symbol-api
+  "Experimental Symbol API"
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
+                            min repeat reverse set sort take to-array empty sin
+                            get apply shuffle ref])
+  (:require [org.apache.clojure-mxnet.base :as base]
+            [org.apache.clojure-mxnet.context :as mx-context]
+            [org.apache.clojure-mxnet.executor :as ex]
+            [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util]
+            [t6.from-scala.core :refer [$] :as $]
+            [org.apache.clojure-mxnet.ndarray :as ndarray])
+  (:import (org.apache.mxnet SymbolAPI)))
+
+;; loads the generated functions into the namespace
+(do (clojure.core/load "gen/symbol_api"))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_random_api.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_random_api.clj
new file mode 100644
index 000000000000..76f6fdefc334
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/symbol_random_api.clj
@@ -0,0 +1,32 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.symbol-random-api
+  "Experimental Symbol Random API"
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
+                            min repeat reverse set sort take to-array empty sin
+                            get apply shuffle ref])
+  (:require [org.apache.clojure-mxnet.base :as base]
+            [org.apache.clojure-mxnet.context :as mx-context]
+            [org.apache.clojure-mxnet.executor :as ex]
+            [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util]
+            [t6.from-scala.core :refer [$] :as $]
+            [org.apache.clojure-mxnet.ndarray :as ndarray])
+  (:import (org.apache.mxnet SymbolAPI)))
+
+;; loads the generated functions into the namespace
+(do (clojure.core/load "gen/symbol_random_api"))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
index 89ac1cd66a57..9dc6c8f88ddd 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
@@ -35,7 +35,6 @@
                            "int<>" "vec-of-ints"
                            "float<>" "vec-of-floats"
                            "byte<>" "byte-array"
-                           "java.lang.String<>" "vec-or-strings"
                            "org.apache.mxnet.NDArray" "ndarray"
                            "org.apache.mxnet.Symbol" "sym"
                            "org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE" "double-or-float"})
@@ -49,7 +48,7 @@
                           "int<>" "vec-of-ints"
                           "float<>" "vec-of-floats"
                           "byte<>" "byte-array"
-                          "java.lang.String<>" "vec-or-strings"
+                          "java.lang.String<>" "vec-of-strings"
                           "org.apache.mxnet.Symbol" "sym"
                           "java.lang.Object" "object"})
 
@@ -152,9 +151,12 @@
     (and (get targets "scala.collection.Seq") (instance? org.apache.mxnet.Symbol param)) ($/immutable-list param)
     (and (get targets "scala.collection.Seq") (and (or (vector? param) (seq? param)) (empty? param))) (empty-list)
     (and (get targets "scala.collection.Seq") (or (vector? param) (seq? param))) (apply $/immutable-list param)
+    (and (get targets "org.apache.mxnet.Shape") (or (vector? param) (seq? param) (empty? param))) (mx-shape/->shape param)
     (and (get targets "int<>") (vector? param)) (int-array param)
     (and (get targets "float<>") (vector? param)) (float-array param)
     (and (get targets "java.lang.String<>") (vector? param)) (into-array param)
+    (and (get targets "org.apache.mxnet.NDArray<>") (vector? param)) (into-array param)
+    (and (get targets "org.apache.mxnet.Symbol<>") (vector? param)) (into-array param)
     (and (get targets "org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE") (instance? Float param)) (primitives/mx-float param)
     (and (get targets "org.apache.mxnet.MX_PRIMITIVES$MX_PRIMITIVE_TYPE") (number? param)) (primitives/mx-double param)
     :else param))
@@ -248,7 +250,7 @@
       shape)))
 
 (defn map->scala-tuple-seq
-  "* Convert a map to a scala-Seq of scala-Tubple.
+  "* Convert a map to a scala-Seq of scala-Tuple.
    * Should also work if a seq of seq of 2 things passed.
    * Otherwise passed through unchanged."
   [map-or-tuple-seq]
diff --git a/contrib/clojure-package/test/dev/generator_test.clj b/contrib/clojure-package/test/dev/generator_test.clj
index 05b4a741bc7c..acc81afcbcd5 100644
--- a/contrib/clojure-package/test/dev/generator_test.clj
+++ b/contrib/clojure-package/test/dev/generator_test.clj
@@ -27,6 +27,20 @@
   (is (= "foo-bar" (gen/clojure-case "Foo_Bar")))
   (is (= "div+" (gen/clojure-case "/+"))))
 
+(deftest fn-name->random-fn-name
+  (is (= "poisson" (gen/fn-name->random-fn-name "-random-poisson")))
+  (is (= "poisson-like" (gen/fn-name->random-fn-name "-sample-poisson"))))
+
+(deftest remove-prefix
+  (is (= "randint" (gen/remove-prefix "-random-" "-random-randint")))
+  (is (= "exponential" (gen/remove-prefix "-sample-" "-sample-exponential"))))
+
+(deftest in-namespace-random?
+  (is (gen/in-namespace-random? "random_randint"))
+  (is (gen/in-namespace-random? "sample_poisson"))
+  (is (not (gen/in-namespace-random? "rnn")))
+  (is (not (gen/in-namespace-random? "activation"))))
+
 (defn ndarray-reflect-info [name]
   (->> gen/ndarray-public-no-default
        (filter #(= name (str (:name %))))
@@ -50,6 +64,127 @@
     (is (= transformed-params (gen/symbol-transform-param-name
                                (:parameter-types (symbol-reflect-info "floor")))))))
 
+(deftest test-gen-op-info
+  (testing "activation"
+    (let [activation-info (gen/gen-op-info "Activation")]
+      (is (= "activation" (:fn-name activation-info)))
+      (is (string? (:fn-description activation-info)))
+      (is (= 2 (-> activation-info :args count)))
+      (is (= "" (:key-var-num-args activation-info)))
+
+      (is (= "data" (-> activation-info :args first :name)))
+      (is (= "NDArray-or-Symbol" (-> activation-info :args first :type)))
+      (is (false? (-> activation-info :args first :optional?)))
+      (is (nil? (-> activation-info :args first :default)))
+      (is (string? (-> activation-info :args first :description)))
+
+      (is (= "act-type" (-> activation-info :args second :name)))
+      (is (= "'relu', 'sigmoid', 'softrelu', 'softsign', 'tanh'" (-> activation-info :args second :type)))
+      (is (false? (-> activation-info :args second :optional?)))
+      (is (nil? (-> activation-info :args second :default)))
+      (is (string? (-> activation-info :args second :description)))))
+
+  (testing "argmin"
+    (let [argmin-info (gen/gen-op-info "argmin")]
+      (is (= "argmin" (:fn-name argmin-info)))
+      (is (= 3 (-> argmin-info :args count)))
+
+      (is (= "data" (-> argmin-info :args (nth 0) :name)))
+      (is (= "NDArray-or-Symbol" (-> argmin-info :args (nth 0) :type)))
+      (is (false? (-> argmin-info :args (nth 0) :optional?)))
+
+      (is (= "axis" (-> argmin-info :args (nth 1) :name)))
+      (is (= "int or None" (-> argmin-info :args (nth 1) :type)))
+      (is (= "'None'" (-> argmin-info :args (nth 1) :default)))
+      (is (true? (-> argmin-info :args (nth 1) :optional?)))
+
+      (is (= "keepdims" (-> argmin-info :args (nth 2) :name)))
+      (is (= "boolean" (-> argmin-info :args (nth 2) :type)))
+      (is (= "0" (-> argmin-info :args (nth 2) :default)))
+      (is (true? (-> argmin-info :args (nth 2) :optional?)))))
+
+  (testing "concat"
+    (let [concat-info (gen/gen-op-info "Concat")]
+      (is (= "concat" (:fn-name concat-info)))
+      (is (= 3 (-> concat-info :args count)))
+      (is (= "num-args" (:key-var-num-args concat-info)))
+
+      (is (= "data" (-> concat-info :args (nth 0) :name)))
+      (is (= "NDArray-or-Symbol[]" (-> concat-info :args (nth 0) :type)))
+      (is (false? (-> concat-info :args (nth 0) :optional?)))
+
+      (is (= "num-args" (-> concat-info :args (nth 1) :name)))
+      (is (= "int" (-> concat-info :args (nth 1) :type)))
+      (is (false? (-> concat-info :args (nth 1) :optional?)))
+
+      (is (= "dim" (-> concat-info :args (nth 2) :name)))
+      (is (= "int" (-> concat-info :args (nth 2) :type)))
+      (is (= "'1'" (-> concat-info :args (nth 2) :default)))
+      (is (true? (-> concat-info :args (nth 2) :optional?)))))
+
+  (testing "convolution"
+    (let [convolution-info (gen/gen-op-info "Convolution")]
+
+      (is (= "convolution" (:fn-name convolution-info)))
+      (is (= 14 (-> convolution-info :args count)))
+      (is (= "" (:key-var-num-args convolution-info)))
+
+      (is (= "data" (-> convolution-info :args (nth 0) :name)))
+      (is (= "NDArray-or-Symbol" (-> convolution-info :args (nth 0) :type)))
+      (is (false? (-> convolution-info :args (nth 0) :optional?)))
+
+      (is (= "weight" (-> convolution-info :args (nth 1) :name)))
+      (is (= "NDArray-or-Symbol" (-> convolution-info :args (nth 1) :type)))
+      (is (false? (-> convolution-info :args (nth 1) :optional?)))
+
+      (is (= "kernel" (-> convolution-info :args (nth 3) :name)))
+      (is (= "Shape" (-> convolution-info :args (nth 3) :type)))
+      (is (= "(tuple)" (-> convolution-info :args (nth 3) :spec)))
+      (is (false? (-> convolution-info :args (nth 3) :optional?)))
+
+      (is (= "stride" (-> convolution-info :args (nth 4) :name)))
+      (is (= "Shape" (-> convolution-info :args (nth 4) :type)))
+      (is (= "(tuple)" (-> convolution-info :args (nth 4) :spec)))
+      (is (= "[]" (-> convolution-info :args (nth 4) :default)))
+      (is (true? (-> convolution-info :args (nth 4) :optional?)))
+
+      (is (= "num-filter" (-> convolution-info :args (nth 7) :name)))
+      (is (= "int" (-> convolution-info :args (nth 7) :type)))
+      (is (= "(non-negative)" (-> convolution-info :args (nth 7) :spec)))
+      (is (false? (-> convolution-info :args (nth 7) :optional?)))
+
+      (is (= "num-group" (-> convolution-info :args (nth 8) :name)))
+      (is (= "int" (-> convolution-info :args (nth 8) :type)))
+      (is (= "(non-negative)" (-> convolution-info :args (nth 8) :spec)))
+      (is (= "1" (-> convolution-info :args (nth 8) :default)))
+      (is (true? (-> convolution-info :args (nth 8) :optional?)))
+
+      (is (= "workspace" (-> convolution-info :args (nth 9) :name)))
+      (is (= "long" (-> convolution-info :args (nth 9) :type)))
+      (is (= "(non-negative)" (-> convolution-info :args (nth 9) :spec)))
+      (is (= "1024" (-> convolution-info :args (nth 9) :default)))
+      (is (true? (-> convolution-info :args (nth 9) :optional?)))
+
+      (is (= "no-bias" (-> convolution-info :args (nth 10) :name)))
+      (is (= "boolean" (-> convolution-info :args (nth 10) :type)))
+      (is (= "0" (-> convolution-info :args (nth 10) :default)))
+      (is (true? (-> convolution-info :args (nth 10) :optional?)))
+
+      (is (= "layout" (-> convolution-info :args (nth 13) :name)))
+      (is (= "None, 'NCDHW', 'NCHW', 'NCW', 'NDHWC', 'NHWC'" (-> convolution-info :args (nth 13) :type)))
+      (is (= "'None'" (-> convolution-info :args (nth 13) :default)))
+      (is (true? (-> convolution-info :args (nth 13) :optional?)))))
+
+  (testing "element wise sum"
+    (let [element-wise-sum-info (gen/gen-op-info "ElementWiseSum")]
+      (is (= "add-n" (:fn-name element-wise-sum-info)))
+      (is (= 1 (-> element-wise-sum-info :args count)))
+      (is (= "num-args" (:key-var-num-args element-wise-sum-info)))
+
+      (is (= "args" (-> element-wise-sum-info :args (nth 0) :name)))
+      (is (= "NDArray-or-Symbol[]" (-> element-wise-sum-info :args (nth 0) :type)))
+      (is (false? (-> element-wise-sum-info :args (nth 0) :optional?))))))
+
 (deftest test-ndarray-transform-param-name
   (let [params ["scala.collection.immutable.Map"
                 "scala.collection.Seq"]
@@ -68,7 +203,10 @@
 
 (deftest test-rename-duplicate-params
   (is (= ["foo" "bar" "baz"] (gen/rename-duplicate-params ["foo" "bar" "baz"])))
-  (is (= ["foo" "bar" "bar-1"] (gen/rename-duplicate-params ["foo" "bar" "bar"]))))
+  (is (= ["foo" "bar" "bar-1"] (gen/rename-duplicate-params ["foo" "bar" "bar"])))
+  (is (= ["foo" "bar" "bar-1" "foo-1"] (gen/rename-duplicate-params ["foo" "bar" "bar" "foo"])))
+  (is (= ["foo" "bar" "bar-1" "bar-2"] (gen/rename-duplicate-params ["foo" "bar" "bar" "bar"])))
+  (is (= ["foo" "bar" "bar-1" "bar-2" "foo-1" "baz"] (gen/rename-duplicate-params ["foo" "bar" "bar" "bar" "foo" "baz"]))))
 
 (deftest test-is-symbol-hand-gen?
   (is (not (false? (gen/is-symbol-hand-gen? (symbol-reflect-info "max")))))
@@ -191,7 +329,28 @@
            (gen/gen-ndarray-function-arity op-name op-values)))))
 
 (deftest test-write-to-file
-  (testing "symbol"
+  (testing "symbol-api"
+    (let [fname "test/test-symbol-api.clj"
+          fns (gen/all-symbol-api-functions gen/op-names)
+          _ (gen/write-to-file [(first fns) (second fns)]
+                               (gen/symbol-api-gen-ns false)
+                               fname)
+          good-contents (slurp "test/good-test-symbol-api.clj")
+          contents (slurp fname)]
+      (is (= good-contents contents))))
+
+  (testing "symbol-random-api"
+    (let [fname "test/test-symbol-random-api.clj"
+          fns (gen/all-symbol-random-api-functions gen/op-names)
+          _ (gen/write-to-file [(first fns) (second fns)]
+                               (gen/symbol-api-gen-ns true)
+                               fname)
+          good-contents (slurp "test/good-test-symbol-random-api.clj")
+          contents (slurp fname)]
+      (is (= good-contents contents))))
+
+
+ (testing "symbol"
     (let [fname "test/test-symbol.clj"
           _ (gen/write-to-file [(first gen/all-symbol-functions)]
                                gen/symbol-gen-ns
@@ -200,6 +359,26 @@
           contents (slurp fname)]
       (is (= good-contents contents))))
 
+  (testing "ndarray-api"
+    (let [fname "test/test-ndarray-api.clj"
+          fns (gen/all-ndarray-api-functions gen/op-names)
+          _ (gen/write-to-file [(first fns) (second fns)]
+                               (gen/ndarray-api-gen-ns false)
+                               fname)
+          good-contents (slurp "test/good-test-ndarray-api.clj")
+          contents (slurp fname)]
+      (is (= good-contents contents))))
+
+  (testing "ndarray-random-api"
+    (let [fname "test/test-ndarray-random-api.clj"
+          fns (gen/all-ndarray-random-api-functions gen/op-names)
+          _ (gen/write-to-file [(first fns) (second fns)]
+                               (gen/ndarray-api-gen-ns true)
+                               fname)
+          good-contents (slurp "test/good-test-ndarray-random-api.clj")
+          contents (slurp fname)]
+      (is (= good-contents contents))))
+
   (testing "ndarray"
     (let [fname "test/test-ndarray.clj"
           _ (gen/write-to-file [(first gen/all-ndarray-functions)]
diff --git a/contrib/clojure-package/test/good-test-ndarray-api.clj b/contrib/clojure-package/test/good-test-ndarray-api.clj
new file mode 100644
index 000000000000..7554089d0ba0
--- /dev/null
+++ b/contrib/clojure-package/test/good-test-ndarray-api.clj
@@ -0,0 +1,170 @@
+(ns
+  ^{:doc "Experimental"}
+  org.apache.clojure-mxnet.ndarray-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
+                            min repeat reverse set sort take to-array empty shuffle
+                            ref])
+  (:require [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util])
+  (:import (org.apache.mxnet NDArrayAPI)))
+
+;; Do not edit - this is auto-generated
+
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+
+
+(defn
+ activation
+ "Applies an activation function element-wise to the input.
+  
+  The following activation functions are supported:
+  
+  - `relu`: Rectified Linear Unit, :math:`y = max(x, 0)`
+  - `sigmoid`: :math:`y = \\frac{1}{1 + exp(-x)}`
+  - `tanh`: Hyperbolic tangent, :math:`y = \\frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}`
+  - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`
+  - `softsign`: :math:`y = \\frac{x}{1 + abs(x)}`
+  
+  
+  
+  Defined in src/operator/nn/activation.cc:L167
+  
+  `data`: The input array.
+  `act-type`: Activation function to be applied.
+  `out`: Output array. (optional)"
+ ([data act-type] (activation {:data data, :act-type act-type}))
+ ([{:keys [data act-type out], :or {out nil}, :as opts}]
+  (util/coerce-return
+   (NDArrayAPI/Activation data act-type (util/->option out)))))
+
+(defn
+ batch-norm
+ "Batch normalization.
+  
+  Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
+  well as offset ``beta``.
+  
+  Assume the input has more than one dimension and we normalize along axis 1.
+  We first compute the mean and variance along this axis:
+  
+  .. math::
+  
+    data\\_mean[i] = mean(data[:,i,:,...]) \\\\
+    data\\_var[i] = var(data[:,i,:,...])
+  
+  Then compute the normalized output, which has the same shape as input, as following:
+  
+  .. math::
+  
+    out[:,i,:,...] = \\frac{data[:,i,:,...] - data\\_mean[i]}{\\sqrt{data\\_var[i]+\\epsilon}} * gamma[i] + beta[i]
+  
+  Both *mean* and *var* returns a scalar by treating the input as a vector.
+  
+  Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
+  have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and
+  the inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these
+  two outputs are blocked.
+  
+  Besides the inputs and the outputs, this operator accepts two auxiliary
+  states, ``moving_mean`` and ``moving_var``, which are *k*-length
+  vectors. They are global statistics for the whole dataset, which are updated
+  by::
+  
+    moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+    moving_var = moving_var * momentum + data_var * (1 - momentum)
+  
+  If ``use_global_stats`` is set to be true, then ``moving_mean`` and
+  ``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute
+  the output. It is often used during inference.
+  
+  The parameter ``axis`` specifies which axis of the input shape denotes
+  the 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel
+  axis to be the last item in the input shape.
+  
+  Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,
+  then set ``gamma`` to 1 and its gradient to 0.
+  
+  .. Note::
+    When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False,
+    the sparse tensors will fallback.
+  
+  
+  
+  Defined in src/operator/nn/batch_norm.cc:L574
+  
+  `data`: Input data to batch normalization
+  `gamma`: gamma array
+  `beta`: beta array
+  `moving-mean`: running mean of input
+  `moving-var`: running variance of input
+  `eps`: Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON defined in cudnn.h when using cudnn (usually 1e-5) (optional)
+  `momentum`: Momentum for moving average (optional)
+  `fix-gamma`: Fix gamma while training (optional)
+  `use-global-stats`: Whether use global moving statistics instead of local batch-norm. This will force change batch-norm into a scale shift operator. (optional)
+  `output-mean-var`: Output the mean and inverse std  (optional)
+  `axis`: Specify which shape axis the channel is specified (optional)
+  `cudnn-off`: Do not select CUDNN operator, if available (optional)
+  `out`: Output array. (optional)"
+ ([data gamma beta moving-mean moving-var]
+  (batch-norm
+   {:data data,
+    :gamma gamma,
+    :beta beta,
+    :moving-mean moving-mean,
+    :moving-var moving-var}))
+ ([{:keys
+    [data
+     gamma
+     beta
+     moving-mean
+     moving-var
+     eps
+     momentum
+     fix-gamma
+     use-global-stats
+     output-mean-var
+     axis
+     cudnn-off
+     out],
+    :or
+    {eps nil,
+     momentum nil,
+     fix-gamma nil,
+     use-global-stats nil,
+     output-mean-var nil,
+     axis nil,
+     cudnn-off nil,
+     out nil},
+    :as opts}]
+  (util/coerce-return
+   (NDArrayAPI/BatchNorm
+    data
+    gamma
+    beta
+    moving-mean
+    moving-var
+    (util/->option eps)
+    (util/->option momentum)
+    (util/->option fix-gamma)
+    (util/->option use-global-stats)
+    (util/->option output-mean-var)
+    (util/->option axis)
+    (util/->option cudnn-off)
+    (util/->option out)))))
+
diff --git a/contrib/clojure-package/test/good-test-ndarray-random-api.clj b/contrib/clojure-package/test/good-test-ndarray-random-api.clj
new file mode 100644
index 000000000000..230e1033c008
--- /dev/null
+++ b/contrib/clojure-package/test/good-test-ndarray-random-api.clj
@@ -0,0 +1,95 @@
+(ns
+  ^{:doc "Experimental"}
+  org.apache.clojure-mxnet.ndarray-random-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
+                            min repeat reverse set sort take to-array empty shuffle
+                            ref])
+  (:require [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util])
+  (:import (org.apache.mxnet NDArrayAPI)))
+
+;; Do not edit - this is auto-generated
+
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+
+
+(defn
+ exponential
+ "Draw random samples from an exponential distribution.
+  
+  Samples are distributed according to an exponential distribution parametrized by *lambda* (rate).
+  
+  Example::
+  
+     exponential(lam=4, shape=(2,2)) = [[ 0.0097189 ,  0.08999364],
+                                        [ 0.04146638,  0.31715935]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L137
+  
+  `lam`: Lambda parameter (rate) of the exponential distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `out`: Output array. (optional)"
+ ([] (exponential {}))
+ ([{:keys [lam shape ctx dtype out],
+    :or {lam nil, shape nil, ctx nil, dtype nil, out nil},
+    :as opts}]
+  (util/coerce-return
+   (NDArrayAPI/random_exponential
+    (util/->option lam)
+    (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+    (util/->option ctx)
+    (util/->option dtype)
+    (util/->option out)))))
+
+(defn
+ gamma
+ "Draw random samples from a gamma distribution.
+  
+  Samples are distributed according to a gamma distribution parametrized by *alpha* (shape) and *beta* (scale).
+  
+  Example::
+  
+     gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984,  3.37695289],
+                                              [ 3.91697288,  3.65933681]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L125
+  
+  `alpha`: Alpha parameter (shape) of the gamma distribution. (optional)
+  `beta`: Beta parameter (scale) of the gamma distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `out`: Output array. (optional)"
+ ([] (gamma {}))
+ ([{:keys [alpha beta shape ctx dtype out],
+    :or {alpha nil, beta nil, shape nil, ctx nil, dtype nil, out nil},
+    :as opts}]
+  (util/coerce-return
+   (NDArrayAPI/random_gamma
+    (util/->option alpha)
+    (util/->option beta)
+    (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+    (util/->option ctx)
+    (util/->option dtype)
+    (util/->option out)))))
+
diff --git a/contrib/clojure-package/test/good-test-symbol-api.clj b/contrib/clojure-package/test/good-test-symbol-api.clj
new file mode 100644
index 000000000000..c7450f8eb5c1
--- /dev/null
+++ b/contrib/clojure-package/test/good-test-symbol-api.clj
@@ -0,0 +1,192 @@
+(ns
+  ^{:doc "Experimental"}
+  org.apache.clojure-mxnet.symbol-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
+                            min repeat reverse set sort take to-array empty sin
+                            get apply shuffle ref])
+  (:require [org.apache.clojure-mxnet.util :as util]
+            [org.apache.clojure-mxnet.shape :as mx-shape])
+  (:import (org.apache.mxnet SymbolAPI)))
+
+;; Do not edit - this is auto-generated
+
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+
+
+(defn
+ activation
+ "Applies an activation function element-wise to the input.
+  
+  The following activation functions are supported:
+  
+  - `relu`: Rectified Linear Unit, :math:`y = max(x, 0)`
+  - `sigmoid`: :math:`y = \\frac{1}{1 + exp(-x)}`
+  - `tanh`: Hyperbolic tangent, :math:`y = \\frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}`
+  - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))`
+  - `softsign`: :math:`y = \\frac{x}{1 + abs(x)}`
+  
+  
+  
+  Defined in src/operator/nn/activation.cc:L167
+  
+  `data`: The input array. (optional)
+  `act-type`: Activation function to be applied.
+  `name`: Name of the symbol (optional)
+  `attr`: Attributes of the symbol (optional)"
+ [{:keys [data act-type name attr],
+   :or {data nil, name nil, attr nil},
+   :as opts}]
+ (util/coerce-return
+  (SymbolAPI/Activation
+   (util/->option data)
+   act-type
+   name
+   (clojure.core/when
+    attr
+    (clojure.core/->>
+     attr
+     (clojure.core/mapv
+      (clojure.core/fn [[k v]] [k (clojure.core/str v)]))
+     (clojure.core/into {})
+     util/convert-map)))))
+
+(defn
+ batch-norm
+ "Batch normalization.
+  
+  Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
+  well as offset ``beta``.
+  
+  Assume the input has more than one dimension and we normalize along axis 1.
+  We first compute the mean and variance along this axis:
+  
+  .. math::
+  
+    data\\_mean[i] = mean(data[:,i,:,...]) \\\\
+    data\\_var[i] = var(data[:,i,:,...])
+  
+  Then compute the normalized output, which has the same shape as input, as following:
+  
+  .. math::
+  
+    out[:,i,:,...] = \\frac{data[:,i,:,...] - data\\_mean[i]}{\\sqrt{data\\_var[i]+\\epsilon}} * gamma[i] + beta[i]
+  
+  Both *mean* and *var* returns a scalar by treating the input as a vector.
+  
+  Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
+  have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and
+  the inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these
+  two outputs are blocked.
+  
+  Besides the inputs and the outputs, this operator accepts two auxiliary
+  states, ``moving_mean`` and ``moving_var``, which are *k*-length
+  vectors. They are global statistics for the whole dataset, which are updated
+  by::
+  
+    moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+    moving_var = moving_var * momentum + data_var * (1 - momentum)
+  
+  If ``use_global_stats`` is set to be true, then ``moving_mean`` and
+  ``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute
+  the output. It is often used during inference.
+  
+  The parameter ``axis`` specifies which axis of the input shape denotes
+  the 'channel' (separately normalized groups).  The default is 1.  Specifying -1 sets the channel
+  axis to be the last item in the input shape.
+  
+  Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,
+  then set ``gamma`` to 1 and its gradient to 0.
+  
+  .. Note::
+    When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False,
+    the sparse tensors will fallback.
+  
+  
+  
+  Defined in src/operator/nn/batch_norm.cc:L574
+  
+  `data`: Input data to batch normalization (optional)
+  `gamma`: gamma array (optional)
+  `beta`: beta array (optional)
+  `moving-mean`: running mean of input (optional)
+  `moving-var`: running variance of input (optional)
+  `eps`: Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON defined in cudnn.h when using cudnn (usually 1e-5) (optional)
+  `momentum`: Momentum for moving average (optional)
+  `fix-gamma`: Fix gamma while training (optional)
+  `use-global-stats`: Whether use global moving statistics instead of local batch-norm. This will force change batch-norm into a scale shift operator. (optional)
+  `output-mean-var`: Output the mean and inverse std  (optional)
+  `axis`: Specify which shape axis the channel is specified (optional)
+  `cudnn-off`: Do not select CUDNN operator, if available (optional)
+  `name`: Name of the symbol (optional)
+  `attr`: Attributes of the symbol (optional)"
+ [{:keys
+   [data
+    gamma
+    beta
+    moving-mean
+    moving-var
+    eps
+    momentum
+    fix-gamma
+    use-global-stats
+    output-mean-var
+    axis
+    cudnn-off
+    name
+    attr],
+   :or
+   {output-mean-var nil,
+    axis nil,
+    cudnn-off nil,
+    fix-gamma nil,
+    eps nil,
+    data nil,
+    attr nil,
+    beta nil,
+    name nil,
+    use-global-stats nil,
+    moving-mean nil,
+    moving-var nil,
+    momentum nil,
+    gamma nil},
+   :as opts}]
+ (util/coerce-return
+  (SymbolAPI/BatchNorm
+   (util/->option data)
+   (util/->option gamma)
+   (util/->option beta)
+   (util/->option moving-mean)
+   (util/->option moving-var)
+   (util/->option eps)
+   (util/->option momentum)
+   (util/->option fix-gamma)
+   (util/->option use-global-stats)
+   (util/->option output-mean-var)
+   (util/->option axis)
+   (util/->option cudnn-off)
+   name
+   (clojure.core/when
+    attr
+    (clojure.core/->>
+     attr
+     (clojure.core/mapv
+      (clojure.core/fn [[k v]] [k (clojure.core/str v)]))
+     (clojure.core/into {})
+     util/convert-map)))))
+
diff --git a/contrib/clojure-package/test/good-test-symbol-random-api.clj b/contrib/clojure-package/test/good-test-symbol-random-api.clj
new file mode 100644
index 000000000000..7202d2e27d12
--- /dev/null
+++ b/contrib/clojure-package/test/good-test-symbol-random-api.clj
@@ -0,0 +1,118 @@
+(ns
+  ^{:doc "Experimental"}
+  org.apache.clojure-mxnet.symbol-random-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
+                            min repeat reverse set sort take to-array empty sin
+                            get apply shuffle ref])
+  (:require [org.apache.clojure-mxnet.util :as util]
+            [org.apache.clojure-mxnet.shape :as mx-shape])
+  (:import (org.apache.mxnet SymbolAPI)))
+
+;; Do not edit - this is auto-generated
+
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+
+
+(defn
+ exponential
+ "Draw random samples from an exponential distribution.
+  
+  Samples are distributed according to an exponential distribution parametrized by *lambda* (rate).
+  
+  Example::
+  
+     exponential(lam=4, shape=(2,2)) = [[ 0.0097189 ,  0.08999364],
+                                        [ 0.04146638,  0.31715935]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L137
+  
+  `lam`: Lambda parameter (rate) of the exponential distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `name`: Name of the symbol (optional)
+  `attr`: Attributes of the symbol (optional)"
+ [{:keys [lam shape ctx dtype name attr],
+   :or {lam nil, shape nil, ctx nil, dtype nil, name nil, attr nil},
+   :as opts}]
+ (util/coerce-return
+  (SymbolAPI/random_exponential
+   (util/->option lam)
+   (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+   (util/->option ctx)
+   (util/->option dtype)
+   name
+   (clojure.core/when
+    attr
+    (clojure.core/->>
+     attr
+     (clojure.core/mapv
+      (clojure.core/fn [[k v]] [k (clojure.core/str v)]))
+     (clojure.core/into {})
+     util/convert-map)))))
+
+(defn
+ gamma
+ "Draw random samples from a gamma distribution.
+  
+  Samples are distributed according to a gamma distribution parametrized by *alpha* (shape) and *beta* (scale).
+  
+  Example::
+  
+     gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984,  3.37695289],
+                                              [ 3.91697288,  3.65933681]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L125
+  
+  `alpha`: Alpha parameter (shape) of the gamma distribution. (optional)
+  `beta`: Beta parameter (scale) of the gamma distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `name`: Name of the symbol (optional)
+  `attr`: Attributes of the symbol (optional)"
+ [{:keys [alpha beta shape ctx dtype name attr],
+   :or
+   {alpha nil,
+    beta nil,
+    shape nil,
+    ctx nil,
+    dtype nil,
+    name nil,
+    attr nil},
+   :as opts}]
+ (util/coerce-return
+  (SymbolAPI/random_gamma
+   (util/->option alpha)
+   (util/->option beta)
+   (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+   (util/->option ctx)
+   (util/->option dtype)
+   name
+   (clojure.core/when
+    attr
+    (clojure.core/->>
+     attr
+     (clojure.core/mapv
+      (clojure.core/fn [[k v]] [k (clojure.core/str v)]))
+     (clojure.core/into {})
+     util/convert-map)))))
+
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj
index feda45b9d027..ca9d4bc93986 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj
@@ -24,6 +24,8 @@
             [org.apache.clojure-mxnet.module :as m]
             [org.apache.clojure-mxnet.optimizer :as optimizer]
             [org.apache.clojure-mxnet.symbol :as sym]
+            [org.apache.clojure-mxnet.symbol-api :as sym-api]
+            [org.apache.clojure-mxnet.util :as util]
             [clojure.reflect :as r]))
 
 (def data-dir "data/")
@@ -54,17 +56,19 @@
 (defn get-symbol []
   (as-> (sym/variable "data") data
 
-    (sym/convolution "conv1" {:data data :kernel [3 3] :num-filter 32 :stride [2 2]})
-    (sym/batch-norm "bn1" {:data data})
-    (sym/activation "relu1" {:data data :act-type "relu"})
-    (sym/pooling "mp1" {:data data :kernel [2 2] :pool-type "max" :stride [2 2]}) (sym/convolution "conv2" {:data data :kernel [3 3] :num-filter 32 :stride [2 2]})
-    (sym/batch-norm "bn2" {:data data})
-    (sym/activation "relu2" {:data data :act-type "relu"})
-    (sym/pooling "mp2" {:data data :kernel [2 2] :pool-type "max" :stride [2 2]})
+    (sym-api/convolution {:name "conv1" :data data :kernel [3 3] :num-filter 32 :stride [2 2]})
+    (sym-api/batch-norm {:name "bn1" :data data})
+    (sym-api/activation {:name "relu1" :data data :act-type "relu"})
+    (sym-api/pooling {:name "mp1" :data data :kernel [2 2] :pool-type "max" :stride [2 2]})
 
-    (sym/flatten "fl" {:data data})
-    (sym/fully-connected "fc2" {:data data :num-hidden 10})
-    (sym/softmax-output "softmax" {:data data})))
+    (sym-api/convolution {:name "conv2" :data data :kernel [3 3] :num-filter 32 :stride [2 2]})
+    (sym-api/batch-norm {:name "bn2" :data data})
+    (sym-api/activation {:name "relu2" :data data :act-type "relu"})
+    (sym-api/pooling {:name "mp2" :data data :kernel [2 2] :pool-type "max" :stride [2 2]})
+
+    (sym-api/flatten {:name "fl" :data data})
+    (sym-api/fully-connected {:name "fc2" :data data :num-hidden 10})
+    (sym-api/softmax-output {:name "softmax" :data data})))
 
 (deftest test-conv []
   (let [mod (m/module (get-symbol))]
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_api_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_api_test.clj
new file mode 100644
index 000000000000..18b8b78f19d1
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_api_test.clj
@@ -0,0 +1,415 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.ndarray-api-test
+  (:require [org.apache.clojure-mxnet.base :as base]
+            [org.apache.clojure-mxnet.context :as ctx]
+            [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.ndarray :as ndarray :refer [->vec zeros ones += -= *= full shape shape-vec]]
+            [org.apache.clojure-mxnet.ndarray-api :as ndarray-api]
+            [org.apache.clojure-mxnet.shape :as mx-shape :refer [->shape]]
+            [org.apache.clojure-mxnet.test-util :as test-util :refer [approx=]]
+            [org.apache.clojure-mxnet.util :as util :refer [->option]]
+            [clojure.test :refer :all]))
+
+(deftest test-activation
+  (let [data (ndarray/array [2 1 0 -1 -2] [1 5])
+        relu (ndarray-api/activation data "relu")
+        sigmoid (ndarray-api/activation data "sigmoid")
+        softsign (ndarray-api/activation data "softsign")
+        out (ndarray/zeros [1 5])
+        _ (ndarray-api/activation {:data data :act-type "relu" :out out})]
+    (is (= [2.0 1.0 0.0 0.0 0.0] (->vec relu)))
+    (is (approx= 1e-3 [0.881 0.731 0.5 0.269 0.119] (->vec sigmoid)))
+    (is (approx= 1e-3 [0.666 0.5 0.0 -0.5 -0.666] (->vec softsign)))
+    (is (= [2.0 1.0 0.0 0.0 0.0] (->vec out)))))
+
+(deftest test-bilinear-sampler
+  (let [data (ndarray/array [1 4 3 6
+                             1 8 8 9
+                             0 4 1 5
+                             1 0 1 3]
+                            [1 1 4 4])
+        affine (ndarray/array [2 0 0
+                               0 2 0]
+                              [1 6])
+        grid (ndarray-api/grid-generator {:data affine :transform-type "affine" :target-shape [4 4]})
+        out (ndarray-api/bilinear-sampler data grid)]
+    (is (approx= 1e-3
+                 [0.0 0.0 0.0 0.0
+                  0.0 3.5 6.5 0.0
+                  0.0 1.25 2.5 0.0
+                  0.0 0.0 0.0 0.0]
+                 (->vec out)))))
+
+(deftest test-cast
+  (let [nda1 (ndarray/array [0.9 1.3] [2])
+        nda2 (ndarray/array [1e20 11.1] [2])
+        nda3 (ndarray/array [300 11.1 10.9 -1 -3] [5])
+        out (ndarray/zeros [2] {:dtype dtype/INT32})
+        _ (ndarray-api/cast {:data nda1 :dtype (str dtype/INT32) :out out})]
+    (is (= [0.0 1.0] (->vec (ndarray-api/cast nda1 (str dtype/INT32)))))
+    (is (= [(float 1e20) (float 11.1)] (->vec (ndarray-api/cast nda2 (str dtype/FLOAT32)))))
+    ;; uint8 gets converted to native types after ->vec
+    (is (= [44.0 11.0 10.0 -1.0 -3.0] (->vec (ndarray-api/cast nda3 "uint8"))))))
+
+(deftest test-concat
+  (let [nda1 (ndarray/zeros [1 2])
+        nda2 (ndarray/ones [1 2])
+        out (ndarray/zeros [1 4])
+        res1 (ndarray-api/concat [nda1 nda2] 2) ;; num_args=2, dim=1 (default)
+        res2 (ndarray-api/concat {:data [nda1 nda2] :num-args 2 :dim 0}) ;; num_args=2, dim=0
+        res3 (ndarray-api/concat {:data [nda1 nda2 nda1] :num-args 3 :dim 1}) ;; num_args=3, dim=1
+        _ (ndarray-api/concat {:data [nda1 nda2] :num-args 2 :dim 1 :out out}) ;; store result in out
+        ]
+    (is (= [0.0 0.0 1.0 1.0] (->vec res1)))
+    (is (= [1 4] (shape-vec res1)))
+    (is (= [0.0 0.0 1.0 1.0] (->vec res2)))
+    (is (= [2 2] (shape-vec res2)))
+    (is (= [0.0 0.0 1.0 1.0 0.0 0.0] (->vec res3)))
+    (is (= [1 6] (shape-vec res3)))
+    (is (= [0.0 0.0 1.0 1.0] (->vec out)))
+    (is (= [1 4] (shape-vec out)))))
+
+(deftest test-embedding
+  (let [input-dim 4
+        output-dim 5
+        w (ndarray/array [0.  1.  2.  3.  4.
+                          5.  6.  7.  8.  9.
+                          10. 11. 12. 13. 14.
+                          15. 16. 17. 18. 19.]
+                         [4 5])
+        x (ndarray/array [1. 3.
+                          0. 2.]
+                         [2 2])
+        out (ndarray-api/embedding x w input-dim output-dim)]
+    (is (= [5.  6.  7.  8.  9.
+            15. 16. 17. 18. 19.
+            0.  1.  2.  3.  4.
+            10. 11. 12. 13. 14.]
+           (->vec out)))
+    (is (= [2 2 5] (shape-vec out)))))
+
+(deftest test-flatten
+  (let [nda (ndarray/array [1 2 3
+                            4 5 6
+                            7 8 9
+                            1 2 3
+                            4 5 6
+                            7 8 9]
+                           [2 3 3])
+        out (ndarray/zeros [2 9])
+        res (ndarray-api/flatten {:data nda})
+        _ (ndarray-api/flatten {:data nda :out out})]
+    (is (= [1. 2. 3. 4. 5. 6. 7. 8. 9.
+            1. 2. 3. 4. 5. 6. 7. 8. 9.] (->vec res)))
+    (is (= [2 9] (shape-vec res)))
+    (is (= [1. 2. 3. 4. 5. 6. 7. 8. 9.
+            1. 2. 3. 4. 5. 6. 7. 8. 9.] (->vec out)))
+    (is (= [2 9] (shape-vec out)))))
+
+(deftest test-instance-norm
+  (let [x (ndarray/array [1.1 2.2 3.3 4.4] [2 1 2])
+        gamma (ndarray/array [1.5] [1])
+        beta (ndarray/array [0.5] [1])
+        res (ndarray-api/instance-norm x gamma beta)]
+    (is (approx= 1e-4 [-0.9975 1.9975
+                       -0.9975 1.9975] (->vec res)))
+    (is (= [2 1 2] (shape-vec res)))))
+
+(deftest test-l2-normalization
+  (let [x (ndarray/array [1 2 3 4 2 2 5 6] [2 2 2])
+        res1 (ndarray-api/l2-normalization {:data x}) ;; instance-wise
+        res2 (ndarray-api/l2-normalization {:data x :mode "instance"})
+        res3 (ndarray-api/l2-normalization {:data x :mode "channel"})
+        res4 (ndarray-api/l2-normalization {:data x :mode "spatial"})]
+    (is (approx= 1e-4 [0.1825 0.3651
+                       0.5477 0.7303
+                       0.2407 0.2407
+                       0.6019 0.7223] (->vec res1)))
+    (is (approx= 1e-4 [0.1825 0.3651
+                       0.5477 0.7303
+                       0.2407 0.2407
+                       0.6019 0.7223] (->vec res2)))
+    (is (approx= 1e-4 [0.3162 0.4472
+                       0.9486 0.8944
+                       0.3714 0.3162
+                       0.9284 0.9486] (->vec res3)))
+    (is (approx= 1e-4 [0.4472 0.8944
+                       0.6    0.8
+                       0.7071 0.7071
+                       0.6402 0.7682] (->vec res4)))))
+
+(deftest test-pad
+  (let [x (ndarray/array [1 2 3
+                          4 5 6
+                          7 8 9
+                          10 11 12
+                          11 12 13
+                          14 15 16
+                          17 18 19
+                          20 21 22]
+                         [2 2 2 3])
+        res1 (ndarray-api/pad x "edge" [0,0,0,0,1,1,1,1])
+        res2 (ndarray-api/pad {:data x :mode "constant" :pad-width [0,0,0,0,1,1,1,1] :constant-value 0})]
+    (is (= [1.   1.   2.   3.   3.
+            1.   1.   2.   3.   3.
+            4.   4.   5.   6.   6.
+            4.   4.   5.   6.   6.
+            7.   7.   8.   9.   9.
+            7.   7.   8.   9.   9.
+            10.  10.  11.  12.  12.
+            10.  10.  11.  12.  12.
+            11.  11.  12.  13.  13.
+            11.  11.  12.  13.  13.
+            14.  14.  15.  16.  16.
+            14.  14.  15.  16.  16.
+            17.  17.  18.  19.  19.
+            17.  17.  18.  19.  19.
+            20.  20.  21.  22.  22.
+            20.  20.  21.  22.  22.] (->vec res1)))
+    (is (= [2 2 4 5] (shape-vec res1)))
+    (is (= [0.   0.   0.   0.   0.
+            0.   1.   2.   3.   0.
+            0.   4.   5.   6.   0.
+            0.   0.   0.   0.   0.
+            
+            0.   0.   0.   0.   0.
+            0.   7.   8.   9.   0.
+            0.  10.  11.  12.   0.
+            0.   0.   0.   0.   0.
+            
+            0.   0.   0.   0.   0.
+            0.  11.  12.  13.   0.
+            0.  14.  15.  16.   0.
+            0.   0.   0.   0.   0.
+            
+            0.   0.   0.   0.   0.
+            0.  17.  18.  19.   0.
+            0.  20.  21.  22.   0.
+            0.   0.   0.   0.   0.] (->vec res2)))
+    (is (= [2 2 4 5] (shape-vec res2)))))
+
+(deftest test-roi-pooling
+  (let [xi [[[[  0.,   1.,   2.,   3.,   4.,   5.],
+              [  6.,   7.,   8.,   9.,  10.,  11.],
+              [ 12.,  13.,  14.,  15.,  16.,  17.],
+              [ 18.,  19.,  20.,  21.,  22.,  23.],
+              [ 24.,  25.,  26.,  27.,  28.,  29.],
+              [ 30.,  31.,  32.,  33.,  34.,  35.],
+              [ 36.,  37.,  38.,  39.,  40.,  41.],
+              [ 42.,  43.,  44.,  45.,  46.,  47.]]]]
+        x (ndarray/array (-> xi flatten vec) [1 1 8 6])
+        y (ndarray/array [0 0 0 4 4] [1 5])
+        res1 (ndarray-api/roi-pooling x y [2 2] 1.0)
+        res2 (ndarray-api/roi-pooling x y [2 2] 0.7)]
+    (is (= [14. 16. 26. 28.] (->vec res1)))
+    (is (= [1 1 2 2] (shape-vec res1)))
+    (is (= [7. 9. 19. 21.] (->vec res2)))
+    (is (= [1 1 2 2] (shape-vec res2)))))
+
+(deftest test-reshape
+  (let [x (ndarray/array (vec (range 4)) [4])
+        y (ndarray/array (vec (range 24)) [2 3 4])
+        z (ndarray/array (vec (range 120)) [2 3 4 5])
+        res1 (ndarray-api/reshape {:data x :shape [2 2]})]
+    (is (= [0. 1. 2. 3.] (->vec res1)))
+    (is (= [2 2] (shape-vec res1)))
+    (is (= (map float (range 24)) (->vec (ndarray-api/reshape {:data y :shape [4 0 2]}))))
+    (is (= [4 3 2] (shape-vec (ndarray-api/reshape {:data y :shape [4 0 2]}))))
+    (is (= [2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [2 0 0]}))))
+    (is (= [6 1 4] (shape-vec (ndarray-api/reshape {:data y :shape [6 1 -1]}))))
+    (is (= [3 1 8] (shape-vec (ndarray-api/reshape {:data y :shape [3 -1 8]}))))
+    (is (= [24] (shape-vec (ndarray-api/reshape {:data y :shape [-1]}))))
+    (is (= [2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [-2]}))))
+    (is (= [2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [2 -2]}))))
+    (is (= [2 3 4 1 1] (shape-vec (ndarray-api/reshape {:data y :shape [-2 1 1]}))))
+    (is (= [6 4] (shape-vec (ndarray-api/reshape {:data y :shape [-3 4]}))))
+    (is (= [6 20] (shape-vec (ndarray-api/reshape {:data z :shape [-3 -3]}))))
+    (is (= [2 12] (shape-vec (ndarray-api/reshape {:data y :shape [0 -3]}))))
+    (is (= [6 4] (shape-vec (ndarray-api/reshape {:data y :shape [-3 -2]}))))
+    (is (= [1 2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [-4 1 2 -2]}))))
+    (is (= [2 1 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [2 -4 -1 3 -2]}))))))
+
+(deftest test-sequence-last
+  (let [xi [[[  1.,   2.,   3.],
+             [  4.,   5.,   6.],
+             [  7.,   8.,   9.]],
+            
+            [[ 10.,   11.,   12.],
+             [ 13.,   14.,   15.],
+             [ 16.,   17.,   18.]],
+            
+            [[  19.,   20.,   21.],
+             [  22.,   23.,   24.],
+             [  25.,   26.,   27.]]]
+        x (ndarray/array (-> xi flatten vec) [3 3 3])
+        seq-len1 (ndarray/array [1 1 1] [3])
+        seq-len2 (ndarray/array [1 2 3] [3])
+        ;; This test is failing with an exception
+        ;; (most likely a scala generation issue)
+        ;; res1 (ndarray-api/sequence-last x nil)
+        ]
+    ;; (is (= [] (->vec res1)))
+))
+
+(deftest test-sequence-mask
+  (let [xi [[[  1.,   2.,   3.],
+             [  4.,   5.,   6.]],
+            
+            [[  7.,   8.,   9.],
+             [ 10.,  11.,  12.]],
+            
+            [[ 13.,  14.,   15.],
+             [ 16.,  17.,   18.]]]
+        x (ndarray/array (-> xi flatten vec) [3 2 3])
+        seq-len1 (ndarray/array [1 1] [2])
+        seq-len2 (ndarray/array [2 3] [2])
+        ;; Same issue as previous test
+        ;; res1 (ndarray-api/sequence-mask x seq-len1)
+        ]
+    ;; (is (= [] (->vec res1)))
+))
+
+(deftest test-slice-channel
+  (let [xi [[[ 1.] [ 2.]]
+            [[ 3.] [ 4.]]
+            [[ 5.] [ 6.]]]
+        x (ndarray/array (-> xi flatten vec) [3 2 1])
+        res1 (ndarray-api/slice-channel {:data x :num-outputs 2 :axis 1})
+        res2 (ndarray-api/slice-channel {:data x :num-outputs 3 :axis 0})
+        res3 (ndarray-api/slice-channel {:data x :num-outputs 3 :axis 0 :squeeze-axis 1})]
+    (is (= [1. 3. 5.] (->vec res1)))
+    (is (= [3 1 1] (shape-vec res1)))
+    (is (= [1. 2.] (->vec res2)))
+    (is (= [1 2 1] (shape-vec res2)))
+    (is (= [1. 2.] (->vec res3)))
+    (is (= [2 1] (shape-vec res3)))))
+
+(deftest test-softmax-activation
+  (let [x (ndarray/array [1 1 1 1 1 1] [2 3])
+        res1 (ndarray-api/softmax-activation {:data x :mode "instance"})]
+    (is (approx= 1e-3 [0.333 0.333 0.333
+                       0.333 0.333 0.333] (->vec res1)))
+    (is (= [2 3] (shape-vec res1)))))
+
+(deftest test-softmax-output
+  (let [datai [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
+        data (ndarray/array (-> datai flatten vec) [4 4])
+        label (ndarray/array [1,0,2,3] [4])
+        res1 (ndarray-api/softmax-output data label)]
+    (is (approx= 1e-4 [0.0321 0.0871 0.2369 0.6439
+                       0.25 0.25 0.25 0.25
+                       0.25 0.25 0.25 0.25
+                       0.25 0.25 0.25 0.25] (->vec res1)))
+    (is (= [4 4] (shape-vec res1)))))
+
+(deftest test-swap-axis
+  (let [x (ndarray/array (range 3) [1 3])
+        y (ndarray/array (range 8) [2 2 2])
+        res1 (ndarray-api/swap-axis {:data x :dim1 0 :dim2 1})
+        res2 (ndarray-api/swap-axis {:data y :dim1 0 :dim2 2})]
+    (is (= [0. 1. 2.] (->vec res1)))
+    (is (= [3 1] (shape-vec res1)))
+    (is (= [0. 4. 2. 6. 1. 5. 3. 7.] (->vec res2)))
+    (is (= [2 2 2] (shape-vec res2)))))
+
+(deftest test-abs
+  (let [x (ndarray/array [-2 0 3] [3])
+        res1 (ndarray-api/abs {:data x})]
+    (is (= [2. 0. 3.] (->vec res1)))
+    (is (= [3] (shape-vec res1)))))
+
+(deftest test-arccos
+  (let [x (ndarray/array [-1 -0.707 0 0.707 1] [5])
+        pi Math/PI
+        res1 (ndarray-api/arccos {:data x})]
+    (is (approx= 1e-3 [pi (* 0.75 pi) (* 0.5 pi) (* 0.25 pi) 0.] (->vec res1)))))
+
+(deftest test-arcsin
+  (let [x (ndarray/array [-1 -0.707 0 0.707 1] [5])
+        pi Math/PI
+        res1 (ndarray-api/arcsin {:data x})]
+    (is (approx= 1e-3 [(- (* 0.5 pi)) (- (* 0.25 pi)) 0 (* 0.25 pi) (* 0.5 pi)] (->vec res1)))))
+
+(deftest test-argmax
+  (let [x (ndarray/array (range 6) [2 3])
+        res1 (ndarray-api/argmax {:data x :axis 0})
+        res2 (ndarray-api/argmax {:data x :axis 1})
+        res3 (ndarray-api/argmax {:data x :axis 0 :keepdims true})
+        res4 (ndarray-api/argmax {:data x :axis 1 :keepdims true})]
+    (is (= [1. 1. 1.] (->vec res1)))
+    (is (= [3] (shape-vec res1)))
+    (is (= [2. 2.] (->vec res2)))
+    (is (= [2] (shape-vec res2)))
+    (is (= [1. 1. 1.] (->vec res3)))
+    (is (= [1 3] (shape-vec res3)))
+    (is (= [2. 2.] (->vec res4)))
+    (is (= [2 1] (shape-vec res4)))))
+
+(deftest test-argmax-channel
+  (let [x (ndarray/array (range 6) [2 3])
+        res1 (ndarray-api/argmax-channel {:data x})]
+    (is (= [2. 2.] (->vec res1)))
+    (is (= [2] (shape-vec res1)))))
+
+(deftest test-argmin
+  (let [x (ndarray/array (reverse (range 6)) [2 3])
+        res1 (ndarray-api/argmin {:data x :axis 0})
+        res2 (ndarray-api/argmin {:data x :axis 1})
+        res3 (ndarray-api/argmin {:data x :axis 0 :keepdims true})
+        res4 (ndarray-api/argmin {:data x :axis 1 :keepdims true})]
+    (is (= [1. 1. 1.] (->vec res1)))
+    (is (= [3] (shape-vec res1)))
+    (is (= [2. 2.] (->vec res2)))
+    (is (= [2] (shape-vec res2)))
+    (is (= [1. 1. 1.] (->vec res3)))
+    (is (= [1 3] (shape-vec res3)))
+    (is (= [2. 2.] (->vec res4)))
+    (is (= [2 1] (shape-vec res4)))))
+
+(deftest test-argsort
+  (let [x (ndarray/array [0.3  0.2  0.4
+                          0.1  0.3  0.2]
+                         [2 3])
+        y (ndarray/array [0.3 0.2 0.4 0.1 0.3 0.2] [6])
+        res1 (ndarray-api/argsort {:data x})
+        res2 (ndarray-api/argsort {:data x :axis 0})
+        res3 (ndarray-api/argsort {:data y})]
+    (is (= [1. 0. 2.
+            0. 2. 1.]
+           (->vec res1)))
+    (is (= [2 3] (shape-vec res1)))
+    (is (= [1. 0. 1.
+            0. 1. 0.]
+           (->vec res2)))
+    (is (= [2 3] (shape-vec res1)))
+    (is (= [3. 1. 5. 0. 4. 2.] (->vec res3)))
+    (is (= [6] (shape-vec res3)))))
+
+(deftest test-batch-take
+  (let [x (ndarray/array (range 6) [3 2])
+        i (ndarray/as-type (ndarray/array [0 1 0] [3]) dtype/INT32)
+        res1 (ndarray-api/batch-take x i)        ]
+    (is (= [0. 3. 4.] (->vec res1)))))
+
+(deftest test-broadcast-add
+  (let [x (ndarray/ones [2 3])
+        y (ndarray/array (range 2) [2 1])
+        res1 (ndarray-api/broadcast-add x y)]
+    (is (= [1. 1. 1. 2. 2. 2.] (->vec res1)))
+    (is (= [2 3] (shape-vec res1)))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_api_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_api_test.clj
new file mode 100644
index 000000000000..b642ad75d1d0
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_api_test.clj
@@ -0,0 +1,61 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.symbol-api-test
+  (:require [org.apache.clojure-mxnet.dtype :as dtype]
+            [org.apache.clojure-mxnet.executor :as executor]
+            [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [org.apache.clojure-mxnet.symbol :as sym]
+            [org.apache.clojure-mxnet.symbol-api :as sym-api]
+            [org.apache.clojure-mxnet.util :as util]
+            [clojure.test :refer :all]
+            [org.apache.clojure-mxnet.context :as context]))
+
+(deftest test-compose
+  (let [data (sym/variable "data")
+        net1 (sym-api/fully-connected {:data data :num-hidden 10 :name "fc1"})
+        net1 (sym-api/fully-connected {:data net1 :num-hidden 100 :name "fc2"} )
+
+        net2 (sym-api/fully-connected {:num-hidden 10 :name "fc3"})
+        net2 (sym-api/activation {:data net2 :act-type "relu"})
+        net2 (sym-api/fully-connected {:data net2 :num-hidden 20 :name "fc4"})
+
+        composed (sym/apply net2 "composed" {"fc3_data" net1})
+
+        multi-out (sym/group [composed net1])]
+
+    (is (= ["data" "fc1_weight" "fc1_bias" "fc2_weight" "fc2_bias"] (sym/list-arguments net1)))
+    (is (= 2 (count (sym/list-outputs multi-out))))))
+
+(deftest test-symbol-internal
+  (let [data (sym/variable "data")
+        oldfc (sym-api/fully-connected {:data data :num-hidden 10 :name"fc1"})
+        net1 (sym-api/fully-connected {:data oldfc :num-hidden 100 :name"fc2"})]
+    (is (= ["data" "fc1_weight" "fc1_bias" "fc2_weight" "fc2_bias"] (sym/list-arguments net1)))
+    (= (sym/list-arguments oldfc) (-> (sym/get-internals net1)
+                                      (sym/get "fc1_output")
+                                      (sym/list-arguments)))))
+
+(deftest test-infer-type
+  (let [data (sym/variable "data")
+        f32data (sym-api/cast {:data data :dtype "float32"})
+        fc1 (sym-api/fully-connected {:data f32data :num-hidden 128 :name"fc1"})
+        mlp (sym-api/softmax-output {:data fc1 :name"softmax"})
+        [arg out aux] (sym/infer-type mlp {:data dtype/FLOAT64})]
+    (is (= [dtype/FLOAT64 dtype/FLOAT32 dtype/FLOAT32 dtype/FLOAT32] (util/buffer->vec arg)))
+    (is (= [dtype/FLOAT32] (util/buffer->vec out)))
+    (is (= [] (util/buffer->vec aux)))))
diff --git a/contrib/clojure-package/test/test-ndarray-random-api.clj b/contrib/clojure-package/test/test-ndarray-random-api.clj
new file mode 100644
index 000000000000..230e1033c008
--- /dev/null
+++ b/contrib/clojure-package/test/test-ndarray-random-api.clj
@@ -0,0 +1,95 @@
+(ns
+  ^{:doc "Experimental"}
+  org.apache.clojure-mxnet.ndarray-random-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat flatten identity load max
+                            min repeat reverse set sort take to-array empty shuffle
+                            ref])
+  (:require [org.apache.clojure-mxnet.shape :as mx-shape]
+            [org.apache.clojure-mxnet.util :as util])
+  (:import (org.apache.mxnet NDArrayAPI)))
+
+;; Do not edit - this is auto-generated
+
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+
+
+(defn
+ exponential
+ "Draw random samples from an exponential distribution.
+  
+  Samples are distributed according to an exponential distribution parametrized by *lambda* (rate).
+  
+  Example::
+  
+     exponential(lam=4, shape=(2,2)) = [[ 0.0097189 ,  0.08999364],
+                                        [ 0.04146638,  0.31715935]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L137
+  
+  `lam`: Lambda parameter (rate) of the exponential distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `out`: Output array. (optional)"
+ ([] (exponential {}))
+ ([{:keys [lam shape ctx dtype out],
+    :or {lam nil, shape nil, ctx nil, dtype nil, out nil},
+    :as opts}]
+  (util/coerce-return
+   (NDArrayAPI/random_exponential
+    (util/->option lam)
+    (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+    (util/->option ctx)
+    (util/->option dtype)
+    (util/->option out)))))
+
+(defn
+ gamma
+ "Draw random samples from a gamma distribution.
+  
+  Samples are distributed according to a gamma distribution parametrized by *alpha* (shape) and *beta* (scale).
+  
+  Example::
+  
+     gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984,  3.37695289],
+                                              [ 3.91697288,  3.65933681]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L125
+  
+  `alpha`: Alpha parameter (shape) of the gamma distribution. (optional)
+  `beta`: Beta parameter (scale) of the gamma distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `out`: Output array. (optional)"
+ ([] (gamma {}))
+ ([{:keys [alpha beta shape ctx dtype out],
+    :or {alpha nil, beta nil, shape nil, ctx nil, dtype nil, out nil},
+    :as opts}]
+  (util/coerce-return
+   (NDArrayAPI/random_gamma
+    (util/->option alpha)
+    (util/->option beta)
+    (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+    (util/->option ctx)
+    (util/->option dtype)
+    (util/->option out)))))
+
diff --git a/contrib/clojure-package/test/test-symbol-random-api.clj b/contrib/clojure-package/test/test-symbol-random-api.clj
new file mode 100644
index 000000000000..7202d2e27d12
--- /dev/null
+++ b/contrib/clojure-package/test/test-symbol-random-api.clj
@@ -0,0 +1,118 @@
+(ns
+  ^{:doc "Experimental"}
+  org.apache.clojure-mxnet.symbol-random-api
+  (:refer-clojure :exclude [* - + > >= < <= / cast concat identity flatten load max
+                            min repeat reverse set sort take to-array empty sin
+                            get apply shuffle ref])
+  (:require [org.apache.clojure-mxnet.util :as util]
+            [org.apache.clojure-mxnet.shape :as mx-shape])
+  (:import (org.apache.mxnet SymbolAPI)))
+
+;; Do not edit - this is auto-generated
+
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+
+
+
+(defn
+ exponential
+ "Draw random samples from an exponential distribution.
+  
+  Samples are distributed according to an exponential distribution parametrized by *lambda* (rate).
+  
+  Example::
+  
+     exponential(lam=4, shape=(2,2)) = [[ 0.0097189 ,  0.08999364],
+                                        [ 0.04146638,  0.31715935]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L137
+  
+  `lam`: Lambda parameter (rate) of the exponential distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `name`: Name of the symbol (optional)
+  `attr`: Attributes of the symbol (optional)"
+ [{:keys [lam shape ctx dtype name attr],
+   :or {lam nil, shape nil, ctx nil, dtype nil, name nil, attr nil},
+   :as opts}]
+ (util/coerce-return
+  (SymbolAPI/random_exponential
+   (util/->option lam)
+   (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+   (util/->option ctx)
+   (util/->option dtype)
+   name
+   (clojure.core/when
+    attr
+    (clojure.core/->>
+     attr
+     (clojure.core/mapv
+      (clojure.core/fn [[k v]] [k (clojure.core/str v)]))
+     (clojure.core/into {})
+     util/convert-map)))))
+
+(defn
+ gamma
+ "Draw random samples from a gamma distribution.
+  
+  Samples are distributed according to a gamma distribution parametrized by *alpha* (shape) and *beta* (scale).
+  
+  Example::
+  
+     gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984,  3.37695289],
+                                              [ 3.91697288,  3.65933681]]
+  
+  
+  Defined in src/operator/random/sample_op.cc:L125
+  
+  `alpha`: Alpha parameter (shape) of the gamma distribution. (optional)
+  `beta`: Beta parameter (scale) of the gamma distribution. (optional)
+  `shape`: Shape of the output. (optional)
+  `ctx`: Context of output, in format [cpu|gpu|cpu_pinned](n). Only used for imperative calls. (optional)
+  `dtype`: DType of the output in case this can't be inferred. Defaults to float32 if not defined (dtype=None). (optional)
+  `name`: Name of the symbol (optional)
+  `attr`: Attributes of the symbol (optional)"
+ [{:keys [alpha beta shape ctx dtype name attr],
+   :or
+   {alpha nil,
+    beta nil,
+    shape nil,
+    ctx nil,
+    dtype nil,
+    name nil,
+    attr nil},
+   :as opts}]
+ (util/coerce-return
+  (SymbolAPI/random_gamma
+   (util/->option alpha)
+   (util/->option beta)
+   (util/->option (clojure.core/when shape (mx-shape/->shape shape)))
+   (util/->option ctx)
+   (util/->option dtype)
+   name
+   (clojure.core/when
+    attr
+    (clojure.core/->>
+     attr
+     (clojure.core/mapv
+      (clojure.core/fn [[k v]] [k (clojure.core/str v)]))
+     (clojure.core/into {})
+     util/convert-map)))))
+
diff --git a/cpp-package/example/inference/inception_inference.cpp b/cpp-package/example/inference/inception_inference.cpp
index 78487e6ee0cd..fa5600190f95 100644
--- a/cpp-package/example/inference/inception_inference.cpp
+++ b/cpp-package/example/inference/inception_inference.cpp
@@ -301,7 +301,7 @@ void Predictor::PredictImage(const std::string& image_file) {
   executor->Forward(false);
 
   // The output is available in executor->outputs.
-  auto array = executor->outputs[0].Copy(global_ctx);
+  auto array = executor->outputs[0].Copy(Context::cpu());
 
   /*
    * Find out the maximum accuracy and the index associated with that accuracy.
diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
index b667542bffb5..d0438305a62e 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.hpp
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -397,11 +397,11 @@ inline size_t NDArray::Size() const {
 }
 
 inline std::vector<mx_uint> NDArray::GetShape() const {
-  const mx_uint *out_pdata;
-  mx_uint out_dim;
-  MXNDArrayGetShape(blob_ptr_->handle_, &out_dim, &out_pdata);
+  const int *out_pdata;
+  int out_dim;
+  MXNDArrayGetShapeEx(blob_ptr_->handle_, &out_dim, &out_pdata);
   std::vector<mx_uint> ret;
-  for (mx_uint i = 0; i < out_dim; ++i) {
+  for (int i = 0; i < out_dim; ++i) {
     ret.push_back(out_pdata[i]);
   }
   return ret;
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
index aed963949060..2e3fb7a2d5de 100644
--- a/cpp-package/include/mxnet-cpp/symbol.hpp
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -188,7 +188,7 @@ inline void Symbol::InferShape(
 
   std::vector<const char *> keys;
   std::vector<mx_uint> arg_ind_ptr;
-  std::vector<mx_uint> arg_shape_data;
+  std::vector<int> arg_shape_data;
 
   for (const auto &arg : arg_shapes) {
     keys.push_back(arg.first.c_str());
@@ -200,40 +200,40 @@ inline void Symbol::InferShape(
   arg_ind_ptr.push_back(arg_shape_data.size());
 
   mx_uint in_shape_size;
-  const mx_uint *in_shape_ndim;
-  const mx_uint **in_shape_data;
+  const int *in_shape_ndim;
+  const int **in_shape_data;
   mx_uint out_shape_size;
-  const mx_uint *out_shape_ndim;
-  const mx_uint **out_shape_data;
+  const int *out_shape_ndim;
+  const int **out_shape_data;
   mx_uint aux_shape_size;
-  const mx_uint *aux_shape_ndim;
-  const mx_uint **aux_shape_data;
+  const int *aux_shape_ndim;
+  const int **aux_shape_data;
   int complete;
 
-  CHECK_EQ(MXSymbolInferShape(GetHandle(), keys.size(), keys.data(),
-                              arg_ind_ptr.data(), arg_shape_data.data(),
-                              &in_shape_size, &in_shape_ndim, &in_shape_data,
-                              &out_shape_size, &out_shape_ndim, &out_shape_data,
-                              &aux_shape_size, &aux_shape_ndim, &aux_shape_data,
-                              &complete),
+  CHECK_EQ(MXSymbolInferShapeEx(GetHandle(), keys.size(), keys.data(),
+                                arg_ind_ptr.data(), arg_shape_data.data(),
+                                &in_shape_size, &in_shape_ndim, &in_shape_data,
+                                &out_shape_size, &out_shape_ndim, &out_shape_data,
+                                &aux_shape_size, &aux_shape_ndim, &aux_shape_data,
+                                &complete),
            0);
 
   if (complete) {
     for (mx_uint i = 0; i < in_shape_size; ++i) {
       in_shape->push_back(std::vector<mx_uint>());
-      for (mx_uint j = 0; j < in_shape_ndim[i]; ++j) {
+      for (int j = 0; j < in_shape_ndim[i]; ++j) {
         (*in_shape)[i].push_back(in_shape_data[i][j]);
       }
     }
     for (mx_uint i = 0; i < aux_shape_size; ++i) {
       aux_shape->push_back(std::vector<mx_uint>());
-      for (mx_uint j = 0; j < aux_shape_ndim[i]; ++j) {
+      for (int j = 0; j < aux_shape_ndim[i]; ++j) {
         (*aux_shape)[i].push_back(aux_shape_data[i][j]);
       }
     }
     for (mx_uint i = 0; i < out_shape_size; ++i) {
       out_shape->push_back(std::vector<mx_uint>());
-      for (mx_uint j = 0; j < out_shape_ndim[i]; ++j) {
+      for (int j = 0; j < out_shape_ndim[i]; ++j) {
         (*out_shape)[i].push_back(out_shape_data[i][j]);
       }
     }
diff --git a/dev_menu.py b/dev_menu.py
index cea0f96793a9..d439d8194f2a 100755
--- a/dev_menu.py
+++ b/dev_menu.py
@@ -123,7 +123,7 @@ def create_virtualenv_default():
     ('[Docker] sanity_check. Check for linting and code formatting and licenses.',
     [
         "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh sanity_check",
-        "ci/build.py --platform ubuntu_cpu /work/runtime_functions.sh nightly_test_rat_check",
+        "ci/build.py --platform ubuntu_rat /work/runtime_functions.sh nightly_test_rat_check",
     ]),
     ('[Docker] Python3 CPU unittests',
     [
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 2768f644c066..095c214e66b3 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -60,9 +60,6 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_MP_OPENCV_NUM_THREADS
   - Values: Int ```(default=0)```
   - The number of OpenCV execution threads given to multiprocess workers. OpenCV multithreading is disabled if `MXNET_MP_OPENCV_NUM_THREADS` < 1 (default). Enlarge this number may boost the performance of individual workers when executing underlying OpenCV functions but please consider reducing the overall `num_workers` to avoid thread contention (not available on Windows).
-* MXNET_CUSTOM_OP_NUM_THREADS
-  - Values: Int ```(default=16)```
-  - The maximum number of threads given to custom operators.
 
 ## Memory Options
 
diff --git a/docs/install/index.md b/docs/install/index.md
index f1e959eaf34c..10db8d95b44a 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -1125,7 +1125,8 @@ You can [build MXNet-R from source](windows_setup.html#install-mxnet-package-for
   options(repos = cran)
   install.packages("mxnet")
 ```
-Change cu92 to cu80, cu90 or cu91 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
+Change cu92 to cu90, cu91 or cuda100 based on your CUDA toolkit version. Currently, MXNet supports these versions of CUDA.
+Note : You also need to have cuDNN installed on Windows. Check out this [guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installwindows) on the steps for installation.
 
 </div> <!-- END of GPU -->
 </div> <!-- END - Windows R -->
@@ -1232,9 +1233,7 @@ You can do a dockerized cross compilation build on your local machine or a nativ
 The complete MXNet library and its requirements can take almost 200MB of RAM, and loading large models with the library can take over 1GB of RAM. Because of this, we recommend running MXNet on the Raspberry Pi 3 or an equivalent device that has more than 1 GB of RAM and a Secure Digital (SD) card that has at least 4 GB of free memory.
 
 ## Quick installation
-You can use this [pre-built Python wheel](wget https://mxnet-public.s3.amazonaws.com/install/raspbian/mxnet-1.5.0-py2.py3-none-any.whl) on a Raspberry Pi 3B with Stretch. You will likely need to install several dependencies to get MXNet to work. Refer to the following **Build** section for details.
-
-**Cross compilation build (Experimental)**
+You can use this [pre-built Python wheel](https://mxnet-public.s3.amazonaws.com/install/raspbian/mxnet-1.5.0-py2.py3-none-any.whl) on a Raspberry Pi 3B with Stretch. You will likely need to install several dependencies to get MXNet to work. Refer to the following **Build** section for details.
 
 ## Docker installation
 **Step 1**  Install Docker on your machine by following the [docker installation instructions](https://docs.docker.com/engine/installation/linux/ubuntu/#install-using-the-repository).
@@ -1247,18 +1246,22 @@ Follow the four steps in this [docker documentation](https://docs.docker.com/eng
 
 ## Build
 
-**Please use a Native build with gcc 4 as explained below, higher compiler versions currently cause test
-failures on ARM**
+**This cross compilation build is experimental.**
+
+**Please use a Native build with gcc 4 as explained below, higher compiler versions currently cause test failures on ARM.**
 
-The following command will build a container with dependencies and tools and then compile MXNet for
-ARMv7. The resulting artifact will be located in `build/mxnet-x.x.x-py2.py3-none-any.whl`, copy this
-file to your Raspberry Pi.
+The following command will build a container with dependencies and tools,
+and then compile MXNet for ARMv7.
+You will want to run this on a fast cloud instance or locally on a fast PC to save time.
+The resulting artifact will be located in `build/mxnet-x.x.x-py2.py3-none-any.whl`.
+Copy this file to your Raspberry Pi.
+The previously mentioned pre-built wheel was created using this method.
 
 ```
 ci/build.py -p armv7
 ```
 
-## Install
+## Install using a pip wheel
 
 Your Pi will need several dependencies.
 
@@ -1281,6 +1284,7 @@ sudo apt-get install -y \
     libzmq3-dev \
     ninja-build \
     python-dev \
+    python-pip \
     software-properties-common \
     sudo \
     unzip \
@@ -1297,18 +1301,24 @@ virtualenv -p `which python` mxnet_py27
 ```
 You may use Python 3, however the [wine bottle detection example](https://mxnet.incubator.apache.org/versions/master/tutorials/embedded/wine_detector.html) for the Pi with camera requires Python 2.7.
 
-Create a virtualenv and install the wheel we created previously, or the wheel that you downloaded.
+Activate the environment, then install the wheel we created previously, or install this [prebuilt wheel](https://mxnet-public.s3.amazonaws.com/install/raspbian/mxnet-1.5.0-py2.py3-none-any.whl).
 
 ```
-virtualenv -p `which python3` mxnet_py27
 source mxnet_py27/bin/activate
 pip install mxnet-x.x.x-py2.py3-none-any.whl
 ```
 
+Test MXNet with the Python interpreter:
+```
+$ python
+
+>>> import mxnet
+```
+If there are no errors then you're ready to start using MXNet on your Pi!
 
-**Native Build**
+## Native Build
 
-Installing MXNet is a two-step process:
+Installing MXNet from source is a two-step process:
 
 1. Build the shared library from the MXNet C++ source code.
 2. Install the supported language-specific packages for MXNet.
diff --git a/docs/install/ubuntu_setup.md b/docs/install/ubuntu_setup.md
index f225023d18d5..01b11cdc11ab 100644
--- a/docs/install/ubuntu_setup.md
+++ b/docs/install/ubuntu_setup.md
@@ -137,12 +137,33 @@ It is recommended that you review the general [build from source](build_from_sou
 
 On Ubuntu versions 16.04 or later, you need the following dependencies:
 
-**Step 1:** Install build tools and git.
+**Step 1:** Install prerequisite packages.
 ```bash
     sudo apt-get update
-    sudo apt-get install -y build-essential git
+    sudo apt-get install -y build-essential git ninja-build ccache
 ```
 
+**For Ubuntu 18.04 and CUDA builds you need to update CMake**
+
+```bash
+#!/usr/bin/env bash
+set -exuo pipefail
+sudo apt remove --purge --auto-remove cmake
+
+# Update CMAKE for correct cuda autotedetection: /~https://github.com/clab/dynet/issues/1457
+version=3.14
+build=0
+mkdir -p ~/tmp
+cd ~/tmp
+wget https://cmake.org/files/v$version/cmake-$version.$build.tar.gz
+tar -xzvf cmake-$version.$build.tar.gz
+cd cmake-$version.$build/
+./bootstrap
+make -j$(nproc)
+sudo make install
+```
+
+
 **Step 2:** Install a Math Library.
 
 Details on the different math libraries are found in the build from source guide's [Math Library Selection](build_from_source.html#math-library-selection) section.
@@ -167,49 +188,73 @@ For other libraries, visit the [Math Library Selection](build_from_source.html#m
 
 If building on CPU and using OpenBLAS:
 
+Clone the repository:
+
 ```bash
     git clone --recursive /~https://github.com/apache/incubator-mxnet.git
     cd incubator-mxnet
-    echo "USE_OPENCV = 1" >> ./config.mk
-    echo "USE_BLAS = openblas" >> ./config.mk
-    make -j $(nproc)
 ```
 
-If building on CPU and using MKL and MKL-DNN (make sure MKL is installed according to [Math Library Selection](build_from_source.html#math-library-selection) and [MKL-DNN README](/~https://github.com/apache/incubator-mxnet/blob/master/docs/tutorials/mkldnn/MKLDNN_README.md)):
+Build with CMake and ninja, without GPU and without MKL.
 
 ```bash
-    git clone --recursive /~https://github.com/apache/incubator-mxnet.git
-    cd incubator-mxnet
-    echo "USE_OPENCV = 1" >> ./config.mk
-    echo "USE_BLAS = openblas" >> ./config.mk
-    echo "USE_CUDA = 0" >> ./config.mk
-    echo "USE_MKLDNN = 1" >> ./config.mk
-    make -j $(nproc)
+    rm -rf build
+    mkdir -p build && cd build
+    cmake -GNinja \
+        -DUSE_CUDA=OFF \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+    ..
+    ninja
 ```
 
-If building on GPU and you want OpenCV and OpenBLAS (make sure you have installed the [CUDA dependencies first](#cuda-dependencies)):
+If building on CPU and using MKL and MKL-DNN (make sure MKL is installed according to [Math Library Selection](build_from_source.html#math-library-selection) and [MKL-DNN README](/~https://github.com/apache/incubator-mxnet/blob/master/docs/tutorials/mkldnn/MKLDNN_README.md)):
 
 ```bash
-    git clone --recursive /~https://github.com/apache/incubator-mxnet.git
-    cd incubator-mxnet
-    echo "USE_OPENCV = 1" >> ./config.mk
-    echo "USE_BLAS = openblas" >> ./config.mk
-    echo "USE_CUDA = 1" >> ./config.mk
-    echo "USE_CUDA_PATH = /usr/local/cuda" >> ./config.mk
-    echo "USE_CUDNN = 1" >> ./config.mk
-    make -j $(nproc)
+    rm -rf build
+    mkdir -p build && cd build
+    cmake -GNinja \
+        -DUSE_CUDA=OFF \
+        -DUSE_MKL_IF_AVAILABLE=ON \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+    ..
+    ninja
 ```
 
-*Note* - USE_OPENCV and USE_BLAS are make file flags to set compilation options to use OpenCV and BLAS library. You can explore and use more compilation options in `make/config.mk` and also review common [usage examples](build_from_source.html#usage-examples).
+If building on GPU (make sure you have installed the [CUDA dependencies first](#cuda-dependencies)):
+Cuda 10.1 in Ubuntu 18.04 builds fine but is not currently tested in CI.
 
-Building from source creates a library called ```libmxnet.so``` in the `lib` folder in your MXNet project root.
+```bash
+    rm -rf build
+    mkdir -p build && cd build
+    cmake -GNinja \
+        -DUSE_CUDA=ON \
+        -DUSE_MKL_IF_AVAILABLE=OFF \
+        -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+    ..
+    ninja
+```
 
-You may also want to add the MXNet shared library to your `LD_LIBRARY_PATH`:
+*Note* - You can explore and use more compilation options as they are delcared in the top of `CMakeLists.txt` and also review common [usage examples](build_from_source.html#usage-examples).
+Optionally, you can also use a higher level, scripted version of the above with an editable CMake options file by doing the
+following:
 
 ```bash
-export LD_LIBRARY_PATH=$PWD/lib
+cp cmake/cmake_options.yml .
+# Edit cmake_options.yml in the MXNet root to your taste
+$EDITOR cmake_options.yml
+# Launch a local CMake build
+./dev_menu.py build
 ```
 
+Building from source creates a library called ```libmxnet.so``` in the `build` folder in your MXNet project root.
+
 After building the MXNet library, you may install language bindings.
 
 <hr>
diff --git a/example/gluon/super_resolution/data.py b/example/gluon/data.py
similarity index 100%
rename from example/gluon/super_resolution/data.py
rename to example/gluon/data.py
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 2f9d74dc5ba0..d3c679455a57 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -182,7 +182,7 @@ typedef int (*CustomOpFBFunc)(int /*size*/, void** /*ptrs*/, int* /*tags*/,
 typedef int (*CustomOpDelFunc)(void* /*state*/);
 typedef int (*CustomOpListFunc)(char*** /*args*/, void* /*state*/);
 typedef int (*CustomOpInferShapeFunc)(int /*num_input*/, int* /*ndims*/,
-                                      unsigned** /*shapes*/, void* /*state*/);
+                                      int** /*shapes*/, void* /*state*/);
 typedef int (*CustomOpInferStorageTypeFunc)(int /*num_input*/, int* /*stypes*/, void* /*state*/);
 typedef int (*CustomOpBackwardInferStorageTypeFunc)(int /*num_input*/,
                                                     int * /*stypes*/,
@@ -768,7 +768,8 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
                                  bool reverse,
                                  NDArrayHandle *out);
 /*!
- * \brief get the shape of the array
+ * \brief DEPRECATED. Use MXNDArrayGetShapeEx instead.
+ * get the shape of the array
  * \param handle the handle to the narray
  * \param out_dim the output dimension
  * \param out_pdata pointer holder to get data pointer of the shape
@@ -777,6 +778,16 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
 MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
                                 mx_uint *out_dim,
                                 const mx_uint **out_pdata);
+/*!
+ * \brief get the shape of the array
+ * \param handle the handle to the narray
+ * \param out_dim the output dimension
+ * \param out_pdata pointer holder to get data pointer of the shape
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXNDArrayGetShapeEx(NDArrayHandle handle,
+                                  int *out_dim,
+                                  const int **out_pdata);
 /*!
  * \brief get the content of the data in NDArray
  * \param handle the handle to the ndarray
@@ -812,6 +823,7 @@ MXNET_DLL int MXNDArrayToDLPack(NDArrayHandle handle,
 */
 MXNET_DLL int MXNDArrayFromDLPack(DLManagedTensorHandle dlpack,
                                   NDArrayHandle *out_handle);
+
 /*!
  * \brief Delete a dlpack tensor
  * \param dlpack the pointer of the input DLManagedTensor
@@ -1048,6 +1060,19 @@ MXNET_DLL int MXAutogradIsRecording(bool* curr);
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXAutogradIsTraining(bool* curr);
+/*!
+ * \brief get whether numpy compatibility is on
+ * \param curr returns the current status
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXIsNumpyCompatible(bool* curr);
+/*!
+ * \brief set numpy compatibility switch
+ * \param is_np_comp 1 when numpy compatibility is on, 0 when off
+ * \param prev returns the previous status before this set
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSetIsNumpyCompatible(int is_np_comp, int* prev);
 /*!
  * \brief mark NDArrays as variables to compute gradient for autograd
  * \param num_var number of variable NDArrays
@@ -1145,14 +1170,7 @@ MXNET_DLL int MXInvokeCachedOpEx(CachedOpHandle handle,
                                  int *num_outputs,
                                  NDArrayHandle **outputs,
                                  const int** out_stypes);
-/*!
- * \brief invoke cached operator
- */
-MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle,
-                               int num_inputs,
-                               NDArrayHandle *inputs,
-                               int *num_outputs,
-                               NDArrayHandle **outputs);
+
 //--------------------------------------------
 // Part 3: symbolic configuration generation
 //--------------------------------------------
@@ -1468,7 +1486,8 @@ MXNET_DLL int MXSymbolGrad(SymbolHandle sym,
                            const char** wrt,
                            SymbolHandle* out);
 /*!
- * \brief infer shape of unknown input shapes given the known one.
+ * \brief DEPRECATED. Use MXSymbolInferShapeEx instead.
+ * infer shape of unknown input shapes given the known one.
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
  *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
  *
@@ -1504,8 +1523,47 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
                                  const mx_uint **aux_shape_ndim,
                                  const mx_uint ***aux_shape_data,
                                  int *complete);
+
 /*!
- * \brief partially infer shape of unknown input shapes given the known one.
+ * \brief infer shape of unknown input shapes given the known one.
+ *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param sym symbol handle
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_ind_ptr the head pointer of the rows in CSR
+ * \param arg_shape_data the content of the CSR
+ * \param in_shape_size sizeof the returning array of in_shapes
+ * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param in_shape_data returning array of pointers to head of the input shape.
+ * \param out_shape_size sizeof the returning array of out_shapes
+ * \param out_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param out_shape_data returning array of pointers to head of the input shape.
+ * \param aux_shape_size sizeof the returning array of aux_shapes
+ * \param aux_shape_ndim returning array of shape dimensions of eachs auxiliary shape.
+ * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
+ * \param complete whether infer shape completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolInferShapeEx(SymbolHandle sym,
+                                   mx_uint num_args,
+                                   const char** keys,
+                                   const mx_uint *arg_ind_ptr,
+                                   const int *arg_shape_data,
+                                   mx_uint *in_shape_size,
+                                   const int **in_shape_ndim,
+                                   const int ***in_shape_data,
+                                   mx_uint *out_shape_size,
+                                   const int **out_shape_ndim,
+                                   const int ***out_shape_data,
+                                   mx_uint *aux_shape_size,
+                                   const int **aux_shape_ndim,
+                                   const int ***aux_shape_data,
+                                   int *complete);
+/*!
+ * \brief DEPRECATED. Use MXSymbolInferShapePartialEx instead.
+ * partially infer shape of unknown input shapes given the known one.
  *
  *  Return partially inferred results if not all shapes could be inferred.
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
@@ -1544,6 +1602,47 @@ MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym,
                                         const mx_uint ***aux_shape_data,
                                         int *complete);
 
+
+/*!
+ * \brief partially infer shape of unknown input shapes given the known one.
+ *
+ *  Return partially inferred results if not all shapes could be inferred.
+ *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param sym symbol handle
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_ind_ptr the head pointer of the rows in CSR
+ * \param arg_shape_data the content of the CSR
+ * \param in_shape_size sizeof the returning array of in_shapes
+ * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param in_shape_data returning array of pointers to head of the input shape.
+ * \param out_shape_size sizeof the returning array of out_shapes
+ * \param out_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param out_shape_data returning array of pointers to head of the input shape.
+ * \param aux_shape_size sizeof the returning array of aux_shapes
+ * \param aux_shape_ndim returning array of shape dimensions of eachs auxiliary shape.
+ * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
+ * \param complete whether infer shape completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolInferShapePartialEx(SymbolHandle sym,
+                                          mx_uint num_args,
+                                          const char** keys,
+                                          const mx_uint *arg_ind_ptr,
+                                          const int *arg_shape_data,
+                                          mx_uint *in_shape_size,
+                                          const int **in_shape_ndim,
+                                          const int ***in_shape_data,
+                                          mx_uint *out_shape_size,
+                                          const int **out_shape_ndim,
+                                          const int ***out_shape_data,
+                                          mx_uint *aux_shape_size,
+                                          const int **aux_shape_ndim,
+                                          const int ***aux_shape_data,
+                                          int *complete);
+
 /*!
  * \brief infer type of unknown input types given the known one.
  *  The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data
@@ -1807,7 +1906,8 @@ MXNET_DLL int MXExecutorBindEX(SymbolHandle symbol_handle,
                                NDArrayHandle *aux_states,
                                ExecutorHandle shared_exec,
                                ExecutorHandle *out);
-
+/*! \brief DEPRECATED. Use MXExecutorSimpleBindEx instead.
+ */
 MXNET_DLL int MXExecutorSimpleBind(SymbolHandle symbol_handle,
                                    int dev_type,
                                    int dev_id,
@@ -1843,8 +1943,44 @@ MXNET_DLL int MXExecutorSimpleBind(SymbolHandle symbol_handle,
                                    ExecutorHandle shared_exec_handle,
                                    ExecutorHandle* out);
 
-/*!
- * \brief Return a new executor with the same symbol and shared memory,
+
+MXNET_DLL int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
+                                     int dev_type,
+                                     int dev_id,
+                                     const mx_uint num_g2c_keys,
+                                     const char** g2c_keys,
+                                     const int* g2c_dev_types,
+                                     const int* g2c_dev_ids,
+                                     const mx_uint provided_grad_req_list_len,
+                                     const char** provided_grad_req_names,
+                                     const char** provided_grad_req_types,
+                                     const mx_uint num_provided_arg_shapes,
+                                     const char** provided_arg_shape_names,
+                                     const int* provided_arg_shape_data,
+                                     const mx_uint* provided_arg_shape_idx,
+                                     const mx_uint num_provided_arg_dtypes,
+                                     const char** provided_arg_dtype_names,
+                                     const int* provided_arg_dtypes,
+                                     const mx_uint num_provided_arg_stypes,
+                                     const char** provided_arg_stype_names,
+                                     const int* provided_arg_stypes,
+                                     const mx_uint num_shared_arg_names,
+                                     const char** shared_arg_name_list,
+                                     int* shared_buffer_len,
+                                     const char** shared_buffer_name_list,
+                                     NDArrayHandle* shared_buffer_handle_list,
+                                     const char*** updated_shared_buffer_name_list,
+                                     NDArrayHandle** updated_shared_buffer_handle_list,
+                                     mx_uint* num_in_args,
+                                     NDArrayHandle** in_args,
+                                     NDArrayHandle** arg_grads,
+                                     mx_uint* num_aux_states,
+                                     NDArrayHandle** aux_states,
+                                     ExecutorHandle shared_exec_handle,
+                                     ExecutorHandle* out);
+/*!
+ * \brief DEPRECATED. Use MXExecutorReshapeEx instead.
+ * Return a new executor with the same symbol and shared memory,
  * but different input/output shapes.
  *
  * \param partial_shaping Whether to allow changing the shape of unspecified arguments.
@@ -1883,6 +2019,46 @@ MXNET_DLL int MXExecutorReshape(int partial_shaping,
                                 NDArrayHandle** aux_states,
                                 ExecutorHandle shared_exec,
                                 ExecutorHandle *out);
+/*!
+ * \brief Return a new executor with the same symbol and shared memory,
+ * but different input/output shapes.
+ *
+ * \param partial_shaping Whether to allow changing the shape of unspecified arguments.
+ * \param allow_up_sizing Whether to allow allocating new ndarrays that's larger than the original.
+ * \param dev_type device type of default context
+ * \param dev_id device id of default context
+ * \param num_map_keys size of group2ctx map
+ * \param map_keys keys of group2ctx map
+ * \param map_dev_types device type of group2ctx map
+ * \param map_dev_ids device id of group2ctx map
+ * \param num_in_args length of in_args
+ * \param in_args in args array
+ * \param arg_grads arg grads handle array
+ * \param num_aux_states length of auxiliary states
+ * \param aux_states auxiliary states array
+ * \param shared_exec input executor handle for memory sharing
+ * \param out output executor handle
+ * \return a new executor
+ */
+MXNET_DLL int MXExecutorReshapeEx(int partial_shaping,
+                                  int allow_up_sizing,
+                                  int dev_type,
+                                  int dev_id,
+                                  mx_uint num_map_keys,
+                                  const char** map_keys,
+                                  const int* map_dev_types,
+                                  const int* map_dev_ids,
+                                  const mx_uint num_provided_arg_shapes,
+                                  const char** provided_arg_shape_names,
+                                  const int* provided_arg_shape_data,
+                                  const mx_uint* provided_arg_shape_idx,
+                                  mx_uint* num_in_args,
+                                  NDArrayHandle** in_args,
+                                  NDArrayHandle** arg_grads,
+                                  mx_uint* num_aux_states,
+                                  NDArrayHandle** aux_states,
+                                  ExecutorHandle shared_exec,
+                                  ExecutorHandle *out);
 
 /*!
  * \brief get optimized graph from graph executor
@@ -2542,7 +2718,8 @@ MXNET_DLL int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** ar
 MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
                                           int* shared_id);
 /*!
- * \brief Reconstruct NDArray from shared memory handle
+ * \brief DEPRECATED. Use MXNDArrayCreateFromSharedMemEx instead.
+ * Reconstruct NDArray from shared memory handle
  * \param shared_pid shared PID
  * \param shared_id shared memory id
  * \param shape pointer to NDArray dimensions
@@ -2553,6 +2730,19 @@ MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
 MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *shape,
                                            mx_uint ndim, int dtype, NDArrayHandle *out);
 
+
+/*!
+ * \brief Reconstruct NDArray from shared memory handle
+ * \param shared_pid shared PID
+ * \param shared_id shared memory id
+ * \param shape pointer to NDArray dimensions
+ * \param ndim number of NDArray dimensions
+ * \param dtype data type of NDArray
+ * \param out constructed NDArray
+ */
+MXNET_DLL int MXNDArrayCreateFromSharedMemEx(int shared_pid, int shared_id, const int *shape,
+                                             int ndim, int dtype, NDArrayHandle *out);
+
 /*!
   * \brief Push an asynchronous operation to the engine.
   * \param async_func Execution function whici takes a parameter on_complete
@@ -2574,8 +2764,9 @@ MXNET_DLL int MXEnginePushAsync(EngineAsyncFunc async_func, void* func_param,
                                 EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
                                 EngineVarHandle const_vars_handle, int num_const_vars,
                                 EngineVarHandle mutable_vars_handle, int num_mutable_vars,
-                                EngineFnPropertyHandle prop_handle = NULL, int priority = 0,
-                                const char* opr_name = NULL, bool wait = false);
+                                EngineFnPropertyHandle prop_handle DEFAULT(NULL),
+                                int priority DEFAULT(0), const char* opr_name DEFAULT(NULL),
+                                bool wait DEFAULT(false));
 
 /*!
   * \brief Push a synchronous operation to the engine.
@@ -2596,8 +2787,8 @@ MXNET_DLL int MXEnginePushSync(EngineSyncFunc sync_func, void* func_param,
                                EngineFuncParamDeleter deleter, ContextHandle ctx_handle,
                                EngineVarHandle const_vars_handle, int num_const_vars,
                                EngineVarHandle mutable_vars_handle, int num_mutable_vars,
-                               EngineFnPropertyHandle prop_handle = NULL, int priority = 0,
-                               const char* opr_name = NULL);
+                               EngineFnPropertyHandle prop_handle DEFAULT(NULL),
+                               int priority DEFAULT(0), const char* opr_name DEFAULT(NULL));
 
 #ifdef __cplusplus
 }
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index 408a70a5feed..9d6367509f79 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -106,7 +106,9 @@ enum class FnProperty {
   /*! \brief Delete variable call */
   kDeleteVar,
   /*! \brief Prioritized sync operation on GPU */
-  kGPUPrioritized
+  kGPUPrioritized,
+  /*! \brief Operation not to be skipped even with associated exception */
+  kNoSkip
 };  // enum class FnProperty
 
 /*!
@@ -230,6 +232,8 @@ class MXNET_API Engine {
    * \brief Wait until all the activity of engine finishes.
    */
   virtual void WaitForAll() = 0;
+  /*!\brief Throw if threre are associated exception with var */
+  virtual void Throw(VarHandle var) = 0;
   /*!\brief virtual destructor */
   virtual ~Engine() noexcept(false) {}
   /*!
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index 52cedb2fadd9..ad209913ac53 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -97,6 +97,16 @@ class Imperative {
       is_recording_ = is_recording;
       return old;
   }
+  /*! brief whether numpy compatibility is on. */
+  bool is_np_comp() const {
+    return is_np_comp_;
+  }
+  /*! brief turn on or turn off numpy compatibility switch. */
+  bool set_is_np_comp(bool is_np_comp) {
+    bool old = is_np_comp_;
+    is_np_comp_ = is_np_comp;
+    return old;
+  }
   /*! \brief to record operator, return corresponding node. */
   void RecordOp(nnvm::NodeAttrs&& attrs,
                 const std::vector<NDArray*>& inputs,
@@ -165,9 +175,15 @@ class Imperative {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local bool is_train_;
   static thread_local bool is_recording_;
+  // TOOD(junwu): Added numpy compatibility switch for backward compatibility.
+  // Delete it in the next major release.
+  static thread_local bool is_np_comp_;
 #else
   static MX_THREAD_LOCAL bool is_train_;
   static MX_THREAD_LOCAL bool is_recording_;
+  // TOOD(junwu): Added numpy compatibility switch for backward compatibility.
+  // Delete it in the next major release.
+  static MX_THREAD_LOCAL bool is_np_comp_;
 #endif
   /*! \brief node count used for naming */
   std::atomic<uint64_t> node_count_{0};
diff --git a/include/mxnet/libinfo.h b/include/mxnet/libinfo.h
index f35d41a9aa8a..8b58a398c673 100644
--- a/include/mxnet/libinfo.h
+++ b/include/mxnet/libinfo.h
@@ -123,7 +123,9 @@
 #define MXNET_USE_SIGNAL_HANDLER 0
 #endif
 
-
+#ifndef MXNET_USE_INT64_TENSOR_SIZE
+#define MXNET_USE_INT64_TENSOR_SIZE MSHADOW_INT64_TENSOR_SIZE
+#endif
 
 namespace mxnet {
 namespace features {
@@ -177,6 +179,8 @@ enum : unsigned {
   PROFILER,
   DIST_KVSTORE,
   CXX14,
+  INT64_TENSOR_SIZE,
+
   // Signal handler to print stack traces on exceptions
   SIGNAL_HANDLER,
   DEBUG,
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index d00cb479b92e..05d3fa45683e 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -850,26 +850,35 @@ class NDArray {
     mxnet::ShapeVector aux_shapes;
     /*! \brief Reference to the storage to ensure proper destruct order */
     std::shared_ptr<Storage> storage_ref_;
+    /*! \brief Reference to the engine to ensure we cleanup without calling a destructed engine */
+    std::weak_ptr<Engine> engine_ref_;
 
-    /*! \brief default cosntructor */
+
+    /*! \brief default constructor */
     Chunk() : static_data(true), delay_alloc(false),
-              storage_ref_(Storage::_GetSharedRef()) {}
+              storage_ref_(Storage::_GetSharedRef()),
+              engine_ref_(Engine::_GetSharedRef()) {}
 
     /*! \brief construct a new chunk */
     Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype)
         : static_data(false), delay_alloc(true), ctx(ctx_),
-          storage_ref_(Storage::_GetSharedRef()) {
-      auto size = shape.Size();
+          storage_ref_(Storage::_GetSharedRef()),
+          engine_ref_(Engine::_GetSharedRef()) {
       storage_shape = shape;
+      if (shape_is_known(storage_shape)) {
+        shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
+      }
       var = Engine::Get()->NewVariable();
-      shandle.size = size * mshadow::mshadow_sizeof(dtype);
       shandle.ctx = ctx_;
-      if (!delay_alloc_) this->CheckAndAlloc();
+      if (!delay_alloc_) {
+        this->CheckAndAlloc();
+      }
     }
 
     Chunk(const TBlob &data, int dev_id)
         : static_data(true), delay_alloc(false),
-          storage_ref_(Storage::_GetSharedRef()) {
+          storage_ref_(Storage::_GetSharedRef()),
+          engine_ref_(Engine::_GetSharedRef()) {
       CHECK(storage_type == kDefaultStorage);
       var = Engine::Get()->NewVariable();
       if (data.dev_mask() == cpu::kDevMask) {
@@ -887,7 +896,8 @@ class NDArray {
 
     Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
         : static_data(false), delay_alloc(false),
-          storage_ref_(Storage::_GetSharedRef()) {
+          storage_ref_(Storage::_GetSharedRef()),
+          engine_ref_(Engine::_GetSharedRef()) {
       var = Engine::Get()->NewVariable();
       ctx = Context::CPUShared(0);
       shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
@@ -903,7 +913,8 @@ class NDArray {
           const mxnet::ShapeVector &aux_shapes_)
         : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
           aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
-          aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()) {
+          aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()),
+          engine_ref_(Engine::_GetSharedRef()) {
       shandle.ctx = ctx;
       var = Engine::Get()->NewVariable();
       // aux_handles always reflect the correct number of aux data
@@ -921,7 +932,7 @@ class NDArray {
     Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
           const std::vector<TBlob> &aux_data, int dev_id)
         : static_data(true), delay_alloc(false), storage_type(storage_type_),
-          storage_ref_(Storage::_GetSharedRef()) {
+          storage_ref_(Storage::_GetSharedRef()), engine_ref_(Engine::_GetSharedRef()) {
       using namespace mshadow;
       CHECK_NE(storage_type, kDefaultStorage);
       // init var
@@ -953,7 +964,7 @@ class NDArray {
     /*! \brief set the shape for ith aux data, and update storage shape if necessary */
     inline void set_aux_shape(const size_t i, const mxnet::TShape& shape) {
       aux_shapes[i] = shape;
-      if (storage_shape.ndim() > 0) {
+      if (storage_shape.ndim() >= 0) {
         if (storage_type == kRowSparseStorage && i == rowsparse::kIdx) {
           storage_shape[0] = shape[0];
         } else if (storage_type == kCSRStorage && i == csr::kIdx) {
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 7d059025b03e..a08dab1b1b74 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -198,7 +198,6 @@ class TBlob {
       << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType<DType>::kFlag;
     return mshadow::Tensor<Device, 2, DType>(static_cast<DType*>(dptr_),
                                              shape_.FlatTo2D(),
-                                             shape_[shape_.ndim() - 1],
                                              stream);
   }
   /*!
@@ -219,15 +218,16 @@ class TBlob {
     return shape_.ndim();
   }
   /*!
-   * \brief return size of i-th dimension, start counting from highest dimension
+   * \brief return size of i-th dimension, start counting from highest dimension.
+   * return type needs to be a signed integer.
    * \param idx the dimension count from the highest dimensin
-   * \return the size
+   * \return the size. -1 means unknown size to support zero-size tensor.
    */
   inline index_t size(index_t idx) const {
     return shape_[idx];
   }
   /*! \brief total number of elements in the tensor */
-  inline index_t Size(void) const {
+  inline size_t Size(void) const {
     return shape_.Size();
   }
   /*! \brief get pointer in dtype */
@@ -419,6 +419,8 @@ class TBlob {
 namespace dmlc {
 // Add a few patches to support mxnet::TShape in dmlc/parameter.
 DMLC_DECLARE_TYPE_NAME(mxnet::TShape, "Shape(tuple)");
+DMLC_DECLARE_TYPE_NAME(mxnet::Tuple<int>, "Shape(tuple)");
+DMLC_DECLARE_TYPE_NAME(mxnet::Tuple<dmlc::optional<int>>, "Shape(tuple)");
 DMLC_DECLARE_TYPE_NAME(nnvm::Tuple<int>, "Shape(tuple)");
 DMLC_DECLARE_TYPE_NAME(nnvm::Tuple<dmlc::optional<int>>, "Shape(tuple)");
 
@@ -442,7 +444,7 @@ class FieldEntry<mxnet::TShape>
         throw dmlc::ParamError(os.str());
     }
     if (enforce_nonzero_) {
-      for (mxnet::index_t i = 0; i < v.ndim(); ++i) {
+      for (int i = 0; i < v.ndim(); ++i) {
         if (v[i] == 0U) {
           std::ostringstream os;
           os << "value " << v << "for Parameter " << this->key_
@@ -456,7 +458,7 @@ class FieldEntry<mxnet::TShape>
     this->enforce_nonzero_ = true;
     return this->self();
   }
-  inline FieldEntry<mxnet::TShape> &set_expect_ndim(mxnet::index_t ndim) {
+  inline FieldEntry<mxnet::TShape> &set_expect_ndim(int ndim) {
     expect_ndim_ = ndim;
     return this->self();
   }
@@ -465,7 +467,7 @@ class FieldEntry<mxnet::TShape>
   // whether all the entries need to be nonzero
   bool enforce_nonzero_;
   // expected number of dimension, default = 0 means no restriction.
-  mxnet::index_t expect_ndim_;
+  int expect_ndim_;
 };
 
 }  // namespace parameter
diff --git a/include/mxnet/tuple.h b/include/mxnet/tuple.h
index 7c1367333630..bc630f153744 100644
--- a/include/mxnet/tuple.h
+++ b/include/mxnet/tuple.h
@@ -17,7 +17,7 @@
  * under the License.
  */
 /*!
- *  Copyright (c) 2016 by Contributors
+ *  Copyright (c) 2019 by Contributors
  * \file mxnet/tuple.h
  * \brief Data structure Tuple and TShape to store dynamic sized shapes.
  */
@@ -39,11 +39,14 @@ namespace mxnet {
 
 /*!
  * \brief A dynamic sized array data structure that is optimized for storing
- *        small number of elements with same type.
+ * small number of elements with same type.
  *
  *  Data will be stored in stack when number of elements is small.
  *  It is suitable to hold shape of Tensor.
  *
+ *  The ndim of a valid tuple is an integer in range [0, inf).
+ *  ndim = 0 means the tuple is empty.
+ *
  * \tparam ValueType The type of data stored inside tuple.
  * \sa TShape
  */
@@ -61,7 +64,11 @@ class Tuple {
    * \param s the source tuple
    */
   inline Tuple(const Tuple<ValueType>& s) {
-    this->assign(s.begin(), s.end());
+    if (s.ndim() == -1) {
+      this->SetDim(-1);
+    } else {
+      this->assign(s.begin(), s.end());
+    }
   }
   /*!
    * \brief constructor from initializer list
@@ -106,6 +113,7 @@ class Tuple {
   inline void assign(RandomAccessIterator begin,
                      RandomAccessIterator end) {
     this->SetDim(end - begin);
+    CHECK_GE(ndim(), 0);
     std::copy(begin, end, this->begin());
   }
   /*!
@@ -124,7 +132,11 @@ class Tuple {
    * \return reference of self
    */
   inline Tuple<ValueType>& operator=(const Tuple<ValueType>& src) {
-    this->assign(src.begin(), src.end());
+    if (src.ndim() == -1) {
+      this->SetDim(-1);
+    } else {
+      this->assign(src.begin(), src.end());
+    }
     return *this;
   }
   /*!
@@ -151,6 +163,7 @@ class Tuple {
    */
   inline bool operator==(const Tuple<ValueType> &s) const {
     if (ndim_ != s.ndim_) return false;
+    if (ndim() == -1) return true;
     return std::equal(begin(), end(), s.begin());
   }
   /*!
@@ -177,7 +190,7 @@ class Tuple {
     return ndim_ <= kStackCache ? (data_stack_ + ndim_): (data_heap_ + ndim_);
   }
   /*! \return number of dimension of the tuple */
-  inline uint32_t ndim() const {
+  inline int ndim() const {
     return ndim_;
   }
   /*!
@@ -185,7 +198,8 @@ class Tuple {
    * \param i dimension index
    * \return the corresponding dimension size
    */
-  inline ValueType& operator[](size_t i) {
+  inline ValueType& operator[](int i) {
+    CHECK(i >= 0 && i < ndim()) << "index = " << i << " must be in range [0, " << ndim() << ")";
     return begin()[i];
   }
   /*!
@@ -193,7 +207,8 @@ class Tuple {
    * \param i dimension index
    * \return the corresponding dimension size
    */
-  inline const ValueType& operator[](size_t i) const {
+  inline const ValueType& operator[](int i) const {
+    CHECK(i >= 0 && i < ndim()) << "index = " << i << " must be in range [0, " << ndim() << ")";
     return begin()[i];
   }
   /*!
@@ -220,6 +235,13 @@ class Tuple {
    * \return the ostream
    */
   friend std::ostream &operator<<(std::ostream &os, const Tuple<ValueType> &t) {
+    if (t.ndim() == -1) {
+      // If t is an unknown shape, return string "None".
+      // This is consistent with returning unknown shape in Python and generating
+      // C++ operator APIs by OpWrapperGenerator.py (defaultString) in cpp-package.
+      os << "None";
+      return os;
+    }
     os << '[';
     const ValueType* begin = t.begin();
     const ValueType* end = t.end();
@@ -252,14 +274,16 @@ class Tuple {
       if (!isspace(ch)) {
         is.setstate(std::ios::failbit);
         return is;
+      }
     }
-    }
-    // Handle empty tuple
+    // Handle empty tuple. A tensor whose shape is an empty tuple
+    // represents a scalar with ndim = 0.
     while (isspace(is.peek())) {
       is.get();
     }
     if (is.peek() == ')' || is.peek() == ']') {
       is.get();
+      t.SetDim(0);
       return is;
     }
     // Handle non-empty tuple
@@ -316,48 +340,85 @@ class Tuple {
 
  protected:
   // stack cache size
-  static const uint32_t kStackCache = 4;
+  static const int kStackCache = 4;
   /*! \brief number of dimension of the tuple */
-  uint32_t ndim_{0};
+  int ndim_{0};
   /*! \brief number of cells allocated in data_heap_ */
-  uint32_t num_heap_allocated_{0};
+  int num_heap_allocated_{0};
   /*! \brief in stack space used to store shape when it is small */
   ValueType data_stack_[kStackCache];
   /*! \brief space to store shape when dimension is big*/
   ValueType* data_heap_{nullptr};
   // internal function to change the dimension
-  inline void SetDim(uint32_t ndim) {
+  inline void SetDim(int ndim) {
+    CHECK_GE(ndim, -1) << "ndim cannot be less than -1, received " << ndim;
     if (ndim > kStackCache &&
         ndim > num_heap_allocated_) {
       delete [] data_heap_;
       data_heap_ = new ValueType[ndim];
       num_heap_allocated_ = ndim;
+    } else if (ndim <= 0 && data_heap_ != nullptr) {
+      delete [] data_heap_;
+      data_heap_ = nullptr;
+      num_heap_allocated_ = 0;
     }
     ndim_ = ndim;
   }
 };
 
+
+/*! brief check if a shape's ndim is known. */
+inline bool ndim_is_known(const int ndim) {
+  CHECK_GE(ndim, -1) << "shape ndim must be >= -1, while received " << ndim;
+  return ndim != -1;
+}
+
+/*! brief check if a shape's dim size is known. */
+inline bool dim_size_is_known(const dim_t dim_size) {
+  CHECK_GE(dim_size, -1) << "shape dim size must be >= -1, while received " << dim_size;
+  return dim_size != -1;
+}
+
 /*!
  * \brief A Shape class that is used to represent shape of each tensor.
+ *
+ * The ndim of a valid shape is an integer in range [-1, inf).
+ * ndim = -1 means the shape information is unknown and need to be inferred.
+ * ndim = 0 means the tensor with the shape is a scalar.
+ *
+ * The dimension size of a valid shape is an integer in range [-1, inf).
+ * dim_size = -1 means the size of that dimension is unknown and need to be inferred.
+ * dim_size = 0 means that dimension is empty.
+ *
+ * The definition of ndim = 0 and dim_size = 0 is consistent with NumPy.
  */
 class TShape : public Tuple<dim_t> {
  public:
   /*! \brief default constructor */
-  TShape() = default;
+  TShape() {
+    this->SetDim(-1);
+  }
   /*!
-   * constructor to construct a shape with all 1.
+   * constructor to construct a shape with all `value`.
    * \param ndim the number of dimension
+   * \param value the dimension size for all dims
    */
-  inline TShape(uint32_t ndim) {  // NOLINT(*)
+  inline TShape(const int ndim, const dim_t value) {  // NOLINT(*)
     this->SetDim(ndim);
-    std::fill_n(begin(), ndim, 1);
+    if (ndim > 0) {
+      std::fill_n(begin(), ndim, value);
+    }
   }
   /*!
    * \brief copy constructor of TShape
    * \param s source shape.
    */
   inline TShape(const Tuple<dim_t>& s) { // NOLINT(*)
-    this->assign(s.begin(), s.end());
+    if (s.ndim() == -1) {
+      this->SetDim(-1);
+    } else {
+      this->assign(s.begin(), s.end());
+    }
   }
   /*!
    * \brief constructor from initializer list
@@ -374,12 +435,17 @@ class TShape : public Tuple<dim_t> {
     this->swap(s);
   }
   /*!
-   * \brief construct the Tuple from content of iterator
+   * \brief construct the Tuple from content of iterator.
+   * This function is enforced with template arguments of random access iterator types.
+   * This is necessary to distinguish from another constructor: TShape(const int, const dim_t).
    * \param begin the beginning of iterator
    * \param end end the end of the iterator
    * \tparam RandomAccessIterator iterator type
    */
-  template<typename RandomAccessIterator>
+  template<typename RandomAccessIterator,
+           typename std::enable_if<
+               std::is_same<typename std::iterator_traits<RandomAccessIterator>::iterator_category,
+                            std::random_access_iterator_tag>::value, int>::type = 0>
   inline TShape(RandomAccessIterator begin,
                 RandomAccessIterator end) {
     this->assign(begin, end);
@@ -390,7 +456,11 @@ class TShape : public Tuple<dim_t> {
    * \return self.
    */
   inline TShape& operator=(const Tuple<dim_t>& src) {
-    this->assign(src.begin(), src.end());
+    if (src.ndim() == -1) {
+      this->SetDim(-1);
+    } else {
+      this->assign(src.begin(), src.end());
+    }
     return *this;
   }
   /*!
@@ -404,9 +474,11 @@ class TShape : public Tuple<dim_t> {
   }
   /*! \return total number of elements in the shape */
   inline size_t Size() const {
+    CHECK(ndim_is_known(this->ndim())) << "Shape is unknown.";
     dim_t size = 1;
     const dim_t* start = begin(), *fin = end();
     for (const dim_t* it = start; it != fin; ++it) {
+      CHECK(dim_size_is_known(*it)) << "Shape dim size cannot be a negative value " << *it;
       size *= *it;
     }
     return size;
@@ -417,9 +489,14 @@ class TShape : public Tuple<dim_t> {
    * \param dimend end dimension
    */
   inline size_t ProdShape(int dimstart, int dimend) const {
+    CHECK(ndim_is_known(this->ndim())) << "Shape is unknown.";
+    CHECK_GE(dimstart, 0) << "dimstart must be >= 0, while received " << dimstart;
+    CHECK_LE(dimend, this->ndim()) << "dimend must be <= " << this->ndim()
+                                   << ", while received " << dimend;
     dim_t num = 1;
     const dim_t *d = this->data();
     for (int i = dimstart; i < dimend; ++i) {
+      CHECK(dim_size_is_known(d[i])) << "Shape dim size must be known, while received " << d[i];
       num *= d[i];
     }
     return num;
@@ -460,7 +537,7 @@ class TShape : public Tuple<dim_t> {
    */
   template<int dim>
   inline mshadow::Shape<dim> get() const {
-    CHECK_EQ(dim, static_cast<int>(ndim()))
+    CHECK_EQ(dim, ndim())
         << "dimension do not match target dimension " << dim << " vs " << ndim();
     const dim_t *d = this->data();
     mshadow::Shape<dim> s;
@@ -475,11 +552,12 @@ class TShape : public Tuple<dim_t> {
    */
   inline mshadow::Shape<2> FlatTo2D(void) const {
     mshadow::Shape<2> s;
-    if (ndim() == 0) return mshadow::Shape2(0, 0);
+    CHECK(ndim_is_known(ndim())) << "shape must have a valid ndim";
+    if (ndim() == 0) return mshadow::Shape2(1, 1);
     const dim_t *d = this->data();
     s.shape_[1] = d[ndim() - 1];
     dim_t ymax = 1;
-    for (size_t i = 1; i < ndim(); ++i) {
+    for (int i = 1; i < ndim(); ++i) {
       ymax *= d[i - 1];
     }
     s.shape_[0] = ymax;
@@ -491,22 +569,23 @@ class TShape : public Tuple<dim_t> {
    * \param axis_end The ending axis specified.
    * \return the flat 3d shape
    */
-  inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const {
+  inline mshadow::Shape<3> FlatTo3D(int axis_begin, int axis_end) const {
     CHECK(axis_end >= axis_begin);
     mshadow::Shape<3> s;
-    if (ndim() == 0) return mshadow::Shape3(0, 0, 0);
+    CHECK(ndim_is_known(ndim())) << "shape must have a valid ndim";
+    if (ndim() == 0) return mshadow::Shape3(1, 1, 1);
     const dim_t *d = this->data();
     s.shape_[0] = 1;
     s.shape_[1] = 1;
     s.shape_[2] = 1;
 
-    for (size_t i = 0; i < axis_begin; ++i) {
+    for (int i = 0; i < axis_begin; ++i) {
       s.shape_[0] *= d[i];
     }
-    for (size_t i = axis_begin; i <= axis_end; ++i) {
+    for (int i = axis_begin; i <= axis_end; ++i) {
       s.shape_[1] *= d[i];
     }
-    for (size_t i = axis_end + 1; i < ndim(); ++i) {
+    for (int i = axis_end + 1; i < ndim(); ++i) {
       s.shape_[2] *= d[i];
     }
     return s;
@@ -516,7 +595,7 @@ class TShape : public Tuple<dim_t> {
    * \param axis The axis specified.
    * \return the flat 3d shape
    */
-  inline mshadow::Shape<3> FlatTo3D(size_t axis) const {
+  inline mshadow::Shape<3> FlatTo3D(int axis) const {
     return FlatTo3D(axis, axis);
   }
   inline bool operator==(const TShape &s) const {
@@ -552,6 +631,28 @@ class TShape : public Tuple<dim_t> {
 #endif
 };
 
+/*! brief check if a shape's ndim is known. */
+inline bool ndim_is_known(const TShape& x) {
+  return ndim_is_known(x.ndim());
+}
+
+/*! brief check if a shape's dim size is known. */
+inline bool dim_size_is_known(const TShape& x, const int idx) {
+  CHECK(idx >= 0 && idx < x.ndim())
+      << "idx = " << idx << " exceeds shape dimension range [0, " << x.ndim() << ")";
+  return dim_size_is_known(x[idx]);
+}
+
+/*! brief check if shape is known using the NumPy compatible definition.
+ * zero-dim and zero-size tensors are valid. -1 means unknown.*/
+inline bool shape_is_known(const TShape& x) {
+  if (!ndim_is_known(x)) return false;
+  for (int i = 0; i < x.ndim(); ++i) {
+    if (!dim_size_is_known(x, i)) return false;
+  }
+  return true;
+}
+
 /*! \brief helper function to cast type of container elements */
 template<typename SrcIter, typename DstIter>
 inline DstIter ShapeTypeCast(const SrcIter begin,
@@ -567,7 +668,7 @@ inline DstIter ShapeTypeCast(const SrcIter begin,
 template<typename SrcIter>
 inline TShape ShapeTypeCast(const SrcIter begin, const SrcIter end) {
   size_t ndim = std::distance(begin, end);
-  TShape res(ndim);
+  TShape res(ndim, -1);
   ShapeTypeCast(begin, end, res.begin());
   return res;
 }
@@ -611,9 +712,9 @@ template<typename T>
 struct hash<mxnet::Tuple<T> > {
   /*! \brief hash a Tuple into unsigned int */
   size_t operator()(const mxnet::Tuple<T>& val) const {
-    std::hash<uint32_t> hash_uint;
-    size_t res = hash_uint(val.ndim());
-    for (uint32_t i = 0; i < val.ndim(); ++i) {
+    std::hash<int> hash_int;
+    size_t res = hash_int(val.ndim());
+    for (int i = 0; i < val.ndim(); ++i) {
       res = dmlc::HashCombine(res, val[i]);
     }
     return res;
@@ -625,9 +726,9 @@ template<>
 struct hash<mxnet::TShape> {
   /*! \brief hash a TShape into unsigned int */
   size_t operator()(const mxnet::TShape& val) const {
-    std::hash<uint32_t> hash_uint;
-    size_t res = hash_uint(val.ndim());
-    for (uint32_t i = 0; i < val.ndim(); ++i) {
+    std::hash<int> hash_int;
+    size_t res = hash_int(val.ndim());
+    for (int i = 0; i < val.ndim(); ++i) {
       res = dmlc::HashCombine(res, val[i]);
     }
     return res;
@@ -638,6 +739,7 @@ struct hash<mxnet::TShape> {
 namespace dmlc {
 /*! \brief description for optional TShape */
 DMLC_DECLARE_TYPE_NAME(optional<mxnet::TShape>, "Shape or None");
+DMLC_DECLARE_TYPE_NAME(optional<mxnet::Tuple<int>>, "Shape or None");
 // avoid low version of MSVC
 #if !defined(_MSC_VER)
 template<typename T>
diff --git a/julia/test/unittest/ndarray.jl b/julia/test/unittest/ndarray.jl
index 26695e8c4769..638963f1b8aa 100644
--- a/julia/test/unittest/ndarray.jl
+++ b/julia/test/unittest/ndarray.jl
@@ -886,18 +886,20 @@ function test_saveload()
 end
 
 function test_clamp()
-  dims = rand_dims()
-  @info("NDArray::clamp::dims = $dims")
-
-  j_array, nd_array = rand_tensors(dims)
-  clip_up   = maximum(abs.(j_array)) / 2
-  clip_down = 0
-  clipped   = clamp(nd_array, clip_down, clip_up)
+  @info("NDArray::clamp::dims")
+
+  A = [1 2 3;
+       4 5 6;
+       7 8 9.]
+  B = [3 3 3;
+       4 5 6;
+       7 8 8.]
+  x = NDArray(A)
+  y = clamp(x, 3., 8.)
 
   # make sure the original array is not modified
-  @test copy(nd_array) ≈ j_array
-
-  @test all(clip_down .<= copy(clipped) .<= clip_up)
+  @test copy(x) ≈ A
+  @test copy(y) ≈ B
 
   @info("NDArray::clamp!")
   let
diff --git a/make/config.mk b/make/config.mk
index f9ac4cf10048..20834675ecbd 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -183,7 +183,8 @@ USE_S3 = 0
 USE_OPERATOR_TUNING = 1
 
 # Use gperftools if found
-USE_GPERFTOOLS = 1
+# Disable because of #8968
+USE_GPERFTOOLS = 0
 
 # path to gperftools (tcmalloc) library in case of a non-standard installation
 USE_GPERFTOOLS_PATH =
@@ -214,6 +215,12 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
+# Use int64_t type to represent the total number of elements in a tensor
+# This will cause performance degradation reported in issue #14496
+# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
+# Note: the size of each dimension is still bounded by INT32_MAX
+USE_INT64_TENSOR_SIZE = 0
+
 #----------------------------
 # plugins
 #----------------------------
diff --git a/make/crosscompile.jetson.mk b/make/crosscompile.jetson.mk
index 3db2b98847f3..880e2cf5b466 100644
--- a/make/crosscompile.jetson.mk
+++ b/make/crosscompile.jetson.mk
@@ -166,7 +166,8 @@ USE_S3 = 0
 USE_OPERATOR_TUNING = 1
 
 # Use gperftools if found
-USE_GPERFTOOLS = 1
+# Disable because of #8968
+USE_GPERFTOOLS = 0
 
 # path to gperftools (tcmalloc) library in case of a non-standard installation
 USE_GPERFTOOLS_PATH =
@@ -191,6 +192,12 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
+# Use int64_t type to represent the total number of elements in the tensor
+# This will cause performance degradation reported in issue #14496
+# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
+# Note: the size of each dimension is still bounded by INT32_MAX
+USE_INT64_TENSOR_SIZE = 0
+
 #----------------------------
 # plugins
 #----------------------------
diff --git a/make/osx.mk b/make/osx.mk
index 7e32d81a5d71..0b5842e59524 100644
--- a/make/osx.mk
+++ b/make/osx.mk
@@ -135,6 +135,12 @@ EXTRA_OPERATORS =
 # Create C++ interface package
 USE_CPP_PACKAGE = 0
 
+# Use int64_t type to represent the total number of elements in a tensor
+# This will cause performance degradation reported in issue #14496
+# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
+# Note: the size of each dimension is still bounded by INT32_MAX
+USE_INT64_TENSOR_SIZE = 0
+
 #----------------------------
 # plugins
 #----------------------------
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
index 573abbf588f2..5844302fce16 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
@@ -471,7 +471,7 @@ method reshape(HashRef[Shape] $kwargs, Int :$partial_shaping=0, Int :$allow_up_s
     my $shared_handle = $self->handle;
 
     my ($in_args_and_grad_handles, $aux_state_handles, $handle) = check_call(
-        AI::MXNetCAPI::ExecutorReshape(
+        AI::MXNetCAPI::ExecutorReshapeEx(
             $partial_shaping,
             $allow_up_sizing,
             $self->_ctx->device_type_id,
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
index 72f6cc772178..f466aaa11a3d 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
@@ -535,7 +535,7 @@ method wait_to_read()
 
 method shape()
 {
-    return scalar(check_call(AI::MXNetCAPI::NDArrayGetShape($self->handle)));
+    return scalar(check_call(AI::MXNetCAPI::NDArrayGetShapeEx($self->handle)));
 }
 
 =head2 size
@@ -1460,7 +1460,7 @@ func _new_alloc_handle($shape, $ctx, $delay_alloc, $dtype)
 method _new_from_shared_mem($shared_pid, $shared_id, $shape, $dtype)
 {
     my $hdl = check_call(
-        AI::MXNetCAPI::NDArrayCreateFromSharedMem(
+        AI::MXNetCAPI::NDArrayCreateFromSharedMemEx(
             $shared_pid,
             $shared_id,
             $shape,
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
index 04dd1cbfc441..e4953f17031a 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
@@ -662,7 +662,7 @@ method _infer_shape_impl(Maybe[Str|Shape] @args)
             push @{ $indptr }, scalar(@{ $sdata });
         }
     }
-    my $infer_func = $partial ? \&AI::MXNetCAPI::SymbolInferShapePartial : \&AI::MXNetCAPI::SymbolInferShape;
+    my $infer_func = $partial ? \&AI::MXNetCAPI::SymbolInferShapePartialEx : \&AI::MXNetCAPI::SymbolInferShapeEx;
     my ($arg_shapes, $out_shapes, $aux_shapes, $complete) = check_call(
         $infer_func->(
             $self->handle,
@@ -937,7 +937,7 @@ method simple_bind(
         ($updated_shared_data, $in_arg_handles, $arg_grad_handles, $aux_state_handles, $exe_handle)
             =
         check_call(
-            AI::MXNetCAPI::ExecutorSimpleBind(
+            AI::MXNetCAPI::ExecutorSimpleBindEx(
                 $self->handle,
                 $ctx->device_type_id,
                 $ctx->device_id,
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
index 0e6a05ea9695..e38402c56100 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -640,9 +640,9 @@ int MXNDArrayReshape64(NDArrayHandle handle,
  * \param out_pdata pointer holder to get data pointer of the shape
  * \return 0 when success, -1 when failure happens
  */
-int MXNDArrayGetShape(NDArrayHandle handle,
-                                mx_uint *out_dim,
-                                const mx_uint **out_pdata);
+int MXNDArrayGetShapeEx(NDArrayHandle handle,
+                                int *out_dim,
+                                const int **out_pdata);
 /*!
  * \brief get the content of the data in NDArray
  * \param handle the handle to the ndarray
@@ -1289,21 +1289,21 @@ int MXSymbolGrad(SymbolHandle sym,
  * \param complete whether infer shape completes or more information is needed.
  * \return 0 when success, -1 when failure happens
  */
-int MXSymbolInferShape(SymbolHandle sym,
-                                 mx_uint num_args,
-                                 const char** in,
-                                 const mx_uint *in,
-                                 const mx_uint *in,
-                                 mx_uint *in_shape_size,
-                                 const mx_uint **in_shape_ndim,
-                                 const mx_uint ***in_shape_data,
-                                 mx_uint *out_shape_size,
-                                 const mx_uint **out_shape_ndim,
-                                 const mx_uint ***out_shape_data,
-                                 mx_uint *aux_shape_size,
-                                 const mx_uint **aux_shape_ndim,
-                                 const mx_uint ***aux_shape_data,
-                                 int *out);
+int MXSymbolInferShapeEx(SymbolHandle sym,
+                         mx_uint num_args,
+                         const char** in,
+                         const mx_uint *in,
+                         const int *in,
+                         mx_uint *in_shape_size,
+                         const int **in_shape_ndim,
+                         const int ***in_shape_data,
+                         mx_uint *out_shape_size,
+                         const int **out_shape_ndim,
+                         const int ***out_shape_data,
+                         mx_uint *aux_shape_size,
+                         const int **aux_shape_ndim,
+                         const int ***aux_shape_data,
+                         int *out);
 /*!
  * \brief partially infer shape of unknown input shapes given the known one.
  *
@@ -1328,21 +1328,21 @@ int MXSymbolInferShape(SymbolHandle sym,
  * \param complete whether infer shape completes or more information is needed.
  * \return 0 when success, -1 when failure happens
  */
-int MXSymbolInferShapePartial(SymbolHandle sym,
-                                 mx_uint num_args,
-                                 const char** in,
-                                 const mx_uint *in,
-                                 const mx_uint *in,
-                                 mx_uint *in_shape_size,
-                                 const mx_uint **in_shape_ndim,
-                                 const mx_uint ***in_shape_data,
-                                 mx_uint *out_shape_size,
-                                 const mx_uint **out_shape_ndim,
-                                 const mx_uint ***out_shape_data,
-                                 mx_uint *aux_shape_size,
-                                 const mx_uint **aux_shape_ndim,
-                                 const mx_uint ***aux_shape_data,
-                                 int *out);
+int MXSymbolInferShapePartialEx(SymbolHandle sym,
+                                mx_uint num_args,
+                                const char** in,
+                                const mx_uint *in,
+                                const int *in,
+                                mx_uint *in_shape_size,
+                                const int **in_shape_ndim,
+                                const int ***in_shape_data,
+                                mx_uint *out_shape_size,
+                                const int **out_shape_ndim,
+                                const int ***out_shape_data,
+                                mx_uint *aux_shape_size,
+                                const int **aux_shape_ndim,
+                                const int ***aux_shape_data,
+                                int *out);
 
 /*!
  * \brief infer type of unknown input types given the known one.
@@ -1535,40 +1535,40 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
                                ExecutorHandle shared_exec,
                                ExecutorHandle *out);
 
-int MXExecutorSimpleBind(SymbolHandle symbol_handle,
-                         int dev_type,
-                         int dev_id,
-                         const mx_uint num_g2c_keys,
-                         const char** in, // g2c_keys,
-                         const int* in, // g2c_dev_types,
-                         const int* in, // g2c_dev_ids,
-                         const mx_uint provided_grad_req_list_len,
-                         const char** in, // provided_grad_req_names,
-                         const char** in, // provided_grad_req_types,
-                         const mx_uint num_provided_arg_shapes,
-                         const char** in, // provided_arg_shape_names,
-                         const mx_uint* in, // provided_arg_shape_data,
-                         const mx_uint* in, // provided_arg_shape_idx,
-                         const mx_uint num_provided_arg_dtypes,
-                         const char** in, // provided_arg_dtype_names,
-                         const int* in, // provided_arg_dtypes,
-                         const mx_uint num_provided_arg_stypes,
-                         const char** in, // provided_arg_stype_names,
-                         const int* in, // provided_arg_stypes,
-                         const mx_uint num_shared_arg_names,
-                         const char** in, // shared_arg_name_list,
-                         int* shared_buffer_len,
-                         const char** shared_buffer_name_list,
-                         NDArrayHandle* shared_buffer_handle_list,
-                         const char*** updated_shared_buffer_name_list,
-                         NDArrayHandle** updated_shared_buffer_handle_list,
-                         mx_uint* num_in_args,
-                         NDArrayHandle** in_args,
-                         NDArrayHandle** arg_grads,
-                         mx_uint* num_aux_states,
-                         NDArrayHandle** aux_states,
-                         ExecutorHandle shared_exec_handle,
-                         ExecutorHandle* out
+int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
+                           int dev_type,
+                           int dev_id,
+                           const mx_uint num_g2c_keys,
+                           const char** in, // g2c_keys,
+                           const int* in, // g2c_dev_types,
+                           const int* in, // g2c_dev_ids,
+                           const mx_uint provided_grad_req_list_len,
+                           const char** in, // provided_grad_req_names,
+                           const char** in, // provided_grad_req_types,
+                           const mx_uint num_provided_arg_shapes,
+                           const char** in, // provided_arg_shape_names,
+                           const int* in, // provided_arg_shape_data,
+                           const mx_uint* in, // provided_arg_shape_idx,
+                           const mx_uint num_provided_arg_dtypes,
+                           const char** in, // provided_arg_dtype_names,
+                           const int* in, // provided_arg_dtypes,
+                           const mx_uint num_provided_arg_stypes,
+                           const char** in, // provided_arg_stype_names,
+                           const int* in, // provided_arg_stypes,
+                           const mx_uint num_shared_arg_names,
+                           const char** in, // shared_arg_name_list,
+                           int* shared_buffer_len,
+                           const char** shared_buffer_name_list,
+                           NDArrayHandle* shared_buffer_handle_list,
+                           const char*** updated_shared_buffer_name_list,
+                           NDArrayHandle** updated_shared_buffer_handle_list,
+                           mx_uint* num_in_args,
+                           NDArrayHandle** in_args,
+                           NDArrayHandle** arg_grads,
+                           mx_uint* num_aux_states,
+                           NDArrayHandle** aux_states,
+                           ExecutorHandle shared_exec_handle,
+                           ExecutorHandle* out
 );
 
 /*!
@@ -1592,25 +1592,25 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
  * \param out output executor handle
  * \return a new executor
  */
-int MXExecutorReshape(int partial_shaping,
-                                int allow_up_sizing,
-                                int dev_type,
-                                int dev_id,
-                                mx_uint num_map_keys,
-                                const char** in,
-                                const int* in,
-                                const int* in,
-                                const mx_uint num_provided_arg_shapes,
-                                const char** in,
-                                const mx_uint* in,
-                                const mx_uint* in,
-                                mx_uint* couple_out_size,
-                                NDArrayHandle** out_first_array,
-                                NDArrayHandle** out_second_array,
-                                mx_uint* out_size,
-                                NDArrayHandle** out_array,
-                                ExecutorHandle shared_exec,
-                                ExecutorHandle *out);
+int MXExecutorReshapeEx(int partial_shaping,
+                        int allow_up_sizing,
+                        int dev_type,
+                        int dev_id,
+                        mx_uint num_map_keys,
+                        const char** in,
+                        const int* in,
+                        const int* in,
+                        const mx_uint num_provided_arg_shapes,
+                        const char** in,
+                        const int* in,
+                        const mx_uint* in,
+                        mx_uint* couple_out_size,
+                        NDArrayHandle** out_first_array,
+                        NDArrayHandle** out_second_array,
+                        mx_uint* out_size,
+                        NDArrayHandle** out_array,
+                        ExecutorHandle shared_exec,
+                        ExecutorHandle *out);
 
 /*!
  * \brief set a call back to notify the completion of operation
diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
index 50296c2aaba5..3ec9f95ea9c3 100644
--- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
+++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
@@ -524,13 +524,13 @@
     }
 }
 
-%typemap(in,numinputs=0) (mx_uint *out_dim, const mx_uint **out_pdata) (mx_uint temp_dim, mx_uint *temp_pdata)
+%typemap(in,numinputs=0) (int *out_dim, const int **out_pdata) (int temp_dim, int *temp_pdata)
 {
     $1 = &temp_dim;
     $2 = &temp_pdata;
 }
 
-%typemap(argout) (mx_uint *out_dim, const mx_uint **out_pdata)
+%typemap(argout) (int *out_dim, const int **out_pdata)
 {
     if(!result)
     {
@@ -956,12 +956,12 @@
     }
 }
 
-%typemap(in,numinputs=0) (mx_uint *in_shape_size, const mx_uint **in_shape_ndim, const mx_uint ***in_shape_data) 
-                         (mx_uint temp1, mx_uint *temp2, mx_uint **temp3),
-                         (mx_uint *out_shape_size, const mx_uint **out_shape_ndim, const mx_uint ***out_shape_data) 
-                         (mx_uint temp1, mx_uint *temp2, mx_uint **temp3),
-                         (mx_uint *aux_shape_size, const mx_uint **aux_shape_ndim, const mx_uint ***aux_shape_data) 
-                         (mx_uint temp1, mx_uint *temp2, mx_uint **temp3)
+%typemap(in,numinputs=0) (mx_uint *in_shape_size, const int **in_shape_ndim, const int ***in_shape_data) 
+                         (mx_uint temp1, int *temp2, int **temp3),
+                         (mx_uint *out_shape_size, const int **out_shape_ndim, const int ***out_shape_data) 
+                         (mx_uint temp1, int *temp2, int **temp3),
+                         (mx_uint *aux_shape_size, const int **aux_shape_ndim, const int ***aux_shape_data) 
+                         (mx_uint temp1, int *temp2, int **temp3)
 {
     $1 = &temp1;
     $2 = &temp2;
@@ -969,9 +969,9 @@
     *$1 = 0;
 }
 
-%typemap(argout) (mx_uint *in_shape_size, const mx_uint **in_shape_ndim, const mx_uint ***in_shape_data),
-                 (mx_uint *out_shape_size, const mx_uint **out_shape_ndim, const mx_uint ***out_shape_data),
-                 (mx_uint *aux_shape_size, const mx_uint **aux_shape_ndim, const mx_uint ***aux_shape_data)
+%typemap(argout) (mx_uint *in_shape_size, const int **in_shape_ndim, const int ***in_shape_data),
+                 (mx_uint *out_shape_size, const int **out_shape_ndim, const int ***out_shape_data),
+                 (mx_uint *aux_shape_size, const int **aux_shape_ndim, const int ***aux_shape_data)
 {
     if(!result && *arg15)
     {
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 374a3b50bbb5..79eb1f10f427 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -23,7 +23,7 @@
 
 from .context import Context, current_context, cpu, gpu, cpu_pinned
 from . import engine
-from .base import MXNetError
+from .base import MXNetError, is_np_compat, set_np_compat, np_compat, use_np_compat
 from . import base
 from . import contrib
 from . import ndarray
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index feb4d70b6533..58f222dc1e85 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -20,17 +20,18 @@
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
+from functools import wraps
 import atexit
 import ctypes
 import os
 import sys
 import inspect
 import platform
-import numpy as np
+import numpy as _np
 
 from . import libinfo
 
-__all__ = ['MXNetError']
+__all__ = ['MXNetError', 'is_np_compat', 'set_np_compat', 'np_compat', 'use_np_compat']
 #----------------------------
 # library loading
 #----------------------------
@@ -44,8 +45,8 @@
     long = int
 # pylint: enable=pointless-statement
 
-integer_types = (int, long, np.int32, np.int64)
-numeric_types = (float, int, long, np.generic)
+integer_types = (int, long, _np.int32, _np.int64)
+numeric_types = (float, int, long, _np.generic)
 string_types = basestring,
 
 if sys.version_info[0] > 2:
@@ -213,10 +214,11 @@ def _load_lib():
 _LIB = _load_lib()
 
 # type definitions
+mx_int = ctypes.c_int
 mx_uint = ctypes.c_uint
 mx_float = ctypes.c_float
 mx_float_p = ctypes.POINTER(mx_float)
-mx_real_t = np.float32
+mx_real_t = _np.float32
 NDArrayHandle = ctypes.c_void_p
 FunctionHandle = ctypes.c_void_p
 OpHandle = ctypes.c_void_p
@@ -455,7 +457,7 @@ def ctypes2numpy_shared(cptr, shape):
     for s in shape:
         size *= s
     dbuffer = (mx_float * size).from_address(ctypes.addressof(cptr.contents))
-    return np.frombuffer(dbuffer, dtype=np.float32).reshape(shape)
+    return _np.frombuffer(dbuffer, dtype=_np.float32).reshape(shape)
 
 
 def build_param_doc(arg_names, arg_types, arg_descs, remove_dup=True):
@@ -733,3 +735,140 @@ def write_all_str(module_file, module_all_list):
 
 ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
 ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+
+
+def set_np_compat(active):
+    """
+    Turns on/off NumPy compatibility. NumPy-compatibility is turned off by default in backend.
+
+    Parameters
+    ----------
+    active : bool
+        Indicates whether to turn on/off NumPy compatibility.
+
+    Returns
+    -------
+        A bool value indicating the previous state of NumPy compatibility.
+    """
+    prev = ctypes.c_int()
+    check_call(_LIB.MXSetIsNumpyCompatible(ctypes.c_int(active), ctypes.byref(prev)))
+    return bool(prev.value)
+
+
+def is_np_compat():
+    """
+    Checks whether the NumPy compatibility is currently turned on.
+    NumPy-compatibility is turned off by default in backend.
+
+    Returns
+    -------
+        A bool value indicating whether the NumPy compatibility is currently on.
+    """
+    curr = ctypes.c_bool()
+    check_call(_LIB.MXIsNumpyCompatible(ctypes.byref(curr)))
+    return curr.value
+
+
+class _NumpyCompatibilityStateScope(object):
+    """Scope for managing numpy compatibility state.
+    Do not use this class directly. Use `np_compat(active)` instead.
+
+    Example::
+
+        with _NumpyCompatibilityStateScope(True):
+            y = model(x)
+            backward([y])
+
+    """
+    def __init__(self, is_np_compat):  #pylint: disable=redefined-outer-name
+        self._enter_is_np_compat = is_np_compat
+        self._prev_is_np_compat = None
+
+    def __enter__(self):
+        if self._enter_is_np_compat is not None:
+            self._prev_is_np_compat = set_np_compat(self._enter_is_np_compat)
+
+    def __exit__(self, ptype, value, trace):
+        if self._enter_is_np_compat is not None and self._prev_is_np_compat != self._enter_is_np_compat:
+            set_np_compat(self._prev_is_np_compat)
+
+
+def np_compat(active=True):
+    """Returns an activated/deactivated NumPy compatibility state scope to be used in 'with' statement
+    and captures code that needs the compatibility.
+
+    Example::
+
+        with mx.np_compat(active=True):
+            # A scalar tensor's shape is `()`, whose `ndim` is `0`.
+            scalar = mx.nd.ones(shape=())
+            assert scalar.shape == ()
+
+            # In NumPy compatible mode, 0 in a shape means that dimension contains zero elements.
+            data = mx.sym.var("data", shape=(0, 2, 3))
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape()
+            assert arg_shapes[0] == (0, 2, 3)
+            assert out_shapes[0] == (0, 2, 3)
+
+            # -1 means unknown shape dimension size in the new NumPy-compatible shape definition
+            data = mx.sym.var("data", shape=(-1, 2, 3))
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] == (-1, 2, 3)
+            assert out_shapes[0] == (-1, 2, 3)
+
+            # When a shape is completely unknown in NumPy-compatible mode, it is
+            # represented as `None` in Python.
+            data = mx.sym.var("data")
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] is None
+            assert out_shapes[0] is None
+
+        with mx.np_compat(active=False):
+            # 0 means unknown shape dimension size in the legacy shape definition.
+            data = mx.sym.var("data", shape=(0, 2, 3))
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] == (0, 2, 3)
+            assert out_shapes[0] == (0, 2, 3)
+
+            # When a shape is completely unknown in the legacy mode (default), its ndim is
+            # equal to 0 and it is represented as `()` in Python.
+            data = mx.sym.var("data")
+            ret = mx.sym.sin(data)
+            arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+            assert arg_shapes[0] == ()
+            assert out_shapes[0] == ()
+    """
+    return _NumpyCompatibilityStateScope(active)
+
+
+def use_np_compat(func):
+    """Wraps a function with an activated NumPy-compatibility scope. This ensures
+    that the execution of the function is guaranteed with NumPy compatible semantics,
+    such as zero-dim and zero size tensors.
+
+    Example::
+        import mxnet as mx
+        @mx.use_np_compat
+        def scalar_one():
+            return mx.nd.ones(())
+        print(scalar_one())
+
+    Parameters
+    ----------
+    func : a user-provided callable function to be scoped by the NumPy compatibility state.
+
+    Returns
+    -------
+    Function
+        A function for wrapping the user functions in the NumPy compatibility scope.
+    """
+    @wraps(func)
+    def _with_np_compat(*args, **kwargs):
+        with np_compat(active=True):
+            return func(*args, **kwargs)
+
+    return _with_np_compat
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 7bf867579d6b..9dfe63682f86 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -25,7 +25,7 @@
 import copy
 import numpy as np
 from .base import _LIB
-from .base import mx_uint, NDArrayHandle, ExecutorHandle, py_str
+from .base import mx_uint, NDArrayHandle, ExecutorHandle, py_str, mx_int
 from .base import check_call, c_handle_array, c_array_buf, c_str_array
 from .ndarray import NDArray
 from .ndarray import _ndarray_cls
@@ -433,29 +433,29 @@ def reshape(self, partial_shaping=False, allow_up_sizing=False, **kwargs):
         num_aux_states = ctypes.c_uint()
         aux_state_handles = ctypes.POINTER(NDArrayHandle)()
 
-        check_call(_LIB.MXExecutorReshape(ctypes.c_int(int(partial_shaping)),
-                                          ctypes.c_int(int(allow_up_sizing)),
-                                          ctypes.c_int(self._ctx.device_typeid),
-                                          ctypes.c_int(self._ctx.device_id),
-                                          mx_uint(len(ctx_map_keys)),
-                                          c_str_array(ctx_map_keys),
-                                          c_array_buf(ctypes.c_int,
-                                                      py_array('i', ctx_map_dev_types)),
-                                          c_array_buf(ctypes.c_int,
-                                                      py_array('i', ctx_map_dev_ids)),
-                                          mx_uint(len(provided_arg_shape_names)),
-                                          c_str_array(provided_arg_shape_names),
-                                          c_array_buf(mx_uint,
-                                                      py_array('I', provided_arg_shape_data)),
-                                          c_array_buf(mx_uint,
-                                                      py_array('I', provided_arg_shape_idx)),
-                                          ctypes.byref(num_in_args),
-                                          ctypes.byref(in_arg_handles),
-                                          ctypes.byref(arg_grad_handles),
-                                          ctypes.byref(num_aux_states),
-                                          ctypes.byref(aux_state_handles),
-                                          shared_handle,
-                                          ctypes.byref(handle)))
+        check_call(_LIB.MXExecutorReshapeEx(ctypes.c_int(int(partial_shaping)),
+                                            ctypes.c_int(int(allow_up_sizing)),
+                                            ctypes.c_int(self._ctx.device_typeid),
+                                            ctypes.c_int(self._ctx.device_id),
+                                            mx_uint(len(ctx_map_keys)),
+                                            c_str_array(ctx_map_keys),
+                                            c_array_buf(ctypes.c_int,
+                                                        py_array('i', ctx_map_dev_types)),
+                                            c_array_buf(ctypes.c_int,
+                                                        py_array('i', ctx_map_dev_ids)),
+                                            mx_uint(len(provided_arg_shape_names)),
+                                            c_str_array(provided_arg_shape_names),
+                                            c_array_buf(mx_int,
+                                                        py_array('i', provided_arg_shape_data)),
+                                            c_array_buf(mx_uint,
+                                                        py_array('I', provided_arg_shape_idx)),
+                                            ctypes.byref(num_in_args),
+                                            ctypes.byref(in_arg_handles),
+                                            ctypes.byref(arg_grad_handles),
+                                            ctypes.byref(num_aux_states),
+                                            ctypes.byref(aux_state_handles),
+                                            shared_handle,
+                                            ctypes.byref(handle)))
 
         arg_arrays = [_ndarray_cls(NDArrayHandle(in_arg_handles[i]))
                       for i in range(num_in_args.value)]
diff --git a/python/mxnet/gluon/nn/activations.py b/python/mxnet/gluon/nn/activations.py
index c7dc83176e14..8c51b0a52592 100644
--- a/python/mxnet/gluon/nn/activations.py
+++ b/python/mxnet/gluon/nn/activations.py
@@ -153,12 +153,13 @@ class ELU(HybridBlock):
     Outputs:
         - **out**: output tensor with the same shape as `data`.
     """
+
     def __init__(self, alpha=1.0, **kwargs):
         super(ELU, self).__init__(**kwargs)
         self._alpha = alpha
 
     def hybrid_forward(self, F, x):
-        return F.where(x > 0, x, self._alpha * (F.exp(x) - 1.0))
+        return F.LeakyReLU(x, act_type='elu', slope=self._alpha)
 
 
 class SELU(HybridBlock):
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index efb51096c368..f44ff041e35d 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -884,6 +884,8 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
                                    rescale_grad=(1.0/batch_size),
                                    **(self.kwargs))
         elif isinstance(self.optimizer, opt.Optimizer):
+            if not optimizer.idx2name:
+                optimizer.idx2name = param_idx2name.copy()
             optimizer = self.optimizer
 
         # do training
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index a7d3336e8439..e83751d42974 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -505,14 +505,14 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
             batch_size *= kvstore.num_workers
         rescale_grad = 1.0/batch_size
 
+        idx2name = {}
+        if update_on_kvstore:
+            idx2name.update(enumerate(self._exec_group.param_names))
+        else:
+            for k in range(len(self._context)):
+                idx2name.update({i*len(self._context)+k: n
+                                 for i, n in enumerate(self._exec_group.param_names)})
         if isinstance(optimizer, str):
-            idx2name = {}
-            if update_on_kvstore:
-                idx2name.update(enumerate(self._exec_group.param_names))
-            else:
-                for k in range(len(self._context)):
-                    idx2name.update({i*len(self._context)+k: n
-                                     for i, n in enumerate(self._exec_group.param_names)})
             optimizer_params = dict(optimizer_params)
             if 'rescale_grad' not in optimizer_params:
                 optimizer_params['rescale_grad'] = rescale_grad
@@ -528,6 +528,8 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
                     "is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "%(
                         optimizer.rescale_grad, rescale_grad) +
                     "Is this intended?", stacklevel=2)
+            if not optimizer.idx2name:
+                optimizer.idx2name = idx2name.copy()
 
         self._optimizer = optimizer
         self._kvstore = kvstore
diff --git a/python/mxnet/ndarray/_internal.py b/python/mxnet/ndarray/_internal.py
index 5f3ce976dbc5..8045d9bd2b14 100644
--- a/python/mxnet/ndarray/_internal.py
+++ b/python/mxnet/ndarray/_internal.py
@@ -20,8 +20,6 @@
 import os as _os
 import sys as _sys
 
-import numpy as np
-
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
         from .._ctypes.ndarray import NDArrayBase, CachedOp
diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
index 74c355dc1288..1718a2c68d13 100644
--- a/python/mxnet/ndarray/contrib.py
+++ b/python/mxnet/ndarray/contrib.py
@@ -18,6 +18,7 @@
 # coding: utf-8
 # pylint: disable=wildcard-import, unused-wildcard-import,redefined-outer-name
 """Contrib NDArray API of MXNet."""
+from __future__ import absolute_import
 import math
 import numpy as np
 from ..context import current_context
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 87f2712d8a40..d912d38930a5 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -35,7 +35,7 @@
 import numpy as np
 from ..base import _LIB, numeric_types, integer_types
 from ..base import c_str, c_array, c_array_buf, c_handle_array, mx_real_t
-from ..base import mx_uint, NDArrayHandle, check_call, DLPackHandle
+from ..base import mx_uint, NDArrayHandle, check_call, DLPackHandle, mx_int
 from ..base import ctypes2buffer
 from ..context import Context, current_context
 from . import _internal
@@ -47,7 +47,7 @@
            "imdecode", "lesser", "lesser_equal", "logical_and", "logical_or", "logical_xor",
            "maximum", "minimum", "moveaxis", "modulo", "multiply", "not_equal", "onehot_encode",
            "power", "subtract", "true_divide", "waitall", "_new_empty_handle", "histogram",
-           "split_v2", "to_dlpack_for_read", "to_dlpack_for_write", "from_dlpack"]
+           "split_v2", "to_dlpack_for_read", "to_dlpack_for_write", "from_dlpack", "from_numpy"]
 
 _STORAGE_TYPE_UNDEFINED = -1
 _STORAGE_TYPE_DEFAULT = 0
@@ -143,11 +143,11 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
 
 def _new_from_shared_mem(shared_pid, shared_id, shape, dtype):
     hdl = NDArrayHandle()
-    check_call(_LIB.MXNDArrayCreateFromSharedMem(
+    check_call(_LIB.MXNDArrayCreateFromSharedMemEx(
         ctypes.c_int(shared_pid),
         ctypes.c_int(shared_id),
-        c_array(mx_uint, shape),
-        mx_uint(len(shape)),
+        c_array(mx_int, shape),
+        mx_int(len(shape)),
         ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
         ctypes.byref(hdl)))
     return hdl
@@ -1845,11 +1845,14 @@ def shape(self):
         >>> y.shape
         (2L, 3L, 4L)
         """
-        ndim = mx_uint()
-        pdata = ctypes.POINTER(mx_uint)()
-        check_call(_LIB.MXNDArrayGetShape(
+        ndim = mx_int()
+        pdata = ctypes.POINTER(mx_int)()
+        check_call(_LIB.MXNDArrayGetShapeEx(
             self.handle, ctypes.byref(ndim), ctypes.byref(pdata)))
-        return tuple(pdata[:ndim.value]) # pylint: disable=invalid-slice-index
+        if ndim.value == -1:
+            return None
+        else:
+            return tuple(pdata[:ndim.value])  # pylint: disable=invalid-slice-index
 
 
     @property
@@ -4112,3 +4115,108 @@ def from_dlpack(dlpack):
     # delete the deleter of the old dlpack
     ctypes.pythonapi.PyCapsule_SetDestructor(dlpack, None)
     return NDArray(handle=handle)
+
+class DLContext(ctypes.Structure):
+    _fields_ = [("device_type", ctypes.c_int),
+                ("device_id", ctypes.c_int)]
+
+
+class DLDataType(ctypes.Structure):
+    _fields_ = [("type_code", ctypes.c_uint8),
+                ("bits", ctypes.c_uint8),
+                ("lanes", ctypes.c_uint16)]
+    TYPE_MAP = {
+        "int32": (0, 32, 1),
+        "int64": (0, 64, 1),
+        "bool": (1, 1, 1),
+        "uint32": (1, 32, 1),
+        "uint64": (1, 64, 1),
+        "float32": (2, 32, 1),
+        "float64": (2, 64, 1),
+    }
+
+
+class DLTensor(ctypes.Structure):
+    _fields_ = [("data", ctypes.c_void_p),
+                ("ctx", DLContext),
+                ("ndim", ctypes.c_int),
+                ("dtype", DLDataType),
+                ("shape", ctypes.POINTER(ctypes.c_int64)),
+                ("strides", ctypes.POINTER(ctypes.c_int64)),
+                ("byte_offset", ctypes.c_uint64)]
+
+class DLManagedTensor(ctypes.Structure):
+    pass
+
+
+DeleterFunc = ctypes.CFUNCTYPE(None, ctypes.POINTER(DLManagedTensor))
+
+
+DLManagedTensor._fields_ = [("dl_tensor", DLTensor),           # pylint: disable=protected-access
+                            ("manager_ctx", ctypes.c_void_p),
+                            ("deleter", DeleterFunc)]
+
+
+@DeleterFunc
+def dl_managed_tensor_deleter(dl_managed_tensor_handle):
+    void_p = dl_managed_tensor_handle.contents.manager_ctx
+    pyobj = ctypes.cast(void_p, ctypes.py_object)
+    ctypes.pythonapi.Py_DecRef(pyobj)
+
+
+def from_numpy(ndarray, zero_copy=True):
+    """Returns an MXNet's NDArray backed by Numpy's ndarray.
+
+    Parameters
+    ----------
+    ndarray: numpy.ndarray
+        input data
+
+    zero_copy: bool
+        Whether we use DLPack's zero-copy conversion to convert to MXNet's NDArray.
+        This is only available for c-contiguous arrays, i.e. array.flags[C_CONTIGUOUS] == True.
+
+    Returns
+    -------
+    NDArray
+        a NDArray backed by a dlpack tensor
+
+    """
+
+    def _make_manager_ctx(obj):
+        pyobj = ctypes.py_object(obj)
+        void_p = ctypes.c_void_p.from_buffer(pyobj)
+        ctypes.pythonapi.Py_IncRef(pyobj)
+        return void_p
+
+    def _make_dl_tensor(array):
+        if str(array.dtype) not in DLDataType.TYPE_MAP:
+            raise ValueError(str(array.dtype) + " is not supported.")
+        dl_tensor = DLTensor()
+        dl_tensor.data = array.ctypes.data_as(ctypes.c_void_p)
+        dl_tensor.ctx = DLContext(1, 0)
+        dl_tensor.ndim = array.ndim
+        dl_tensor.dtype = DLDataType.TYPE_MAP[str(array.dtype)]
+        dl_tensor.shape = array.ctypes.shape_as(ctypes.c_int64)
+        dl_tensor.strides = None
+        dl_tensor.byte_offset = 0
+        return dl_tensor
+
+    def _make_dl_managed_tensor(array):
+        c_obj = DLManagedTensor()
+        c_obj.dl_tensor = _make_dl_tensor(array)
+        c_obj.manager_ctx = _make_manager_ctx(array)
+        c_obj.deleter = dl_managed_tensor_deleter
+        return c_obj
+
+    if not zero_copy:
+        return array(ndarray, dtype=ndarray.dtype)
+
+    if not ndarray.flags['C_CONTIGUOUS']:
+        raise ValueError("Only c-contiguous arrays are supported for zero-copy")
+    c_obj = _make_dl_managed_tensor(ndarray)
+    address = ctypes.addressof(c_obj)
+    address = ctypes.cast(address, ctypes.c_void_p)
+    handle = NDArrayHandle()
+    check_call(_LIB.MXNDArrayFromDLPack(address, ctypes.byref(handle)))
+    return NDArray(handle=handle)
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
index 05d7f17a8fc1..1ccf228698ba 100644
--- a/python/mxnet/ndarray/register.py
+++ b/python/mxnet/ndarray/register.py
@@ -16,9 +16,10 @@
 # under the License.
 
 """Register backend ops in mxnet.ndarray namespace"""
+from __future__ import absolute_import
 import os as _os
 import ctypes
-import numpy as np  # pylint: disable=unused-import
+import numpy as _np  # pylint: disable=unused-import
 
 from ._internal import NDArrayBase, _imperative_invoke # pylint: disable=unused-import
 from ..ndarray_doc import _build_doc
@@ -103,7 +104,7 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
             if dtype_name is not None:
                 code.append("""
     if '%s' in kwargs:
-        kwargs['%s'] = np.dtype(kwargs['%s']).name"""%(
+        kwargs['%s'] = _np.dtype(kwargs['%s']).name"""%(
             dtype_name, dtype_name, dtype_name))
             code.append("""
     _ = kwargs.pop('name', None)
@@ -136,7 +137,7 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
                 code.append("""
     if %s is not _Null:
         keys.append('%s')
-        vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
+        vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
 
     if not signature_only:
         code.append("""
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index e8fa571d44db..2c69b9b46521 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -28,7 +28,7 @@
 from ctypes import CFUNCTYPE, POINTER, Structure, pointer
 from ctypes import c_void_p, c_int, c_char, c_char_p, cast, c_bool
 
-from .base import _LIB, check_call, MXCallbackList, c_array, c_array_buf
+from .base import _LIB, check_call, MXCallbackList, c_array, c_array_buf, mx_int
 from .base import c_str, mx_uint, mx_float, ctypes2numpy_shared, NDArrayHandle, py_str
 from . import symbol, context
 from .ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
@@ -164,7 +164,7 @@ def get_symbol(self, *args, **kwargs):
         fb_functype = CFUNCTYPE(None, c_int, POINTER(POINTER(mx_float)), POINTER(c_int),
                                 POINTER(POINTER(mx_uint)), POINTER(c_int), c_void_p)
         infer_functype = CFUNCTYPE(None, c_int, POINTER(c_int),
-                                   POINTER(POINTER(mx_uint)), c_void_p)
+                                   POINTER(POINTER(mx_int)), c_void_p)
         list_functype = CFUNCTYPE(None, POINTER(POINTER(POINTER(c_char))), c_void_p)
         class NumpyOpInfo(Structure):
             """Structure that holds Callback information. Passed to NumpyOpProp"""
@@ -214,9 +214,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
             assert len(ishape) == n_in
             rshape = list(ishape) + list(oshape)
             for i in range(n_in+n_out):
-                tensor_shapes[i] = cast(c_array_buf(mx_uint,
-                                                    array('I', rshape[i])),
-                                        POINTER(mx_uint))
+                tensor_shapes[i] = cast(c_array_buf(mx_int,
+                                                    array('i', rshape[i])),
+                                        POINTER(mx_int))
                 tensor_dims[i] = len(rshape[i])
 
         def list_outputs_entry(out, _):
@@ -266,7 +266,7 @@ def __init__(self, need_top_grad=True):
     def get_symbol(self, *args, **kwargs):
         fb_functype = CFUNCTYPE(c_bool, c_int, POINTER(c_void_p), POINTER(c_int), c_void_p)
         infer_functype = CFUNCTYPE(c_bool, c_int, POINTER(c_int),
-                                   POINTER(POINTER(mx_uint)), c_void_p)
+                                   POINTER(POINTER(mx_int)), c_void_p)
         list_functype = CFUNCTYPE(c_bool, POINTER(POINTER(POINTER(c_char))), c_void_p)
         deps_functype = CFUNCTYPE(c_bool, c_int_p, c_int_p, c_int_p,
                                   c_int_p, POINTER(c_int_p), c_void_p)
@@ -335,9 +335,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
                 assert len(ishape) == n_in
                 rshape = list(ishape) + list(oshape)
                 for i in range(n_in+n_out):
-                    tensor_shapes[i] = cast(c_array_buf(mx_uint,
-                                                        array('I', rshape[i])),
-                                            POINTER(mx_uint))
+                    tensor_shapes[i] = cast(c_array_buf(mx_int,
+                                                        array('i', rshape[i])),
+                                            POINTER(mx_int))
                     tensor_dims[i] = len(rshape[i])
             except Exception:
                 print('Error in NDArrayOp.infer_shape: %s' % traceback.format_exc())
@@ -698,7 +698,7 @@ def do_register(prop_cls):
         del_functype = CFUNCTYPE(c_int, c_void_p)
 
         infershape_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int),
-                                        POINTER(POINTER(mx_uint)), c_void_p)
+                                        POINTER(POINTER(mx_int)), c_void_p)
         infertype_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int), c_void_p)
         inferstorage_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int), c_void_p)
         inferstorage_backward_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int), \
@@ -747,9 +747,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
                         "shapes, got %d."%(n_aux, len(ashape))
                     rshape = list(ishape) + list(oshape) + list(ashape)
                     for i in range(n_in+n_out+n_aux):
-                        tensor_shapes[i] = cast(c_array_buf(mx_uint,
-                                                            array('I', rshape[i])),
-                                                POINTER(mx_uint))
+                        tensor_shapes[i] = cast(c_array_buf(mx_int,
+                                                            array('i', rshape[i])),
+                                                POINTER(mx_int))
                         tensor_dims[i] = len(rshape[i])
 
                     infer_shape_entry._ref_holder = [tensor_shapes]
diff --git a/python/mxnet/symbol/_internal.py b/python/mxnet/symbol/_internal.py
index 53fc684008cf..7e9787e32b1c 100644
--- a/python/mxnet/symbol/_internal.py
+++ b/python/mxnet/symbol/_internal.py
@@ -22,8 +22,6 @@
 import sys as _sys
 import os as _os
 
-import numpy as np
-
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
         from .._ctypes.symbol import SymbolBase, _set_symbol_class
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
index 15c8e5e1fa68..ac59f8b97f15 100644
--- a/python/mxnet/symbol/register.py
+++ b/python/mxnet/symbol/register.py
@@ -17,9 +17,10 @@
 
 # pylint: disable=unused-import
 """Register backend ops in mxnet.symbol namespace."""
+from __future__ import absolute_import
 import os as _os
 import ctypes
-import numpy as np
+import numpy as _np
 
 from . import _internal
 from ._internal import SymbolBase, _symbol_creator
@@ -109,7 +110,7 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
             if dtype_name is not None:
                 code.append("""
     if '%s' in kwargs:
-        kwargs['%s'] = np.dtype(kwargs['%s']).name"""%(
+        kwargs['%s'] = _np.dtype(kwargs['%s']).name"""%(
             dtype_name, dtype_name, dtype_name))
             code.append("""
     attr = kwargs.pop('attr', None)
@@ -175,7 +176,7 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
                 code.append("""
     if %s is not _Null:
         _keys.append('%s')
-        _vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
+        _vals.append(_np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
 
             code.append("""
     if not hasattr(NameManager._current, "value"):
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 91d4ca16df07..4bf60a6a1fcd 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -34,7 +34,7 @@
 
 from ..attribute import AttrScope
 from ..base import _LIB, numeric_types, c_array, c_array_buf, c_str, c_str_array, c_handle_array
-from ..base import mx_uint, py_str, string_types, integer_types
+from ..base import mx_uint, py_str, string_types, integer_types, mx_int, is_np_compat
 from ..base import NDArrayHandle, ExecutorHandle, SymbolHandle
 from ..base import check_call, MXNetError, NotImplementedForSymbol
 from ..context import Context, current_context
@@ -1078,7 +1078,11 @@ def infer_shape(self, *args, **kwargs):
                 arg_names = self.list_arguments()
                 unknowns = []
                 for name, shape in zip(arg_names, arg_shapes):
-                    if not shape or not _numpy.prod(shape):
+                    if is_np_compat():
+                        shape_is_none = not shape or -1 in shape
+                    else:
+                        shape_is_none = not shape or 0 in shape
+                    if shape_is_none:
                         if len(unknowns) >= 10:
                             unknowns.append('...')
                             break
@@ -1174,25 +1178,25 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
                 indptr.append(len(sdata))
             keys = c_str_array(str_keys)
         arg_shape_size = mx_uint()
-        arg_shape_ndim = ctypes.POINTER(mx_uint)()
-        arg_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        arg_shape_ndim = ctypes.POINTER(mx_int)()
+        arg_shape_data = ctypes.POINTER(ctypes.POINTER(mx_int))()
         out_shape_size = mx_uint()
-        out_shape_ndim = ctypes.POINTER(mx_uint)()
-        out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        out_shape_ndim = ctypes.POINTER(mx_int)()
+        out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_int))()
         aux_shape_size = mx_uint()
-        aux_shape_ndim = ctypes.POINTER(mx_uint)()
-        aux_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
+        aux_shape_ndim = ctypes.POINTER(mx_int)()
+        aux_shape_data = ctypes.POINTER(ctypes.POINTER(mx_int))()
         complete = ctypes.c_int()
         if partial:
-            infer_func = _LIB.MXSymbolInferShapePartial
+            infer_func = _LIB.MXSymbolInferShapePartialEx
         else:
-            infer_func = _LIB.MXSymbolInferShape
+            infer_func = _LIB.MXSymbolInferShapeEx
         check_call(infer_func(
             self.handle,
             mx_uint(len(indptr) - 1),
             keys,
             c_array_buf(mx_uint, array('I', indptr)),
-            c_array_buf(mx_uint, array('I', sdata)),
+            c_array_buf(mx_int, array('i', sdata)),
             ctypes.byref(arg_shape_size),
             ctypes.byref(arg_shape_ndim),
             ctypes.byref(arg_shape_data),
@@ -1204,12 +1208,15 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
             ctypes.byref(aux_shape_data),
             ctypes.byref(complete)))
         if complete.value != 0:
-            arg_shapes = [
-                tuple(arg_shape_data[i][:arg_shape_ndim[i]]) for i in range(arg_shape_size.value)]
-            out_shapes = [
-                tuple(out_shape_data[i][:out_shape_ndim[i]]) for i in range(out_shape_size.value)]
-            aux_shapes = [
-                tuple(aux_shape_data[i][:aux_shape_ndim[i]]) for i in range(aux_shape_size.value)]
+            arg_shapes = [tuple(arg_shape_data[i][:arg_shape_ndim[i]])
+                          if arg_shape_ndim[i] >= 0 else None
+                          for i in range(arg_shape_size.value)]
+            out_shapes = [tuple(out_shape_data[i][:out_shape_ndim[i]])
+                          if out_shape_ndim[i] >= 0 else None
+                          for i in range(out_shape_size.value)]
+            aux_shapes = [tuple(aux_shape_data[i][:aux_shape_ndim[i]])
+                          if aux_shape_ndim[i] >= 0 else None
+                          for i in range(aux_shape_size.value)]
             return (arg_shapes, out_shapes, aux_shapes)
         else:
             return (None, None, None)
@@ -1564,42 +1571,42 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
         aux_state_handles = ctypes.POINTER(NDArrayHandle)()
 
         try:
-            check_call(_LIB.MXExecutorSimpleBind(self.handle,
-                                                 ctypes.c_int(ctx.device_typeid),
-                                                 ctypes.c_int(ctx.device_id),
-                                                 num_ctx_map_keys,
-                                                 ctx_map_keys,
-                                                 ctx_map_dev_types,
-                                                 ctx_map_dev_ids,
-                                                 mx_uint(provided_req_type_list_len),
-                                                 provided_grad_req_names,
-                                                 provided_grad_req_types,
-                                                 mx_uint(len(provided_arg_shape_names)),
-                                                 c_str_array(provided_arg_shape_names),
-                                                 c_array_buf(mx_uint,
-                                                             array('I', provided_arg_shape_data)),
-                                                 c_array_buf(mx_uint,
-                                                             array('I', provided_arg_shape_idx)),
-                                                 num_provided_arg_types,
-                                                 provided_arg_type_names,
-                                                 provided_arg_type_data,
-                                                 num_provided_arg_stypes,
-                                                 provided_arg_stype_names,
-                                                 provided_arg_stype_data,
-                                                 mx_uint(len(shared_arg_name_list)),
-                                                 c_str_array(shared_arg_name_list),
-                                                 ctypes.byref(shared_buffer_len),
-                                                 shared_buffer_names,
-                                                 shared_buffer_handles,
-                                                 ctypes.byref(updated_shared_buffer_names),
-                                                 ctypes.byref(updated_shared_buffer_handles),
-                                                 ctypes.byref(num_in_args),
-                                                 ctypes.byref(in_arg_handles),
-                                                 ctypes.byref(arg_grad_handles),
-                                                 ctypes.byref(num_aux_states),
-                                                 ctypes.byref(aux_state_handles),
-                                                 shared_exec_handle,
-                                                 ctypes.byref(exe_handle)))
+            check_call(_LIB.MXExecutorSimpleBindEx(self.handle,
+                                                   ctypes.c_int(ctx.device_typeid),
+                                                   ctypes.c_int(ctx.device_id),
+                                                   num_ctx_map_keys,
+                                                   ctx_map_keys,
+                                                   ctx_map_dev_types,
+                                                   ctx_map_dev_ids,
+                                                   mx_uint(provided_req_type_list_len),
+                                                   provided_grad_req_names,
+                                                   provided_grad_req_types,
+                                                   mx_uint(len(provided_arg_shape_names)),
+                                                   c_str_array(provided_arg_shape_names),
+                                                   c_array_buf(mx_int,
+                                                               array('I', provided_arg_shape_data)),
+                                                   c_array_buf(mx_uint,
+                                                               array('i', provided_arg_shape_idx)),
+                                                   num_provided_arg_types,
+                                                   provided_arg_type_names,
+                                                   provided_arg_type_data,
+                                                   num_provided_arg_stypes,
+                                                   provided_arg_stype_names,
+                                                   provided_arg_stype_data,
+                                                   mx_uint(len(shared_arg_name_list)),
+                                                   c_str_array(shared_arg_name_list),
+                                                   ctypes.byref(shared_buffer_len),
+                                                   shared_buffer_names,
+                                                   shared_buffer_handles,
+                                                   ctypes.byref(updated_shared_buffer_names),
+                                                   ctypes.byref(updated_shared_buffer_handles),
+                                                   ctypes.byref(num_in_args),
+                                                   ctypes.byref(in_arg_handles),
+                                                   ctypes.byref(arg_grad_handles),
+                                                   ctypes.byref(num_aux_states),
+                                                   ctypes.byref(aux_state_handles),
+                                                   shared_exec_handle,
+                                                   ctypes.byref(exe_handle)))
         except MXNetError as e:
             error_msg = "simple_bind error. Arguments:\n"
             for k, v in kwargs.items():
diff --git a/scala-package/assembly/src/main/assembly/assembly.xml b/scala-package/assembly/src/main/assembly/assembly.xml
index 5a931360645c..b463ac12a6d6 100644
--- a/scala-package/assembly/src/main/assembly/assembly.xml
+++ b/scala-package/assembly/src/main/assembly/assembly.xml
@@ -66,10 +66,11 @@
       <outputDirectory>lib/native</outputDirectory>
     </fileSet>
     <fileSet>
-      <directory>${MXNET_DIR}/3rdparty</directory>
+      <directory>${MXNET_DIR}/licenses</directory>
       <includes>
-        <include>cub/LICENSE.TXT</include>
-        <include>mkldnn/external/mklml_mac_2019.0.1.20180928/license.txt</include>
+        <include>LICENSE.binary.dependencies</include>
+        <include>NOTICE</include>
+        <include>LICENSE</include>
       </includes>
       <outputDirectory>.</outputDirectory>
     </fileSet>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Executor.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Executor.scala
index aec44023a5d3..b0fae0f9d58d 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Executor.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Executor.scala
@@ -61,9 +61,7 @@ class Executor private[mxnet](private[mxnet] val handle: ExecutorHandle,
   protected var monitorCallback: MXMonitorCallback = null
   private val logger: Logger = LoggerFactory.getLogger(classOf[Executor])
 
-  private[mxnet] var ownsArgArrays = false
-  private[mxnet] var ownsGradArrays = false
-  private[mxnet] var ownsAuxArrays = false
+  private var reshaped = false
 
   override def nativeAddress: CPtrAddress = handle
   override def nativeDeAllocator: (CPtrAddress => Int) = _LIB.mxExecutorFree
@@ -75,17 +73,12 @@ class Executor private[mxnet](private[mxnet] val handle: ExecutorHandle,
     if (!super.isDisposed) {
       super.dispose()
       outputs.foreach(o => o.dispose())
-      // Symbol.bind clones symbol when creating the executor so we need to dispose of the clone
-      symbol.dispose()
-      if (ownsArgArrays && argArrays != null) {argArrays.foreach(a => a.dispose())}
-      if (ownsGradArrays && gradArrays != null) {gradArrays.foreach(
+      if (reshaped && argArrays != null) {argArrays.foreach(a => a.dispose())}
+      if (reshaped && gradArrays != null) {gradArrays.foreach(
         // Symbol will sometimes fill this with nulls so we've got to check the elements too
         a => if (a != null) {a.dispose()})
       }
-      if (ownsAuxArrays && auxArrays != null) {auxArrays.foreach(a => a.dispose())}
-      if (_argDict != null) {_argDict.foreach(a => a._2.dispose())}
-      if (_gradDict != null) {_gradDict.foreach(a => a._2.dispose())}
-      if (_auxDict != null) {_auxDict.foreach(a => a._2.dispose())}
+      if (reshaped && auxArrays != null) {auxArrays.foreach(a => a.dispose())}
     }
   }
 
@@ -104,95 +97,59 @@ class Executor private[mxnet](private[mxnet] val handle: ExecutorHandle,
    */
   def reshape(partialShaping: Boolean = false, allowUpSizing: Boolean = false,
     kwargs: Map[String, Shape]): Executor = {
-    var setArgOwner = false
-    var setAuxOwner = false
-    var setGradOwner = false
-     val (argShapes, _, auxShapes) = this.symbol.inferShape(kwargs)
-    // TODO: more precise error message should be provided by backend
-    require(argShapes != null, "Shape inference failed." +
-      s"Known shapes are $kwargs for symbol arguments ${symbol.listArguments()} " +
-      s"and aux states ${symbol.listAuxiliaryStates()}")
 
-    var newArgDict = Map[String, NDArray]()
-    var newGradDict = Map[String, NDArray]()
+    val providedArgShapeNames = kwargs.keys
+    val providedArgShapeData = kwargs.values.flatMap(_.toVector)
+    val providedArgShapeIdx = kwargs.values.scanLeft(0)((sum, shape) => sum + shape.size)
 
-    this.symbol.listArguments().zipWithIndex.foreach { case (name, i) =>
-      val newShape = argShapes(i)
-      val arr = this.argArrays(i)
-      val dArr = if (this.gradArrays == null) null else this.gradArrays(i)
-      if (partialShaping || kwargs.contains(name) || newShape.equals(arr.shape)) {
-        if (newShape.product > arr.shape.product) {
-          require(allowUpSizing, s"New shape of arg:$name larger than original. " +
-                        "First making a big executor and then down sizing it " +
-                        "is more efficient than the reverse." +
-                        "If you really want to up size, set allowUpSizing = true " +
-                        "to enable allocation of new arrays.")
-          newArgDict = newArgDict + (name -> NDArray.empty(newShape, arr.context, arr.dtype))
-          setArgOwner = true
-          if (dArr != null) {
-            newGradDict = newGradDict + (name -> NDArray.empty(newShape, dArr.context, dArr.dtype))
-            setGradOwner = true
-          }
-        } else {
-          newArgDict = newArgDict + (name -> arr.reshape(newShape.toArray))
-          if (dArr != null) {
-            newGradDict = newGradDict + (name -> dArr.reshape(newShape.toArray))
-          }
-        }
-      } else {
-        throw new  AssertionError(s"Shape of unspecified array arg:$name changed." +
-                    "This can cause the new executor to not share parameters " +
-                    "with the old one. Please check for error in network." +
-                    "If this is intended, set partialShaping = true to suppress this warning.")
-      }
-    }
-
-    var newAuxDict = Map[String, NDArray]()
-    val zip3 = (this.symbol.listAuxiliaryStates(), auxShapes, this.auxArrays).zipped
-    zip3.foreach { case (name, newShape, arr) =>
-      if (partialShaping || newShape.equals(arr.shape)) {
-        if (newShape.product > arr.shape.product) {
-          require(allowUpSizing, s"New shape of aux:$name larger than original. " +
-                        "First making a big executor and then down sizing it " +
-                        "is more efficient than the reverse." +
-                        "If you really want to up size, set allowUpSizing = true " +
-                        "to enable allocation of new arrays.")
-          newAuxDict = newAuxDict + (name -> NDArray.empty(newShape, arr.context))
-          setAuxOwner = true
-        } else {
-          newAuxDict = newAuxDict + (name -> arr.reshape(newShape.toArray))
-        }
-      } else {
-        throw new  AssertionError(s"Shape of unspecified array aux:$name changed." +
-                  "This can cause the new executor to not share parameters " +
-                  "with the old one. Please check for error in network." +
-                  "If this is intended, set partialShaping = true to suppress this warning.")
-      }
+    val ctxMapKeys = if (_group2ctx != null) _group2ctx.keys.toArray else Array.empty[String]
+    val ctxMapDevTypes = if (_group2ctx != null) {
+      _group2ctx.values.map(_.deviceTypeid).toArray
+    } else {
+      Array.empty[Int]
     }
-    val reshapedExecutor = if (this._gradsReq.isInstanceOf[Seq[_]]) {
-      this.symbol.bind(this._ctx,
-                          newArgDict,
-                          newGradDict,
-                          this._gradsReq.asInstanceOf[Seq[String]],
-                          newAuxDict,
-                          this._group2ctx,
-                          this)
+    val ctxMapDevIds = if (_group2ctx != null) {
+      _group2ctx.values.map(_.deviceId).toArray
     } else {
-      this.symbol.bind(this._ctx,
-                          newArgDict,
-                          newGradDict,
-                          this._gradsReq.asInstanceOf[Map[String, String]],
-                          newAuxDict,
-                          this._group2ctx,
-                          this)
+      Array.empty[Int]
     }
 
-    // This method has created new NDArrays that will need to be managed by the new Executor
-    if (setArgOwner) reshapedExecutor.ownsArgArrays = true
-    if (setGradOwner) reshapedExecutor.ownsGradArrays = true
-    if (setAuxOwner) reshapedExecutor.ownsAuxArrays = true
+    val inArgs = ArrayBuffer.empty[NDArrayHandle]
+    val argGrads = ArrayBuffer.empty[NDArrayHandle]
+    val auxStates = ArrayBuffer.empty[NDArrayHandle]
+    val outHandle = new ExecutorHandleRef()
+
+    checkCall(_LIB.mxExecutorReshape(
+              if (partialShaping) 1 else 0,
+              if (allowUpSizing) 1 else 0,
+              _ctx.deviceTypeid,
+              _ctx.deviceId,
+              ctxMapKeys.toArray,
+              ctxMapDevTypes.toArray,
+              ctxMapDevIds.toArray,
+              providedArgShapeNames.toArray,
+              providedArgShapeData.toArray,
+              providedArgShapeIdx.toArray,
+              inArgs,
+              argGrads,
+              auxStates,
+              this.handle,
+              outHandle))
+
+    val argArrays = inArgs.map(new NDArray(_)).toArray
+    val gradArrays = argGrads.map(handle =>
+      if (handle == 0) null else new NDArray(handle)).toArray
+    val auxArrays = auxStates.map(new NDArray(_)).toArray
 
-    reshapedExecutor
+    val executor = new Executor(outHandle.value, this.symbol)
+    executor._ctx = this._ctx
+    executor._gradsReq = this._gradsReq
+    executor._group2ctx = this._group2ctx
+    executor.argArrays = argArrays
+    executor.gradArrays = gradArrays
+    executor.auxArrays = auxArrays
+    executor.reshaped = true
+    executor
   }
 
   /**
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/ExecutorManager.scala b/scala-package/core/src/main/scala/org/apache/mxnet/ExecutorManager.scala
index b13741bdd3b0..d94b8fb01ed6 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/ExecutorManager.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/ExecutorManager.scala
@@ -395,8 +395,8 @@ private[mxnet] object ExecutorManager {
  * @param paramNames Names of all trainable parameters.
  * @param ctx List of devices for training (data parallel)
  * @param slices Describes how the data parallel splits data into different devices.
- * @param providedData training data shapes
- * @param providedLabel training label shapes
+ * @param providedDataDesc training data descriptions
+ * @param providedLabelDesc training label descriptions
  * @param sharedGroup: DataParallelExecutorGroup
  *                   An existing executor group, if to share parameters with it.
  *
@@ -404,8 +404,8 @@ private[mxnet] object ExecutorManager {
 private class DataParallelExecutorGroup private(sym: Symbol,
                                 argNames: IndexedSeq[String], paramNames: Set[String],
                                 ctx: Array[Context], private val slices: Array[(Int, Int)],
-                                providedData: Map[String, Shape],
-                                providedLabel: Map[String, Shape],
+                                providedDataDesc: IndexedSeq[DataDesc],
+                                providedLabelDesc: IndexedSeq[DataDesc],
                                 sharedGroup: DataParallelExecutorGroup)  {
   // make sure the architecture is valid
   ExecutorManager.checkArguments(sym)
@@ -417,8 +417,8 @@ private class DataParallelExecutorGroup private(sym: Symbol,
       sharedGroup.sharedDataArrays
     }
 
-  private[mxnet] val dataNames = providedData.map { case (k, _) => k }.toList
-  private[mxnet] val labelNames = providedLabel.map { case (k, _) => k }.toList
+  private[mxnet] val dataNames = providedDataDesc.map(_.name).toList
+  private[mxnet] val labelNames = providedLabelDesc.map(_.name).toList
   private[mxnet] val auxNames = sym.listAuxiliaryStates()
   private[mxnet] val paramIdx = argNames.zipWithIndex
     .filter { case (name, i) => paramNames.contains(name) }
@@ -428,9 +428,10 @@ private class DataParallelExecutorGroup private(sym: Symbol,
   private[mxnet] val trainExecs: Array[Executor] =
     ctx.zipWithIndex.map { case (ctxi, i) =>
       val dataShapes =
-        (providedData ++ providedLabel) map { case (name, shape) =>
-          name -> (Shape(slices(i)._2 - slices(i)._1) ++ shape.slice(1, shape.length))
-        }
+        (providedDataDesc ++ providedLabelDesc).map( desc => {
+          desc.name ->
+            (Shape(slices(i)._2 - slices(i)._1) ++ desc.shape.slice(1, desc.shape.length))
+        }).toMap
       val sharedExec: Executor = if (sharedGroup == null) null else sharedGroup.trainExecs(i)
       ExecutorManager.bindExec(sym, ctxi, dataShapes, paramNamesComb,
         needGrad = true, baseExec = sharedExec,
@@ -479,7 +480,7 @@ private class DataParallelExecutorGroup private(sym: Symbol,
       trainData: DataIter,
       sharedGroup: DataParallelExecutorGroup) {
     this(sym, argNames, paramNames, ctx, slices,
-      trainData.provideData, trainData.provideLabel, sharedGroup)
+      trainData.provideDataDesc, trainData.provideLabelDesc, sharedGroup)
   }
 
   def this(sym: Symbol,
@@ -487,7 +488,7 @@ private class DataParallelExecutorGroup private(sym: Symbol,
            ctx: Array[Context], slices: Array[(Int, Int)],
            trainData: DataIter) {
     this(sym, argNames, paramNames, ctx, slices,
-      trainData.provideData, trainData.provideLabel, null)
+      trainData.provideDataDesc, trainData.provideLabelDesc, null)
   }
 
   /**
@@ -509,7 +510,7 @@ private class DataParallelExecutorGroup private(sym: Symbol,
       trainData: DataBatch,
       sharedGroup: DataParallelExecutorGroup) {
     this(sym, argNames, paramNames, ctx, slices,
-      trainData.provideData, trainData.provideLabel, sharedGroup)
+      trainData.provideDataDesc, trainData.provideLabelDesc, sharedGroup)
   }
 
   def this(sym: Symbol,
@@ -517,7 +518,7 @@ private class DataParallelExecutorGroup private(sym: Symbol,
            ctx: Array[Context], slices: Array[(Int, Int)],
            trainData: DataBatch) {
     this(sym, argNames, paramNames, ctx, slices,
-      trainData.provideData, trainData.provideLabel, null)
+      trainData.provideDataDesc, trainData.provideLabelDesc, null)
   }
 
   // load data and labels into arrays
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala b/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
index 2b1765531824..b8e2ba0b39c8 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/FeedForward.scala
@@ -129,11 +129,11 @@ class FeedForward private(
   // Initialize weight parameters and auxiliary states
   // The NDArrays associated with the _argParms and _auxParams are not disposed instead
   // they are passed a outer scope if available.
-  private def initParams(inputShapes: Map[String, Shape], overwrite: Boolean = false)
+  private def initParams(inputShapes: IndexedSeq[DataDesc], overwrite: Boolean = false)
   : (IndexedSeq[String], IndexedSeq[String], IndexedSeq[String]) = {
     val (argShapes, _, auxShapes) = symbol.inferShape(inputShapes)
     val argNames = symbol.listArguments()
-    val inputNames = inputShapes.keys.toSet
+    val inputNames = inputShapes.map(_.name).toSet
     val paramNames = argNames.filter(!inputNames.contains(_))
     val auxNames = symbol.listAuxiliaryStates()
 
@@ -179,7 +179,7 @@ class FeedForward private(
   }
 
   // Initialize the predictor module for running prediction.
-  private def initPredictor(inputShapes: Map[String, Shape]): Unit = {
+  private def initPredictor(inputShapes: IndexedSeq[DataDesc]): Unit = {
     var shouldInit = true
     if (this.predExec != null) {
       val (argShapes, _, _) = symbol.inferShape(inputShapes)
@@ -193,7 +193,7 @@ class FeedForward private(
     }
     if(shouldInit) {
       // for now only use the first device
-      val predExec = symbol.simpleBind(ctx(0), gradReq = "null", shapeDict = inputShapes)
+      val predExec = symbol.simpleBind(ctx(0), gradReq = "null", inputShapes)
       predExec.copyParamsFrom(_argParams, _auxParams)
       ExecutorManager.checkArguments(symbol)
       this.predExec = predExec
@@ -233,8 +233,8 @@ class FeedForward private(
    */
   def predict(data: DataIter, numBatch: Int = -1): Array[NDArray] = {
     data.reset()
-    val dataShapes = data.provideData
-    val dataNames = dataShapes.map(_._1).toArray
+    val dataShapes = data.provideDataDesc
+    val dataNames = dataShapes.map(_.name).toArray
     initPredictor(dataShapes)
     val batchSize = data.batchSize
     val dataArrays = dataNames.map(predExec.argDict(_))
@@ -363,7 +363,7 @@ class FeedForward private(
       this.symbol = symGen.generate(trainData.defaultBucketKey)
       checkArguments()
     }
-    initParams(trainData.provideData ++ trainData.provideLabel)
+    initParams(trainData.provideDataDesc ++ trainData.provideLabelDesc)
   }
 
   private def fit(trainData: DataIter, evalData: DataIter, evalMetric: EvalMetric = new Accuracy(),
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/IO.scala b/scala-package/core/src/main/scala/org/apache/mxnet/IO.scala
index b580ad10a04e..1db6d2a6e953 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/IO.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/IO.scala
@@ -141,28 +141,46 @@ class DataBatch(val data: IndexedSeq[NDArray],
                 val pad: Int,
                 // the key for the bucket that should be used for this batch,
                 // for bucketing io only
-                val bucketKey: AnyRef,
+                val bucketKey: AnyRef = null,
                 // use DataDesc to indicate the order of data/label loading
                 // (must match the order of input data/label)
-                private val providedDataDesc: IndexedSeq[DataDesc],
-                private val providedLabelDesc: IndexedSeq[DataDesc]) {
+                private val providedDataDesc: IndexedSeq[DataDesc] = null,
+                private val providedLabelDesc: IndexedSeq[DataDesc] = null) {
   // TODO: change the data/label type into IndexedSeq[(NDArray, DataDesc)]
   // However, since the data and label can be accessed publicly (no getter and setter)
   // the change on this will break BC
+
+  @deprecated("Use provideDataDesc and provideDataLabel instead", "1.3.0")
+  def this(data: IndexedSeq[NDArray],
+           label: IndexedSeq[NDArray],
+           index: IndexedSeq[Long],
+           pad: Int,
+           // the key for the bucket that should be used for this batch,
+           // for bucketing io only
+           bucketKey: AnyRef,
+           // use ListMap to indicate the order of data/label loading
+           // (must match the order of input data/label)
+           providedData: ListMap[String, Shape]) {
+    this(data, label, index, pad, bucketKey,
+      DataDesc.ListMap2Descs(providedData))
+  }
+
+  @deprecated("Use provideDataDesc and provideDataLabel instead", "1.3.0")
   def this(data: IndexedSeq[NDArray],
             label: IndexedSeq[NDArray],
             index: IndexedSeq[Long],
             pad: Int,
             // the key for the bucket that should be used for this batch,
             // for bucketing io only
-            bucketKey: AnyRef = null,
+            bucketKey: AnyRef,
             // use ListMap to indicate the order of data/label loading
             // (must match the order of input data/label)
-            providedData: ListMap[String, Shape] = null,
-            providedLabel: ListMap[String, Shape] = null) {
+            providedData: ListMap[String, Shape],
+            providedLabel: ListMap[String, Shape]) {
     this(data, label, index, pad, bucketKey,
       DataDesc.ListMap2Descs(providedData), DataDesc.ListMap2Descs(providedLabel))
   }
+
   /**
    * Dispose its data and labels
    * The object shall never be used after it is disposed.
@@ -177,6 +195,7 @@ class DataBatch(val data: IndexedSeq[NDArray],
   }
 
   // The name and shape of data
+  @deprecated("Use provideDataDesc instead", "1.3.0")
   def provideData: ListMap[String, Shape] = {
     var temp = ListMap[String, Shape]()
     if (providedDataDesc == null) null
@@ -187,6 +206,7 @@ class DataBatch(val data: IndexedSeq[NDArray],
   }
 
   // The name and shape of label
+  @deprecated("Use provideLabelDesc instead", "1.3.0")
   def provideLabel: ListMap[String, Shape] = {
     var temp = ListMap[String, Shape]()
     if (providedLabelDesc == null) null
@@ -311,8 +331,7 @@ abstract class DataIter extends Iterator[DataBatch] {
    */
   @throws(classOf[NoSuchElementException])
   def next(): DataBatch = {
-    new DataBatch(getData(), getLabel(), getIndex(), getPad(),
-      null, null, null)
+    new DataBatch(getData(), getLabel(), getIndex(), getPad())
   }
 
   /**
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala
index 52e26efb41f1..b54ecc05818e 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala
@@ -38,6 +38,7 @@ object Image {
     * @param flag   Convert decoded image to grayscale (0) or color (1).
     * @param to_rgb Whether to convert decoded image
     *               to mxnet's default RGB format (instead of opencv's default BGR).
+    * @param out    NDArray to store the output
     * @return NDArray in HWC format with DType [[DType.UInt8]]
     */
   def imDecode(buf: Array[Byte], flag: Int,
@@ -57,6 +58,10 @@ object Image {
   /**
     * Same imageDecode with InputStream
     * @param inputStream the inputStream of the image
+    * @param flag   Convert decoded image to grayscale (0) or color (1).
+    * @param to_rgb Whether to convert decoded image
+    *               to mxnet's default RGB format (instead of opencv's default BGR).
+    * @param out    NDArray to store the output
     * @return NDArray in HWC format with DType [[DType.UInt8]]
     */
   def imDecode(inputStream: InputStream, flag: Int = 1,
@@ -79,6 +84,7 @@ object Image {
     * @param flag     Convert decoded image to grayscale (0) or color (1).
     * @param to_rgb   Whether to convert decoded image to mxnet's default RGB format
     *                 (instead of opencv's default BGR).
+    * @param out    NDArray to store the output
     * @return org.apache.mxnet.NDArray in HWC format with DType [[DType.UInt8]]
     */
   def imRead(filename: String, flag: Option[Int] = None,
@@ -99,6 +105,7 @@ object Image {
     * @param w       Width of resized image.
     * @param h       Height of resized image.
     * @param interp  Interpolation method (default=cv2.INTER_LINEAR).
+    * @param out    NDArray to store the output
     * @return org.apache.mxnet.NDArray
     */
   def imResize(src: org.apache.mxnet.NDArray, w: Int, h: Int,
@@ -124,6 +131,7 @@ object Image {
     * @param typeOf Filling type (default=cv2.BORDER_CONSTANT).
     * @param value  (Deprecated! Use ``values`` instead.) Fill with single value.
     * @param values Fill with value(RGB[A] or gray), up to 4 channels.
+    * @param out    NDArray to store the output
     * @return org.apache.mxnet.NDArray
     */
   def copyMakeBorder(src: org.apache.mxnet.NDArray, top: Int, bot: Int,
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
index 40fc0951e885..aba618540141 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/LibInfo.scala
@@ -188,6 +188,23 @@ private[mxnet] class LibInfo {
                                  grads: Array[NDArrayHandle]): Int
   @native def mxExecutorPrint(handle: ExecutorHandle, debugStr: RefString): Int
   @native def mxExecutorSetMonitorCallback(handle: ExecutorHandle, callback: MXMonitorCallback): Int
+  // scalastyle:off parameterNum
+  @native def mxExecutorReshape(partialShaping: Int,
+                                allowUpSizing: Int,
+                                devType: Int,
+                                devId: Int,
+                                mapKeys: Array[String],
+                                mapDevTypes: Array[Int],
+                                mapDevIds: Array[Int],
+                                providedArgShapeNames: Array[String],
+                                providedArgShapeData: Array[Int],
+                                providedArgShapeIdx: Array[Int],
+                                inArgs: ArrayBuffer[NDArrayHandle],
+                                argGrads: ArrayBuffer[NDArrayHandle],
+                                auxStates: ArrayBuffer[NDArrayHandle],
+                                sharedExec: ExecutorHandle,
+                                out: ExecutorHandleRef): Int
+  // scalastyle:on parameterNum
 
   // Symbols
   @native def mxSymbolListAtomicSymbolCreators(symbolList: ListBuffer[SymbolHandle]): Int
@@ -240,11 +257,20 @@ private[mxnet] class LibInfo {
                                  numArgs: MXUint,
                                  keys: Array[String],
                                  argIndPtr: Array[MXUint],
-                                 argShapeData: Array[MXUint],
+                                 argShapeData: Array[Int],
                                  inShapeData: ListBuffer[Array[Int]],
                                  outShapeData: ListBuffer[Array[Int]],
                                  auxShapeData: ListBuffer[Array[Int]],
                                  complete: RefInt): Int
+  @native def mxSymbolInferShapePartial(handle: SymbolHandle,
+                                        numArgs: MXUint,
+                                        keys: Array[String],
+                                        argIndPtr: Array[MXUint],
+                                        argShapeData: Array[Int],
+                                        inShapeData: ListBuffer[Array[Int]],
+                                        outShapeData: ListBuffer[Array[Int]],
+                                        auxShapeData: ListBuffer[Array[Int]],
+                                        complete: RefInt): Int
   @native def mxSymbolGetOutput(handle: SymbolHandle, index: Int, out: SymbolHandleRef): Int
   @native def mxSymbolSaveToJSON(handle: SymbolHandle, out: RefString): Int
   @native def mxSymbolCreateFromJSON(json: String, handle: SymbolHandleRef): Int
@@ -322,4 +348,8 @@ private[mxnet] class LibInfo {
   @native def mxSetProfilerConfig(keys: Array[String], vals: Array[String]): Int
   @native def mxSetProfilerState(state: Int): Int
   @native def mxDumpProfile(finished: Int): Int
+
+  // Numpy
+  @native def mxIsNumpyCompatible(compatible: RefInt): Int
+  @native def mxSetIsNumpyCompatible(isNpComp: Int, prev: RefInt): Int
 }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala b/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala
index 3a51222cc0b8..de7792850dc1 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala
@@ -17,6 +17,8 @@
 
 package org.apache.mxnet
 
+import scala.language.implicitConversions
+
 object MX_PRIMITIVES {
 
   /**
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index ab42265ae102..3764f5a4a040 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -36,6 +36,11 @@ import scala.util.Try
   */
 @AddNDArrayFunctions(false)
 object NDArray extends NDArrayBase {
+  /**
+    * method to convert NDArrayFunctionReturn to NDArray
+    * @param ret the returned NDArray list
+    * @return NDArray result
+    */
   implicit def getFirstResult(ret: NDArrayFuncReturn): NDArray = ret(0)
   private val logger = LoggerFactory.getLogger(classOf[NDArray])
 
@@ -1274,11 +1279,15 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
    * @return an array representing shape of current ndarray
    */
   def shape: Shape = {
-    val ndim = new MXUintRef
+    val ndim = new RefInt
     val data = ArrayBuffer[Int]()
     checkCall(_LIB.mxNDArrayGetShape(handle, ndim, data))
-    require(ndim.value == data.length, s"ndim=$ndim, while len(data)=${data.length}")
-    Shape(data)
+    if (ndim.value == -1) {
+      null
+    } else {
+      require(ndim.value == data.length, s"ndim=$ndim, while len(data)=${data.length}")
+      Shape(data)
+    }
   }
 
   // Get size of current NDArray.
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NumpyScope.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NumpyScope.scala
new file mode 100644
index 000000000000..d3e76f1044a7
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NumpyScope.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet
+
+import org.apache.mxnet.Base._
+
+/**
+  * NumpyScope object provides util functions for turning on/off NumPy compatibility
+  * and checking whether NumPy compatibility has been turned on/off. NumPy compatibility
+  * is introduced first to support zero-dim and zero-size tensors as in NumPy.
+  */
+object NumpyScope {
+  def setNumpyCompatible(isNpComp: Boolean): Boolean = {
+    val prev = new RefInt()
+    checkCall(_LIB.mxSetIsNumpyCompatible(if (isNpComp) 1 else 0, prev))
+    if (prev.value != 0) true else false
+  }
+
+  def isNumpyCompatible: Boolean = {
+    val curr = new RefInt
+    checkCall(_LIB.mxIsNumpyCompatible(curr))
+    if (curr.value != 0) true else false
+  }
+
+  def enableNumpyCompatible: NumpyScope = {
+    new NumpyScope(true)
+  }
+
+
+  def disableNumpyCompatible: NumpyScope = {
+    new NumpyScope(false)
+  }
+}
+
+class NumpyScope(var isCompatible: Boolean) {
+  private var prev: Boolean = false
+
+  def withScope[T](body: => T): T = {
+    prev = NumpyScope.setNumpyCompatible(isCompatible)
+    try {
+      body
+    } finally {
+      if (prev != isCompatible) {
+        NumpyScope.setNumpyCompatible(prev)
+      }
+    }
+  }
+}
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
index 821e04f08df2..68db2b1d9144 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Symbol.scala
@@ -21,6 +21,7 @@ import org.apache.mxnet.Base._
 import org.apache.mxnet.DType.DType
 import org.slf4j.{Logger, LoggerFactory}
 
+import scala.collection.mutable
 import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 import scala.language.implicitConversions
 
@@ -209,6 +210,33 @@ class Symbol private(private[mxnet] val handle: SymbolHandle) extends NativeReso
     }
   }
 
+  /**
+    * Infer the shape of outputs and arguments of given known shapes of arguments.
+    * User can either pass in the known shapes in positional way or keyword argument way.
+    * Tuple of Nones is returned if there is not enough information passed in.
+    * An error will be raised if there is inconsistency found in the known shapes passed in.
+    * @param args Provide a list of DataDesc containing the shapes to resolve
+    * @return
+    * argShapes List of shapes of arguments. The order is in the same order as list_arguments()
+    * outShapes List of shapes of outputs. The order is in the same order as list_outputs()
+    * auxShapes List of shapes of outputs. The order is in the same order as list_auxiliary()
+    */
+  def inferShape(args: IndexedSeq[DataDesc]):
+  (IndexedSeq[Shape], IndexedSeq[Shape], IndexedSeq[Shape]) = {
+    val keys = ArrayBuffer.empty[String]
+    val indPtr = ArrayBuffer(0)
+    val sdata = ArrayBuffer.empty[Int]
+    args.foreach { arg =>
+      val shape = arg.shape
+      if (shape != null) {
+        keys += arg.name
+        sdata ++= shape.toVector
+        indPtr += sdata.size
+      }
+    }
+    inferShape(keys.toArray, indPtr.toArray, sdata.toArray)
+  }
+
   /**
    * Infer the shape of outputs and arguments of given known shapes of arguments.
    * User can either pass in the known shapes in positional way or keyword argument way.
@@ -260,17 +288,45 @@ class Symbol private(private[mxnet] val handle: SymbolHandle) extends NativeReso
 
   def inferShape(keys: Array[String], indPtr: Array[Int], values: Array[Int])
     : (IndexedSeq[Shape], IndexedSeq[Shape], IndexedSeq[Shape]) = {
+    val res = inferShapeImpl(partial = false, keys, indPtr, values)
+    if (res._2 == null) {
+      val (argShapes, _, _) = inferShapeImpl(partial = true, keys, indPtr, values)
+      val argNames = listArguments()
+      val unknown = (argNames zip argShapes).map { case (name, shape) =>
+        val shapeIsNone = if (NumpyScope.isNumpyCompatible) {
+          shape == null || shape.toVector.contains(-1)
+        } else {
+          shape == null || shape.toVector.contains(0)
+        }
+        if (shapeIsNone) s"$name: $shape" else ""
+      }
+      logger.warn("Cannot decide shape for the following arguments. " +
+        "Consider providing them as input: \n\t{}",
+        unknown.filter(_ != "").mkString("\n\t"))
+    }
+    res
+  }
+
+  private def inferShapeImpl(partial: Boolean,
+                             keys: Array[String],
+                             indPtr: Array[Int],
+                             values: Array[Int])
+    : (IndexedSeq[Shape], IndexedSeq[Shape], IndexedSeq[Shape]) = {
     val argShapeData = ListBuffer.empty[Array[Int]]
     val outShapeData = ListBuffer.empty[Array[Int]]
     val auxShapeData = ListBuffer.empty[Array[Int]]
     val complete = new RefInt
-
-    checkCall(_LIB.mxSymbolInferShape(handle, indPtr.length - 1, keys, indPtr, values,
-      argShapeData, outShapeData, auxShapeData, complete))
+    if (partial) {
+      checkCall(_LIB.mxSymbolInferShapePartial(handle, indPtr.length - 1, keys, indPtr, values,
+        argShapeData, outShapeData, auxShapeData, complete))
+    } else {
+      checkCall(_LIB.mxSymbolInferShape(handle, indPtr.length - 1, keys, indPtr, values,
+        argShapeData, outShapeData, auxShapeData, complete))
+    }
     if (complete.value != 0) {
       (argShapeData.map(s => Shape(s)).toIndexedSeq,
-       outShapeData.map(s => Shape(s)).toIndexedSeq,
-       auxShapeData.map(s => Shape(s)).toIndexedSeq)
+        outShapeData.map(s => Shape(s)).toIndexedSeq,
+        auxShapeData.map(s => Shape(s)).toIndexedSeq)
     } else {
       (null, null, null)
     }
@@ -389,6 +445,29 @@ class Symbol private(private[mxnet] val handle: SymbolHandle) extends NativeReso
     checkCall(_LIB.mxSymbolCompose(handle, name, keys, args))
   }
 
+  /**
+    * Bind current symbol to get an executor, allocate all the ndarrays needed.
+    * Allows specifying data types.
+    * This function will ask user to pass in ndarray of position
+    * they like to bind to, and it will automatically allocate the ndarray
+    * for arguments and auxiliary states that user did not specify explicitly.
+    *
+    * @param ctx The device context the generated executor to run on.
+    * @param gradReq {'write', 'add', 'null'}, or list of str or dict of str to str, optional
+    *                Specifies how we should update the gradient to the args_grad.
+    *                - 'write' means everytime gradient is write to specified args_grad NDArray.
+    *                - 'add' means everytime gradient is add to the specified NDArray.
+    *                - 'null' means no action is taken, the gradient may not be calculated.
+    * @param dataDesc List of dataDescriptors
+    * @return The generated Executor
+    */
+  def simpleBind(ctx: Context, gradReq: String,
+                 descs: IndexedSeq[DataDesc]) : Executor = {
+    val (shapes, types) = descs.map(desc =>
+      ( desc.name -> desc.shape, desc.name -> desc.dtype )).unzip
+    simpleBind(ctx, gradReq, shapes.toMap, types.toMap)
+  }
+
   /**
    * Bind current symbol to get an executor, allocate all the ndarrays needed.
    * Allows specifying data types.
@@ -1189,7 +1268,7 @@ object Symbol extends SymbolBase {
 
   // a more friendly interface for creating symbols
   // all values except symbols in kwargs will be cast to String using its toString() method
-  @Deprecated
+  @deprecated("Use Checked version", "0.1.2")
   def createFromNamedSymbolsNoCheck(
       operator: String, name: String = null, attr: Map[String, String] = null)(
       kwargs: Map[String, Any]): Symbol = {
@@ -1208,7 +1287,7 @@ object Symbol extends SymbolBase {
 
   // a more friendly interface for creating symbols
   // all values except symbols in kwargs will be cast to String using its toString() method
-  @Deprecated
+  @deprecated("Use Checked version", "0.1.2")
   def createFromListedSymbolsNoCheck(
       operator: String, name: String = null, attr: Map[String, String] = null)(
       symbols: Array[Symbol], kwargs: Map[String, Any] = null): Symbol = {
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/io/MXDataIter.scala b/scala-package/core/src/main/scala/org/apache/mxnet/io/MXDataIter.scala
index e30098c3088b..66b7d83cedc8 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/io/MXDataIter.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/io/MXDataIter.scala
@@ -107,8 +107,7 @@ private[mxnet] class MXDataIter(private[mxnet] val handle: DataIterHandle,
     checkCall(_LIB.mxDataIterNext(handle, next))
     if (next.value > 0) {
       currentBatch = new DataBatch(data = getData(), label = getLabel(),
-        index = getIndex(), pad = getPad(),
-        null, null, null)
+        index = getIndex(), pad = getPad())
     } else {
       currentBatch = null
     }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala b/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
index b205bbe47abb..e9513257c050 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
@@ -161,8 +161,7 @@ class NDArrayIter(data: IndexedSeq[(DataDesc, NDArray)],
   override def next(): DataBatch = {
     if (hasNext) {
       cursor += dataBatchSize
-      new DataBatch(getData(), getLabel(), getIndex(), getPad(),
-        null, null, null)
+      new DataBatch(getData(), getLabel(), getIndex(), getPad())
     } else {
       throw new NoSuchElementException
     }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/io/PrefetchingIter.scala b/scala-package/core/src/main/scala/org/apache/mxnet/io/PrefetchingIter.scala
index d277351b124b..9cfcd598197c 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/io/PrefetchingIter.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/io/PrefetchingIter.scala
@@ -42,71 +42,51 @@ class PrefetchingIter(
 
   require(iters.nonEmpty, "Iters length must be greater than 0")
 
-  private val _provideData: ListMap[String, Shape] = {
+  @deprecated("Please use provideDataDesc instead", "1.3.0")
+  override def provideData: ListMap[String, Shape] = {
     if (dataNames == null) {
-      iters.map(_.provideData).foldLeft(ListMap[String, Shape]()) { (acc, elem) =>
-        acc ++ elem
-      }
+      iters.map(_.provideData).reduce(_ ++ _)
     } else {
-      iters.zipWithIndex.map(tu => (tu._1.provideData, tu._2))
-             .map(m => m._1.map(t => (dataNames(m._2)(t._1), t._2)))
-             .foldLeft(ListMap[String, Shape]()) { (acc, elem) =>
-        acc ++ elem
-      }
+      iters.map(_.provideData).zip(dataNames).map { case (providedData, names) =>
+        providedData.map { case (oldName, shape) => names(oldName) -> shape }
+      }.reduceLeft(_ ++ _)
     }
   }
 
-  private val _provideLabel: ListMap[String, Shape] = {
+  @deprecated("Please use provideDataDesc instead", "1.3.0")
+  override def provideLabel: ListMap[String, Shape] = {
     if (labelNames == null) {
-      iters.map(_.provideLabel).foldLeft(ListMap[String, Shape]()) { (acc, elem) =>
-        acc ++ elem
-      }
+      iters.map(_.provideLabel).reduce(_ ++ _)
     } else {
-      iters.zipWithIndex.map(tu => (tu._1.provideLabel, tu._2))
-             .map(m => m._1.map(t => (labelNames(m._2)(t._1), t._2)))
-             .foldLeft(ListMap[String, Shape]()) { (acc, elem) =>
-        acc ++ elem
-      }
+      iters.map(_.provideLabel).zip(labelNames).map { case (providedLabel, names) =>
+        providedLabel.map { case (oldName, shape) => names(oldName) -> shape }
+      }.reduceLeft(_ ++ _)
     }
   }
 
-  private val _provideDataDesc: IndexedSeq[DataDesc] = {
+  override def provideDataDesc: IndexedSeq[DataDesc] = {
     if (dataNames == null) {
-      iters.map(_.provideDataDesc).foldLeft(IndexedSeq[DataDesc]()) { (acc, elem) =>
-        acc ++ elem
-      }
+      iters.flatMap(_.provideDataDesc)
     } else {
-      iters.zipWithIndex.map(tu => (tu._1.provideDataDesc, tu._2))
-        .map(m =>
-          m._1.map(t =>
-            new DataDesc(dataNames(m._2)(t.name), t.shape, t.dtype, t.layout)
-          )
-        )
-        .foldLeft(IndexedSeq[DataDesc]()) { (acc, elem) =>
-          acc ++ elem
-        }
+      iters.map(_.provideDataDesc).zip(dataNames).flatMap { case (providedDataDesc, names) =>
+          providedDataDesc.map(desc =>
+            new DataDesc(names(desc.name), desc.shape, desc.dtype, desc.layout))
+      }
     }
   }
 
-  private val _provideLabelDesc: IndexedSeq[DataDesc] = {
+  override def provideLabelDesc: IndexedSeq[DataDesc] = {
     if (labelNames == null) {
-      iters.map(_.provideLabelDesc).foldLeft(IndexedSeq[DataDesc]()) { (acc, elem) =>
-        acc ++ elem
-      }
+      iters.flatMap(_.provideLabelDesc)
     } else {
-      iters.zipWithIndex.map(tu => (tu._1.provideLabelDesc, tu._2))
-        .map(m =>
-          m._1.map(t =>
-            new DataDesc(labelNames(m._2)(t.name), t.shape, t.dtype, t.layout)
-          )
-        )
-        .foldLeft(IndexedSeq[DataDesc]()) { (acc, elem) =>
-          acc ++ elem
-        }
+      iters.map(_.provideLabelDesc).zip(labelNames).flatMap { case (providedLabelDesc, names) =>
+        providedLabelDesc.map(desc =>
+          new DataDesc(names(desc.name), desc.shape, desc.dtype, desc.layout))
+      }
     }
   }
 
-  private val _batchSize: Int = this._provideData.toList(0)._2(0)
+  private val _batchSize: Int = this.provideDataDesc.head.shape(0)
   private val dataReady: IndexedSeq[Semaphore] =
                                         (0 until iters.length).map(i => new Semaphore(0))
   private val dataTaken: IndexedSeq[Semaphore] =
@@ -177,20 +157,6 @@ class PrefetchingIter(
     */
   override def getPad(): Int = this.currentBatch.pad
 
-  // The name and shape of label provided by this iterator
-  @deprecated("Please use provideDataDesc instead", "1.3.0")
-  override def provideLabel: ListMap[String, Shape] = this._provideLabel
-
-  // The name and shape of data provided by this iterator
-  @deprecated("Please use provideLabelDesc instead", "1.3.0")
-  override def provideData: ListMap[String, Shape] = this._provideData
-
-  // Provide type:DataDesc of the data
-  override def provideDataDesc: IndexedSeq[DataDesc] = _provideDataDesc
-
-  // Provide type:DataDesc of the label
-  override def provideLabelDesc: IndexedSeq[DataDesc] = _provideLabelDesc
-
   override def hasNext: Boolean = {
     for (e <- dataReady) e.acquire()
     if (nextBatch(0) == null) {
@@ -209,8 +175,7 @@ class PrefetchingIter(
       currentBatch = new DataBatch(datas.toIndexedSeq.flatten,
         labels.toIndexedSeq.flatten,
         nextBatch(0).index,
-        nextBatch(0).pad,
-        null, null, null)
+        nextBatch(0).pad)
       for (e <- dataTaken) e.release()
       true
     }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala
index f72223d1e4da..57a485083f20 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala
@@ -36,6 +36,12 @@ object Image {
     org.apache.mxnet.Image.imDecode(buf, flag, toRGB, None)
   }
 
+  /**
+    * Decode image with OpenCV.
+    * Note: return image in RGB by default, instead of OpenCV's default BGR.
+    * @param buf   Buffer containing binary encoded image
+    * @return NDArray in HWC format with DType [[DType.UInt8]]
+    */
   def imDecode(buf: Array[Byte]): NDArray = {
     imDecode(buf, 1, true)
   }
@@ -52,6 +58,12 @@ object Image {
     org.apache.mxnet.Image.imDecode(inputStream, flag, toRGB, None)
   }
 
+  /**
+    * Same imageDecode with InputStream
+    *
+    * @param inputStream the inputStream of the image
+    * @return NDArray in HWC format with DType [[DType.UInt8]]
+    */
   def imDecode(inputStream: InputStream): NDArray = {
     imDecode(inputStream, 1, true)
   }
@@ -69,6 +81,12 @@ object Image {
     org.apache.mxnet.Image.imRead(filename, Some(flag), Some(toRGB), None)
   }
 
+  /**
+    * Read and decode image with OpenCV.
+    * Note: return image in RGB by default, instead of OpenCV's default BGR.
+    * @param filename Name of the image file to be loaded.
+    * @return org.apache.mxnet.NDArray in HWC format with DType [[DType.UInt8]]
+    */
   def imRead(filename: String): NDArray = {
     imRead(filename, 1, true)
   }
@@ -86,6 +104,13 @@ object Image {
     org.apache.mxnet.Image.imResize(src, w, h, interpVal, None)
   }
 
+  /**
+    * Resize image with OpenCV.
+    * @param src    source image in NDArray
+    * @param w      Width of resized image.
+    * @param h      Height of resized image.
+    * @return org.apache.mxnet.NDArray
+    */
   def imResize(src: NDArray, w: Int, h: Int): NDArray = {
     imResize(src, w, h, null)
   }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala
index b73f4ad4b112..7fbdae5b3e21 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/BaseModule.scala
@@ -247,11 +247,23 @@ abstract class BaseModule {
 
   /**
    * Run prediction and collect the outputs.
-   * @param evalData
+   * @param evalData dataIter to do the Inference
    * @param numBatch Default is -1, indicating running all the batches in the data iterator.
    * @param reset Default is `True`, indicating whether we should reset the data iter before start
    *              doing prediction.
    * @return The return value will be a list `[out1, out2, out3]`.
+   *        The concatenation process will be like
+   *        {{{
+   *            outputBatches = [
+   *              [a1, a2, a3], // batch a
+   *              [b1, b2, b3]  // batch b
+   *            ]
+   *            result = [
+   *              NDArray, // [a1, b1]
+   *              NDArray, // [a2, b2]
+   *              NDArray, // [a3, b3]
+   *            ]
+   *        }}}
    *         Where each element is concatenation of the outputs for all the mini-batches.
    */
   def predict(evalData: DataIter, numBatch: Int = -1, reset: Boolean = true)
@@ -264,7 +276,8 @@ abstract class BaseModule {
           s"in mini-batches (${out.size})." +
       "Maybe bucketing is used?")
     )
-    val concatenatedOutput = outputBatches.map(out => NDArray.concatenate(out))
+    val oBT = outputBatches.transpose
+    val concatenatedOutput = oBT.map(out => NDArray.concatenate(out))
     outputBatches.foreach(_.foreach(_.dispose()))
     concatenatedOutput
   }
@@ -398,8 +411,7 @@ abstract class BaseModule {
           fitParams: FitParams = new FitParams): Unit = {
     require(fitParams != null, "Undefined fitParams")
     require(numEpoch > 0, s"Invalid number of epochs $numEpoch")
-    import org.apache.mxnet.DataDesc._
-    bind(dataShapes = trainData.provideData, labelShapes = Option(trainData.provideLabel),
+    bind(dataShapes = trainData.provideDataDesc, labelShapes = Option(trainData.provideLabelDesc),
          forTraining = true, forceRebind = fitParams.forceRebind)
     fitParams.monitor.foreach(installMonitor)
     initParams(fitParams.initializer, argParams, auxParams,
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala
index 41a6f69394d2..d5c8c21ea106 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/BucketingModule.scala
@@ -296,7 +296,7 @@ class BucketingModule(symGen: AnyRef => (Symbol, IndexedSeq[String], IndexedSeq[
     require(this.binded && this.paramsInitialized, "bind() and initParams() must be called first.")
     val bucketKey = dataBatch.bucketKey
     val originalBucketKey = this._currBucketKey
-    this.switchBucket(bucketKey, dataBatch.provideData, Option(dataBatch.provideLabel))
+    this.switchBucket(bucketKey, dataBatch.provideDataDesc, Option(dataBatch.provideLabelDesc))
     // switch back
     this.switchBucket(originalBucketKey, null, None)
   }
@@ -308,8 +308,8 @@ class BucketingModule(symGen: AnyRef => (Symbol, IndexedSeq[String], IndexedSeq[
    */
   override def forward(dataBatch: DataBatch, isTrain: Option[Boolean] = None): Unit = {
     require(binded && paramsInitialized, "bind() and initParams() must be called first.")
-    this.switchBucket(dataBatch.bucketKey, dataBatch.provideData,
-      Option(dataBatch.provideLabel))
+    this.switchBucket(dataBatch.bucketKey, dataBatch.provideDataDesc,
+      Option(dataBatch.provideLabelDesc))
     this._currModule.forward(dataBatch, isTrain)
   }
 
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
index 3255d9346b80..9928f66b2200 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/Module.scala
@@ -435,14 +435,14 @@ class Module(symbolVar: Symbol,
     val newDataShapes = dataBatch.data.map(_.shape)
     if (currDataShapes != newDataShapes) {
       val newDShapes: IndexedSeq[DataDesc] =
-        if (dataBatch.provideData != null) dataBatch.provideData
+        if (dataBatch.provideDataDesc != null) dataBatch.provideDataDesc
         else {
           this.dataShapes.zip(newDataShapes).map { case (i, shape) =>
             DataDesc(i.name, shape, i.dtype, i.layout)
           }
         }
       val newLShapes: Option[IndexedSeq[DataDesc]] =
-        if (dataBatch.provideLabel != null) Some(dataBatch.provideLabel)
+        if (dataBatch.provideLabelDesc != null) Some(dataBatch.provideLabelDesc)
         else if (dataBatch.label != null && dataBatch.label.length > 0
             && this.labelShapes != null) {
           Some(this.labelShapes.zip(dataBatch.label).map { case (i, j) =>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala b/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala
index 3c3eeb97f201..d80e6bc6279b 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/module/SequentialModule.scala
@@ -111,6 +111,16 @@ class SequentialModule extends BaseModule {
     this.labelShapesVar.orNull
   }
 
+  /**
+    * Get output shapes.
+    * @return The output shapes of the last
+    * module is the output shape of a SequentialModule.
+    */
+  def outputDesc: IndexedSeq[DataDesc] = {
+    require(this.binded, "bind() must be called first.")
+    this.modules.reverse.head.dataShapes
+  }
+
   /**
    * Get output shapes.
    * @return The output shapes of the last
@@ -306,12 +316,8 @@ class SequentialModule extends BaseModule {
         val dataNames = module.outputShapes.map(_._1)
         require(dataNames.length == data.data.length,
           s"dataNames $dataNames do not match with number of arrays in batch")
-        var provideData = ListMap[String, Shape]()
-        for ((name, x) <- dataNames.zip(out.head)) {
-          provideData += name -> x.shape
-        }
         data = new DataBatch(out.head, data.label, data.index,
-            data.pad, data.bucketKey, provideData, data.provideLabel)
+            data.pad, data.bucketKey, outputDesc, data.provideLabelDesc)
       }
     }
   }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/util/OptionConversion.scala b/scala-package/core/src/main/scala/org/apache/mxnet/util/OptionConversion.scala
index 2cf453ac3d18..c780a9605b12 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/util/OptionConversion.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/util/OptionConversion.scala
@@ -17,6 +17,8 @@
 
 package org.apache.mxnet.util
 
+import scala.language.implicitConversions
+
 object OptionConversion {
   implicit def someWrapper[A](noSome : A) : Option[A] = Option(noSome)
 }
diff --git a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java
index 86c7eb29d2ef..1b7042d49795 100644
--- a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java
+++ b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/NDArrayTest.java
@@ -86,7 +86,7 @@ public void testGenerated(){
         NDArray$ NDArray = NDArray$.MODULE$;
         float[] arr = new float[]{1.0f, 2.0f, 3.0f};
         NDArray nd = new NDArray(arr, new Shape(new int[]{3}), new Context("cpu", 0));
-        float result = NDArray.norm(NDArray.new normParam(nd))[0].toArray()[0];
+        float result = NDArray.norm(new normParam(nd))[0].toArray()[0];
         float cal = 0.0f;
         for (float ele : arr) {
             cal += ele * ele;
@@ -94,7 +94,7 @@ public void testGenerated(){
         cal = (float) Math.sqrt(cal);
         assertTrue(Math.abs(result - cal) < 1e-5);
         NDArray dotResult = new NDArray(new float[]{0}, new Shape(new int[]{1}), new Context("cpu", 0));
-        NDArray.dot(NDArray.new dotParam(nd, nd).setOut(dotResult));
+        NDArray.dot(new dotParam(nd, nd).setOut(dotResult));
         assertTrue(Arrays.equals(dotResult.toArray(), new float[]{14.0f}));
     }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
index 698a2b53a9fa..9839f09e4063 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
@@ -19,6 +19,7 @@ package org.apache.mxnet
 
 import org.apache.mxnet.io.{NDArrayIter, ResizeIter, PrefetchingIter}
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import scala.language.postfixOps
 import scala.sys.process._
 
 
@@ -53,10 +54,10 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     // test DataIter
     val mnistIter = mnistPack.iterator
     // test provideData
-    val provideData = mnistIter.provideData
-    val provideLabel = mnistIter.provideLabel
-    assert(provideData("data") === Shape(100, 784))
-    assert(provideLabel("label") === Shape(100))
+    val provideData = mnistIter.provideDataDesc
+    val provideLabel = mnistIter.provideLabelDesc
+    assert(provideData.find(_.name == "data").get.shape === Shape(100, 784))
+    assert(provideLabel.find(_.name == "label").get.shape === Shape(100))
     // test_loop
     mnistIter.reset()
     batchCount = 0
@@ -105,10 +106,10 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     val nBatch = 500
     var batchCount = 0
     // test provideData
-    val provideData = imgRecIter.provideData
-    val provideLabel = imgRecIter.provideLabel
-    assert(provideData("data").toArray === Array(100, 3, 28, 28))
-    assert(provideLabel("label").toArray === Array(100))
+    val provideData = imgRecIter.provideDataDesc
+    val provideLabel = imgRecIter.provideLabelDesc
+    assert(provideData.find(_.name == "data").get.shape.toArray === Array(100, 3, 28, 28))
+    assert(provideLabel.find(_.name == "label").get.shape.toArray === Array(100))
 
     imgRecIter.reset()
     while (imgRecIter.hasNext) {
@@ -208,12 +209,12 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     assert(nBatch === batchCount)
 
     // test provideData
-    val provideData = prefetchIter.provideData
-    val provideLabel = prefetchIter.provideLabel
-    assert(provideData("data1") === Shape(100, 784))
-    assert(provideData("data2") === Shape(100, 784))
-    assert(provideLabel("label1") === Shape(100))
-    assert(provideLabel("label2") === Shape(100))
+    val provideData = prefetchIter.provideDataDesc
+    val provideLabel = prefetchIter.provideLabelDesc
+    assert(provideData.find(_.name == "data1").get.shape === Shape(100, 784))
+    assert(provideData.find(_.name == "data2").get.shape === Shape(100, 784))
+    assert(provideLabel.find(_.name == "label1").get.shape === Shape(100))
+    assert(provideLabel.find(_.name == "label2").get.shape === Shape(100))
 
     // test reset
     prefetchIter.reset()
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
index 88e314e2a72c..5aed01bde693 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
@@ -23,6 +23,34 @@ import org.apache.mxnet.optimizer._
 import org.apache.mxnet.io._
 
 class ModuleSuite extends FunSuite with BeforeAndAfterAll {
+
+  class myModule(symbol : Symbol) extends Module (symbol) {
+    override def predictEveryBatch(evalData: DataIter,
+                                   numBatch: Int = 1, reset: Boolean = true):
+    IndexedSeq[IndexedSeq[NDArray]] = {
+      val data = IndexedSeq(
+        NDArray.ones(Shape(1, 10, 1)),
+        NDArray.ones(Shape(1, 10, 1)),
+        NDArray.ones(Shape(1, 10, 4))
+      )
+      List.fill(numBatch)(data).toIndexedSeq
+    }
+  }
+
+  test("predict") {
+    val sym = Symbol.Variable("data")
+    val mod = new myModule(sym)
+    val dummyIter = new NDArrayIter(IndexedSeq(NDArray.ones(1)))
+    var output = mod.predict(dummyIter, 1)
+    require(output(0).shape == Shape(1, 10, 1))
+    require(output(1).shape == Shape(1, 10, 1))
+    require(output(2).shape == Shape(1, 10, 4))
+    output = mod.predict(dummyIter, 2)
+    require(output(0).shape == Shape(2, 10, 1))
+    require(output(1).shape == Shape(2, 10, 1))
+    require(output(2).shape == Shape(2, 10, 4))
+  }
+
   test ("model dtype") {
     val dType = DType.Float32
     val dShape = Shape(3, 8, 7)
@@ -253,8 +281,8 @@ class ModuleSuite extends FunSuite with BeforeAndAfterAll {
 
     // create module
     val mod = new Module(x, contexts = Array(Context.cpu()))
-    mod.bind(dataShapes = trainData.provideData,
-      Option(trainData.provideLabel))
+    mod.bind(dataShapes = trainData.provideDataDesc,
+      Option(trainData.provideLabelDesc))
     mod.installMonitor(mon)
     val argParams = Map(
       "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)),
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/OperatorBuildUtils.scala b/scala-package/core/src/test/scala/org/apache/mxnet/NumpyScopeSuite.scala
similarity index 58%
rename from scala-package/macros/src/main/scala/org/apache/mxnet/utils/OperatorBuildUtils.scala
rename to scala-package/core/src/test/scala/org/apache/mxnet/NumpyScopeSuite.scala
index 383c68c0fb10..bf6627ac7e91 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/OperatorBuildUtils.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/NumpyScopeSuite.scala
@@ -15,18 +15,20 @@
  * limitations under the License.
  */
 
-package org.apache.mxnet.utils
+package org.apache.mxnet
 
-private[mxnet] object OperatorBuildUtils {
-  // Convert ctypes returned doc string information into parameters docstring.
-  def ctypes2docstring(argNames: Seq[String],
-                       argTypes: Seq[String],
-                       argDescs: Seq[String]): String = {
-    val params =
-      (argNames zip argTypes zip argDescs) map { case ((argName, argType), argDesc) =>
-        val desc = if (argDesc.isEmpty) "" else s"\n$argDesc"
-        s"$argName : $argType$desc"
-      }
-    s"Parameters\n----------\n${params.mkString("\n")}\n"
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+
+class NumpyScopeSuite extends FunSuite with BeforeAndAfterAll {
+  test("compatible") {
+    NumpyScope.enableNumpyCompatible.withScope {
+      assert(NumpyScope.isNumpyCompatible === true)
+    }
+  }
+
+  test("incompatible") {
+    NumpyScope.disableNumpyCompatible.withScope {
+      assert(NumpyScope.isNumpyCompatible === false)
+    }
   }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/train/ConvSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/train/ConvSuite.scala
index 44f57c0d1162..eb6d5b7d7175 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/train/ConvSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/train/ConvSuite.scala
@@ -23,6 +23,7 @@ import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
 import scala.collection.mutable.ListBuffer
+import scala.language.postfixOps
 import scala.sys.process._
 
 class ConvSuite extends FunSuite with BeforeAndAfterAll {
diff --git a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java
index b40a4e94afbd..dd17b1d4a0a5 100644
--- a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java
+++ b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/bert/BertQA.java
@@ -68,15 +68,15 @@ private static int argmax(float[] prob) {
      */
     static List<String> postProcessing(NDArray result, List<String> tokens) {
         NDArray[] output = NDArray.split(
-                NDArray.new splitParam(result, 2).setAxis(2));
+                new splitParam(result, 2).setAxis(2));
         // Get the formatted logits result
         NDArray startLogits = output[0].reshape(new int[]{0, -3});
         NDArray endLogits = output[1].reshape(new int[]{0, -3});
         // Get Probability distribution
         float[] startProb = NDArray.softmax(
-                NDArray.new softmaxParam(startLogits))[0].toArray();
+                new softmaxParam(startLogits))[0].toArray();
         float[] endProb = NDArray.softmax(
-                NDArray.new softmaxParam(endLogits))[0].toArray();
+                new softmaxParam(endLogits))[0].toArray();
         int startIdx = argmax(startProb);
         int endIdx = argmax(endProb);
         return tokens.subList(startIdx, endIdx + 1);
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/CNNTextClassification.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/CNNTextClassification.scala
index 7745043b23d8..d9902e9dcc75 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/CNNTextClassification.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/CNNTextClassification.scala
@@ -188,7 +188,7 @@ object CNNTextClassification {
       // decay learning rate
       if (iter % 50 == 0 && iter > 0) {
         factor *= 0.5f
-        opt.setLrScale(paramBlocks.map(_._1 -> factor).toMap)
+        opt.setLrMult(paramBlocks.map(paramBlock => (Left(paramBlock._1), factor)).toMap)
         logger.info(s"reset learning to ${opt.learningRate * factor}")
       }
       // end of training loop
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/customop/ExampleCustomOp.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/customop/ExampleCustomOp.scala
index df79f5b63769..0cfcc49aee04 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/customop/ExampleCustomOp.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/customop/ExampleCustomOp.scala
@@ -107,7 +107,7 @@ object ExampleCustomOp {
     val (trainIter, testIter) =
       Data.mnistIterator(dataPath, batchSize = 100, inputShape = Shape(784))
 
-    val datasAndLabels = trainIter.provideData ++ trainIter.provideLabel
+    val datasAndLabels = trainIter.provideDataDesc ++ trainIter.provideLabelDesc
     val (argShapes, outputShapes, auxShapes) = mlp.inferShape(datasAndLabels)
 
     val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/customop/ExampleCustomOpWithRtc.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/customop/ExampleCustomOpWithRtc.scala
index c3ac347353df..7b0fb349373d 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/customop/ExampleCustomOpWithRtc.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/customop/ExampleCustomOpWithRtc.scala
@@ -128,7 +128,7 @@ object ExampleCustomOpWithRtc {
 
     val (trainIter, testIter) =
       Data.mnistIterator(dataPath, batchSize = 100, inputShape = Shape(784))
-    val datasAndLabels = trainIter.provideData ++ trainIter.provideLabel
+    val datasAndLabels = trainIter.provideDataDesc ++ trainIter.provideLabelDesc
     val (argShapes, outputShapes, auxShapes) = mlp.inferShape(datasAndLabels)
 
     val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala
index e4d3b2ae7c3e..4d22b62bea81 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala
@@ -54,7 +54,7 @@ class SyntheticDataIter(numClasses: Int, val batchSize: Int, datumShape: List[In
   override def next(): DataBatch = {
     if (hasNext) {
       curIter += batchSize
-      new DataBatch(data, label, getIndex, getPad, null, null, null)
+      new DataBatch(data, label, getIndex, getPad)
     } else {
       throw new NoSuchElementException
     }
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/MnistMlp.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/MnistMlp.scala
index 4d450c60456b..839f6ac85902 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/MnistMlp.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/MnistMlp.scala
@@ -49,7 +49,7 @@ object MnistMlp {
       logger.info("Load checkpoint from epoch {}", loadModelEpoch)
       Module.loadCheckpoint("model/mnist_mlp", loadModelEpoch, loadOptimizerStates = true)
     }
-    mod.bind(dataShapes = train.provideData, labelShapes = Some(train.provideLabel))
+    mod.bind(dataShapes = train.provideDataDesc, labelShapes = Some(train.provideLabelDesc))
     mod.initParams()
     mod.initOptimizer(optimizer = new SGD(learningRate = 0.01f, momentum = 0.9f))
 
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/SequentialModuleEx.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/SequentialModuleEx.scala
index ff616a57b1aa..ea2273ebd796 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/SequentialModuleEx.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/SequentialModuleEx.scala
@@ -57,7 +57,7 @@ object SequentialModuleEx {
     cmdLine: SequentialModuleEx): Unit = {
     // Intermediate-level API
     val modSeq = getSeqModule()
-    modSeq.bind(dataShapes = train.provideData, labelShapes = Some(train.provideLabel))
+    modSeq.bind(dataShapes = train.provideDataDesc, labelShapes = Some(train.provideLabelDesc))
     if (cmdLine.loadModelPath != null) {
       logger.info(s"Load checkpoint from ${cmdLine.loadModelPath}")
       modSeq.loadParams(cmdLine.loadModelPath)
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala
index bfde55831e26..5c17a3747ab6 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala
@@ -31,6 +31,7 @@ import org.apache.mxnet.optimizer.RMSProp
 import org.apache.mxnetexamples.Util
 
 import scala.collection.immutable.ListMap
+import scala.language.postfixOps
 import scala.sys.process.Process
 
 /**
@@ -65,7 +66,7 @@ object ExampleMultiTask {
         new DataBatch(batch.data,
           IndexedSeq(label, label),
           batch.index,
-          batch.pad, null, null, null)
+          batch.pad)
       } else {
         throw new NoSuchElementException
       }
@@ -100,7 +101,7 @@ object ExampleMultiTask {
     override def getIndex(): IndexedSeq[Long] = this.dataIter.getIndex()
 
     // The name and shape of label provided by this iterator
-    @deprecated
+    @deprecated("Use provideLabelDesc instead", "1.3.0")
     override def provideLabel: ListMap[String, Shape] = {
       val provideLabel = this.dataIter.provideLabel.toArray
       // Different labels should be used here for actual application
@@ -126,7 +127,7 @@ object ExampleMultiTask {
     override def getPad(): Int = this.dataIter.getPad()
 
     // The name and shape of data provided by this iterator
-    @deprecated
+    @deprecated("Use provideDataDesc instead", "1.3.0")
     override def provideData: ListMap[String, Shape] = this.dataIter.provideData
 
     override def provideDataDesc: IndexedSeq[DataDesc] = this.dataIter.provideDataDesc
@@ -229,10 +230,10 @@ object ExampleMultiTask {
       val trainMultiIt = new MultiMnistIterator(trainIter)
       val valMultiIter = new MultiMnistIterator(valIter)
 
-      val datasAndLabels = trainMultiIt.provideData ++ trainMultiIt.provideLabel
+      val datasAndLabels = trainMultiIt.provideDataDesc ++ trainMultiIt.provideLabelDesc
 
       val (argShapes, outputShapes, auxShapes)
-      = network.inferShape(trainMultiIt.provideData("data"))
+      = network.inferShape(trainMultiIt.provideDataDesc.filter(_.name == "data"))
       val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
 
       val argNames = network.listArguments
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala
index 1767cabcbae4..475e179f819b 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyle.scala
@@ -39,7 +39,7 @@ object NeuralStyle {
   private val logger = LoggerFactory.getLogger(classOf[NeuralStyle])
 
   def preprocessContentImage(path: String, longEdge: Int, ctx: Context): NDArray = {
-    val img = Image(new File(path))
+    val img = Image.fromFile(new File(path))
     logger.info(s"load the content image, size = ${(img.height, img.width)}")
     val factor = longEdge.toFloat / Math.max(img.height, img.width)
     val (newHeight, newWidth) = ((img.height * factor).toInt, (img.width * factor).toInt)
@@ -60,7 +60,7 @@ object NeuralStyle {
   }
 
   def preprocessStyleImage(path: String, shape: Shape, ctx: Context): NDArray = {
-    val img = Image(new File(path))
+    val img = Image.fromFile(new File(path))
     val resizedImg = img.scaleTo(shape(3), shape(2))
     val sample = NDArray.empty(Shape(1, 3, shape(2), shape(3)), ctx)
     val datas = {
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
index 80a009ea40c2..5b01d2016467 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
@@ -29,7 +29,7 @@ object DataProcessing {
 
   def preprocessContentImage(path: String,
       dShape: Shape = null, ctx: Context): NDArray = {
-    val img = Image(new File(path))
+    val img = Image.fromFile(new File(path))
     val resizedImg = img.scaleTo(dShape(3), dShape(2))
     val sample = NDArray.empty(Shape(1, 3, resizedImg.height, resizedImg.width), ctx)
     val datas = {
@@ -46,7 +46,7 @@ object DataProcessing {
   }
 
   def preprocessStyleImage(path: String, shape: Shape, ctx: Context): NDArray = {
-    val img = Image(new File(path))
+    val img = Image.fromFile(new File(path))
     val resizedImg = img.scaleTo(shape(3), shape(2))
     val sample = NDArray.empty(Shape(1, 3, shape(2), shape(3)), ctx)
     val datas = {
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
index 350e28cf8634..2648f9e3d6bb 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
@@ -251,11 +251,11 @@ object BucketIo {
     override def getPad(): Int = 0
 
     // The name and shape of label provided by this iterator
-    @deprecated
+    @deprecated("Use provideLabelDesc instead", "1.3.0")
     override def provideLabel: ListMap[String, Shape] = this._provideLabel
 
     // The name and shape of data provided by this iterator
-    @deprecated
+    @deprecated("Use provideDataDesc instead", "1.3.0")
     override def provideData: ListMap[String, Shape] = this._provideData
 
     // Provide type:DataDesc of the data
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TrainCharRnn.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TrainCharRnn.scala
index c90b7637b9b1..68346afe1f47 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TrainCharRnn.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TrainCharRnn.scala
@@ -75,7 +75,7 @@ object TrainCharRnn {
       // the network symbol
       val symbol = symGen(buckets(0))
 
-      val datasAndLabels = dataTrain.provideData ++ dataTrain.provideLabel
+      val datasAndLabels = dataTrain.provideDataDesc ++ dataTrain.provideLabelDesc
       val (argShapes, outputShapes, auxShapes) = symbol.inferShape(datasAndLabels)
 
       val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
@@ -85,12 +85,13 @@ object TrainCharRnn {
       val auxNames = symbol.listAuxiliaryStates()
       val auxDict = auxNames.zip(auxShapes.map(NDArray.zeros(_, ctx))).toMap
 
+      val datasAndLabelsNames = datasAndLabels.map(_.name)
       val gradDict = argNames.zip(argShapes).filter { case (name, shape) =>
-        !datasAndLabels.contains(name)
+        !datasAndLabelsNames.contains(name)
       }.map(x => x._1 -> NDArray.empty(x._2, ctx)).toMap
 
       argDict.foreach { case (name, ndArray) =>
-        if (!datasAndLabels.contains(name)) {
+        if (!datasAndLabelsNames.contains(name)) {
           initializer.initWeight(name, ndArray)
         }
       }
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmarkSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmarkSuite.scala
index 0b7f4693c5fa..548f2e4122e0 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmarkSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmarkSuite.scala
@@ -22,6 +22,7 @@ import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
+import scala.language.postfixOps
 import scala.sys.process.Process
 
 class ScalaInferenceBenchmarkSuite  extends FunSuite with BeforeAndAfterAll {
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
index a7e36dfc3a11..ae0ee33002d9 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
@@ -26,6 +26,7 @@ import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
+import scala.language.postfixOps
 import scala.sys.process.Process
 
 /**
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala
index 6385e062a260..b65f237c8621 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/customop/CustomOpExampleSuite.scala
@@ -25,6 +25,7 @@ import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
+import scala.language.postfixOps
 import scala.sys.process.Process
 
 class CustomOpExampleSuite extends FunSuite with BeforeAndAfterAll {
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
index 59faba9a3779..709ea77632e0 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
@@ -24,6 +24,7 @@ import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite, Ignore}
 import org.slf4j.LoggerFactory
 
+import scala.language.postfixOps
 import scala.sys.process.Process
 
 class GanExampleSuite extends FunSuite with BeforeAndAfterAll{
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
index 0daba5a97d77..e6f4f6fcc908 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
@@ -24,6 +24,7 @@ import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
+import scala.language.postfixOps
 import scala.sys.process.Process
 
 /**
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
index c5308ac37512..9c16aca420ef 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
@@ -23,6 +23,7 @@ import java.io.File
 import org.apache.mxnet.Context
 import org.apache.mxnetexamples.Util
 
+import scala.language.postfixOps
 import sys.process.Process
 
 /**
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/objectdetector/ObjectDetectorExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/objectdetector/ObjectDetectorExampleSuite.scala
index bd960bddebf5..918fb835f76e 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/objectdetector/ObjectDetectorExampleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/objectdetector/ObjectDetectorExampleSuite.scala
@@ -23,6 +23,7 @@ import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
+import scala.language.postfixOps
 import scala.sys.process.Process
 
 class ObjectDetectorExampleSuite extends FunSuite with BeforeAndAfterAll {
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala
index 71c2b35ef444..c93a7d06a452 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/neuralstyle/NeuralStyleSuite.scala
@@ -23,6 +23,7 @@ import org.apache.mxnetexamples.neuralstyle.end2end.{BoostInference, BoostTrain}
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import org.slf4j.LoggerFactory
 
+import scala.language.postfixOps
 import scala.sys.process.Process
 
 /**
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
index 2ccd38fc4f9c..ff3fbe9e05d2 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
@@ -23,6 +23,7 @@ import org.apache.mxnetexamples.Util
 import org.scalatest.{BeforeAndAfterAll, FunSuite, Ignore}
 import org.slf4j.LoggerFactory
 
+import scala.language.postfixOps
 import scala.sys.process.Process
 
 class ExampleRNNSuite extends FunSuite with BeforeAndAfterAll {
diff --git a/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ClassifierSuite.scala b/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ClassifierSuite.scala
index d9ccec468791..11d418002744 100644
--- a/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ClassifierSuite.scala
+++ b/scala-package/infer/src/test/scala/org/apache/mxnet/infer/ClassifierSuite.scala
@@ -142,7 +142,7 @@ class ClassifierSuite extends FunSuite with BeforeAndAfterAll {
     val result: IndexedSeq[(String, Double)] = testClassifier.
       classify(IndexedSeq(inputData), topK = Some(10))
 
-    assert((result(0)_2).getClass == 1d.getClass)
+    assert((result(0)._2).getClass == 1d.getClass)
 
     assertResult(predictResult(0).sortBy(-_)) {
       result.map(_._2).toArray
@@ -185,7 +185,7 @@ class ClassifierSuite extends FunSuite with BeforeAndAfterAll {
     val result: IndexedSeq[(String, Double)] = testClassifier.
       classify(IndexedSeq(inputData))
 
-    assert((result(0)_2).getClass == 1d.getClass)
+    assert((result(0)._2).getClass == 1d.getClass)
 
     assertResult(predictResult(0)) {
       result.map(_._2).toArray
diff --git a/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala b/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala
index b5a6286af1b6..e3fa28fb2a59 100644
--- a/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala
+++ b/scala-package/init/src/main/scala/org/apache/mxnet/init/Base.scala
@@ -29,17 +29,41 @@ object Base {
   class RefFloat(val value: Float = 0)
   class RefString(val value: String = null)
 
+  /**
+    * This C Pointer Address point to the
+    * actual memory piece created in MXNet Engine
+    */
   type CPtrAddress = Long
 
+  /**
+    * NDArrayHandle is the C pointer to
+    * the NDArray
+    */
   type NDArrayHandle = CPtrAddress
+  /**
+    * FunctionHandle is the C pointer to
+    * the ids of the operators
+    */
   type FunctionHandle = CPtrAddress
+  /**
+    * KVStoreHandle is the C pointer to
+    * the KVStore
+    */
   type KVStoreHandle = CPtrAddress
+  /**
+    * ExecutorHandle is the C pointer to
+    * the Executor
+    */
   type ExecutorHandle = CPtrAddress
+  /**
+    * SymbolHandle is the C pointer to
+    * the Symbol
+    */
   type SymbolHandle = CPtrAddress
 
   @throws(classOf[UnsatisfiedLinkError])
   private def tryLoadInitLibrary(): Unit = {
-    var userDir : File = new File(System.getProperty("user.dir"))
+    val userDir : File = new File(System.getProperty("user.dir"))
     var nativeDir : File = new File(userDir, "init-native")
     if (!nativeDir.exists()) {
       nativeDir = new File(userDir.getParent, "init-native")
@@ -50,7 +74,6 @@ object Base {
     val baseDir = nativeDir.getAbsolutePath
 
     val os = System.getProperty("os.name")
-    // ref: http://lopica.sourceforge.net/os.html
     if (os.startsWith("Linux")) {
       System.load(s"$baseDir/target/libmxnet-init-scala.so")
     } else if (os.startsWith("Mac")) {
diff --git a/scala-package/init/src/main/scala/org/apache/mxnet/init/LibInfo.scala b/scala-package/init/src/main/scala/org/apache/mxnet/init/LibInfo.scala
index 7bd0c701f872..c813d449f652 100644
--- a/scala-package/init/src/main/scala/org/apache/mxnet/init/LibInfo.scala
+++ b/scala-package/init/src/main/scala/org/apache/mxnet/init/LibInfo.scala
@@ -22,7 +22,25 @@ import org.apache.mxnet.init.Base._
 import scala.collection.mutable.ListBuffer
 
 class LibInfo {
+  /**
+    * Get the list of the symbol ids
+    * @param symbolList Pass in an empty ListBuffer and obtain a list of operator IDs
+    * @return Callback result
+    */
   @native def mxSymbolListAtomicSymbolCreators(symbolList: ListBuffer[SymbolHandle]): Int
+
+  /**
+    * Get the detailed information of an operator
+    * @param handle The ID of the operator
+    * @param name Name of the operator
+    * @param desc Description of the operator
+    * @param numArgs Number of arguments
+    * @param argNames Argument names
+    * @param argTypes Argument types
+    * @param argDescs Argument descriptions
+    * @param keyVarNumArgs Kwargs number
+    * @return Callback result
+    */
   @native def mxSymbolGetAtomicSymbolInfo(handle: SymbolHandle,
                                           name: RefString,
                                           desc: RefString,
@@ -31,6 +49,18 @@ class LibInfo {
                                           argTypes: ListBuffer[String],
                                           argDescs: ListBuffer[String],
                                           keyVarNumArgs: RefString): Int
+  /**
+    * Get the name list of all operators
+    * @param names Names of all operator
+    * @return Callback result
+    */
   @native def mxListAllOpNames(names: ListBuffer[String]): Int
+
+  /**
+    * Get operator ID from its name
+    * @param opName Operator name
+    * @param opHandle Operator ID
+    * @return Callback result
+    */
   @native def nnGetOpHandle(opName: String, opHandle: RefLong): Int
 }
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
index ede16f73d2a1..e939b2ebf9e7 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/APIDocGenerator.scala
@@ -23,12 +23,16 @@ import java.security.MessageDigest
 import scala.collection.mutable.ListBuffer
 
 /**
-  * This object will generate the Scala documentation of the new Scala API
-  * Two file namely: SymbolAPIBase.scala and NDArrayAPIBase.scala
+  * This object will generate the Scala documentation of the Scala/Java APIs
   * The code will be executed during Macros stage and file live in Core stage
   */
 private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
 
+  /**
+    * Main method used to generate code and write to files
+    * A hash check placed at the end to verify changes
+    * @param args Input args
+    */
   def main(args: Array[String]): Unit = {
     val FILE_PATH = args(0)
     val hashCollector = ListBuffer[String]()
@@ -42,6 +46,12 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
     val finalHash = hashCollector.mkString("\n")
   }
 
+  /**
+    * Generate MD5 result from an input string
+    * Encoded in UTF-8
+    * @param input The input string
+    * @return A MD5 value from the string
+    */
   def MD5Generator(input: String): String = {
     val md = MessageDigest.getInstance("MD5")
     md.update(input.getBytes("UTF-8"))
@@ -49,6 +59,12 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
     org.apache.commons.codec.binary.Base64.encodeBase64URLSafeString(digest)
   }
 
+  /**
+    * Type-safe class body generation for NDArray/Symbol
+    * @param FILE_PATH File path write the file to
+    * @param isSymbol Check if write the Symbol API, NDArray otherwise
+    * @return MD5 String
+    */
   def typeSafeClassGen(FILE_PATH: String, isSymbol: Boolean): String = {
     val generated = typeSafeFunctionsToGenerate(isSymbol, isContrib = false)
       .map { func =>
@@ -65,6 +81,12 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
       generated)
   }
 
+  /**
+    * Generate the Random classes for Symbol/NDArray
+    * @param FILE_PATH File path write the file to
+    * @param isSymbol Check if write the Symbol API, NDArray otherwise
+    * @return MD5 String
+    */
   def typeSafeRandomClassGen(FILE_PATH: String, isSymbol: Boolean): String = {
     val generated = typeSafeRandomFunctionsToGenerate(isSymbol)
       .map { func =>
@@ -83,6 +105,16 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
       generated)
   }
 
+  /**
+    * Non Type-safe interface of Scala Symbol/NDArray
+    * It includes class definition : e.g class SymbolBase
+    * and function definitions : e.g def softmax(...)(...)(...) : NDArray
+    * Users can directly use the api by calling NDArray.<function_name>
+    * It support both positional input or Map input
+    * @param FILE_PATH File path write the file to
+    * @param isSymbol Check if write the Symbol API, NDArray otherwise
+    * @return MD5 String
+    */
   def nonTypeSafeClassGen(FILE_PATH: String, isSymbol: Boolean): String = {
     val absFuncs = functionsToGenerate(isSymbol, isContrib = false)
       .map { func =>
@@ -112,10 +144,16 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
       absFuncs)
   }
 
-  def javaClassGen(filePath : String) : String = {
+  /**
+    * Type-safe interface of Java NDArray
+    * @param FILE_PATH File path write the file to
+    * @return MD5 String
+    */
+  def javaClassGen(FILE_PATH : String) : String = {
     val notGenerated = Set("Custom")
     val absClassFunctions = functionsToGenerate(false, false, true)
-    val absFuncs = absClassFunctions.filterNot(ele => notGenerated.contains(ele.name))
+    val (absFuncs, paramClassUncleaned) =
+      absClassFunctions.filterNot(ele => notGenerated.contains(ele.name))
       .groupBy(_.name.toLowerCase).map(ele => {
       /* Pattern matching for not generating deprecated method
        * Group all method name in lowercase
@@ -129,17 +167,24 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
       }
     }).map(absClassFunction => {
         generateJavaAPISignature(absClassFunction)
-      }).toSeq
+      }).toSeq.unzip
+    val paramClass = paramClassUncleaned.filterNot(_.isEmpty)
     val packageName = "NDArrayBase"
     val packageDef = "package org.apache.mxnet.javaapi"
     writeFile(
-      filePath + "javaapi/",
+      FILE_PATH + "javaapi/",
       packageDef,
       packageName,
       "import org.apache.mxnet.annotation.Experimental",
-      absFuncs)
+      absFuncs, Some(paramClass))
   }
 
+  /**
+    * Generate Scala docs from the function description
+    * @param func The function case class
+    * @param withParam Whether to generate param field
+    * @return A formatted string for the function description
+    */
   def generateAPIDocFromBackend(func: Func, withParam: Boolean = true): String = {
     def fixDesc(desc: String): String = {
       var curDesc = desc
@@ -173,6 +218,14 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
     }
   }
 
+  /**
+    * Generate the function interface
+    * e.g: def softmax(data: NDArray, name ...): NDArrayFunctionReturn
+    * @param func The function case class
+    * @param isSymbol Check if generate Symbol function, NDArray otherwise
+    * @param typeParameter Type param specifically used in Random Module
+    * @return Formatted string for the function
+    */
   def generateAPISignature(func: Func, isSymbol: Boolean, typeParameter: String = ""): String = {
     val argDef = ListBuffer[String]()
 
@@ -192,7 +245,12 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
        |def ${func.name}$typeParameter (${argDef.mkString(", ")}): $returnType""".stripMargin
   }
 
-  def generateJavaAPISignature(func : Func) : String = {
+  /**
+    * Generate Java function interface
+    * @param func The function case class
+    * @return A formatted string for the function
+    */
+  def generateJavaAPISignature(func : Func) : (String, String) = {
     val useParamObject = func.listOfArgs.count(arg => arg.isOptional) >= 2
     var argDef = ListBuffer[String]()
     var classDef = ListBuffer[String]()
@@ -231,27 +289,38 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
            | }
            | def getOut() = this.out
            | """.stripMargin
-      s"""$scalaDocNoParam
+      (s"""$scalaDocNoParam
          | $experimentalTag
          | def ${func.name}(po: ${func.name}Param) : $returnType
-         | /**
+         | """.stripMargin,
+        s"""/**
          | * This Param Object is specifically used for ${func.name}
          | ${requiredParam.mkString("\n")}
          | */
          | class ${func.name}Param(${argDef.mkString(",")}) {
          |  ${classDef.mkString("\n  ")}
-         | }""".stripMargin
+         | }""".stripMargin)
     } else {
       argDef += "out : NDArray"
-      s"""$scalaDoc
+      (s"""$scalaDoc
          |$experimentalTag
          | def ${func.name}(${argDef.mkString(", ")}) : $returnType
-         | """.stripMargin
+         | """.stripMargin, "")
     }
   }
 
+  /**
+    * Write the formatted string to file
+    * @param FILE_PATH Location of the file writes to
+    * @param packageDef Package definition
+    * @param className Class name
+    * @param imports Packages need to import
+    * @param absFuncs All formatted functions
+    * @return A MD5 string
+    */
   def writeFile(FILE_PATH: String, packageDef: String, className: String,
-                imports: String, absFuncs: Seq[String]): String = {
+                imports: String, absFuncs: Seq[String],
+                paramClass: Option[Seq[String]] = None): String = {
 
     val finalStr =
       s"""/*
@@ -278,7 +347,9 @@ private[mxnet] object APIDocGenerator extends GeneratorBase with RandomHelpers {
          |// scalastyle:off
          |abstract class $className {
          |${absFuncs.mkString("\n")}
-         |}""".stripMargin
+         |}
+         |${paramClass.getOrElse(Seq()).mkString("\n")}
+         |""".stripMargin
 
 
     val pw = new PrintWriter(new File(FILE_PATH + s"$className.scala"))
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
index b2033f529c65..cc3992c2ebc3 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/GeneratorBase.scala
@@ -17,16 +17,20 @@
 
 package org.apache.mxnet
 
-import org.apache.mxnet.init.Base.{RefInt, RefLong, RefString, _LIB}
-import org.apache.mxnet.utils.{CToScalaUtils, OperatorBuildUtils}
+import org.apache.mxnet.init.Base.{CPtrAddress, RefInt, RefLong, RefString, _LIB}
+import org.apache.mxnet.utils.CToScalaUtils
 
 import scala.collection.mutable.ListBuffer
 import scala.reflect.macros.blackbox
 
 private[mxnet] abstract class GeneratorBase {
-  type Handle = Long
 
   case class Arg(argName: String, argType: String, argDesc: String, isOptional: Boolean) {
+    /**
+      * Filter the arg name with the Scala keyword that are not allow to use as arg name,
+      * such as var and type listed in here. This is due to the diff between C and Scala
+      * @return argname that works in Scala
+      */
     def safeArgName: String = argName match {
       case "var" => "vari"
       case "type" => "typeOf"
@@ -36,6 +40,14 @@ private[mxnet] abstract class GeneratorBase {
 
   case class Func(name: String, desc: String, listOfArgs: List[Arg], returnType: String)
 
+  /**
+    * Non Type-safe function generation method
+    * This method will filter all "_" functions
+    * @param isSymbol Check if generate the Symbol method
+    * @param isContrib Check if generate the contrib method
+    * @param isJava Check if generate Corresponding Java method
+    * @return List of functions
+    */
   def functionsToGenerate(isSymbol: Boolean, isContrib: Boolean,
                           isJava: Boolean = false): List[Func] = {
     val l = getBackEndFunctions(isSymbol, isJava)
@@ -46,7 +58,12 @@ private[mxnet] abstract class GeneratorBase {
     }
   }
 
-  // filter the operators to generate in the type-safe Symbol.api and NDArray.api
+  /**
+    * Filter the operators to generate in the type-safe Symbol.api and NDArray.api
+    * @param isSymbol Check if generate the Symbol method
+    * @param isContrib Check if generate the contrib method
+    * @return List of functions
+    */
   protected def typeSafeFunctionsToGenerate(isSymbol: Boolean, isContrib: Boolean): List[Func] = {
     // Operators that should not be generated
     val notGenerated = Set("Custom")
@@ -60,6 +77,12 @@ private[mxnet] abstract class GeneratorBase {
     res.filterNot(ele => notGenerated.contains(ele.name))
   }
 
+  /**
+    * Extract and format the functions obtained from C API
+    * @param isSymbol Check if generate for Symbol
+    * @param isJava Check if extracting in Java format
+    * @return List of functions
+    */
   protected def getBackEndFunctions(isSymbol: Boolean, isJava: Boolean = false): List[Func] = {
     val opNames = ListBuffer.empty[String]
     _LIB.mxListAllOpNames(opNames)
@@ -70,7 +93,7 @@ private[mxnet] abstract class GeneratorBase {
     }).toList
   }
 
-  private def makeAtomicFunction(handle: Handle, aliasName: String,
+  private def makeAtomicFunction(handle: CPtrAddress, aliasName: String,
                                  isSymbol: Boolean, isJava: Boolean): Func = {
     val name = new RefString
     val desc = new RefString
@@ -82,14 +105,11 @@ private[mxnet] abstract class GeneratorBase {
 
     _LIB.mxSymbolGetAtomicSymbolInfo(
       handle, name, desc, numArgs, argNames, argTypes, argDescs, keyVarNumArgs)
-    val paramStr = OperatorBuildUtils.ctypes2docstring(argNames, argTypes, argDescs)
     val extraDoc: String = if (keyVarNumArgs.value != null && keyVarNumArgs.value.length > 0) {
       s"This function support variable length of positional input (${keyVarNumArgs.value})."
     } else {
       ""
     }
-    val realName = if (aliasName == name.value) "" else s"(a.k.a., ${name.value})"
-    val docStr = s"$aliasName $realName\n${desc.value}\n\n$paramStr\n$extraDoc\n"
 
     val argList = argNames zip argTypes zip argDescs map { case ((argName, argType), argDesc) =>
       val family = if (isJava) "org.apache.mxnet.javaapi.NDArray"
@@ -109,10 +129,10 @@ private[mxnet] abstract class GeneratorBase {
   /**
     * Generate class structure for all function APIs
     *
-    * @param c
+    * @param c Context used for generation
     * @param funcDef DefDef type of function definitions
-    * @param annottees
-    * @return
+    * @param annottees Annottees used to define Class or Module
+    * @return Expr used for code generation
     */
   protected def structGeneration(c: blackbox.Context)
                                 (funcDef: List[c.universe.DefDef], annottees: c.Expr[Any]*)
@@ -141,11 +161,15 @@ private[mxnet] abstract class GeneratorBase {
         throw new IllegalArgumentException(s"Invalid macro input: $ex")
     }
     // wrap the result up in an Expr, and return it
-    val result = c.Expr(Block(modDefs, Literal(Constant())))
+    val result = c.Expr(Block(modDefs, Literal(Constant(()))))
     result
   }
 
-  // build function argument definition, with optionality, and safe names
+  /**
+    * Build function argument definition, with optionality, and safe names
+    * @param func Functions
+    * @return List of string representing the functions interface
+    */
   protected def typedFunctionCommonArgDef(func: Func): List[String] = {
     func.listOfArgs.map(arg =>
       if (arg.isOptional) {
@@ -167,14 +191,23 @@ private[mxnet] abstract class GeneratorBase {
 private[mxnet] trait RandomHelpers {
   self: GeneratorBase =>
 
-  // a generic type spec used in Symbol.random and NDArray.random modules
+/**
+  * A generic type spec used in Symbol.random and NDArray.random modules
+  * @param isSymbol Check if generate for Symbol
+  * @param fullPackageSpec Check if leave the full name of the classTag
+  * @return A formatted string for random Symbol/NDArray
+  */
   protected def randomGenericTypeSpec(isSymbol: Boolean, fullPackageSpec: Boolean): String = {
     val classTag = if (fullPackageSpec) "scala.reflect.ClassTag" else "ClassTag"
     if (isSymbol) s"[T: SymbolOrScalar : $classTag]"
     else s"[T: NDArrayOrScalar : $classTag]"
   }
 
-  // filter the operators to generate in the type-safe Symbol.random and NDArray.random
+/**
+  * Filter the operators to generate in the type-safe Symbol.random and NDArray.random
+  * @param isSymbol Check if generate Symbol functions
+  * @return List of functions
+  */
   protected def typeSafeRandomFunctionsToGenerate(isSymbol: Boolean): List[Func] = {
     getBackEndFunctions(isSymbol)
       .filter(f => f.name.startsWith("_sample_") || f.name.startsWith("_random_"))
@@ -206,16 +239,24 @@ private[mxnet] trait RandomHelpers {
     )
   }
 
-  // hacks to manage the fact that random_normal and sample_normal have
-  // non-consistent parameter naming in the back-end
-  // this first one, merge loc/scale and mu/sigma
+  /**
+    * Hacks to manage the fact that random_normal and sample_normal have
+    * non-consistent parameter naming in the back-end
+    * this first one, merge loc/scale and mu/sigma
+    * @param arg Argument need to modify
+    * @return Arg case class with clean arg names
+    */
   protected def hackNormalFunc(arg: Arg): Arg = {
     if (arg.argName == "loc") arg.copy(argName = "mu")
     else if (arg.argName == "scale") arg.copy(argName = "sigma")
     else arg
   }
 
-  // this second one reverts this merge prior to back-end call
+  /**
+    * This second one reverts this merge prior to back-end call
+    * @param func Function case class
+    * @return A string contains the implementation of random args
+    */
   protected def unhackNormalFunc(func: Func): String = {
     if (func.name.equals("normal")) {
       s"""if(target.equals("random_normal")) {
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
index f5b8bce11cf5..c9c10f50c01f 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/NDArrayMacro.scala
@@ -22,16 +22,30 @@ import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
 private[mxnet] class AddNDArrayFunctions(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*): Any = macro NDArrayMacro.addDefs
+/**
+  * Generate non-typesafe method for NDArray operations
+  * @param annottees Annottees used to define Class or Module
+  * @return Generated code for injection
+  */
+  private[mxnet] def macroTransform(annottees: Any*) = macro NDArrayMacro.addDefs
 }
 
 private[mxnet] class AddNDArrayAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*): Any =
-  macro TypedNDArrayAPIMacro.typeSafeAPIDefs
+/**
+  * Generate typesafe method for NDArray operations
+  * @param annottees Annottees used to define Class or Module
+  * @return Generated code for injection
+  */
+  private[mxnet] def macroTransform(annottees: Any*) = macro TypedNDArrayAPIMacro.typeSafeAPIDefs
 }
 
 private[mxnet] class AddNDArrayRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*): Any =
+/**
+  * Generate typesafe method for Random Symbol
+  * @param annottees Annottees used to define Class or Module
+  * @return Generated code for injection
+  */
+  private[mxnet] def macroTransform(annottees: Any*) =
   macro TypedNDArrayRandomAPIMacro.typeSafeAPIDefs
 }
 
@@ -39,8 +53,13 @@ private[mxnet] class AddNDArrayRandomAPIs(isContrib: Boolean) extends StaticAnno
   * For non-typed NDArray API
   */
 private[mxnet] object NDArrayMacro extends GeneratorBase {
-
-  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
+  /**
+    * Methods that check the ``isContrib`` and call code generation
+    * @param c Context used for code gen
+    * @param annottees Annottees used to define Class or Module
+    * @return Generated code for injection
+    */
+  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddNDArrayFunctions($b)" => c.eval[Boolean](c.Expr(b))
@@ -82,8 +101,13 @@ private[mxnet] object NDArrayMacro extends GeneratorBase {
   * NDArray.api code generation
   */
 private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
-
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
+  /**
+    * Methods that check the ``isContrib`` and call code generation
+    * @param c Context used for code gen
+    * @param annottees Annottees used to define Class or Module
+    * @return Generated code for injection
+    */
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddNDArrayAPIs($b)" => c.eval[Boolean](c.Expr(b))
@@ -95,6 +119,12 @@ private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
     structGeneration(c)(functionDefs, annottees: _*)
   }
 
+  /**
+    * Methods that construct the code and build the syntax tree
+    * @param c Context used for code gen
+    * @param function Case class that store all information of the single function
+    * @return Generated syntax tree
+    */
   protected def buildTypedFunction(c: blackbox.Context)
                                   (function: Func): c.universe.DefDef = {
     import c.universe._
@@ -148,8 +178,13 @@ private[mxnet] object TypedNDArrayAPIMacro extends GeneratorBase {
   */
 private[mxnet] object TypedNDArrayRandomAPIMacro extends GeneratorBase
   with RandomHelpers {
-
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
+  /**
+    * methods that check the ``isContrib`` and call code generation
+    * @param c Context used for code gen
+    * @param annottees annottees used to define Class or Module
+    * @return generated code for injection
+    */
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
     // Note: no contrib managed in this module
 
     val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = false)
@@ -158,6 +193,12 @@ private[mxnet] object TypedNDArrayRandomAPIMacro extends GeneratorBase
     structGeneration(c)(functionDefs, annottees: _*)
   }
 
+  /**
+    * Methods that construct the code and build the syntax tree
+    * @param c Context used for code gen
+    * @param function Case class that store all information of the single function
+    * @return Generated syntax tree
+    */
   protected def buildTypedFunction(c: blackbox.Context)
                                   (function: Func): c.universe.DefDef = {
     import c.universe._
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
index 06b567c3d2d4..1a0405cfd63d 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/SymbolMacro.scala
@@ -23,16 +23,30 @@ import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
 private[mxnet] class AddSymbolFunctions(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*): Any = macro SymbolMacro.addDefs
+/**
+  * Generate non-typesafe method for Symbol operations
+  * @param annottees Annottees used to define Class or Module
+  * @return Generated code for injection
+  */
+  private[mxnet] def macroTransform(annottees: Any*) = macro SymbolMacro.addDefs
 }
 
 private[mxnet] class AddSymbolAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*): Any =
-  macro TypedSymbolAPIMacro.typeSafeAPIDefs
+/**
+  * Generate typesafe method for Symbol
+  * @param annottees Annottees used to define Class or Module
+  * @return Generated code for injection
+  */
+  private[mxnet] def macroTransform(annottees: Any*) = macro TypedSymbolAPIMacro.typeSafeAPIDefs
 }
 
 private[mxnet] class AddSymbolRandomAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*): Any =
+/**
+  * Generate typesafe method for Random Symbol
+  * @param annottees Annottees used to define Class or Module
+  * @return Generated code for injection
+  */
+  private[mxnet] def macroTransform(annottees: Any*) =
   macro TypedSymbolRandomAPIMacro.typeSafeAPIDefs
 }
 
@@ -41,7 +55,13 @@ private[mxnet] class AddSymbolRandomAPIs(isContrib: Boolean) extends StaticAnnot
   */
 private[mxnet] object SymbolMacro extends GeneratorBase {
 
-  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
+  /**
+    * Methods that check the ``isContrib`` and call code generation
+    * @param c Context used for code gen
+    * @param annottees Annottees used to define Class or Module
+    * @return Generated code for injection
+    */
+  def addDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddSymbolFunctions($b)" => c.eval[Boolean](c.Expr(b))
@@ -77,7 +97,13 @@ private[mxnet] object SymbolMacro extends GeneratorBase {
   */
 private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
+  /**
+    * Methods that check the ``isContrib`` and call code generation
+    * @param c Context used for code gen
+    * @param annottees Annottees used to define Class or Module
+    * @return Generated code for injection
+    */
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
     import c.universe._
     val isContrib: Boolean = c.prefix.tree match {
       case q"new AddSymbolAPIs($b)" => c.eval[Boolean](c.Expr(b))
@@ -89,6 +115,12 @@ private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
     structGeneration(c)(functionDefs, annottees: _*)
   }
 
+  /**
+    * Methods that construct the code and build the syntax tree
+    * @param c Context used for code gen
+    * @param function Case class that store all information of the single function
+    * @return Generated syntax tree
+    */
   protected def buildTypedFunction(c: blackbox.Context)
                                   (function: Func): c.universe.DefDef = {
     import c.universe._
@@ -141,13 +173,25 @@ private[mxnet] object TypedSymbolAPIMacro extends GeneratorBase {
 private[mxnet] object TypedSymbolRandomAPIMacro extends GeneratorBase
   with RandomHelpers {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
+  /**
+    * Methods that check the ``isContrib`` and call code generation
+    * @param c Context used for code gen
+    * @param annottees Annottees used to define Class or Module
+    * @return Generated code for injection
+    */
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Any] = {
     val functionDefs = typeSafeRandomFunctionsToGenerate(isSymbol = true)
       .map(f => buildTypedFunction(c)(f))
 
     structGeneration(c)(functionDefs, annottees: _*)
   }
 
+  /**
+    * Methods that construct the code and build the syntax tree
+    * @param c Context used for code gen
+    * @param function Case class that store all information of the single function
+    * @return Generated syntax tree
+    */
   protected def buildTypedFunction(c: blackbox.Context)
                                   (function: Func): c.universe.DefDef = {
     import c.universe._
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
index 9bf0818c14a4..29206247296d 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
@@ -25,12 +25,23 @@ import scala.language.experimental.macros
 import scala.reflect.macros.blackbox
 
 private[mxnet] class AddJNDArrayAPIs(isContrib: Boolean) extends StaticAnnotation {
-  private[mxnet] def macroTransform(annottees: Any*): Any = macro JavaNDArrayMacro.typeSafeAPIDefs
+/**
+  * Generate typesafe method for Java NDArray operations
+  * @param annottees Annottees used to define Class or Module
+  * @return Generated code for injection
+  */
+  private[mxnet] def macroTransform(annottees: Any*) = macro JavaNDArrayMacro.typeSafeAPIDefs
 }
 
 private[mxnet] object JavaNDArrayMacro extends GeneratorBase {
 
-  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*): c.Expr[Nothing] = {
+  /**
+    * Methods that call code generation
+    * @param c Context used for code gen
+    * @param annottees Annottees used to define Class or Module
+    * @return Generated code for injection
+    */
+  def typeSafeAPIDefs(c: blackbox.Context)(annottees: c.Expr[Any]*) : c.Expr[Any] = {
     typeSafeAPIImpl(c)(annottees: _*)
   }
 
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
index 57c4cfba10b7..c984d07143ef 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/utils/CToScalaUtils.scala
@@ -31,7 +31,15 @@ private[mxnet] object CToScalaUtils {
     "double" -> "Double",
     "bool" -> "Boolean")
 
-  // Convert C++ Types to Scala Types
+  /**
+    * Convert C++ Types to Scala Types
+    * @param in Input raw string that contains C type docs
+    * @param argType Arg type that used for error messaging
+    * @param argName Arg name used for error messaging
+    * @param returnType The type that NDArray/Symbol should be
+    * @param isJava Check if generating for Java
+    * @return String that contains right Scala/Java types
+    */
   def typeConversion(in : String, argType : String = "", argName : String,
                      returnType : String, isJava : Boolean) : String = {
     val header = returnType.split("\\.").dropRight(1)
@@ -47,7 +55,8 @@ private[mxnet] object CToScalaUtils {
       case "double" | "doubleorNone" => types("double")
       case "string" => "String"
       case "boolean" | "booleanorNone" => types("bool")
-      case "tupleof<float>" | "tupleof<double>" | "tupleof<>" | "ptr" | "" => "Any"
+      case "tupleof<int>" | "tupleof<float>" | "tupleof<double>" | "tupleof<intorNone>" |
+           "tupleof<>" | "ptr" | "" => "Any"
       case default => throw new IllegalArgumentException(
         s"Invalid type for args: $default\nString argType: $argType\nargName: $argName")
     }
@@ -63,6 +72,8 @@ private[mxnet] object CToScalaUtils {
     * optional, what is it Scala type and possibly pass in a default value
     * @param argName The name of the argument
     * @param argType Raw arguement Type description
+    * @param returnType Return type of the function (Symbol/NDArray)
+    * @param isJava Check if Java args should be generated
     * @return (Scala_Type, isOptional)
     */
   def argumentCleaner(argName: String, argType : String,
diff --git a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayCreation.java b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayCreation.java
index 32e2d84dcdbf..4361c06edf32 100644
--- a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayCreation.java
+++ b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayCreation.java
@@ -37,7 +37,7 @@ public static void main(String[] args) {
 
         // random
         NDArray random = NDArray.random_uniform(
-                NDArray.new random_uniformParam()
+                new random_uniformParam()
                         .setLow(0.0f)
                         .setHigh(2.0f)
                         .setShape(new Shape(new int[]{10, 10}))
diff --git a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayOperation.java b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayOperation.java
index 56a414307f46..646adf5550b1 100644
--- a/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayOperation.java
+++ b/scala-package/mxnet-demo/java-demo/src/main/java/mxnet/NDArrayOperation.java
@@ -38,7 +38,7 @@ public static void main(String[] args) {
         System.out.println(eleAdd);
 
         // norm (L2 Norm)
-        NDArray normed = NDArray.norm(NDArray.new normParam(nd))[0];
+        NDArray normed = NDArray.norm(new normParam(nd))[0];
         System.out.println(normed);
     }
 }
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
index 33e4cca99b3a..7323d23ac556 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.cc
@@ -354,9 +354,9 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayLoadFromRawBytes
 
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetShape
   (JNIEnv *env, jobject obj, jlong ndArrayPtr, jobject ndimRef, jobject dataBuf) {
-  mx_uint ndim;
-  const mx_uint *pdata;
-  int ret = MXNDArrayGetShape(reinterpret_cast<NDArrayHandle>(ndArrayPtr), &ndim, &pdata);
+  int ndim;
+  const int *pdata;
+  int ret = MXNDArrayGetShapeEx(reinterpret_cast<NDArrayHandle>(ndArrayPtr), &ndim, &pdata);
 
   // fill dataBuf
   jclass integerClass = env->FindClass("java/lang/Integer");
@@ -365,7 +365,7 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxNDArrayGetShape
   jclass arrayClass = env->FindClass("scala/collection/mutable/ArrayBuffer");
   jmethodID arrayAppend = env->GetMethodID(arrayClass,
     "$plus$eq", "(Ljava/lang/Object;)Lscala/collection/mutable/ArrayBuffer;");
-  for (size_t i = 0; i < ndim; ++i) {
+  for (int i = 0; i < ndim; ++i) {
     jobject data = env->NewObject(integerClass, newInteger, pdata[i]);
     env->CallObjectMethod(dataBuf, arrayAppend, data);
     env->DeleteLocalRef(data);
@@ -892,6 +892,119 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorBackward
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorReshape
+  (JNIEnv * env, jobject obj,
+    jint partialReshaping, jint allowUpSizing, jint devType, jint devId,
+    jobjectArray jmapKeys, jintArray jmapDevTypes, jintArray jmapDevIds,
+    jobjectArray jprovidedArgShapeNames, jintArray jprovidedArgShapeData,
+    jintArray jprovidedArgShapeIdx, jobject jrefInArgs, jobject jrefArgGrads,
+    jobject jrefAuxStates, jlong jsharedExec, jobject jrefOut) {
+  CHECK(jmapKeys != NULL);
+  CHECK(jprovidedArgShapeNames != NULL);
+
+  int numMapKeys = env->GetArrayLength(jmapKeys);
+  jint *mapDevTypes = env->GetIntArrayElements(jmapDevTypes, NULL);
+  jint *mapDevIds = env->GetIntArrayElements(jmapDevIds, NULL);
+  const char **mapKeys = NULL;
+  if (numMapKeys > 0) {
+    mapKeys = new const char*[numMapKeys];
+    for (int i = 0; i < numMapKeys; ++i) {
+      jstring jkey = reinterpret_cast<jstring>(env->GetObjectArrayElement(jmapKeys, i));
+      mapKeys[i] = env->GetStringUTFChars(jkey, 0);
+      env->DeleteLocalRef(jkey);
+    }
+  }
+
+  int numProvidedArgShapes = env->GetArrayLength(jprovidedArgShapeNames);
+  jint *providedArgShapeData = env->GetIntArrayElements(jprovidedArgShapeData, NULL);
+  jint *providedArgShapeIdx = env->GetIntArrayElements(jprovidedArgShapeIdx, NULL);
+  const char **providedArgShapeNames = NULL;
+  if (numProvidedArgShapes > 0) {
+    providedArgShapeNames = new const char*[numProvidedArgShapes];
+    for (int i = 0; i < numProvidedArgShapes; ++i) {
+      jstring jkey = reinterpret_cast<jstring>(
+          env->GetObjectArrayElement(jprovidedArgShapeNames, i));
+      providedArgShapeNames[i] = env->GetStringUTFChars(jkey, 0);
+      env->DeleteLocalRef(jkey);
+    }
+  }
+
+  mx_uint numInArgs = 0;
+  NDArrayHandle *inArgs;
+  NDArrayHandle *argGrads;
+
+  mx_uint numAuxStates = 0;
+  NDArrayHandle *auxStates;
+
+  ExecutorHandle out;
+
+  int ret = MXExecutorReshapeEx(partialReshaping,
+                                allowUpSizing,
+                                devType,
+                                devId,
+                                static_cast<mx_uint>(numMapKeys),
+                                mapKeys,
+                                static_cast<const int*>(mapDevTypes),
+                                static_cast<const int*>(mapDevIds),
+                                static_cast<const mx_uint>(numProvidedArgShapes),
+                                providedArgShapeNames,
+                                static_cast<const int*>(providedArgShapeData),
+                                reinterpret_cast<const mx_uint*>(providedArgShapeIdx),
+                                &numInArgs,
+                                &inArgs,
+                                &argGrads,
+                                &numAuxStates,
+                                &auxStates,
+                                reinterpret_cast<ExecutorHandle>(jsharedExec),
+                                &out);
+
+  jclass longCls = env->FindClass("java/lang/Long");
+  jmethodID newLong = env->GetMethodID(longCls, "<init>", "(J)V");
+
+  jclass arrayClass = env->FindClass("scala/collection/mutable/ArrayBuffer");
+  jmethodID arrayAppend = env->GetMethodID(arrayClass,
+    "$plus$eq", "(Ljava/lang/Object;)Lscala/collection/mutable/ArrayBuffer;");
+
+  for (size_t i = 0; i < numInArgs; ++i) {
+    jobject inArg = env->NewObject(longCls, newLong, inArgs[i]);
+    env->CallObjectMethod(jrefInArgs, arrayAppend, inArg);
+    env->DeleteLocalRef(inArg);
+
+    jobject argGrad = env->NewObject(longCls, newLong, argGrads[i]);
+    env->CallObjectMethod(jrefArgGrads, arrayAppend, argGrad);
+    env->DeleteLocalRef(argGrad);
+  }
+
+  for (size_t i = 0; i < numAuxStates; ++i) {
+    jobject auxState = env->NewObject(longCls, newLong, auxStates[i]);
+    env->CallObjectMethod(jrefAuxStates, arrayAppend, auxState);
+    env->DeleteLocalRef(auxState);
+  }
+
+  SetLongField(env, jrefOut, reinterpret_cast<jlong>(out));
+
+  // release allocated memory
+  for (int i = 0; i < numMapKeys; i++) {
+    jstring jkey = reinterpret_cast<jstring>(env->GetObjectArrayElement(jmapKeys, i));
+    env->ReleaseStringUTFChars(jkey, mapKeys[i]);
+    env->DeleteLocalRef(jkey);
+  }
+  if (mapKeys != NULL) {
+    delete[] mapKeys;
+  }
+
+  for (int i = 0; i < numProvidedArgShapes; i++) {
+    jstring jkey = reinterpret_cast<jstring>(env->GetObjectArrayElement(jprovidedArgShapeNames, i));
+    env->ReleaseStringUTFChars(jkey, providedArgShapeNames[i]);
+    env->DeleteLocalRef(jkey);
+  }
+  if (providedArgShapeNames != NULL) {
+    delete[] providedArgShapeNames;
+  }
+
+  return ret;
+}
+
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorPrint
   (JNIEnv * env, jobject obj, jlong ptr, jobject debugStr) {
   const char *retDebugStr;
@@ -1530,23 +1643,27 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolCreateFromFile
 
 int FillSymbolInferShape
   (JNIEnv *env, jmethodID listAppend, jobject joutData,
-    mx_uint shapeSize, const mx_uint *shapeNdim, const mx_uint **shapeData) {
-  for (size_t i = 0; i < shapeSize; ++i) {
-    jintArray jshape = env->NewIntArray(shapeNdim[i]);
-    if (jshape == NULL) {
-      // TODO(Yizhi): out of memory error thrown, return a specific error code ?
-      return -1;
+    int shapeSize, const int *shapeNdim, const int **shapeData) {
+  for (int i = 0; i < shapeSize; ++i) {
+    jintArray jshape = NULL;
+    if (shapeNdim[i] >= 0) {
+      jshape = env->NewIntArray(shapeNdim[i]);
+      if (jshape == NULL) {
+        // TODO(Yizhi): out of memory error thrown, return a specific error code ?
+        return -1;
+      }
+      env->SetIntArrayRegion(jshape, 0, shapeNdim[i], reinterpret_cast<const jint *>(shapeData[i]));
     }
-    env->SetIntArrayRegion(jshape, 0, shapeNdim[i], reinterpret_cast<const jint *>(shapeData[i]));
     env->CallObjectMethod(joutData, listAppend, jshape);
     env->DeleteLocalRef(jshape);
   }
   return 0;
 }
-JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
-  (JNIEnv *env, jobject obj, jlong symbolPtr, jint jnumArgs, jobjectArray jkeys,
-    jintArray jargIndPtr, jintArray jargShapeData,
-    jobject jinShapeData, jobject joutShapeData, jobject jauxShapeData, jobject jcomplete) {
+
+int SymbolInferShapeHelper(JNIEnv *env, jobject obj, jlong symbolPtr, jint jnumArgs,
+                            jobjectArray jkeys, jintArray jargIndPtr, jintArray jargShapeData,
+                            jobject jinShapeData, jobject joutShapeData, jobject jauxShapeData,
+                            jobject jcomplete, bool partial) {
   const char **keys = NULL;
   if (jkeys != NULL) {
     keys = new const char *[jnumArgs];
@@ -1559,26 +1676,28 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   }
 
   mx_uint inShapeSize;
-  const mx_uint *inShapeNdim;
-  const mx_uint **inShapeData;
+  const int *inShapeNdim;
+  const int **inShapeData;
 
   mx_uint outShapeSize;
-  const mx_uint *outShapeNdim;
-  const mx_uint **outShapeData;
+  const int *outShapeNdim;
+  const int **outShapeData;
 
   mx_uint auxShapeSize;
-  const mx_uint *auxShapeNdim;
-  const mx_uint **auxShapeData;
+  const int *auxShapeNdim;
+  const int **auxShapeData;
 
   int complete;
 
   jint *argIndPtr = env->GetIntArrayElements(jargIndPtr, NULL);
   jint *argShapeData = env->GetIntArrayElements(jargShapeData, NULL);
-  int ret = MXSymbolInferShape(reinterpret_cast<SymbolHandle>(symbolPtr),
+  int ret;
+  if (!partial) {
+    ret = MXSymbolInferShapeEx(reinterpret_cast<SymbolHandle>(symbolPtr),
                                static_cast<mx_uint>(jnumArgs),
                                keys,
-                               reinterpret_cast<const mx_uint *>(argIndPtr),
-                               reinterpret_cast<const mx_uint *>(argShapeData),
+                               reinterpret_cast<mx_uint *>(argIndPtr),
+                               reinterpret_cast<const int *>(argShapeData),
                                &inShapeSize,
                                &inShapeNdim,
                                &inShapeData,
@@ -1589,6 +1708,23 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
                                &auxShapeNdim,
                                &auxShapeData,
                                &complete);
+  } else {
+    ret = MXSymbolInferShapePartialEx(reinterpret_cast<SymbolHandle>(symbolPtr),
+                                      static_cast<mx_uint>(jnumArgs),
+                                      keys,
+                                      reinterpret_cast<mx_uint *>(argIndPtr),
+                                      reinterpret_cast<const int *>(argShapeData),
+                                      &inShapeSize,
+                                      &inShapeNdim,
+                                      &inShapeData,
+                                      &outShapeSize,
+                                      &outShapeNdim,
+                                      &outShapeData,
+                                      &auxShapeSize,
+                                      &auxShapeNdim,
+                                      &auxShapeData,
+                                      &complete);
+  }
   env->ReleaseIntArrayElements(jargShapeData, argShapeData, 0);
   env->ReleaseIntArrayElements(jargIndPtr, argIndPtr, 0);
 
@@ -1629,6 +1765,24 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
+  (JNIEnv *env, jobject obj, jlong symbolPtr, jint jnumArgs, jobjectArray jkeys,
+    jintArray jargIndPtr, jintArray jargShapeData,
+    jobject jinShapeData, jobject joutShapeData, jobject jauxShapeData, jobject jcomplete) {
+
+  return SymbolInferShapeHelper(env, obj, symbolPtr, jnumArgs, jkeys, jargIndPtr, jargShapeData,
+                                jinShapeData, joutShapeData, jauxShapeData, jcomplete, false);
+}
+
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShapePartial
+  (JNIEnv *env, jobject obj, jlong symbolPtr, jint jnumArgs, jobjectArray jkeys,
+    jintArray jargIndPtr, jintArray jargShapeData,
+    jobject jinShapeData, jobject joutShapeData, jobject jauxShapeData, jobject jcomplete) {
+
+  return SymbolInferShapeHelper(env, obj, symbolPtr, jnumArgs, jkeys, jargIndPtr, jargShapeData,
+                                jinShapeData, joutShapeData, jauxShapeData, jcomplete, true);
+}
+
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorBindX
   (JNIEnv *env, jobject obj, jlong symbolPtr, jint deviceTypeId, jint deviceID, jint numCtx,
     jobjectArray jctxMapKeys, jintArray jctxMapDevTypes, jintArray jctxMapDevIDs, jint numArgs,
@@ -2551,3 +2705,20 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDumpProfile
   (JNIEnv *env, jobject obj, jint finished) {
   return MXDumpProfile(finished);
 }
+
+// Numpy
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyCompatible
+  (JNIEnv *env, jobject obj, jobject compatibleRef) {
+  bool isCompatible;
+  int ret = MXIsNumpyCompatible(&isCompatible);
+  SetIntField(env, compatibleRef, static_cast<int>(isCompatible));
+  return ret;
+}
+
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetIsNumpyCompatible
+  (JNIEnv *env, jobject obj, jint isNpComp, jobject prevRef) {
+  int prev;
+  int ret = MXSetIsNumpyCompatible(isNpComp, &prev);
+  SetIntField(env, prevRef, prev);
+  return ret;
+}
diff --git a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
index b8a9b3b9e64f..467272cea9cf 100644
--- a/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
+++ b/scala-package/native/src/main/native/org_apache_mxnet_native_c_api.h
@@ -511,6 +511,14 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorPrint
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorSetMonitorCallback
   (JNIEnv *, jobject, jlong, jobject);
 
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxExecutorReshape
+ * Signature: (IIII[Ljava/lang/String;[I[I[Ljava/lang/String;[I[ILscala/collection/mutable/ArrayBuffer;Lscala/collection/mutable/ArrayBuffer;Lscala/collection/mutable/ArrayBuffer;JLorg/apache/mxnet/Base/RefLong;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxExecutorReshape
+  (JNIEnv *, jobject, jint, jint, jint, jint, jobjectArray, jintArray, jintArray, jobjectArray, jintArray, jintArray, jobject, jobject, jobject, jlong, jobject);
+
 /*
  * Class:     org_apache_mxnet_LibInfo
  * Method:    mxSymbolListAtomicSymbolCreators
@@ -655,6 +663,14 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferType
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShape
   (JNIEnv *, jobject, jlong, jint, jobjectArray, jintArray, jintArray, jobject, jobject, jobject, jobject);
 
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSymbolInferShapePartial
+ * Signature: (JI[Ljava/lang/String;[I[ILscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lscala/collection/mutable/ListBuffer;Lorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSymbolInferShapePartial
+  (JNIEnv *, jobject, jlong, jint, jobjectArray, jintArray, jintArray, jobject, jobject, jobject, jobject);
+
 /*
  * Class:     org_apache_mxnet_LibInfo
  * Method:    mxSymbolGetOutput
@@ -855,6 +871,22 @@ JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetProfilerState
 JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxDumpProfile
   (JNIEnv *, jobject, jint);
 
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxIsNumpyCompatible
+ * Signature: (Lorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxIsNumpyCompatible
+  (JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     org_apache_mxnet_LibInfo
+ * Method:    mxSetIsNumpyCompatible
+ * Signature: (ILorg/apache/mxnet/Base/RefInt;)I
+ */
+JNIEXPORT jint JNICALL Java_org_apache_mxnet_LibInfo_mxSetIsNumpyCompatible
+  (JNIEnv *, jobject, jint, jobject);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 44b784a56f8e..46726bc00472 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -357,6 +357,10 @@
               <version>2.1.0</version>
             </compilerPlugin>
           </compilerPlugins>
+          <args>
+            <arg>-feature</arg>
+            <arg>-deprecation</arg>
+          </args>
         </configuration>
         <executions>
           <execution>
diff --git a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/LabeledPointIter.scala b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/LabeledPointIter.scala
index bf1b26e4b48d..44d6f3345bdf 100644
--- a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/LabeledPointIter.scala
+++ b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/LabeledPointIter.scala
@@ -115,13 +115,13 @@ class LabeledPointIter private[mxnet](
   }
 
   // The name and shape of label provided by this iterator
-  @deprecated
+  @deprecated("Use provideLabelDesc instead", "1.3.0")
   override def provideLabel: ListMap[String, Shape] = {
     ListMap(labelName -> Shape(_batchSize))
   }
 
   // The name and shape of data provided by this iterator
-  @deprecated
+  @deprecated("Use provideDataDesc instead", "1.3.0")
   override def provideData: ListMap[String, Shape] = {
     ListMap(dataName -> dataShape)
   }
diff --git a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/LongLivingDataBatch.scala b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/LongLivingDataBatch.scala
index e3272a4066b5..abf82f6e510c 100644
--- a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/LongLivingDataBatch.scala
+++ b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/LongLivingDataBatch.scala
@@ -28,8 +28,7 @@ class LongLivingDataBatch(
   override val data: IndexedSeq[NDArray],
   override val label: IndexedSeq[NDArray],
   override val index: IndexedSeq[Long],
-  override val pad: Int) extends DataBatch(data, label, index, pad,
-  null, null, null) {
+  override val pad: Int) extends DataBatch(data, label, index, pad) {
   override def dispose(): Unit = {}
   def disposeForce(): Unit = super.dispose()
 }
diff --git a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/PointIter.scala b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/PointIter.scala
index a955ee74e7e2..1ca23927e123 100644
--- a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/PointIter.scala
+++ b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/io/PointIter.scala
@@ -114,13 +114,13 @@ class PointIter private[mxnet](
   }
 
   // The name and shape of label provided by this iterator
-  @deprecated
+  @deprecated("Use provideLabelDesc instead", "1.3.0")
   override def provideLabel: ListMap[String, Shape] = {
     ListMap(labelName -> Shape(_batchSize))
   }
 
   // The name and shape of data provided by this iterator
-  @deprecated
+  @deprecated("Use provideDataDesc instead", "1.3.0")
   override def provideData: ListMap[String, Shape] = {
     ListMap(dataName -> dataShape)
   }
diff --git a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
index 7a2417bdea1f..2382ca9fa358 100644
--- a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
+++ b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
@@ -20,6 +20,7 @@ package org.apache.mxnet.spark
 import java.io.{BufferedReader, File, InputStreamReader}
 import java.nio.file.Files
 
+import scala.language.postfixOps
 import scala.sys.process.Process
 
 import org.apache.spark.SparkContext
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 45197aafe019..f549ddd13994 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -44,9 +44,11 @@
 #include "mxnet/rtc.h"
 #include "mxnet/storage.h"
 #include "mxnet/libinfo.h"
+#include "mxnet/imperative.h"
 #include "./c_api_common.h"
 #include "../operator/custom/custom-inl.h"
 #include "../operator/tensor/matrix_op-inl.h"
+#include "../common/utils.h"
 
 using namespace mxnet;
 
@@ -471,7 +473,7 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
   NDArray *ptr = new NDArray();
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
-  nnvm::Tuple<dim_t> shape(dims, dims+ndim);
+  mxnet::Tuple<dim_t> shape(dims, dims+ndim);
   CHECK_GT(arr->shape().Size(), 0) << "Source ndarray's shape is undefined. Input shape: "
     << arr->shape();
   mxnet::TShape new_shape = mxnet::op::InferReshapeShape(shape, arr->shape(), reverse);
@@ -511,6 +513,34 @@ int MXNDArrayGetShape(NDArrayHandle handle,
   API_END();
 }
 
+int MXNDArrayGetShapeEx(NDArrayHandle handle,
+                        int *out_dim,
+                        const int **out_pdata) {
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  if (!arr->is_none()) {
+    mxnet::TShape s = arr->shape();
+    if (!Imperative::Get()->is_np_comp()) {
+      common::ConvertToLegacyShape(&s);
+    }
+    *out_dim = s.ndim();
+    if (s.ndim() >= 0) {
+      std::vector<int> &buffer = ret->arg_shape_buffer_ex;
+      buffer.resize(s.ndim());
+      mxnet::ShapeTypeCast(s.begin(), s.end(), buffer.data());
+      *out_pdata = buffer.data();
+    }
+  } else {
+    if (Imperative::Get()->is_np_comp()) {
+      *out_dim = -1;
+    } else {
+      *out_dim = 0;
+    }
+  }
+  API_END();
+}
+
 int MXNDArrayGetData(NDArrayHandle handle,
                      void **out_pdata) {
   API_BEGIN();
@@ -791,7 +821,7 @@ int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle *out) {
   // temp hack to make label 1D
   // TODO(tianjun) make label 1D when label_width=0
   mxnet::TShape shape = db.data[1].shape();
-  if (shape[1] == 1) {
+  if (shape.ndim() > 1 && shape[1] == 1) {
     *pndarray = db.data[1].Reshape(mshadow::Shape1(shape[0]));
   } else {
     *pndarray = db.data[1];
@@ -1402,6 +1432,13 @@ int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *s
   API_END();
 }
 
+int MXNDArrayCreateFromSharedMemEx(int shared_pid, int shared_id, const int *shape,
+                                   int ndim, int dtype, NDArrayHandle *out) {
+  API_BEGIN();
+  *out = new NDArray(shared_pid, shared_id, mxnet::TShape(shape, shape + ndim), dtype);
+  API_END();
+}
+
 typedef Engine::VarHandle VarHandle;
 typedef Engine::CallbackOnComplete CallbackOnComplete;
 
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index b5adfa37eca9..013ecab93da8 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -75,12 +75,19 @@ struct MXAPIThreadLocalEntry {
   std::vector<int> arg_storage_types, out_storage_types, aux_storage_types;
   /*! \brief result holder for returning shape dimensions */
   std::vector<mx_uint> arg_shape_ndim, out_shape_ndim, aux_shape_ndim;
+  /*! \brief result holder for returning shape dimensions */
+  std::vector<int> arg_shape_ndim_ex, out_shape_ndim_ex, aux_shape_ndim_ex;
   /*! \brief result holder for returning shape pointer */
   std::vector<const mx_uint*> arg_shape_data, out_shape_data, aux_shape_data;
+  /*! \brief result holder for returning shape pointer */
+  std::vector<const int*> arg_shape_data_ex, out_shape_data_ex, aux_shape_data_ex;
   /*! \brief uint32_t buffer for returning shape pointer */
   std::vector<uint32_t> arg_shape_buffer, out_shape_buffer, aux_shape_buffer;
+  /*! \brief uint32_t buffer for returning shape pointer */
+  std::vector<int> arg_shape_buffer_ex, out_shape_buffer_ex, aux_shape_buffer_ex;
   /*! \brief bool buffer */
   std::vector<bool> save_inputs, save_outputs;
+  // DEPRECATED. Use SetupShapeArrayReturnWithBufferEx instead.
   // helper function to setup return value of shape array
   inline static void SetupShapeArrayReturnWithBuffer(
       const mxnet::ShapeVector &shapes,
@@ -99,6 +106,30 @@ struct MXAPIThreadLocalEntry {
       ptr = nnvm::ShapeTypeCast(shapes[i].begin(), shapes[i].end(), ptr);
     }
   }
+  // helper function to setup return value of shape array
+  inline static void SetupShapeArrayReturnWithBufferEx(
+      const mxnet::ShapeVector &shapes,
+      std::vector<int> *ndim,
+      std::vector<const int*> *data,
+      std::vector<int> *buffer) {
+    ndim->resize(shapes.size());
+    data->resize(shapes.size());
+    size_t size = 0;
+    for (const auto& s : shapes) {
+      if (s.ndim() > 0) {
+        size += s.ndim();
+      }
+    }
+    buffer->resize(size);
+    int *ptr = buffer->data();
+    for (size_t i = 0; i < shapes.size(); ++i) {
+      ndim->at(i) = shapes[i].ndim();
+      data->at(i) = ptr;
+      if (shapes[i].ndim() > 0) {
+        ptr = mxnet::ShapeTypeCast(shapes[i].begin(), shapes[i].end(), ptr);
+      }
+    }
+  }
 };
 
 // define the threadlocal store.
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index a2e8bb810e6f..5352fcfe0951 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -25,8 +25,10 @@
 #include <mxnet/base.h>
 #include <mxnet/c_api.h>
 #include <mxnet/executor.h>
+#include <mxnet/imperative.h>
 #include "./c_api_common.h"
 #include "../executor/graph_executor.h"
+#include "../common/utils.h"
 #if MXNET_USE_TENSORRT
 #include "../executor/trt_graph_executor.h"
 #endif  // MXNET_USE_TENSORRT
@@ -183,7 +185,7 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
 }
 
 /*!
- * \brief
+ * \brief DEPRECATED. Use MXExecutorSimpleBindEx instead.
  * \param symbol_handle symbol handle
  * \param dev_type default device type
  * \param dev_id default device id
@@ -416,6 +418,371 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
     CHECK(p.second) << "Duplicate shapes are provided for argument "
       << provided_arg_shape_names[i] << " in simple_bind";
   }
+  if (!Imperative::Get()->is_np_comp()) {
+    for (auto &kv : arg_shape_map) {
+      common::ConvertToNumpyShape(&kv.second);
+    }
+  }
+
+  // create para name set for sharing data array memory
+  std::unordered_set<std::string> shared_arg_name_set(num_shared_arg_names);
+  for (mx_uint i = 0; i < num_shared_arg_names; ++i) {
+    shared_arg_name_set.insert(shared_arg_name_list[i]);
+  }
+
+  // create shared_buffer_map
+  std::unordered_map<std::string, NDArray> shared_buffer_map;
+  bool use_shared_buffer = (*shared_buffer_len >= 0);
+  if (*shared_buffer_len > 0) {
+    // create shared_buffer_map
+    shared_buffer_map.reserve(*shared_buffer_len);
+    NDArray** shared_buffer_ptrs =
+      reinterpret_cast<NDArray**>(shared_buffer_handle_list);
+    for (int i = 0; i < *shared_buffer_len; ++i) {
+      shared_buffer_map[shared_buffer_name_list[i]] = *(shared_buffer_ptrs[i]);
+    }
+  }
+
+  // create temporary place holders for the initialized NDArrays
+  // to be passed back to front end
+  std::vector<NDArray> in_arg_vec;
+  std::vector<NDArray> arg_grad_vec;
+  std::vector<NDArray> aux_state_vec;
+#if MXNET_USE_TENSORRT
+  // If we've built with TensorRT support we by default return an TRTExecutor.
+  // Users can override this behaviour via env var, which is useful for example for A/B
+  // performance testing.
+  if (dmlc::GetEnv("MXNET_USE_TENSORRT", false)) {
+    *out = exec::TrtGraphExecutor::TensorRTBind(*sym, ctx, ctx_map, &in_arg_ctx_vec,
+                                                &arg_grad_ctx_vec, &aux_state_ctx_vec,
+                                                &arg_shape_map, &arg_dtype_map, &arg_stype_map,
+                                                &grad_req_type_vec, shared_arg_name_set,
+                                                &in_arg_vec, &arg_grad_vec, &aux_state_vec,
+                                                use_shared_buffer ? &shared_buffer_map : nullptr,
+                                                reinterpret_cast<Executor*>(shared_exec_handle));
+  } else {
+    // Checks to see if this env var has been set to true or false by the user.
+    // If the user is using a TensorRT build, but has not enabled TRT at inference time, warn
+    // them and describe further steps.
+    const int unset_indicator =  std::numeric_limits<int>::quiet_NaN();
+    if (dmlc::GetEnv("MXNET_USE_TENSORRT", unset_indicator) == unset_indicator) {
+      LOG(INFO) << "TensorRT not enabled by default.  Please set the MXNET_USE_TENSORRT "
+                   "environment variable to 1 or call mx.contrib.tensorrt.set_use_tensorrt(True) "
+                   "to enable.";
+    }
+#endif  // MXNET_USE_TENSORRT
+    *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
+                                aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
+                                grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
+                                &arg_grad_vec, &aux_state_vec,
+                                use_shared_buffer ? &shared_buffer_map : nullptr,
+                                reinterpret_cast<Executor*>(shared_exec_handle));
+#if MXNET_USE_TENSORRT
+  }
+#endif  // MXNET_USE_TENSORRT
+
+  // copy ndarray ptrs to ret->handles so that front end
+  // can access them
+  ret->ret_handles.clear();
+  ret->ret_handles.reserve(in_arg_vec.size()+arg_grad_vec.size()+aux_state_vec.size()
+                           +shared_buffer_map.size());
+  size_t nd_idx = 0;
+  for (const auto& nd : in_arg_vec) {
+    if (nd.is_none()) {
+      LOG(FATAL) << "Input argument NDArray cannot be un-allocated";
+    }
+    ret->ret_handles.push_back(new NDArray(nd));
+  }
+  if (in_arg_vec.size() > 0) {
+    *num_in_args = in_arg_vec.size();
+    *in_args = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  for (const auto& nd : arg_grad_vec) {
+    if (nd.is_none()) {
+      ret->ret_handles.push_back(nullptr);
+    } else {
+      ret->ret_handles.push_back(new NDArray(nd));
+    }
+  }
+  if (arg_grad_vec.size() > 0) {
+    *arg_grads = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  for (const auto& nd : aux_state_vec) {
+    if (nd.is_none()) {
+      LOG(FATAL) << "Auxiliary argument NDArray cannot be un-allocated";
+    }
+    ret->ret_handles.push_back(new NDArray(nd));
+  }
+  if (aux_state_vec.size() > 0) {
+    *num_aux_states = aux_state_vec.size();
+    *aux_states = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  if (use_shared_buffer) {
+    ret->ret_vec_str.clear();
+    ret->ret_vec_str.reserve(shared_buffer_map.size());
+    ret->ret_vec_charp.clear();
+    ret->ret_vec_charp.reserve(shared_buffer_map.size());
+    for (const auto& kv : shared_buffer_map) {
+      if (kv.second.is_none()) {
+        LOG(FATAL) << "Shared data NDArray cannot be un-allocated";
+      }
+      ret->ret_handles.push_back(new NDArray(kv.second));
+      ret->ret_vec_str.emplace_back(kv.first);
+      ret->ret_vec_charp.push_back(ret->ret_vec_str.back().c_str());
+    }
+    *shared_buffer_len = shared_buffer_map.size();
+    *updated_shared_buffer_handle_list = &(ret->ret_handles[nd_idx]);
+    *updated_shared_buffer_name_list = &(ret->ret_vec_charp[0]);
+  }
+
+  API_END();
+}
+
+/*!
+ * \brief
+ * \param symbol_handle symbol handle
+ * \param dev_type default device type
+ * \param dev_id default device id
+ * \param num_g2c_keys number of group2ctx keys
+ * \param g2c_keys key list of group2ctx
+ * \param g2c_dev_types device type list of group2ctx
+ * \param g2c_dev_ids id list of group2ctx
+ * \param provided_grad_req_list_len grad_req length provided by users in front-end
+ * \param provided_grad_req_names grad_req names provided by users in front-end
+ * \param provided_grad_req_types req types provided by users in front-end
+ * \param num_provided_arg_shapes number of user provided in_arg and aux_state shapes
+ * \param provided_arg_shape_names name list of provided shapes
+ * \param provided_arg_shape_data provided shape data
+ * \param provided_arg_shape_idx provided shape data index
+ * \param num_provided_arg_dtypes number of user provided in_arg and axu_state dtypes
+ * \param provided_arg_dtype_names argument name list of provided dtypes
+ * \param provided_arg_dtypes data of provided dtypes
+ * \param num_provided_arg_stypes number of user provided in_arg and axu_state storage types
+ * \param provided_arg_stype_names argument name list of provided storage types
+ * \param provided_arg_stypes data of provided storage types
+ * \param num_shared_arg_names number of parameter names passed from _bind_ith_exec
+ * \param shared_arg_name_list parameter name list passed from _bind_ith_exec
+ * \param shared_buffer_len number of shared data arrays passed from _bind_ith_exec
+ * \param shared_buffer_name_list shared data array names passed from _bind_ith_exec
+ * \param shared_buffer_handle_list shared data array handles passed from _bind_ith_exec
+ * \param updated_shared_buffer_name_list updated shared data array names after binding
+ * \param updated_shared_buffer_handle_list updated shared data arrays after binding
+ * \param num_in_args number of input arguments of this sym
+ * \param in_args list_arguments associated with the current executor
+ * \param arg_grads list of gradients of in_args associated with the current executor
+ * \param num_aux_states number of aux states of this sym
+ * \param aux_states list_auxiliary_states associated with the current executor
+ * \param shared_exec_handle shared excutor handle passed from _bind_ith_exec
+ * \param out the handle of the executor to be created
+ */
+int MXExecutorSimpleBindEx(SymbolHandle symbol_handle,
+                           int dev_type,
+                           int dev_id,
+                           const mx_uint num_g2c_keys,
+                           const char** g2c_keys,
+                           const int* g2c_dev_types,
+                           const int* g2c_dev_ids,
+                           const mx_uint provided_grad_req_list_len,
+                           const char** provided_grad_req_names,
+                           const char** provided_grad_req_types,
+                           const mx_uint num_provided_arg_shapes,
+                           const char** provided_arg_shape_names,
+                           const int* provided_arg_shape_data,
+                           const mx_uint* provided_arg_shape_idx,
+                           const mx_uint num_provided_arg_dtypes,
+                           const char** provided_arg_dtype_names,
+                           const int* provided_arg_dtypes,
+                           const mx_uint num_provided_arg_stypes,
+                           const char** provided_arg_stype_names,
+                           const int* provided_arg_stypes,
+                           const mx_uint num_shared_arg_names,
+                           const char** shared_arg_name_list,
+                           int* shared_buffer_len,
+                           const char** shared_buffer_name_list,
+                           NDArrayHandle* shared_buffer_handle_list,
+                           const char*** updated_shared_buffer_name_list,
+                           NDArrayHandle** updated_shared_buffer_handle_list,
+                           mx_uint* num_in_args,
+                           NDArrayHandle** in_args,
+                           NDArrayHandle** arg_grads,
+                           mx_uint* num_aux_states,
+                           NDArrayHandle** aux_states,
+                           ExecutorHandle shared_exec_handle,
+                           ExecutorHandle* out) {
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  nnvm::Symbol *sym = static_cast<nnvm::Symbol*>(symbol_handle);
+
+  // get in_arg names
+  std::vector<std::string> in_arg_names = sym->ListInputNames(nnvm::Symbol::kReadOnlyArgs);
+  std::vector<std::string> aux_state_names = sym->ListInputNames(nnvm::Symbol::kAuxiliaryStates);
+
+  // attr_dict for setting up type_dict and arg/aux ctx
+  std::unordered_map<std::string, std::unordered_map<std::string, std::string>> attr_dict;
+  if (nullptr == provided_arg_dtypes || nullptr != g2c_keys || nullptr == provided_arg_stypes) {
+    std::vector<std::tuple<std::string, std::string, std::string>> attrs =
+      sym->ListAttrsRecursive();
+    attr_dict.reserve(attrs.size());
+    for (const auto& tp : attrs) {
+      attr_dict[std::get<0>(tp)][std::get<1>(tp)] = std::get<2>(tp);
+    }
+  }
+
+  // setup arg_dtype_map
+  std::unordered_map<std::string, int> arg_dtype_map;
+  if (nullptr == provided_arg_dtypes) {  // use attr_dict
+    for (const auto& arg_name : in_arg_names) {
+      const auto it = attr_dict.find(arg_name);
+      if (it == attr_dict.end() || !it->second.count("__dtype__")) {
+        arg_dtype_map[arg_name] = mshadow::kFloat32;
+      }
+    }
+  } else {  // use user input type_dict
+    // create dtype map for in_args and aux_states
+    arg_dtype_map.reserve(num_provided_arg_dtypes);
+    for (mx_uint i = 0; i < num_provided_arg_dtypes; ++i) {
+      arg_dtype_map[provided_arg_dtype_names[i]] = provided_arg_dtypes[i];
+    }
+  }
+
+  // setup arg_stype_map
+  std::unordered_map<std::string, int> arg_stype_map;
+  if (nullptr == provided_arg_stypes) {  // use attr_dict
+    for (const auto& arg_name : in_arg_names) {
+      const auto it = attr_dict.find(arg_name);
+      if (it == attr_dict.end() || !it->second.count("__storage_type__")) {
+        arg_stype_map[arg_name] = kDefaultStorage;
+      }
+    }
+  } else {  // use user input type_dict
+    // create stype map for in_args and aux_states
+    arg_stype_map.reserve(num_provided_arg_stypes);
+    for (mx_uint i = 0; i < num_provided_arg_stypes; ++i) {
+      arg_stype_map[provided_arg_stype_names[i]] = provided_arg_stypes[i];
+    }
+  }
+
+  // create default ctx
+  Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+  // create ctx map
+  std::map<std::string, Context> ctx_map;
+  std::vector<Context> in_arg_ctx_vec(in_arg_names.size(), ctx);
+  std::vector<Context> aux_state_ctx_vec(aux_state_names.size(), ctx);
+  if (nullptr != g2c_keys) {  // use user input group2ctx dict
+    for (mx_uint i = 0; i < num_g2c_keys; ++i) {
+      ctx_map[g2c_keys[i]] = Context::Create(
+          static_cast<Context::DeviceType>(g2c_dev_types[i]), g2c_dev_ids[i]);
+    }
+
+    // initialize in_arg_ctx_vec using group2ctx if there are any
+    for (size_t i = 0; i < in_arg_ctx_vec.size(); ++i) {
+      const auto it1 = attr_dict.find(in_arg_names[i]);
+      if (it1 != attr_dict.end()) {
+        const auto it2 = it1->second.find("__ctx_group__");
+        if (it2 != it1->second.end()) {
+          const auto it3 = ctx_map.find(it2->second);
+          if (it3 != ctx_map.end()) {
+            in_arg_ctx_vec[i] = it3->second;
+          }
+        }
+      }
+    }
+
+    // initialize aux_state_ctx_vec using group2ctx if there are any
+    for (size_t i = 0; i < aux_state_ctx_vec.size(); ++i) {
+      const auto it1 = attr_dict.find(aux_state_names[i]);
+      if (it1 != attr_dict.end()) {
+        const auto it2 = it1->second.find("__ctx_group__");
+        if (it2 != it1->second.end()) {
+          const auto it3 = ctx_map.find(it2->second);
+          if (it3 != ctx_map.end()) {
+            aux_state_ctx_vec[i] = it3->second;
+          }
+        }
+      }
+    }
+  }
+
+  // create provided_grad_req_map
+  const std::map<std::string, OpReqType> req_map =
+    {{"null", kNullOp}, {"write", kWriteTo}, {"add", kAddTo}};
+  std::unordered_map<std::string, std::string> provided_grad_req_map;
+  std::string grad_req_type;
+  if (0 == provided_grad_req_list_len
+      && nullptr == provided_grad_req_names
+      && nullptr != provided_grad_req_types) {  // string, grad_req='write'
+    CHECK_EQ(req_map.count(provided_grad_req_types[0]), 1U)
+      << "grad_req=" << provided_grad_req_types[0] << " is not a valid input in simple_bind; "
+      "only \'null\', \'write\', and \'add\' are supported";
+    grad_req_type = "string";
+  } else if (provided_grad_req_list_len > 0
+      && nullptr == provided_grad_req_names
+      && nullptr != provided_grad_req_types) {  // list, grad_req=['null', 'write']
+    grad_req_type = "list";
+    CHECK_EQ(provided_grad_req_list_len, in_arg_names.size())
+      << "The length of grad_req list does not match the number of input arguments in simple_bind, "
+      "expected " << in_arg_names.size() << ", provided " << provided_grad_req_list_len;
+  } else if (provided_grad_req_list_len > 0
+      && nullptr != provided_grad_req_names
+      && nullptr != provided_grad_req_types) {  // dict, grad_req=['lhs': 'null', 'rhs': 'write']
+    grad_req_type = "dict";
+    provided_grad_req_map.reserve(provided_grad_req_list_len);
+    for (mx_uint i = 0; i < provided_grad_req_list_len; ++i) {
+      CHECK_EQ(req_map.count(provided_grad_req_types[i]), 1U)
+        << "grad_req=" << provided_grad_req_types[i] << " is not a valid input in simple_bind; "
+        "only \'null\', \'write\', and \'add\' are supported";
+      provided_grad_req_map[provided_grad_req_names[i]] = provided_grad_req_types[i];
+    }
+  } else {  // grad_req is None
+    grad_req_type = "none";
+  }
+
+  // initialize arg_grad_ctx_vec and grad_req_type_vec
+  std::vector<Context> arg_grad_ctx_vec(in_arg_names.size(), ctx);
+  std::vector<OpReqType> grad_req_type_vec(in_arg_names.size(), kNullOp);
+  if ("none" != grad_req_type) {
+    for (size_t i = 0; i < in_arg_names.size(); ++i) {
+      OpReqType cur_req = kNullOp;
+      if ("string" == grad_req_type) {
+        cur_req = req_map.at(provided_grad_req_types[0]);
+      } else if ("list" == grad_req_type) {
+        CHECK_EQ(req_map.count(provided_grad_req_types[i]), 1U)
+          << "grad_req=" << provided_grad_req_types[i] << " is not a valid input in simple_bind; "
+          "only \'null\', \'write\', and \'add\' are supported";
+        cur_req = req_map.at(provided_grad_req_types[i]);
+      } else if ("dict" == grad_req_type) {
+        const auto it = provided_grad_req_map.find(in_arg_names[i]);
+        if (it != provided_grad_req_map.end()) {
+          cur_req = req_map.at(it->second);
+        }
+      }
+      if (kNullOp != cur_req) {
+        arg_grad_ctx_vec[i] = in_arg_ctx_vec[i];
+        grad_req_type_vec[i] = static_cast<OpReqType>(cur_req);
+      }
+    }
+  }
+
+  // create shape map for in_args and aux_states
+  std::unordered_map<std::string, mxnet::TShape> arg_shape_map(num_provided_arg_shapes);
+  for (mx_uint i = 0; i < num_provided_arg_shapes; ++i) {
+    auto p = arg_shape_map.emplace(provided_arg_shape_names[i],
+        mxnet::TShape(provided_arg_shape_data+provided_arg_shape_idx[i],
+          provided_arg_shape_data+provided_arg_shape_idx[i+1]));
+    CHECK(p.second) << "Duplicate shapes are provided for argument "
+      << provided_arg_shape_names[i] << " in simple_bind";
+  }
+  if (!Imperative::Get()->is_np_comp()) {
+    for (auto &kv : arg_shape_map) {
+      common::ConvertToNumpyShape(&kv.second);
+    }
+  }
 
   // create para name set for sharing data array memory
   std::unordered_set<std::string> shared_arg_name_set(num_shared_arg_names);
@@ -628,6 +995,97 @@ int MXExecutorReshape(int partial_shaping,
   API_END_HANDLE_ERROR(delete new_exec);
 }
 
+int MXExecutorReshapeEx(int partial_shaping,
+                        int allow_up_sizing,
+                        int dev_type,
+                        int dev_id,
+                        mx_uint num_map_keys,
+                        const char** map_keys,
+                        const int* map_dev_types,
+                        const int* map_dev_ids,
+                        const mx_uint num_provided_arg_shapes,
+                        const char** provided_arg_shape_names,
+                        const int* provided_arg_shape_data,
+                        const mx_uint* provided_arg_shape_idx,
+                        mx_uint* num_in_args,
+                        NDArrayHandle** in_args,
+                        NDArrayHandle** arg_grads,
+                        mx_uint* num_aux_states,
+                        NDArrayHandle** aux_states,
+                        ExecutorHandle shared_exec,
+                        ExecutorHandle *out) {
+  Executor* new_exec = nullptr;
+
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  *out = nullptr;  // ensure we can know whether to free executor on early abort
+  // create shape map for in_args and aux_states
+  std::unordered_map<std::string, mxnet::TShape> kwargs(num_provided_arg_shapes);
+  for (mx_uint i = 0; i < num_provided_arg_shapes; ++i) {
+    auto p = kwargs.emplace(provided_arg_shape_names[i],
+        mxnet::TShape(provided_arg_shape_data+provided_arg_shape_idx[i],
+          provided_arg_shape_data+provided_arg_shape_idx[i+1]));
+    CHECK(p.second) << "Duplicate shapes are provided for argument "
+      << provided_arg_shape_names[i] << " in reshape of executor";
+  }
+
+  Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
+  std::map<std::string, Context> ctx_map;
+  for (mx_uint i = 0; i < num_map_keys; ++i) {
+    ctx_map[std::string(map_keys[i])] = Context::Create(
+        static_cast<Context::DeviceType>(map_dev_types[i]), map_dev_ids[i]);
+  }
+  std::vector<NDArray> in_arg_vec;
+  std::vector<NDArray> arg_grad_vec;
+  std::vector<NDArray> aux_state_vec;
+
+  Executor* exec = static_cast<Executor*>(shared_exec);
+  new_exec = exec->Reshape(partial_shaping, allow_up_sizing, ctx, ctx_map, kwargs,
+                       &in_arg_vec, &arg_grad_vec, &aux_state_vec);
+  *out = new_exec;
+
+  ret->ret_handles.clear();
+  ret->ret_handles.reserve(in_arg_vec.size()+arg_grad_vec.size()+aux_state_vec.size());
+
+  size_t nd_idx = 0;
+  for (const auto& nd : in_arg_vec) {
+    if (nd.is_none()) {
+      LOG(FATAL) << "Input argument NDArray cannot be un-allocated";
+    }
+    ret->ret_handles.push_back(new NDArray(nd));
+  }
+  if (in_arg_vec.size() > 0) {
+    *num_in_args = in_arg_vec.size();
+    *in_args = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  for (const auto& nd : arg_grad_vec) {
+    if (nd.is_none()) {
+      ret->ret_handles.push_back(nullptr);
+    } else {
+      ret->ret_handles.push_back(new NDArray(nd));
+    }
+  }
+  if (arg_grad_vec.size() > 0) {
+    *arg_grads = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+
+  for (const auto& nd : aux_state_vec) {
+    if (nd.is_none()) {
+      LOG(FATAL) << "Auxiliary argument NDArray cannot be un-allocated";
+    }
+    ret->ret_handles.push_back(new NDArray(nd));
+  }
+  if (aux_state_vec.size() > 0) {
+    *num_aux_states = aux_state_vec.size();
+    *aux_states = &(ret->ret_handles[nd_idx]);
+    nd_idx = ret->ret_handles.size();
+  }
+  API_END_HANDLE_ERROR(delete new_exec);
+}
+
 int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
                                  SymbolHandle *out) {
   auto s = new nnvm::Symbol();
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 18f6c411e039..0e136b03ecd7 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -276,6 +276,18 @@ int MXAutogradSetIsRecording(int is_recording, int* prev) {
   API_END();
 }
 
+int MXIsNumpyCompatible(bool* curr) {
+  API_BEGIN();
+  *curr = Imperative::Get()->is_np_comp();
+  API_END();
+}
+
+int MXSetIsNumpyCompatible(int is_np_comp, int* prev) {
+  API_BEGIN();
+  *prev = Imperative::Get()->set_is_np_comp(static_cast<bool>(is_np_comp));
+  API_END();
+}
+
 int MXAutogradMarkVariables(mx_uint num_var,
                             NDArrayHandle *var_handles,
                             mx_uint *reqs_array,
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 545e95f04b79..24a88520376f 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -24,6 +24,7 @@
  */
 #include "mxnet/base.h"
 #include "mxnet/c_api.h"
+#include "mxnet/imperative.h"
 #include "nnvm/c_api.h"
 #include "nnvm/pass.h"
 #include "nnvm/pass_functions.h"
@@ -543,8 +544,14 @@ int MXSymbolInferShape(SymbolHandle sym,
     throw dmlc::Error(err.msg);
   }
 
+  // if use legacy shape definition, need to convert numpy shape to legacy shape
+  mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
+  if (!Imperative::Get()->is_np_comp()) {
+    common::ConvertToLegacyShape(&shapes);
+  }
+
   // copy back
-  CopyAttr(g.indexed_graph(), g.GetAttr<mxnet::ShapeVector>("shape"),
+  CopyAttr(g.indexed_graph(), shapes,
            &(ret->arg_shapes), &(ret->out_shapes), &(ret->aux_shapes));
 
   // copy data back
@@ -568,6 +575,79 @@ int MXSymbolInferShape(SymbolHandle sym,
   API_END();
 }
 
+int MXSymbolInferShapeEx(SymbolHandle sym,
+                         mx_uint num_args,
+                         const char** keys,
+                         const mx_uint *arg_ind_ptr,
+                         const int *arg_shape_data,
+                         mx_uint *in_shape_size,
+                         const int **in_shape_ndim,
+                         const int ***in_shape_data,
+                         mx_uint *out_shape_size,
+                         const int **out_shape_ndim,
+                         const int ***out_shape_data,
+                         mx_uint *aux_shape_size,
+                         const int **aux_shape_ndim,
+                         const int ***aux_shape_data,
+                         int *complete) {
+  nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  nnvm::Graph g = Symbol2Graph(*s);
+  mxnet::ShapeVector arg_shapes(g.indexed_graph().input_nodes().size(), mxnet::TShape());
+  if (keys == nullptr && num_args != 0) {
+    std::vector<uint32_t> read_only_args = mxnet::ReadOnlyArgIndices(g.indexed_graph());
+    CHECK_LE(num_args, read_only_args.size());
+    for (mx_uint i = 0; i < num_args; ++i) {
+      arg_shapes[read_only_args[i]] = mxnet::ShapeTypeCast(
+          arg_shape_data + arg_ind_ptr[i], arg_shape_data + arg_ind_ptr[i+1]);
+    }
+  } else {
+    std::unordered_map<std::string, mxnet::TShape> kwargs;
+    for (mx_uint i = 0; i < num_args; ++i) {
+      kwargs[keys[i]] = mxnet::ShapeTypeCast(
+          arg_shape_data + arg_ind_ptr[i], arg_shape_data + arg_ind_ptr[i+1]);
+    }
+    mxnet::MatchArguments(g.indexed_graph(), kwargs, &arg_shapes, "InferShape");
+  }
+
+  try {
+    g = mxnet::exec::InferShape(std::move(g), std::move(arg_shapes), "__shape__");
+  } catch (const mxnet::op::InferShapeError &err) {
+    throw dmlc::Error(err.msg);
+  }
+
+  // if use legacy shape definition, need to convert numpy shape to legacy shape
+  mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
+  if (!Imperative::Get()->is_np_comp()) {
+    common::ConvertToLegacyShape(&shapes);
+  }
+
+  // copy back
+  CopyAttr(g.indexed_graph(), shapes,
+           &(ret->arg_shapes), &(ret->out_shapes), &(ret->aux_shapes));
+
+  // copy data back
+  MXAPIThreadLocalEntry::SetupShapeArrayReturnWithBufferEx(ret->arg_shapes,
+      &(ret->arg_shape_ndim_ex), &(ret->arg_shape_data_ex), &(ret->arg_shape_buffer_ex));
+  MXAPIThreadLocalEntry::SetupShapeArrayReturnWithBufferEx(ret->out_shapes,
+      &(ret->out_shape_ndim_ex), &(ret->out_shape_data_ex), &(ret->out_shape_buffer_ex));
+  MXAPIThreadLocalEntry::SetupShapeArrayReturnWithBufferEx(ret->aux_shapes,
+      &(ret->aux_shape_ndim_ex), &(ret->aux_shape_data_ex), &(ret->aux_shape_buffer_ex));
+  *in_shape_size = static_cast<mx_uint>(ret->arg_shapes.size());
+  *in_shape_ndim = dmlc::BeginPtr(ret->arg_shape_ndim_ex);
+  *in_shape_data = dmlc::BeginPtr(ret->arg_shape_data_ex);
+  *out_shape_size = static_cast<mx_uint>(ret->out_shapes.size());
+  *out_shape_ndim = dmlc::BeginPtr(ret->out_shape_ndim_ex);
+  *out_shape_data = dmlc::BeginPtr(ret->out_shape_data_ex);
+  *aux_shape_size = static_cast<mx_uint>(ret->aux_shapes.size());
+  *aux_shape_ndim = dmlc::BeginPtr(ret->aux_shape_ndim_ex);
+  *aux_shape_data = dmlc::BeginPtr(ret->aux_shape_data_ex);
+  // mark complete
+  *complete = (g.GetAttr<size_t>("shape_num_unknown_nodes") == 0);
+  API_END();
+}
+
 int MXSymbolInferShapePartial(SymbolHandle sym,
                               mx_uint num_args,
                               const char** keys,
@@ -593,6 +673,31 @@ int MXSymbolInferShapePartial(SymbolHandle sym,
                             &succ);
 }
 
+int MXSymbolInferShapePartialEx(SymbolHandle sym,
+                                mx_uint num_args,
+                                const char** keys,
+                                const mx_uint *arg_ind_ptr,
+                                const int *arg_shape_data,
+                                mx_uint *in_shape_size,
+                                const int **in_shape_ndim,
+                                const int ***in_shape_data,
+                                mx_uint *out_shape_size,
+                                const int **out_shape_ndim,
+                                const int ***out_shape_data,
+                                mx_uint *aux_shape_size,
+                                const int **aux_shape_ndim,
+                                const int ***aux_shape_data,
+                                int *complete) {
+  int succ;
+  *complete = 1;
+  return MXSymbolInferShapeEx(sym, num_args, keys,
+                              arg_ind_ptr, arg_shape_data,
+                              in_shape_size, in_shape_ndim, in_shape_data,
+                              out_shape_size, out_shape_ndim, out_shape_data,
+                              aux_shape_size, aux_shape_ndim, aux_shape_data,
+                              &succ);
+}
+
 int MXSymbolInferType(SymbolHandle sym,
                       mx_uint num_args,
                       const char** keys,
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index 3b9f43d86079..7de23ef935ef 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -436,6 +436,7 @@ int MXPredGetOutputShape(PredictorHandle handle,
       << "Index exceed number of outputs";
 
   const mxnet::TShape& s = p->out_shapes[out_index];
+  CHECK_GE(s.ndim(), 0);
   p->out_shapes_buffer.resize(s.ndim());
   nnvm::ShapeTypeCast(s.begin(), s.end(), p->out_shapes_buffer.data());
   *shape_data = p->out_shapes_buffer.data();
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
index 279ecbd67f09..0551b429f17e 100644
--- a/src/common/exec_utils.h
+++ b/src/common/exec_utils.h
@@ -380,7 +380,7 @@ inline void HandleInferShapeError(const size_t num_forward_inputs,
     const uint32_t nid = idx.input_nodes().at(i);
     const uint32_t eid = idx.entry_id(nid, 0);
     const mxnet::TShape& inferred_shape = inferred_shapes[eid];
-    if (inferred_shape.ndim() == 0 || inferred_shape.Size() == 0U) {
+    if (!shape_is_known(inferred_shape)) {
       const std::string& arg_name = idx[nid].source->attrs.name;
       oss << arg_name << ": " << inferred_shape << ", ";
       if (--cnt == 0) {
@@ -390,7 +390,7 @@ inline void HandleInferShapeError(const size_t num_forward_inputs,
     }
   }
   LOG(FATAL) << "InferShape pass cannot decide shapes for the following arguments "
-                "(0s means unknown dimensions). Please consider providing them as inputs:\n"
+                "(-1 means unknown dimensions). Please consider providing them as inputs:\n"
              << oss.str();
 }
 
diff --git a/src/common/serialization.h b/src/common/serialization.h
index 8192ee210a1c..c22d8bc82270 100644
--- a/src/common/serialization.h
+++ b/src/common/serialization.h
@@ -49,7 +49,7 @@ template<typename T>
 inline size_t SerializedSize(const T &obj);
 
 template<typename T>
-inline size_t SerializedSize(const nnvm::Tuple <T> &obj);
+inline size_t SerializedSize(const mxnet::Tuple <T> &obj);
 
 template<typename K, typename V>
 inline size_t SerializedSize(const std::map <K, V> &obj);
@@ -64,7 +64,7 @@ template<typename T>
 inline void Serialize(const T &obj, char **buffer);
 
 template<typename T>
-inline void Serialize(const nnvm::Tuple <T> &obj, char **buffer);
+inline void Serialize(const mxnet::Tuple <T> &obj, char **buffer);
 
 template<typename K, typename V>
 inline void Serialize(const std::map <K, V> &obj, char **buffer);
@@ -79,7 +79,7 @@ template<typename T>
 inline void Deserialize(T *obj, const std::string &buffer, size_t *curr_pos);
 
 template<typename T>
-inline void Deserialize(nnvm::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos);
+inline void Deserialize(mxnet::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos);
 
 template<typename K, typename V>
 inline void Deserialize(std::map <K, V> *obj, const std::string &buffer, size_t *curr_pos);
@@ -102,7 +102,7 @@ inline size_t SerializedSize(const T &obj) {
 }
 
 template<typename T>
-inline size_t SerializedSize(const nnvm::Tuple <T> &obj) {
+inline size_t SerializedSize(const mxnet::Tuple <T> &obj) {
   if (is_container<T>::value) {
     size_t sum_val = 4;
     for (const auto& el : obj) {
@@ -180,7 +180,7 @@ inline void Serialize(const T &obj, char **buffer) {
 }
 
 template<typename T>
-inline void Serialize(const nnvm::Tuple <T> &obj, char **buffer) {
+inline void Serialize(const mxnet::Tuple <T> &obj, char **buffer) {
   uint32_t size = obj.ndim();
   std::memcpy(*buffer, &size, 4);
   *buffer += 4;
@@ -244,7 +244,7 @@ inline void Deserialize(T *obj, const std::string &buffer, size_t *curr_pos) {
 }
 
 template<typename T>
-inline void Deserialize(nnvm::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos) {
+inline void Deserialize(mxnet::Tuple <T> *obj, const std::string &buffer, size_t *curr_pos) {
   uint32_t size = obj->ndim();
   std::memcpy(&size, &buffer[*curr_pos], 4);
   *curr_pos += 4;
diff --git a/src/common/utils.h b/src/common/utils.h
index 8e6966952890..251a8fe3c190 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -654,7 +654,7 @@ FCompType GetFCompute(const nnvm::Op* op, const std::string& name,
   } else if (ctx.dev_mask() == gpu::kDevMask) {
     return fcompute_gpu.get(op, nullptr);
   } else {
-    LOG(FATAL) << "Unknown device mask";
+    LOG(FATAL) << "Unknown device mask " << ctx.dev_mask();
     return nullptr;
   }
 }
@@ -734,6 +734,64 @@ inline void ParallelCopy(DType* dst, const DType* src, index_t size) {
   }
 }
 
+/*!
+ * \brief If numpy compatibility is turned off (default), the shapes passed in
+ * by users follow the legacy shape definition:
+ * 1. 0 ndim means the shape is completely unknown.
+ * 2. 0 dim size means the dim size is unknown.
+ * We need to convert those shapes to use the numpy shape definition:
+ * 1. 0 ndim means it's a scalar tensor.
+ * 2. -1 ndim means the shape is unknown.
+ * 3. 0 dim size means no elements in that dimension.
+ * 4. -1 dim size means the dimension's size is unknown.
+ * so that operator's infer shape function can work in backend.
+ * \param shape to be converted.
+ * Note: It is possible that the shape to be converted is already
+ * numpy compatible. For example, when a subgraph operator's infer
+ * shape function is called from the infer shape pass of the whole
+ * graph, its input/output shapes have been converted to numpy
+ * compatible shapes.
+ */
+inline void ConvertToNumpyShape(mxnet::TShape* shape) {
+  if (shape->ndim() == 0) {  // legacy shape ndim = 0 means unknown
+    *shape = mxnet::TShape();  // unknown shape ndim = -1
+  } else {
+    for (int j = 0; j < shape->ndim(); ++j) {
+      if ((*shape)[j] == 0) {  // legacy shape dim_size = 0 means unknown
+        (*shape)[j] = -1;  // unknown dim size = -1
+      }
+    }
+  }
+}
+
+inline void ConvertToNumpyShape(mxnet::ShapeVector* shapes) {
+  for (size_t i = 0; i < shapes->size(); ++i) {
+    ConvertToNumpyShape(&(shapes->at(i)));
+  }
+}
+
+/*!
+ * \brief This is function is used to convert shapes returned by
+ * the infer shape functions/pass to the legacy shape definition.
+ */
+inline void ConvertToLegacyShape(mxnet::TShape* shape) {
+  if (!mxnet::ndim_is_known(*shape)) {
+    *shape = mxnet::TShape(0, -1);
+  } else {
+    for (int j = 0; j < shape->ndim(); ++j) {
+      if (!mxnet::dim_size_is_known(*shape, j)) {
+        (*shape)[j] = 0;
+      }
+    }
+  }
+}
+
+inline void ConvertToLegacyShape(mxnet::ShapeVector* shapes) {
+  for (size_t i = 0; i < shapes->size(); ++i) {
+    ConvertToLegacyShape(&(shapes->at(i)));
+  }
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index db4491981bdd..93853c459298 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -212,6 +212,9 @@ class NaiveEngine final : public Engine {
   void WaitForAll() override {
   }
 
+  void Throw(VarHandle var) override {
+  }
+
   void NotifyShutdown() override {
     shutdown_phase_.store(true);
   }
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index 986b6ad29909..38311908bdcd 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -498,6 +498,11 @@ inline void ThreadedEngine::ThrowException(ThreadedVar* threaded_var) {
   return;
 }
 
+void ThreadedEngine::Throw(VarHandle var) {
+  ThreadedVar *threaded_var = ThreadedVar::CastFromBase(var);
+  ThrowException(threaded_var);
+}
+
 void ThreadedEngine::OnCompleteStatic(Engine *engine, void *opr_block_,
                                       const dmlc::Error* error) {
   OprBlock *opr_block = static_cast<OprBlock*>(opr_block_);
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index 3d2119d63291..7df232b1c62a 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -306,6 +306,7 @@ class ThreadedEngine : public Engine {
   void DeleteVariable(SyncFn delete_fn, Context exec_ctx, VarHandle var) override;
   void WaitForVar(VarHandle var) override;
   void WaitForAll() override;
+  void Throw(VarHandle var) override;
   void NotifyShutdown() override {
     shutdown_phase_.store(true);
   }
@@ -374,8 +375,8 @@ class ThreadedEngine : public Engine {
           LOG(INFO) << "ExecuteOprFn ";
         }
         try {
-          if (!(threaded_opr->opr_exception && *threaded_opr->opr_exception) ||
-              threaded_opr->wait) {
+          if ((!(threaded_opr->opr_exception && *threaded_opr->opr_exception) ||
+              threaded_opr->prop == FnProperty::kNoSkip) || threaded_opr->wait) {
             threaded_opr->fn(run_ctx, callback);
           } else {
             callback();
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index b04d132ee9f6..8f47bc29db13 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -261,7 +261,7 @@ class FComputeExExecutor : public OpExecutor {
   ExecType exec_type_;
 };
 
-void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
+void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, size_t i) {
   using nnvm::DTypeVector;
   using mxnet::ShapeVector;
   using nnvm::FMutateInputs;
@@ -302,6 +302,10 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i) {
 
     OpStatePtr state = fcreate_op_state[op](
         inode.source->attrs, vctx[i], ishape, itype);
+    if (p_state) {
+      CHECK_GT(p_state->size(), i);
+      p_state->at(i) = state;
+    }
     FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
         op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
@@ -359,7 +363,7 @@ Graph AttachOpExecs(Graph g) {
   const auto& idx = g.indexed_graph();
   OpExecVector ret(idx.num_nodes());
   for (size_t i = 0; i < idx.num_nodes(); ++i) {
-    CreateOpExecs(g, &ret, i);
+    CreateOpExecs(g, &ret, nullptr, i);
   }
   g.attrs["op_execs"] = std::make_shared<nnvm::any>(ret);
   return g;
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index dd4132301346..7e5130f4921c 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -98,6 +98,12 @@ class OpExecutor {
  */
 using OpExecVector = std::vector<std::shared_ptr<OpExecutor> >;
 
+/*!
+ * \brief per node vector of operator states.
+ * \note stored under attribute "op_states"
+ */
+using OpStateVector = std::vector<OpStatePtr>;
+
 /*!
  * \brief per node context vector
  * \node stored under "context"
@@ -115,9 +121,10 @@ using DevMaskVector = std::vector<int>;
  *
  * \param g input graph
  * \param p_ret OpExecVector for input and output
+ * \param p_state OpStateVector if it has.
  * \param i the id of the node
  */
-void CreateOpExecs(const Graph& g, OpExecVector* p_ret, size_t i);
+void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, size_t i);
 /*!
  * \brief Attach OpExecutor to the graph attributes.
  *
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 460cec371bd4..4a4505581920 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -34,6 +34,7 @@
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
 #include "../operator/subgraph/subgraph_property.h"
+#include "../operator/operator_common.h"
 
 namespace mxnet {
 namespace exec {
@@ -966,7 +967,7 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     uint32_t oid = head_grad_map_.at(idx[nid].source);
     uint32_t eid = idx.entry_id(idx.outputs()[oid]);
     NDArrayStorageType stype = (NDArrayStorageType) vstorage_type[eid];
-    CHECK_NE(vshape[eid].ndim(), 0U);
+    CHECK(mxnet::shape_is_known(vshape[eid]));
     CHECK_NE(vdtype[eid], -1);
     auto data_eid = idx.entry_id(nid, 0);
     // initialize based on storage_type
diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
index 6a7fde62c2cf..fa7aee518486 100644
--- a/src/executor/infer_graph_attr_pass.cc
+++ b/src/executor/infer_graph_attr_pass.cc
@@ -24,6 +24,7 @@
 
 #include <mxnet/op_attr_types.h>
 #include <mxnet/graph_attr_types.h>
+#include <mxnet/imperative.h>
 #include "./exec_pass.h"
 #include "../operator/operator_common.h"
 #include "../common/exec_utils.h"
@@ -467,6 +468,12 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
   std::vector<AttrType> ishape, oshape;
   // whether a shape is dynamic
   std::vector<int> is_dynamic(rshape.size(), 0);
+
+  // convert to numpy compatible shape to use operator's infer shape function
+  if (!Imperative::Get()->is_np_comp()) {
+    common::ConvertToNumpyShape(&rshape);
+  }
+
   // inference step function for nid
   auto infer_step = [&](uint32_t nid, bool last_iter) {
     const auto& inode = idx[nid];
@@ -483,6 +490,9 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
         if (it != inode.source->attrs.dict.end()) {
           std::istringstream is(it->second);
           CHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
+          if (!Imperative::Get()->is_np_comp()) {
+            common::ConvertToNumpyShape(&rshape[out_ent_id]);
+          }
         }
       }
       // assign a default value to node attribute
@@ -546,7 +556,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
       bool is_input_dynamic_shape = false;
       for (uint32_t i = 0; i < ishape.size(); ++i) {
         ishape[i] = rshape[idx.entry_id(inode.inputs[i])];
-        if (ishape[i].ndim() == 0 && is_dynamic[idx.entry_id(inode.inputs[i])]) {
+        if (!mxnet::ndim_is_known(ishape[i]) && is_dynamic[idx.entry_id(inode.inputs[i])]) {
           is_input_dynamic_shape = true;
         }
         if (fis_none(ishape[i])) forward_known = false;
@@ -563,7 +573,7 @@ nnvm::Graph InferShapeAttr(nnvm::Graph &&ret,
       auto finfer = finfer_shape.get(inode.source->op(), fdefault);
       if (finfer == nullptr || is_input_dynamic_shape) {
         for (uint32_t i = 0; i < oshape.size(); ++i) {
-          if (oshape[i].ndim() == 0) {
+          if (!mxnet::ndim_is_known(oshape[i].ndim())) {
             is_dynamic[idx.entry_id(nid, i)] = 1;
           }
         }
@@ -648,14 +658,14 @@ nnvm::Graph InferShape(nnvm::Graph&& graph,
       std::move(graph), mxnet::TShape(),
       "FInferShape", "shape_inputs", "shape_attr_key",
       "shape", "shape_num_unknown_nodes",
-      [](const mxnet::TShape& s) { return s.ndim() == 0 || s.Size() == 0; },
+      [](const mxnet::TShape& s) { return !mxnet::shape_is_known(s); },
       [](const mxnet::TShape& s) {
-        if (s.ndim() == 0) {  // TODO(reminisce): Usage of ndim
+        if (!mxnet::ndim_is_known(s)) {
           return static_cast<size_t>(1);
         }
         size_t ret = 0;
         for (const auto& val : s) {
-          if (val == 0) {
+          if (!mxnet::dim_size_is_known(val)) {
             ++ret;
           }
         }
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index c9215c5c8827..7a5ed21432d3 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -285,7 +285,7 @@ bool CachedOp::CheckDynamicShapeExists(const Context& default_ctx,
   CheckAndInferShape(&g, std::move(shape_inputs), true,
                      {0, 0}, {0, 0},
                      &contain_dynamic_shape);
-  if (erase_result) {
+  if (contain_dynamic_shape && erase_result) {
     g.attrs.erase("shape");
     g.attrs.erase("shape_inputs");
   }
@@ -603,7 +603,7 @@ void CachedOp::StaticInitExec(
     }
   } else {
     for (size_t i = start_nid; i < end_nid; ++i) {
-      exec::CreateOpExecs(g, &state.execs, i);
+      exec::CreateOpExecs(g, &state.execs, &state.op_states, i);
     }
     exec::AttachOpResources(g, state.execs, start_nid, end_nid);
 
@@ -705,8 +705,10 @@ void CachedOp::StaticRunOps(
           arg_shapes.emplace_back(ndinput->shape());
           arg_dtypes.emplace_back(ndinput->dtype());
         }
-        state.op_states[i] = createop[node.source->op()](
-            node.source->attrs, default_ctx, arg_shapes, arg_dtypes);
+        if (!state.op_states[i]) {
+          state.op_states[i] =
+              createop[node.source->op()](node.source->attrs, default_ctx, arg_shapes, arg_dtypes);
+        }
         Imperative::Get()->InvokeOp(
             default_ctx, node.source->attrs, ndinputs, ndoutputs, req,
             dispatch_mode, state.op_states[i]);
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index b3192dc8281b..14b373edea57 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -36,8 +36,8 @@ struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
   bool static_alloc;
   bool static_shape;
   bool is_dynamic;
-  nnvm::Tuple<uint32_t> data_indices;
-  nnvm::Tuple<uint32_t> param_indices;
+  mxnet::Tuple<uint32_t> data_indices;
+  mxnet::Tuple<uint32_t> param_indices;
   std::string subgraph;
   DMLC_DECLARE_PARAMETER(CachedOpConfig) {
     DMLC_DECLARE_FIELD(static_alloc)
@@ -59,10 +59,10 @@ struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
     .set_default(Imperative::BulkExecMaxNodeTrainBwd())
     .describe("Segment size of bulk execution during backward pass.");
     DMLC_DECLARE_FIELD(data_indices)
-    .set_default(nnvm::Tuple<uint32_t>())
+    .set_default(mxnet::Tuple<uint32_t>())
     .describe("Position of argument variables.");
     DMLC_DECLARE_FIELD(param_indices)
-    .set_default(nnvm::Tuple<uint32_t>())
+    .set_default(mxnet::Tuple<uint32_t>())
     .describe("Position of parameters.");
     DMLC_DECLARE_FIELD(subgraph)
     .set_default(std::string(""))
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
index 3e5b3987522c..b027de0a0f6f 100644
--- a/src/imperative/imperative.cc
+++ b/src/imperative/imperative.cc
@@ -25,9 +25,11 @@ namespace mxnet {
 #if DMLC_CXX11_THREAD_LOCAL
 thread_local bool Imperative::is_train_ = false;
 thread_local bool Imperative::is_recording_ = false;
+thread_local bool Imperative::is_np_comp_ = false;
 #else
 MX_THREAD_LOCAL bool Imperative::is_train_ = false;
 MX_THREAD_LOCAL bool Imperative::is_recording_ = false;
+MX_THREAD_LOCAL bool Imperative::is_np_comp_ = false;
 #endif
 
 Imperative* Imperative::Get() {
@@ -109,7 +111,7 @@ OpStatePtr Imperative::Invoke(
   OpStatePtr ret = InvokeOp(ctx, attrs, inputs, outputs, req, dispatch_mode);
   // the followinng loop is used for finding out the correct shape when some shapes are dynamic
   for (size_t i = 0; i < outputs.size(); i++) {
-    if (outputs[i]->shape().ndim() == 0) {
+    if (!shape_is_known(outputs[i]->shape())) {
       // the WaitToRead overhead here does not seem to be avoidable
       outputs[i]->WaitToRead();
       outputs[i]->SetShapeFromChunk();
diff --git a/src/imperative/imperative_utils.cc b/src/imperative/imperative_utils.cc
index 6cb4a70324b5..568d39fc8043 100644
--- a/src/imperative/imperative_utils.cc
+++ b/src/imperative/imperative_utils.cc
@@ -19,63 +19,63 @@
 
 #include "./imperative_utils.h"
 #include "./cached_op.h"
+#include "../operator/operator_common.h"
 
-namespace mxnet {
-namespace imperative {
+namespace {
 
-inline std::vector<NDArray*> NodeInputs(const nnvm::IndexedGraph& idx,
-                                        const int node_idx,
-                                        const std::vector<NDArray*> arrays) {
+std::vector<NDArray*> NodeInputs(const nnvm::IndexedGraph& idx,
+                                 const int node_idx,
+                                 const std::vector<NDArray*>& arrays) {
   const nnvm::IndexedGraph::Node& node = idx[node_idx];
   const size_t num_inputs = node.inputs.size();
   std::vector<NDArray*> ndinputs;
   ndinputs.reserve(num_inputs);
   for (const auto& j : node.inputs) {
-    size_t eid = idx.entry_id(j);
+    const size_t eid = idx.entry_id(j);
     ndinputs.emplace_back(arrays[eid]);
   }
   return ndinputs;
 }
 
-inline std::vector<NDArray*> NodeOutputs(const nnvm::IndexedGraph& idx,
-                                         const int node_idx,
-                                         const std::vector<NDArray*> arrays) {
+std::vector<NDArray*> NodeOutputs(const nnvm::IndexedGraph& idx,
+                                  const int node_idx,
+                                  const std::vector<NDArray*>& arrays) {
   const nnvm::IndexedGraph::Node& node = idx[node_idx];
   const size_t num_outputs = node.source->num_outputs();
   std::vector<NDArray*> ndoutputs;
   ndoutputs.reserve(num_outputs);
   for (size_t j = 0; j < num_outputs; ++j) {
-    size_t eid = idx.entry_id(node_idx, j);
+    const size_t eid = idx.entry_id(node_idx, j);
     ndoutputs.emplace_back(arrays[eid]);
   }
   return ndoutputs;
 }
 
-inline std::vector<OpReqType> NodeReq(const nnvm::IndexedGraph& idx,
-                                      const int node_idx,
-                                      const std::vector<OpReqType> array_reqs) {
+std::vector<OpReqType> NodeReq(const nnvm::IndexedGraph& idx,
+                               const int node_idx,
+                               const std::vector<OpReqType>& array_reqs) {
   const nnvm::IndexedGraph::Node& node = idx[node_idx];
   const size_t num_outputs = node.source->num_outputs();
   std::vector<OpReqType> req;
   req.reserve(num_outputs);
   for (size_t j = 0; j < num_outputs; ++j) {
-    size_t eid = idx.entry_id(node_idx, j);
+    const size_t eid = idx.entry_id(node_idx, j);
     req.push_back(array_reqs[eid]);
   }
   return req;
 }
 
-inline void InvokeOperator(const nnvm::IndexedGraph& idx,
-                           const int node_idx,
-                           const bool retain_graph,
-                           const std::vector<NDArray*> arrays,
-                           Context ctx,
-                           std::vector<OpStatePtr>* p_states,
-                           std::vector<NDArray*> ndinputs,
-                           std::vector<NDArray*> ndoutputs,
-                           std::vector<OpReqType> *p_req,
-                           std::vector<uint32_t> *p_ref_count,
-                           std::function<void(const OpStatePtr &state)> invoke) {
+void InvokeOperator(const nnvm::IndexedGraph& idx,
+                    const int node_idx,
+                    const bool retain_graph,
+                    const std::vector<NDArray*>& arrays,
+                    Context ctx,
+                    std::vector<OpStatePtr>* p_states,
+                    const std::vector<NDArray*>& ndinputs,
+                    const std::vector<NDArray*>& ndoutputs,
+                    std::vector<OpReqType> *p_req,
+                    std::vector<uint32_t> *p_ref_count,
+                    std::function<void(const OpStatePtr &state)> invoke) {
   static const auto bwd_cached_op = Op::Get("_backward_CachedOp");
   static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
   static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
@@ -122,10 +122,15 @@ inline void InvokeOperator(const nnvm::IndexedGraph& idx,
   }
 }
 
+}  // namespace
+
+namespace mxnet {
+namespace imperative {
+
 void RunGraph(
     const bool retain_graph,
     const nnvm::IndexedGraph& idx,
-    const std::vector<NDArray*> arrays,
+    const std::vector<NDArray*>& arrays,
     size_t node_start, size_t node_end,
     std::vector<OpReqType>&& array_reqs,
     std::vector<uint32_t>&& ref_count,
@@ -161,7 +166,7 @@ void NaiveRunGraph(
     const bool retain_graph,
     const Context& default_ctx,
     const nnvm::IndexedGraph& idx,
-    const std::vector<NDArray*> arrays,
+    const std::vector<NDArray*>& arrays,
     size_t node_start, size_t node_end,
     std::vector<OpReqType>&& array_reqs,
     std::vector<uint32_t>&& ref_count,
@@ -186,7 +191,7 @@ void NaiveRunGraph(
       Imperative::Get()->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
                                   req, dispatch_mode, state);
       for (size_t j = 0; j < ndoutputs.size(); ++j) {
-        if (ndoutputs[j]->shape().ndim() == 0) {
+        if (mxnet::op::shape_is_none(ndoutputs[j]->shape())) {
           ndoutputs[j]->WaitToRead();
           ndoutputs[j]->SetShapeFromChunk();
         }
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 071f4fa9dd0b..5c9706834b2d 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -31,6 +31,7 @@
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
 #include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../operator/operator_common.h"
 
 #ifndef MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
 #define MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
@@ -121,7 +122,28 @@ inline void SetShapeType(const Context& ctx,
   if (!infershape.count(attrs.op)) {
     is_dynamic_shape_existing = true;
   } else {
-    CHECK(infershape[attrs.op](attrs, &in_shapes, &out_shapes));
+    if (!Imperative::Get()->is_np_comp()) {
+      common::ConvertToNumpyShape(&in_shapes);
+      common::ConvertToNumpyShape(&out_shapes);
+    }
+    const bool success = infershape[attrs.op](attrs, &in_shapes, &out_shapes);
+    if (!success) {
+      std::stringstream os;
+      os << "Operator " << attrs.op->name << " inferring shapes failed.\n";
+      os << "input shapes:\n";
+      for (const auto& s : in_shapes) {
+        os << s << '\n';
+      }
+      os << "output shapes:\n";
+      for (const auto& s : out_shapes) {
+        os << s << '\n';
+      }
+      os << "operator attributes:\n";
+      for (const auto& kv : attrs.dict) {
+        os << kv.first << " : " << kv.second << '\n';
+      }
+      LOG(FATAL) << os.str();
+    }
     CHECK_EQ(out_shapes.size(), outputs.size());
   }
   // infer type
@@ -179,7 +201,7 @@ inline void SetShapeType(const Context& ctx,
 
   for (size_t i = 0; i < outputs.size(); ++i) {
     NDArrayStorageType storage_type = static_cast<NDArrayStorageType>(out_storage_types[i]);
-    if (outputs[i]->is_none() || outputs[i]->shape().ndim() == 0) {
+    if (outputs[i]->is_none() || mxnet::op::shape_is_none(outputs[i]->shape())) {
       if (is_dynamic_shape_existing) {
         // once there is dynamic shape somewhere, we could not pre-determine the shape.
         *outputs[i] = NDArray(ctx, out_types[i]);
@@ -573,23 +595,21 @@ inline bool CheckAndInferShape(nnvm::Graph* p_g, mxnet::ShapeVector&& shapes,
     *contain_unknown = false;
   }
   nnvm::Graph& g = *p_g;
-  if (use_inputs) {
-    if (g.attrs.count("shape_inputs") &&
-        g.GetAttr<mxnet::ShapeVector>("shape_inputs") == shapes) return true;
-  } else if (g.attrs.count("shape")) {
+  if (g.attrs.count("shape")) {
     const auto& prev_shapes = g.GetAttr<mxnet::ShapeVector>("shape");
-    CHECK_EQ(prev_shapes.size(), shapes.size());
-    bool match = true;
-    for (size_t i = 0; i < shapes.size(); ++i) {
-      if (i == entry_range.first) {
-        i = entry_range.second;
-        if (i >= shapes.size()) break;
+    if (prev_shapes.size() == shapes.size()) {
+      bool match = true;
+      for (size_t i = 0; i < shapes.size(); ++i) {
+        if (i == entry_range.first) {
+          i = entry_range.second;
+          if (i >= shapes.size()) break;
+        }
+        if (shapes[i] == prev_shapes[i]) continue;
+        match = false;
+        break;
       }
-      if (shapes[i] == prev_shapes[i]) continue;
-      match = false;
-      break;
+      if (match) return true;
     }
-    if (match) return true;
   }
   g.attrs.erase("shape");
   g.attrs.erase("shape_inputs");
@@ -999,7 +1019,7 @@ inline void CreateEngineOpSeg(
 
 void RunGraph(const bool retain_graph,
               const nnvm::IndexedGraph& idx,
-              const std::vector<NDArray*> arrays,
+              const std::vector<NDArray*>& arrays,
               size_t node_start, size_t node_end,
               std::vector<OpReqType>&& array_reqs,
               std::vector<uint32_t>&& ref_count,
@@ -1011,7 +1031,7 @@ void RunGraph(const bool retain_graph,
 void NaiveRunGraph(const bool retain_graph,
                    const Context& default_ctx,
                    const nnvm::IndexedGraph& idx,
-                   const std::vector<NDArray*> arrays,
+                   const std::vector<NDArray*>& arrays,
                    size_t node_start, size_t node_end,
                    std::vector<OpReqType>&& array_reqs,
                    std::vector<uint32_t>&& ref_count,
diff --git a/src/io/image_det_aug_default.cc b/src/io/image_det_aug_default.cc
index 74e51b51603b..3bd37200b8e7 100644
--- a/src/io/image_det_aug_default.cc
+++ b/src/io/image_det_aug_default.cc
@@ -34,7 +34,7 @@
 namespace mxnet {
 namespace io {
 
-using nnvm::Tuple;
+using mxnet::Tuple;
 
 namespace image_det_aug_default_enum {
 enum ImageDetAugDefaultCropEmitMode {kCenter, kOverlap};
@@ -462,7 +462,7 @@ class DefaultImageDetAugmenter : public ImageAugmenter {
 
   /*! \brief Check number of crop samplers and given parameters */
   template<typename DType>
-  void ValidateCropParameters(nnvm::Tuple<DType> *param, const int num_sampler) {
+  void ValidateCropParameters(mxnet::Tuple<DType> *param, const int num_sampler) {
     if (num_sampler == 1) {
       CHECK_EQ(param->ndim(), 1);
     } else if (num_sampler > 1) {
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index 2196983928bb..c0357998f31c 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -189,7 +189,7 @@ void Imdecode(const nnvm::NodeAttrs& attrs,
   size_t len = inputs[0].shape().Size();
   CHECK(len > 0) << "Input cannot be an empty buffer";
 
-  mxnet::TShape oshape(3);
+  mxnet::TShape oshape(3, 1);
   oshape[2] = param.flag == 0 ? 1 : 3;
   if (get_jpeg_size(str_img, len, &oshape[1], &oshape[0])) {
   } else if (get_png_size(str_img, len, &oshape[1], &oshape[0])) {
@@ -229,7 +229,7 @@ void Imread(const nnvm::NodeAttrs& attrs,
   CHECK(file.good()) << "Failed reading image file: '" << param.filename << "' "
             << strerror(errno);
 
-  mxnet::TShape oshape(3);
+  mxnet::TShape oshape(3, 1);
   oshape[2] = param.flag == 0 ? 1 : 3;
   if (get_jpeg_size(buff.get(), fsize, &oshape[1], &oshape[0])) {
   } else if (get_png_size(buff.get(), fsize, &oshape[1], &oshape[0])) {
@@ -295,7 +295,7 @@ struct MakeBorderParam : public dmlc::Parameter<MakeBorderParam> {
   int top, bot, left, right;
   int type;
   double value;
-  nnvm::Tuple<double> values;
+  mxnet::Tuple<double> values;
   DMLC_DECLARE_PARAMETER(MakeBorderParam) {
     DMLC_DECLARE_FIELD(top)
     .describe("Top margin.");
diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
index 69eb05f7d729..279690b594e6 100644
--- a/src/io/iter_batchloader.h
+++ b/src/io/iter_batchloader.h
@@ -78,7 +78,7 @@ class BatchLoader : public IIterator<TBlobBatch> {
 
     // if overflow from previous round, directly return false, until before first is called
     if (num_overflow_ != 0) return false;
-    index_t top = 0;
+    size_t top = 0;
 
     while (base_->Next()) {
       const DataInst& d = base_->Value();
diff --git a/src/io/iter_sparse_batchloader.h b/src/io/iter_sparse_batchloader.h
index 17c509a0f56b..c0d856df89ec 100644
--- a/src/io/iter_sparse_batchloader.h
+++ b/src/io/iter_sparse_batchloader.h
@@ -67,7 +67,7 @@ class SparseBatchLoader : public BatchLoader, public SparseIIterator<TBlobBatch>
     this->head_ = 0;
     // if overflown from previous round, directly return false, until before first is called
     if (num_overflow_ != 0) return false;
-    index_t top = 0;
+    size_t top = 0;
     offsets_.clear();
     while (sparse_base_->Next()) {
       const DataInst& inst = sparse_base_->Value();
diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc
index e4a06fa9a1f2..30aaec91e27f 100644
--- a/src/kvstore/gradient_compression.cc
+++ b/src/kvstore/gradient_compression.cc
@@ -100,9 +100,9 @@ int64_t GradientCompression::GetCompressedSize(const int64_t original_size) {
 
 void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to,
                   mxnet::NDArray *residual, const int priority) {
-  CHECK(from.shape().ndim() != 0) << "source operand has zero dimension shape";
-  CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape";
-  CHECK(residual->shape().ndim() != 0) << "residual operand has zero dimension shape";
+  CHECK(shape_is_known(from.shape())) << "source operand has undefined shape";
+  CHECK(shape_is_known(to->shape())) << "destination operand has undefined shape";
+  CHECK(shape_is_known(residual->shape())) << "residual operand has undefined shape";
   const int a = from.ctx().dev_mask();
   const int b = to->ctx().dev_mask();
   const float threshold = threshold_;
@@ -137,8 +137,8 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
 
 void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to,
                                      const int priority) {
-  CHECK(from.shape().ndim() != 0) << "source operands has zero dimension shape";
-  CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape";
+  CHECK(shape_is_known(from.shape())) << "source operand has undefined shape";
+  CHECK(shape_is_known(to->shape())) << "destination operand has undefined shape";
   const int a = from.ctx().dev_mask();
   const int b = to->ctx().dev_mask();
   const float threshold = threshold_;
diff --git a/src/libinfo.cc b/src/libinfo.cc
index 2af61eac9eca..f67b45ed1c14 100644
--- a/src/libinfo.cc
+++ b/src/libinfo.cc
@@ -86,7 +86,9 @@ class FeatureSet {
     // Misc
     feature_bits.set(CAFFE, MXNET_USE_CAFFE);
     feature_bits.set(DIST_KVSTORE, MXNET_USE_DIST_KVSTORE);
+    feature_bits.set(INT64_TENSOR_SIZE, MXNET_USE_INT64_TENSOR_SIZE);
     feature_bits.set(SIGNAL_HANDLER, MXNET_USE_SIGNAL_HANDLER);
+
 #ifndef NDEBUG
     feature_bits.set(DEBUG);
 #endif
@@ -154,6 +156,7 @@ const std::vector<std::string> EnumNames::names = {
   "PROFILER",
   "DIST_KVSTORE",
   "CXX14",
+  "INT64_TENSOR_SIZE",
   "SIGNAL_HANDLER",
   "DEBUG",
 };
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 377bef072b03..0bfca8c10a1a 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -113,20 +113,22 @@ NDArray::Chunk::~Chunk() {
   // We want to delete mkldnn memory after deleting the variable.
   mem.mem = this->mkl_mem_;
 #endif
-  Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) {
-    if (skip_free == false) {
+  if (auto engine = engine_ref_.lock()) {
+    engine->DeleteVariable([mem, skip_free](RunContext s) {
+      if (skip_free == false) {
 #if MXNET_USE_MKLDNN == 1
-      if (mem.mem) {
-        CHECK_LE(mem.mem->GetSize(), mem.h.size);
-        CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
-      }
+        if (mem.mem) {
+          CHECK_LE(mem.mem->GetSize(), mem.h.size);
+          CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
+        }
 #endif
-      Storage::Get()->Free(mem.h);
-      for (const auto& aux : mem.aux_h) {
-        Storage::Get()->Free(aux);
+        Storage::Get()->Free(mem.h);
+        for (const auto &aux : mem.aux_h) {
+          Storage::Get()->Free(aux);
+        }
       }
-    }
-  }, shandle.ctx, var);
+    }, shandle.ctx, var);
+  }
 }
 
 void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
@@ -337,8 +339,8 @@ NDArray NDArray::data_ndarray() const {
 }
 
 struct NDArrayDLManager {
-    NDArray handle;  // ref NDArray
-    DLManagedTensor tensor;
+  NDArray handle;  // ref NDArray
+  DLManagedTensor tensor;
 };
 
 DLManagedTensor* NDArray::ToDLPack() const {
@@ -354,13 +356,13 @@ DLManagedTensor* NDArray::ToDLPack() const {
 }
 
 NDArray NDArray::FromDLPack(const DLManagedTensor* tensor) {
-  const DLTensor &dl_tensor = tensor->dl_tensor;
-  auto deleter = [tensor](){
-    if (tensor->deleter != nullptr) {
-      tensor->deleter(const_cast<DLManagedTensor*>(tensor));
+  DLManagedTensor tensor_copy = *tensor;
+  auto deleter = [tensor_copy](){
+    if (tensor_copy.deleter != nullptr) {
+      tensor_copy.deleter(const_cast<DLManagedTensor*>(&tensor_copy));
     }
   };
-  return NDArray(TBlob(dl_tensor), dl_tensor.ctx.device_id, deleter);
+  return NDArray(TBlob(tensor_copy.dl_tensor), tensor_copy.dl_tensor.ctx.device_id, deleter);
 }
 
 bool NDArray::fresh_out_grad() const {
@@ -549,7 +551,7 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
     // If they have different shapes, we need to reshape the array first.
     // Since this method will only be used inside an operator, we can call
     // MKLDNNDataReshape to reshape an array.
-    mxnet::TShape required_shape(desc2.data.ndims);
+    mxnet::TShape required_shape(desc2.data.ndims, -1);
     for (int i = 0; i < desc2.data.ndims; i++)
       required_shape[i] = desc2.data.dims[i];
     NDArray reshaped = MKLDNNDataReshape(required_shape);
@@ -575,7 +577,7 @@ NDArray NDArray::Reorder2Default() const {
 
   // create new ndarray from  mkldnn layout
   mkldnn::memory::desc from_desc = ptr_->mkl_mem_->GetPrimitiveDesc().desc();
-  mxnet::TShape tshape(from_desc.data.ndims);
+  mxnet::TShape tshape(from_desc.data.ndims, -1);
   for (int i = 0; i < from_desc.data.ndims; i++) tshape[i] = from_desc.data.dims[i];
   NDArray ret(tshape, ctx(), false, dtype());
   mkldnn::memory::primitive_desc def_pd = ptr_->mkl_mem_->GetPrimitiveDesc(format);
@@ -1191,8 +1193,8 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
   CHECK(from.shape() == to.shape())
       << "operands shape mismatch"
       << "from.shape = " << from.shape() << " to.shape=" << to.shape();
-  CHECK(from.shape().ndim() != 0)
-      << "source operands have zero dimension shape";
+  CHECK(!mxnet::op::shape_is_none(from.shape()))
+      << "source operands have undefined shape";
   // important: callback must always capture by value
   const Context from_ctx = from.ctx();
   const int a = from_ctx.dev_mask();
@@ -1650,7 +1652,7 @@ bool LegacyTShapeLoad(dmlc::Stream *strm, mxnet::TShape *shape, const uint32_t m
     default:
       // meet legacy mxnet::TShape, magic is ndim here
       uint32_t ndim = magic;
-      *shape = mxnet::TShape(ndim);
+      *shape = mxnet::TShape(ndim, -1);
       std::vector<uint32_t> buffer(ndim);
       size_t nread = ndim * sizeof(uint32_t);
       if (strm->Read(buffer.data(), nread) != nread) return false;
@@ -1663,7 +1665,7 @@ bool NDArray::LegacyLoad(dmlc::Stream *strm, const uint32_t magic) {
   // load shape
   mxnet::TShape shape;
   if (!LegacyTShapeLoad(strm, &shape, magic)) return false;
-  if (shape.ndim() == 0) {
+  if (mxnet::op::shape_is_none(shape)) {
     *this = NDArray(); return true;
   }
   // load context
@@ -1711,7 +1713,10 @@ bool NDArray::Load(dmlc::Stream *strm) {
   // load shape
   mxnet::TShape shape;
   if (!shape.Load(strm)) return false;
-  if (shape.ndim() == 0) {
+  if (!Imperative::Get()->is_np_comp()) {
+    common::ConvertToNumpyShape(&shape);
+  }
+  if (mxnet::op::shape_is_none(shape)) {
     *this = NDArray(); return true;
   }
 
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index a613d5a3decc..8f72bc259afc 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -210,8 +210,6 @@ void ElementwiseSumContainsDnsImpl(mshadow::Stream<cpu>* s,
     Kernel<set_zero, cpu>::Launch(s, out_data.Size(), out_data.dptr<DType>());
     for (size_t i = 0; i < nds.size(); ++i) {
       const NDArray& nd = nds[i];
-      const nnvm::dim_t num_rows = nd.shape()[0];
-      const nnvm::dim_t num_cols = nd.shape()[1];
       const TBlob& nd_data = nd.data();
 
       if (i == 0) {
@@ -234,6 +232,8 @@ void ElementwiseSumContainsDnsImpl(mshadow::Stream<cpu>* s,
         case kCSRStorage: {
           const TBlob& nd_indices = nd.aux_data(csr::kIdx);
           const TBlob& nd_indptr = nd.aux_data(csr::kIndPtr);
+          const nnvm::dim_t num_rows = nd.shape()[0];
+          const nnvm::dim_t num_cols = nd.shape()[1];
           MSHADOW_IDX_TYPE_SWITCH(nd_indices.type_flag_, IType, {  // indices type
             MSHADOW_IDX_TYPE_SWITCH(nd_indptr.type_flag_, CType, {  // indptr type
               if (nd.storage_initialized()) {
@@ -248,6 +248,8 @@ void ElementwiseSumContainsDnsImpl(mshadow::Stream<cpu>* s,
         }
         case kRowSparseStorage: {
           const TBlob& nd_indices = nd.aux_data(rowsparse::kIdx);
+          const nnvm::dim_t num_rows = nd.shape()[0];
+          const nnvm::dim_t num_cols = nd.shape()[1];
           MSHADOW_IDX_TYPE_SWITCH(nd_indices.type_flag_, IType, {  // indices type
             if (nd.storage_initialized()) {
               const nnvm::dim_t nz_rows = nd_indices.Size();
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index 70b626dbb9b7..505bd205a8d5 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -40,7 +40,7 @@ namespace ndarray {
 struct BinaryBase {
   inline static mxnet::TShape GetShape(const mxnet::TShape &lshape, const mxnet::TShape &rshape) {
     CHECK(lshape == rshape) << "operands shape mismatch";
-    CHECK(lshape.ndim() != 0) << "source operand have zero dimension shape";
+    CHECK(!mxnet::op::shape_is_none(lshape)) << "source operand have zero dimension shape";
     return lshape;
   }
 };
diff --git a/src/nnvm/plan_memory.cc b/src/nnvm/plan_memory.cc
index 2b18f990c845..41b8559d16c2 100644
--- a/src/nnvm/plan_memory.cc
+++ b/src/nnvm/plan_memory.cc
@@ -30,6 +30,7 @@
 #include <mxnet/base.h>
 #include <memory>
 #include "graph_algorithm.h"
+#include "../operator/operator_common.h"
 
 namespace nnvm {
 namespace pass {
@@ -75,7 +76,7 @@ class GraphAllocator {
 
   // request a free storage
   StorageID Request(int dev_id, int dtype, mxnet::TShape shape, uint32_t node_id) {
-    if (shape.ndim() == 0) return kBadStorageID;
+    if (!mxnet::shape_is_known(shape)) return kBadStorageID;
     // search memory block in [size / match_range_, size * match_range_)
     // TODO(tqchen) add size of the dtype, assume 4 bytes for now
     size_t size = shape.Size() * 4;
@@ -267,8 +268,7 @@ size_t AllocMemory(const Graph& ret, const IndexedGraph& idx,
       // only request memory for kBadStorageID
       if (storage[eid] == GraphAllocator::kBadStorageID) {
         auto &eshape = shape_vec[eid];
-        size_t esize = 0;
-        if (eshape.ndim() != 0) esize = eshape.Size();
+        size_t esize = eshape.Size();
         eids.insert(std::make_pair(esize, eid));
       }
     }
diff --git a/src/operator/batch_norm_v1-inl.h b/src/operator/batch_norm_v1-inl.h
index f407a5cce61b..89412357ac67 100644
--- a/src/operator/batch_norm_v1-inl.h
+++ b/src/operator/batch_norm_v1-inl.h
@@ -261,7 +261,7 @@ class BatchNormV1Prop : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
     const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
+    if (!mxnet::ndim_is_known(dshape)) return false;
     in_shape->at(1) = mxnet::TShape(Shape1(dshape[1]));
     in_shape->at(2) = mxnet::TShape(Shape1(dshape[1]));
     out_shape->clear();
diff --git a/src/operator/bilinear_sampler-inl.h b/src/operator/bilinear_sampler-inl.h
index 8b1ff38709b6..abb4a61dc84c 100644
--- a/src/operator/bilinear_sampler-inl.h
+++ b/src/operator/bilinear_sampler-inl.h
@@ -149,10 +149,10 @@ class BilinearSamplerProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 2U) << "Input:[data, grid]";
     const mxnet::TShape &dshape = (*in_shape)[bs::kData];
     const mxnet::TShape &lshape = (*in_shape)[bs::kGrid];
-    if (dshape.ndim() == 0) return false;
+    if (!shape_is_known(dshape)) return false;
     CHECK_EQ(dshape.ndim(), 4U) \
         << "input data should be 4D in batch-num_filter-y-x";
-    if (lshape.ndim() ==  0) return false;
+    if (!shape_is_known(lshape)) return false;
     CHECK_EQ(lshape.ndim(), 4U) \
       << "Sampler grid should be 4D in batch-2-y-x";
     CHECK_EQ(dshape[0], lshape[0]);
diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 1afc13ad2594..43f689d2defa 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -45,6 +45,8 @@ inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim, DType
     size_t size = input.size();
     index_t begin = 0;
     for (size_t i = 0; i < size; ++i) {
+      // If input[i] is a zero-size tensor, do nothing.
+      if (input[i].shape_.Size() == 0) continue;
       index_t end = begin + input[i].size(cdim);
       Assign(slice<cdim>(out, begin, end), req, input[i]);
       begin = end;
@@ -80,6 +82,8 @@ void split_helper(const mshadow::Tensor<xpu, dim, DType> &input,
     size_t size = out.size();
     index_t begin = 0;
     for (size_t i = 0; i < size; ++i) {
+      // If out[i] is a zero-size tensor, do nothing.
+      if (out[i].shape_.Size() == 0) continue;
       index_t end = begin + out[i].size(cdim);
       Assign(out[i], req[i], slice<cdim>(input, begin, end));
       begin = end;
diff --git a/src/operator/contrib/adamw-inl.h b/src/operator/contrib/adamw-inl.h
index 07feaefe87aa..6ae9e46b7def 100644
--- a/src/operator/contrib/adamw-inl.h
+++ b/src/operator/contrib/adamw-inl.h
@@ -87,8 +87,9 @@ inline bool MPUpdateInferShape(const nnvm::NodeAttrs& attrs,
                                mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), static_cast<size_t>(total_in)) << " in operator " << attrs.name;
   CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
-  // rescale_grad.shape = (1,)
-  SHAPE_ASSIGN_CHECK(*in_attrs, total_in - 1, mshadow::Shape1(1));
+  // rescale_grad.shape = ()
+  SHAPE_ASSIGN_CHECK(*in_attrs, total_in - 1, mxnet::TShape());
+  // TODO(@reminisce): change "none" behavior in ElemwiseAttr
   return ElemwiseAttr<mxnet::TShape, shape_is_none, shape_assign, true, shape_string, n_in, n_out>(
       attrs, in_attrs, out_attrs, mxnet::TShape());
 }
diff --git a/src/operator/contrib/adaptive_avg_pooling-inl.h b/src/operator/contrib/adaptive_avg_pooling-inl.h
index 0d66de0a5692..eedab78db0c5 100644
--- a/src/operator/contrib/adaptive_avg_pooling-inl.h
+++ b/src/operator/contrib/adaptive_avg_pooling-inl.h
@@ -48,9 +48,9 @@ namespace mxnet {
 namespace op {
 
 struct AdaptiveAvgPoolParam : public dmlc::Parameter<AdaptiveAvgPoolParam> {
-  mxnet::TShape output_size;
+  mxnet::Tuple<int> output_size;
   DMLC_DECLARE_PARAMETER(AdaptiveAvgPoolParam) {
-    DMLC_DECLARE_FIELD(output_size).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(output_size).set_default(mxnet::Tuple<int>())
     .describe("int (output size) or a tuple of int for output (height, width).");
   }
 };
@@ -125,7 +125,7 @@ static bool AdaptiveAvgPoolOpInferShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_shape->size(), 1U) << "Output:[data]";
   const AdaptiveAvgPoolParam& param = nnvm::get<AdaptiveAvgPoolParam>(attrs.parsed);
   mxnet::TShape dshape(in_shape->at(0));
-  if (dshape.ndim() == 0) return false;
+  if (mxnet::op::shape_is_none(dshape)) return false;
   if (param.output_size.ndim() == 0) {
     dshape[2] = 1;
     dshape[3] = 1;
diff --git a/src/operator/contrib/bilinear_resize-inl.h b/src/operator/contrib/bilinear_resize-inl.h
index 46c8e1aa7c0d..ce9c6c83504c 100644
--- a/src/operator/contrib/bilinear_resize-inl.h
+++ b/src/operator/contrib/bilinear_resize-inl.h
@@ -134,7 +134,7 @@ static bool BilinearSampleOpInferShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_shape->size(), 1U) << "Output:[data]";
   const BilinearSampleParam& param = nnvm::get<BilinearSampleParam>(attrs.parsed);
   mxnet::TShape dshape(in_shape->at(0));
-  if (dshape.ndim() == 0) return false;
+  if (mxnet::op::shape_is_none(dshape)) return false;
   if (param.scale_height.has_value()) {
     dshape[2] = static_cast<int>(param.scale_height.value() * in_shape->at(0)[2]);
   } else {
diff --git a/src/operator/contrib/boolean_mask.cc b/src/operator/contrib/boolean_mask.cc
index e22c493d5e2c..06d8439e23a0 100644
--- a/src/operator/contrib/boolean_mask.cc
+++ b/src/operator/contrib/boolean_mask.cc
@@ -121,7 +121,7 @@ inline void BooleanMaskForward<cpu>(const nnvm::NodeAttrs& attrs,
   const NDArray &out = outputs[0];
   CHECK_EQ(axis, 0) << "Not supported yet";
   CHECK_EQ(data.shape()[axis], idx.shape()[0]);
-  CHECK_EQ(idx.shape().ndim(), 1U);
+  CHECK_EQ(idx.shape().ndim(), 1U);  // idx is required to be 1-d.
   // count the number of 1s in `idx`, so that we could know the output dimension
   size_t idx_size = idx.shape()[0];
   std::vector<int32_t> prefix_sum(idx_size, 0);
diff --git a/src/operator/contrib/bounding_box-inl.h b/src/operator/contrib/bounding_box-inl.h
index 37c4297ff49d..686f1666a310 100644
--- a/src/operator/contrib/bounding_box-inl.h
+++ b/src/operator/contrib/bounding_box-inl.h
@@ -94,7 +94,8 @@ inline bool BoxNMSShape(const nnvm::NodeAttrs& attrs,
   const BoxNMSParam& param = nnvm::get<BoxNMSParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 2U);
-  if (in_attrs->at(0).ndim() == 0U && out_attrs->at(0).ndim() == 0U) {
+  if (mxnet::op::shape_is_none(in_attrs->at(0))
+    && mxnet::op::shape_is_none(out_attrs->at(0))) {
     return false;
   }
 
@@ -556,7 +557,7 @@ inline bool BoxOverlapShape(const nnvm::NodeAttrs& attrs,
     << rdim << " provided";
 
   // assign output shape
-  mxnet::TShape oshape(lshape.ndim() + rshape.ndim() - 2);
+  mxnet::TShape oshape(lshape.ndim() + rshape.ndim() - 2, -1);
   int idx = 0;
   for (index_t i = 0; i < lshape.ndim() - 1; ++i) {
     oshape[idx++] = lshape[i];
@@ -565,7 +566,7 @@ inline bool BoxOverlapShape(const nnvm::NodeAttrs& attrs,
     oshape[idx++] = rshape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 struct compute_overlap {
@@ -669,14 +670,14 @@ inline bool MatchingShape(const nnvm::NodeAttrs& attrs,
     << dshape.ndim() << " provided";
 
   // assign output shape
-  mxnet::TShape oshape(dshape.ndim() - 1);
+  mxnet::TShape oshape(dshape.ndim() - 1, -1);
   for (index_t i = 0; i < dshape.ndim() - 1; ++i) {
     oshape[i] = dshape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   oshape[oshape.ndim() - 1] = dshape[dshape.ndim() - 1];
   SHAPE_ASSIGN_CHECK(*out_attrs, 1, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 struct bipartite_matching {
diff --git a/src/operator/contrib/count_sketch-inl.h b/src/operator/contrib/count_sketch-inl.h
index f3a294f6ad46..3ea93e63d6fc 100644
--- a/src/operator/contrib/count_sketch-inl.h
+++ b/src/operator/contrib/count_sketch-inl.h
@@ -151,7 +151,7 @@ class CountSketchProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 3) <<"Input:[data, h, s]";
     const mxnet::TShape &dshape = (*in_shape)[CountSketch::kData];
     // require data to be known
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
 
     out_shape->clear();
     if (dshape.ndim() == 4) {
diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h
index f50641fca6d6..000d703066d7 100644
--- a/src/operator/contrib/deformable_convolution-inl.h
+++ b/src/operator/contrib/deformable_convolution-inl.h
@@ -69,11 +69,11 @@ struct DeformableConvolutionParam : public dmlc::Parameter<DeformableConvolution
   dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(DeformableConvolutionParam) {
     DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, -1))
       .describe("Convolution stride: (h, w) or (d, h, w). Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, -1))
       .describe("Convolution dilate: (h, w) or (d, h, w). Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, -1))
       .describe("Zero pad for convolution: (h, w) or (d, h, w). Defaults to no padding.");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
       .describe("Convolution filter(channel) number");
@@ -127,9 +127,9 @@ class DeformableConvolutionOp : public Operator {
     Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
       .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
     // calculate the shape of col_buffer
-    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1);
+    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, -1);
     col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-    for (size_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+    for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
       col_buffer_shape[i] = out_data[0].shape_[i + 1];
     }
     // create a column buffer using workspace and col_buffer_shape
@@ -189,7 +189,7 @@ class DeformableConvolutionOp : public Operator {
     Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
       .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
     // calculate the shape of col_buffer
-    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1);
+    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, -1);
     col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
     for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
       col_buffer_shape[i] = out_grad[conv::kData].shape_[i + 1];
@@ -371,7 +371,7 @@ class DeformableConvolutionProp : public OperatorProperty {
     out_shape->resize(1, mxnet::TShape());
     const mxnet::TShape &dshp = (*in_shape)[conv::kData];
     const mxnet::TShape &oshp = (*in_shape)[conv::kOffset];
-    if (dshp.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshp)) return false;
     if (param_.kernel.ndim() == 2) {
       // 2d conv
       CHECK_EQ(dshp.ndim(), 4U) \
diff --git a/src/operator/contrib/dgl_graph.cc b/src/operator/contrib/dgl_graph.cc
index f19af84ce9c6..428899791a5d 100644
--- a/src/operator/contrib/dgl_graph.cc
+++ b/src/operator/contrib/dgl_graph.cc
@@ -259,34 +259,28 @@ static bool CSRNeighborUniformSampleShape(const nnvm::NodeAttrs& attrs,
 
   // Output
   bool success = true;
-  mxnet::TShape out_shape(1);
+  mxnet::TShape out_shape(1, -1);
   // We use the last element to store the actual
   // number of vertices in the subgraph.
   out_shape[0] = params.max_num_vertices + 1;
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i, out_shape);
-    success = success &&
-              out_attrs->at(i).ndim() != 0U &&
-              out_attrs->at(i).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i));
   }
   // sub_csr
-  mxnet::TShape out_csr_shape(2);
+  mxnet::TShape out_csr_shape(2, -1);
   out_csr_shape[0] = params.max_num_vertices;
   out_csr_shape[1] = in_attrs->at(0)[1];
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i + num_subgraphs, out_csr_shape);
-    success = success &&
-              out_attrs->at(i + num_subgraphs).ndim() != 0U &&
-              out_attrs->at(i + num_subgraphs).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i + num_subgraphs));
   }
   // sub_layer
-  mxnet::TShape out_layer_shape(1);
+  mxnet::TShape out_layer_shape(1, -1);
   out_layer_shape[0] = params.max_num_vertices;
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i + 2*num_subgraphs, out_layer_shape);
-    success = success &&
-              out_attrs->at(i + 2*num_subgraphs).ndim() != 0U &&
-              out_attrs->at(i + 2*num_subgraphs).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i + 2 * num_subgraphs));
   }
 
   return success;
@@ -317,43 +311,35 @@ static bool CSRNeighborNonUniformSampleShape(const nnvm::NodeAttrs& attrs,
 
   // Output
   bool success = true;
-  mxnet::TShape out_shape(1);
+  mxnet::TShape out_shape(1, -1);
   // We use the last element to store the actual
   // number of vertices in the subgraph.
   out_shape[0] = params.max_num_vertices + 1;
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i, out_shape);
-    success = success &&
-              out_attrs->at(i).ndim() != 0U &&
-              out_attrs->at(i).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i));
   }
   // sub_csr
-  mxnet::TShape out_csr_shape(2);
+  mxnet::TShape out_csr_shape(2, -1);
   out_csr_shape[0] = params.max_num_vertices;
   out_csr_shape[1] = in_attrs->at(0)[1];
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i + num_subgraphs, out_csr_shape);
-    success = success &&
-              out_attrs->at(i + num_subgraphs).ndim() != 0U &&
-              out_attrs->at(i + num_subgraphs).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i + num_subgraphs));
   }
   // sub_probability
-  mxnet::TShape out_prob_shape(1);
+  mxnet::TShape out_prob_shape(1, -1);
   out_prob_shape[0] = params.max_num_vertices;
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i + 2*num_subgraphs, out_prob_shape);
-    success = success &&
-              out_attrs->at(i + 2*num_subgraphs).ndim() != 0U &&
-              out_attrs->at(i + 2*num_subgraphs).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i + 2 * num_subgraphs));
   }
   // sub_layer
-  mxnet::TShape out_layer_shape(1);
+  mxnet::TShape out_layer_shape(1, -1);
   out_layer_shape[0] = params.max_num_vertices;
   for (size_t i = 0; i < num_subgraphs; i++) {
     SHAPE_ASSIGN_CHECK(*out_attrs, i + 3*num_subgraphs, out_prob_shape);
-    success = success &&
-              out_attrs->at(i + 3*num_subgraphs).ndim() != 0U &&
-              out_attrs->at(i + 3*num_subgraphs).Size() != 0U;
+    success = success && !mxnet::op::shape_is_none(out_attrs->at(i + 3 * num_subgraphs));
   }
 
   return success;
@@ -679,8 +665,8 @@ static void SampleSubgraph(const NDArray &csr,
     }
   }
   // Construct sub_csr_graph
-  mxnet::TShape shape_1(1);
-  mxnet::TShape shape_2(1);
+  mxnet::TShape shape_1(1, -1);
+  mxnet::TShape shape_2(1, -1);
   shape_1[0] = num_edges;
   shape_2[0] = max_num_vertices+1;
   sub_csr.CheckAndAllocData(shape_1);
@@ -960,13 +946,13 @@ static bool DGLSubgraphShape(const nnvm::NodeAttrs& attrs,
 
   size_t num_g = params.num_args - 1;
   for (size_t i = 0; i < num_g; i++) {
-    mxnet::TShape gshape(2);
+    mxnet::TShape gshape(2, -1);
     gshape[0] = in_attrs->at(i + 1)[0];
     gshape[1] = in_attrs->at(i + 1)[0];
     out_attrs->at(i) = gshape;
   }
   for (size_t i = num_g; i < out_attrs->size(); i++) {
-    mxnet::TShape gshape(2);
+    mxnet::TShape gshape(2, -1);
     gshape[0] = in_attrs->at(i - num_g + 1)[0];
     gshape[1] = in_attrs->at(i - num_g + 1)[0];
     out_attrs->at(i) = gshape;
@@ -1081,9 +1067,9 @@ static void GetSubgraph(const NDArray &csr_arr, const NDArray &varr,
     row_idx[i + 1] = col_idx.size();
   }
 
-  mxnet::TShape nz_shape(1);
+  mxnet::TShape nz_shape(1, -1);
   nz_shape[0] = col_idx.size();
-  mxnet::TShape indptr_shape(1);
+  mxnet::TShape indptr_shape(1, -1);
   indptr_shape[0] = row_idx.size();
 
   // Store the non-zeros in a subgraph with edge attributes of new edge ids.
@@ -1199,7 +1185,7 @@ inline bool EdgeIDShape(const nnvm::NodeAttrs& attrs,
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(1));
   SHAPE_ASSIGN_CHECK(*in_attrs, 1, out_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*in_attrs, 2, out_attrs->at(0));
-  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U;
+  return !mxnet::op::shape_is_none(out_attrs->at(0));
 }
 
 inline bool EdgeIDType(const nnvm::NodeAttrs& attrs,
@@ -1265,7 +1251,7 @@ void EdgeIDForwardCsrImpl(const OpContext& ctx,
   CHECK_EQ(req, kWriteTo) << "EdgeID with CSR only supports kWriteTo";
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const NDArray& u = inputs[1];
-  const nnvm::dim_t out_elems = u.shape().Size();
+  const dim_t out_elems = u.shape().Size();
   if (!inputs[0].storage_initialized()) {
     MSHADOW_TYPE_SWITCH(output.dtype(), DType, {
       Kernel<mxnet_op::op_with_req<mshadow_op::identity, kWriteTo>, xpu>::Launch(
@@ -1357,7 +1343,7 @@ inline bool DGLAdjacencyShape(const nnvm::NodeAttrs& attrs,
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U;
+  return !mxnet::op::shape_is_none(out_attrs->at(0));
 }
 
 inline bool DGLAdjacencyType(const nnvm::NodeAttrs& attrs,
@@ -1422,7 +1408,7 @@ the data value of float32.
 struct SubgraphCompactParam : public dmlc::Parameter<SubgraphCompactParam> {
   int num_args;
   bool return_mapping;
-  nnvm::Tuple<nnvm::dim_t> graph_sizes;
+  mxnet::Tuple<dim_t> graph_sizes;
   DMLC_DECLARE_PARAMETER(SubgraphCompactParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(2)
     .describe("Number of input arguments.");
@@ -1460,9 +1446,9 @@ static void CompactSubgraph(const NDArray &csr, const NDArray &vids,
     CHECK_NE(row_ids[i], -1);
   }
 
-  mxnet::TShape nz_shape(1);
+  mxnet::TShape nz_shape(1, -1);
   nz_shape[0] = num_elems;
-  mxnet::TShape indptr_shape(1);
+  mxnet::TShape indptr_shape(1, -1);
   CHECK_EQ(out_csr.shape()[0], graph_size);
   indptr_shape[0] = graph_size + 1;
   CHECK_GE(in_ptr_data.shape_[0], indptr_shape[0]);
@@ -1540,7 +1526,7 @@ static bool SubgraphCompactShape(const nnvm::NodeAttrs& attrs,
   }
 
   for (size_t i = 0; i < num_g; i++) {
-    mxnet::TShape gshape(2);
+    mxnet::TShape gshape(2, -1);
     gshape[0] = params.graph_sizes[i];
     gshape[1] = params.graph_sizes[i];
     out_attrs->at(i) = gshape;
diff --git a/src/operator/contrib/fft-inl.h b/src/operator/contrib/fft-inl.h
index 247f6290c02a..a5471b4ba2e2 100644
--- a/src/operator/contrib/fft-inl.h
+++ b/src/operator/contrib/fft-inl.h
@@ -241,7 +241,7 @@ class FFTProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 1) <<"Input:[data]";
     const mxnet::TShape &dshape = (*in_shape)[fft::kData];
     // require data to be known
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
 
     out_shape->clear();
     if (dshape.ndim() == 4) {
diff --git a/src/operator/contrib/ifft-inl.h b/src/operator/contrib/ifft-inl.h
index e53c0f60fa9e..7d8422e838b1 100644
--- a/src/operator/contrib/ifft-inl.h
+++ b/src/operator/contrib/ifft-inl.h
@@ -231,7 +231,7 @@ class IFFTProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 1) <<"Input:[data]";
     const mxnet::TShape &dshape = (*in_shape)[ifft::kData];
     // require data to be known
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
 
     out_shape->clear();
     if (dshape.ndim() == 4) {
diff --git a/src/operator/contrib/index_copy-inl.h b/src/operator/contrib/index_copy-inl.h
index 903dee13272b..9f78f0593ed1 100644
--- a/src/operator/contrib/index_copy-inl.h
+++ b/src/operator/contrib/index_copy-inl.h
@@ -64,7 +64,7 @@ inline bool IndexCopyShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->at(1).ndim(), 1);
   // Shape matching
   CHECK_EQ(in_attrs->at(0).ndim(), in_attrs->at(2).ndim());
-  for (size_t i = 0; i < in_attrs->at(0).ndim(); ++i) {
+  for (int i = 0; i < in_attrs->at(0).ndim(); ++i) {
     if (i == 0) {
       CHECK_GE(in_attrs->at(0)[i], in_attrs->at(2)[i]);
     } else {
@@ -76,8 +76,7 @@ inline bool IndexCopyShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->at(1)[0], in_attrs->at(2)[0]);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-  return out_attrs->at(0).ndim() != 0U &&
-         out_attrs->at(0).Size() != 0U;
+  return !mxnet::op::shape_is_none(out_attrs->at(0));
 }
 
 }  // namespace op
diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h
index 4b9a41c2fa87..7010dadfedbc 100644
--- a/src/operator/contrib/multi_proposal-inl.h
+++ b/src/operator/contrib/multi_proposal-inl.h
@@ -56,8 +56,8 @@ struct MultiProposalParam : public dmlc::Parameter<MultiProposalParam> {
   int rpn_post_nms_top_n;
   float threshold;
   int rpn_min_size;
-  nnvm::Tuple<float> scales;
-  nnvm::Tuple<float> ratios;
+  mxnet::Tuple<float> scales;
+  mxnet::Tuple<float> ratios;
   int feature_stride;
   bool output_score;
   bool iou_loss;
@@ -73,10 +73,10 @@ struct MultiProposalParam : public dmlc::Parameter<MultiProposalParam> {
     DMLC_DECLARE_FIELD(rpn_min_size).set_default(16)
     .describe("Minimum height or width in proposal");
     tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f;
-    DMLC_DECLARE_FIELD(scales).set_default(nnvm::Tuple<float>(tmp, tmp + 4))
+    DMLC_DECLARE_FIELD(scales).set_default(mxnet::Tuple<float>(tmp, tmp + 4))
     .describe("Used to generate anchor windows by enumerating scales");
     tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f;
-    DMLC_DECLARE_FIELD(ratios).set_default(nnvm::Tuple<float>(tmp, tmp + 3))
+    DMLC_DECLARE_FIELD(ratios).set_default(mxnet::Tuple<float>(tmp, tmp + 3))
     .describe("Used to generate anchor windows by enumerating ratios");
     DMLC_DECLARE_FIELD(feature_stride).set_default(16)
     .describe("The size of the receptive field each unit in the convolution layer of the rpn,"
@@ -108,7 +108,7 @@ class MultiProposalProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 3) << "Input:[cls_prob, bbox_pred, im_info]";
     const mxnet::TShape &dshape = in_shape->at(proposal::kClsProb);
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
     Shape<4> bbox_pred_shape;
     bbox_pred_shape = Shape4(dshape[0], dshape[1] * 2, dshape[2], dshape[3]);
     SHAPE_ASSIGN_CHECK(*in_shape, proposal::kBBoxPred,
@@ -214,11 +214,11 @@ inline void _Transform(float scale,
 
 // out_anchors must have shape (n, 5), where n is ratios.size() * scales.size()
 inline void GenerateAnchors(const std::vector<float>& base_anchor,
-                            const nnvm::Tuple<float>& ratios,
-                            const nnvm::Tuple<float>& scales,
+                            const mxnet::Tuple<float>& ratios,
+                            const mxnet::Tuple<float>& scales,
                             std::vector<float> *out_anchors) {
-  for (size_t j = 0; j < ratios.ndim(); ++j) {
-    for (size_t k = 0; k < scales.ndim(); ++k) {
+  for (int j = 0; j < ratios.ndim(); ++j) {
+    for (int k = 0; k < scales.ndim(); ++k) {
       _Transform(scales[k], ratios[j], base_anchor, out_anchors);
     }
   }
diff --git a/src/operator/contrib/multibox_detection-inl.h b/src/operator/contrib/multibox_detection-inl.h
index 977126ad269d..34ad4471dedc 100644
--- a/src/operator/contrib/multibox_detection-inl.h
+++ b/src/operator/contrib/multibox_detection-inl.h
@@ -52,7 +52,7 @@ struct MultiBoxDetectionParam : public dmlc::Parameter<MultiBoxDetectionParam> {
   bool force_suppress;
   int keep_topk;
   int nms_topk;
-  nnvm::Tuple<float> variances;
+  mxnet::Tuple<float> variances;
   DMLC_DECLARE_PARAMETER(MultiBoxDetectionParam) {
     DMLC_DECLARE_FIELD(clip).set_default(true)
     .describe("Clip out-of-boundary boxes.");
@@ -161,7 +161,7 @@ class MultiBoxDetectionProp : public OperatorProperty {
     CHECK_EQ(cshape[2] * 4, lshape[1]) << "# anchors mismatch with # loc";
     CHECK_GT(ashape[1], 0U) << "Number of anchors must > 0";
     CHECK_EQ(ashape[2], 4U);
-    mxnet::TShape oshape = mxnet::TShape(3);
+    mxnet::TShape oshape = mxnet::TShape(3, -1);
     oshape[0] = cshape[0];
     oshape[1] = ashape[1];
     oshape[2] = 6;  // [id, prob, xmin, ymin, xmax, ymax]
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index 8d1082914df7..65fe5f1208bb 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -87,7 +87,7 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
                                      const Tensor<cpu, 3, DType> &temp_space,
                                      const float threshold,
                                      const bool clip,
-                                     const nnvm::Tuple<float> &variances,
+                                     const mxnet::Tuple<float> &variances,
                                      const float nms_threshold,
                                      const bool force_suppress,
                                      const int nms_topk) {
diff --git a/src/operator/contrib/multibox_detection.cu b/src/operator/contrib/multibox_detection.cu
index 98151f8b8755..51b2aa7cdc77 100644
--- a/src/operator/contrib/multibox_detection.cu
+++ b/src/operator/contrib/multibox_detection.cu
@@ -213,7 +213,7 @@ inline void MultiBoxDetectionForward(const Tensor<gpu, 3, DType> &out,
                                      const Tensor<gpu, 3, DType> &temp_space,
                                      const float threshold,
                                      const bool clip,
-                                     const nnvm::Tuple<float> &variances,
+                                     const mxnet::Tuple<float> &variances,
                                      const float nms_threshold,
                                      const bool force_suppress,
                                      const int nms_topk) {
diff --git a/src/operator/contrib/multibox_prior-inl.h b/src/operator/contrib/multibox_prior-inl.h
index 3636a6016bd2..bfc244f77805 100644
--- a/src/operator/contrib/multibox_prior-inl.h
+++ b/src/operator/contrib/multibox_prior-inl.h
@@ -57,11 +57,11 @@ enum MultiBoxPriorOpOutputs {kOut};
 }  // namespace mboxprior_enum
 
 struct MultiBoxPriorParam : public dmlc::Parameter<MultiBoxPriorParam> {
-  nnvm::Tuple<float> sizes;
-  nnvm::Tuple<float> ratios;
+  mxnet::Tuple<float> sizes;
+  mxnet::Tuple<float> ratios;
   bool clip;
-  nnvm::Tuple<float> steps;
-  nnvm::Tuple<float> offsets;
+  mxnet::Tuple<float> steps;
+  mxnet::Tuple<float> offsets;
   DMLC_DECLARE_PARAMETER(MultiBoxPriorParam) {
     DMLC_DECLARE_FIELD(sizes).set_default({1.0f})
     .describe("List of sizes of generated MultiBoxPriores.");
@@ -180,7 +180,7 @@ class MultiBoxPriorProp: public OperatorProperty {
     int in_width = dshape[3];
     CHECK_GT(in_width, 0) << "Input width should > 0";
     // since input sizes are same in each batch, we could share MultiBoxPrior
-    mxnet::TShape oshape = mxnet::TShape(3);
+    mxnet::TShape oshape = mxnet::TShape(3, -1);
     int num_sizes = param_.sizes.ndim();
     int num_ratios = param_.ratios.ndim();
     oshape[0] = 1;
@@ -189,7 +189,7 @@ class MultiBoxPriorProp: public OperatorProperty {
     out_shape->clear();
     out_shape->push_back(oshape);
     CHECK_EQ(param_.steps.ndim(), 2) << "Step ndim must be 2: (step_y, step_x)";
-    return true;
+    return shape_is_known(oshape);
   }
 
   OperatorProperty* Copy() const override {
diff --git a/src/operator/contrib/multibox_prior.cc b/src/operator/contrib/multibox_prior.cc
index ee8f5bfac772..2ad173a2dd93 100644
--- a/src/operator/contrib/multibox_prior.cc
+++ b/src/operator/contrib/multibox_prior.cc
@@ -44,11 +44,12 @@ inline void MultiBoxPriorForward(const Tensor<cpu, 2, DType> &out,
     float center_y = (r + offsets[0]) * step_y;
     for (int c = 0; c < in_width; ++c) {
       float center_x = (c + offsets[1]) * step_x;
-      // ratio = 1, various sizes
+      // ratio = first ratio, various sizes
+      float ratio = num_ratios > 0? sqrtf(ratios[0]) : 1.f;
       for (int i = 0; i < num_sizes; ++i) {
         float size = sizes[i];
-        float w = size * in_height / in_width / 2;
-        float h = size / 2;
+        float w = size * in_height / in_width * ratio / 2;
+        float h = size / ratio / 2;
         out[count][0] = center_x - w;  // xmin
         out[count][1] = center_y - h;  // ymin
         out[count][2] = center_x + w;  // xmax
diff --git a/src/operator/contrib/multibox_prior.cu b/src/operator/contrib/multibox_prior.cu
index 57901585b45a..54e93adba765 100644
--- a/src/operator/contrib/multibox_prior.cu
+++ b/src/operator/contrib/multibox_prior.cu
@@ -83,10 +83,11 @@ inline void MultiBoxPriorForward(const Tensor<gpu, 2, DType> &out,
 
   const int stride = 4 * (num_sizes + num_ratios - 1);
   int offset = 0;
-  // ratio = 1, various sizes
+  // ratio = first ratio, various sizes
+  float ratio = num_ratios > 0? sqrtf(ratios[0]) : 1.f;
   for (int i = 0; i < num_sizes; ++i) {
     cuda::AssignPriors<DType><<<dimGrid, dimBlock, 0, stream>>>(out_ptr,
-      sizes[i], 1.f, in_width, in_height, step_x, step_y, offset_y, offset_x, stride, offset);
+      sizes[i], ratio, in_width, in_height, step_x, step_y, offset_y, offset_x, stride, offset);
     ++offset;
   }
   MULTIBOXPRIOR_CUDA_CHECK(cudaPeekAtLastError());
diff --git a/src/operator/contrib/multibox_target-inl.h b/src/operator/contrib/multibox_target-inl.h
index f7a92882650c..6034f13ef734 100644
--- a/src/operator/contrib/multibox_target-inl.h
+++ b/src/operator/contrib/multibox_target-inl.h
@@ -62,7 +62,7 @@ struct MultiBoxTargetParam : public dmlc::Parameter<MultiBoxTargetParam> {
   float negative_mining_ratio;
   float negative_mining_thresh;
   int minimum_negative_samples;
-  nnvm::Tuple<float> variances;
+  mxnet::Tuple<float> variances;
   DMLC_DECLARE_PARAMETER(MultiBoxTargetParam) {
     DMLC_DECLARE_FIELD(overlap_threshold).set_default(0.5f)
     .describe("Anchor-GT overlap threshold to be regarded as a positive match.");
diff --git a/src/operator/contrib/multibox_target.cc b/src/operator/contrib/multibox_target.cc
index a1f2aac250ff..a1808c5a7c81 100644
--- a/src/operator/contrib/multibox_target.cc
+++ b/src/operator/contrib/multibox_target.cc
@@ -81,7 +81,7 @@ inline void MultiBoxTargetForward(const Tensor<cpu, 2, DType> &loc_target,
                            const float negative_mining_ratio,
                            const float negative_mining_thresh,
                            const int minimum_negative_samples,
-                           const nnvm::Tuple<float> &variances) {
+                           const mxnet::Tuple<float> &variances) {
   const DType *p_anchor = anchors.dptr_;
   const int num_batches = labels.size(0);
   const int num_labels = labels.size(1);
diff --git a/src/operator/contrib/multibox_target.cu b/src/operator/contrib/multibox_target.cu
index ca0428348a6c..a44c08b08923 100644
--- a/src/operator/contrib/multibox_target.cu
+++ b/src/operator/contrib/multibox_target.cu
@@ -349,7 +349,7 @@ inline void MultiBoxTargetForward(const Tensor<gpu, 2, DType> &loc_target,
                            const float negative_mining_ratio,
                            const float negative_mining_thresh,
                            const int minimum_negative_samples,
-                           const nnvm::Tuple<float> &variances) {
+                           const mxnet::Tuple<float> &variances) {
   const int num_batches = labels.size(0);
   const int num_labels = labels.size(1);
   const int label_width = labels.size(2);
diff --git a/src/operator/contrib/nnvm_to_onnx.cc b/src/operator/contrib/nnvm_to_onnx.cc
index 0417a085616a..0c8bd79490e3 100644
--- a/src/operator/contrib/nnvm_to_onnx.cc
+++ b/src/operator/contrib/nnvm_to_onnx.cc
@@ -417,7 +417,8 @@ std::unordered_map<std::string, mxnet::TShape> GetPlaceholderShapes(
   for (uint32_t i = 0; i < shape_inputs.size(); ++i) {
     std::string name = ig[ig.input_nodes()[i]].source->attrs.name;
     mxnet::TShape shp = shape_inputs[i];
-    if (shp.ndim() > 0) {
+    if (!mxnet::op::shape_is_none(shp)) {
+      // TODO(@reminisce): confirm
       placeholder_shapes.emplace(name, shp);
     }
   }
diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc
index 9f948bad81b6..83bbcdab833d 100644
--- a/src/operator/contrib/optimizer_op.cc
+++ b/src/operator/contrib/optimizer_op.cc
@@ -45,7 +45,7 @@ inline bool GroupAdagradShape(const nnvm::NodeAttrs &attrs,
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*in_attrs, 1, out_attrs->at(0));
 
-  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U &&
+  return !mxnet::op::shape_is_none(out_attrs->at(0)) &&
          (in_attrs->at(0)[0] == in_attrs->at(1)[0]) &&
          (in_attrs->at(0)[0] == in_attrs->at(2)[0]);
 }
diff --git a/src/operator/contrib/proposal-inl.h b/src/operator/contrib/proposal-inl.h
index 9908ca96ec5f..10f1f86806e4 100644
--- a/src/operator/contrib/proposal-inl.h
+++ b/src/operator/contrib/proposal-inl.h
@@ -54,8 +54,8 @@ struct ProposalParam : public dmlc::Parameter<ProposalParam> {
   int rpn_post_nms_top_n;
   float threshold;
   int rpn_min_size;
-  nnvm::Tuple<float> scales;
-  nnvm::Tuple<float> ratios;
+  mxnet::Tuple<float> scales;
+  mxnet::Tuple<float> ratios;
   int feature_stride;
   bool output_score;
   bool iou_loss;
@@ -71,10 +71,10 @@ struct ProposalParam : public dmlc::Parameter<ProposalParam> {
     DMLC_DECLARE_FIELD(rpn_min_size).set_default(16)
     .describe("Minimum height or width in proposal");
     tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f;
-    DMLC_DECLARE_FIELD(scales).set_default(nnvm::Tuple<float>(tmp, tmp + 4))
+    DMLC_DECLARE_FIELD(scales).set_default(mxnet::Tuple<float>(tmp, tmp + 4))
     .describe("Used to generate anchor windows by enumerating scales");
     tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f;
-    DMLC_DECLARE_FIELD(ratios).set_default(nnvm::Tuple<float>(tmp, tmp + 3))
+    DMLC_DECLARE_FIELD(ratios).set_default(mxnet::Tuple<float>(tmp, tmp + 3))
     .describe("Used to generate anchor windows by enumerating ratios");
     DMLC_DECLARE_FIELD(feature_stride).set_default(16)
     .describe("The size of the receptive field each unit in the convolution layer of the rpn,"
@@ -106,7 +106,7 @@ class ProposalProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 3) << "Input:[cls_prob, bbox_pred, im_info]";
     const mxnet::TShape &dshape = in_shape->at(proposal::kClsProb);
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
     Shape<4> bbox_pred_shape;
     bbox_pred_shape = Shape4(dshape[0], dshape[1] * 2, dshape[2], dshape[3]);
     SHAPE_ASSIGN_CHECK(*in_shape, proposal::kBBoxPred,
@@ -212,11 +212,11 @@ inline void _Transform(float scale,
 
 // out_anchors must have shape (n, 5), where n is ratios.size() * scales.size()
 inline void GenerateAnchors(const std::vector<float>& base_anchor,
-                            const nnvm::Tuple<float>& ratios,
-                            const nnvm::Tuple<float>& scales,
+                            const mxnet::Tuple<float>& ratios,
+                            const mxnet::Tuple<float>& scales,
                             std::vector<float> *out_anchors) {
-  for (size_t j = 0; j < ratios.ndim(); ++j) {
-    for (size_t k = 0; k < scales.ndim(); ++k) {
+  for (int j = 0; j < ratios.ndim(); ++j) {
+    for (int k = 0; k < scales.ndim(); ++k) {
       _Transform(scales[k], ratios[j], base_anchor, out_anchors);
     }
   }
diff --git a/src/operator/contrib/quadratic_op-inl.h b/src/operator/contrib/quadratic_op-inl.h
index e679fedc8e57..a7aca63de17a 100644
--- a/src/operator/contrib/quadratic_op-inl.h
+++ b/src/operator/contrib/quadratic_op-inl.h
@@ -60,7 +60,7 @@ inline bool QuadraticOpShape(const nnvm::NodeAttrs& attrs,
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U;
+  return !mxnet::op::shape_is_none(out_attrs->at(0));
 }
 
 inline bool QuadraticOpType(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/contrib/sync_batch_norm-inl.h b/src/operator/contrib/sync_batch_norm-inl.h
index 1e6ab25db0e2..cd1a3285fe06 100644
--- a/src/operator/contrib/sync_batch_norm-inl.h
+++ b/src/operator/contrib/sync_batch_norm-inl.h
@@ -482,7 +482,7 @@ class SyncBatchNormProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
     const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
+    if (mxnet::op::shape_is_none(dshape)) return false;
     in_shape->at(1) = mxnet::TShape(Shape1(dshape[1]));
     in_shape->at(2) = mxnet::TShape(Shape1(dshape[1]));
     out_shape->clear();
diff --git a/src/operator/contrib/transformer-inl.h b/src/operator/contrib/transformer-inl.h
index 01faf244aff9..da3d14e33cf4 100644
--- a/src/operator/contrib/transformer-inl.h
+++ b/src/operator/contrib/transformer-inl.h
@@ -41,7 +41,9 @@ static void DivSqrtDimForward_(const nnvm::NodeAttrs& attrs,
                   const std::vector<OpReqType>& req,
                   const std::vector<TBlob>& outputs) {
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  double sqrt_dim = std::sqrt(static_cast<double>(inputs[0].shape_[inputs[0].ndim() - 1]));
+  CHECK_GE(inputs[0].ndim(), 1);
+  int last_idx = inputs[0].ndim() - 1;
+  double sqrt_dim = std::sqrt(static_cast<double>(inputs[0].shape_[last_idx]));
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
       mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::div, Req>, xpu>::Launch(
diff --git a/src/operator/control_flow.cc b/src/operator/control_flow.cc
index ac6fea7c143b..4c0d67bb08f7 100644
--- a/src/operator/control_flow.cc
+++ b/src/operator/control_flow.cc
@@ -37,11 +37,11 @@ struct ForeachParam : public dmlc::Parameter<ForeachParam> {
   int num_outputs;
   int num_out_data;
   // The location of states in the subgraph inputs.
-  nnvm::Tuple<dim_t> in_state_locs;
+  mxnet::Tuple<dim_t> in_state_locs;
   // The location of data arrays in the subgraph inputs.
-  nnvm::Tuple<dim_t> in_data_locs;
+  mxnet::Tuple<dim_t> in_data_locs;
   // The location of remaining arrays in the subgraph inputs.
-  nnvm::Tuple<dim_t> remain_locs;
+  mxnet::Tuple<dim_t> remain_locs;
   DMLC_DECLARE_PARAMETER(ForeachParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(1)
     .describe("Number of inputs.");
@@ -82,7 +82,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   CHECK_GT(params.in_data_locs.ndim(), 0);
   size_t len = inputs[0].shape()[iter_dim];
   state.num_iterations = len;
-  for (size_t i = 1; i < params.in_data_locs.ndim(); i++)
+  for (int i = 1; i < params.in_data_locs.ndim(); i++)
     CHECK_EQ(inputs[i].shape()[iter_dim], len);
   for (size_t i = 0; i < (size_t) params.num_out_data; i++)
     CHECK_EQ(len, outputs[i].shape()[iter_dim]);
@@ -120,7 +120,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
   // and the loop states.
   std::vector<NDArray> subg_inputs(inputs.size());
   // The remaining arrays (other than input data and states) only need to be set once.
-  for (size_t j = 0; j < params.remain_locs.ndim(); j++) {
+  for (int j = 0; j < params.remain_locs.ndim(); j++) {
     CHECK_LT(params.remain_locs[j], subg_inputs.size());
     subg_inputs[params.remain_locs[j]] = inputs[j + params.in_data_locs.ndim()
         + params.in_state_locs.ndim()];
@@ -148,7 +148,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
 
     // Initialize inputs for the subgraph.
     // Get a slice from the input data arrays.
-    for (size_t j = 0; j < params.in_data_locs.ndim(); j++) {
+    for (int j = 0; j < params.in_data_locs.ndim(); j++) {
       size_t loc = params.in_data_locs[j];
       subg_inputs[loc] = inputs[j].At(i);
     }
@@ -161,7 +161,7 @@ static void ForeachComputeExCPU(const OpStatePtr& state_ptr,
         subg_inputs[params.in_state_locs[idx]] = (*subg_out_prev)[j];
       }
     } else {
-      for (size_t j = 0; j < params.in_state_locs.ndim(); j++) {
+      for (int j = 0; j < params.in_state_locs.ndim(); j++) {
         CHECK_LT(params.in_state_locs[j], subg_inputs.size());
         subg_inputs[params.in_state_locs[j]] = inputs[j + params.in_data_locs.ndim()];
       }
@@ -203,7 +203,7 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
   // [data vars], [loop vars], [remaining vars]
 
   // [remaining vars]
-  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
     size_t loc = params.remain_locs[i];
     size_t orig_loc = i + params.in_data_locs.ndim() + params.in_state_locs.ndim();
     subg_igrads[loc] = outputs[orig_loc];
@@ -216,20 +216,20 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
     if (iter_num < len - 1) {
       // For the rest of the iterations, we should add graidents to the
       // remaining vars.
-      for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+      for (int i = 0; i < params.remain_locs.ndim(); i++) {
         size_t loc = params.remain_locs[i];
         subg_req[loc] = kAddTo;
       }
     }
 
     // [data vars]
-    for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+    for (int i = 0; i < params.in_data_locs.ndim(); i++) {
       size_t loc = params.in_data_locs[i];
       subg_igrads[loc] = outputs[i].At(iter_num);
       subg_req[loc] = req[i];
     }
     // [loop vars]
-    for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+    for (int i = 0; i < params.in_state_locs.ndim(); i++) {
       size_t loc = params.in_state_locs[i];
       const NDArray &output = outputs[i + params.in_data_locs.ndim()];
       if (iter_num != 0) {
@@ -258,9 +258,9 @@ static void ForeachGradComputeExCPU(const OpStatePtr& state_ptr,
 
 template<typename T>
 static void remap(const std::vector<T> &op_in, size_t start,
-                  const nnvm::Tuple<dim_t> &locs, std::vector<T> *subg_in) {
+                  const mxnet::Tuple<dim_t> &locs, std::vector<T> *subg_in) {
   auto op_in_it = op_in.begin() + start;
-  for (size_t i = 0; i < locs.ndim(); i++) {
+  for (int i = 0; i < locs.ndim(); i++) {
     dim_t loc = locs[i];
     subg_in->at(loc) = *(op_in_it + i);
   }
@@ -284,7 +284,7 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   mxnet::ShapeVector subg_in_shape(in_shape->size());
   // data shape
   std::vector<bool> data_1d(params.in_data_locs.ndim(), false);
-  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
     if (in_shape->at(i).ndim() == 1)
       data_1d[i] = true;
@@ -301,7 +301,7 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   for (int i = 0; i < params.num_out_data; i++) {
     mxnet::TShape shape = subg_out_shape[i];
     // If we don't have shape info, we don't need to do anything.
-    if (shape.ndim() == 0)
+    if (!mxnet::ndim_is_known(shape))
       continue;
     subg_out_shape[i] = SliceFirstDim(shape);
   }
@@ -317,12 +317,12 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
   for (int i = 0; i < params.num_out_data; i++) {
     // If the output shape isn't inferred, we don't need to propogate the info.
     const auto& g_out_shape = subg_out_shape[i];
-    if (g_out_shape.ndim() == 0)
+    if (!mxnet::ndim_is_known(g_out_shape))
       continue;
 
-    auto out = mxnet::TShape(g_out_shape.ndim() + 1);
+    auto out = mxnet::TShape(g_out_shape.ndim() + 1, -1);
     out[0] = len;
-    for (size_t i = 1; i < out.ndim(); i++)
+    for (int i = 1; i < out.ndim(); i++)
       out[i] = g_out_shape[i - 1];
     SHAPE_ASSIGN_CHECK(*out_shape, i, out);
   }
@@ -331,34 +331,34 @@ static bool ForeachShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*out_shape, i, subg_out_shape[i]);
 
   // For the shape of input data.
-  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
     const auto &shape = subg_in_shape[loc];
     // If the input data shape isn't inferred, we don't need to propogate the
     // info.
-    if (shape.ndim() == 0)
+    if (!mxnet::ndim_is_known(shape))
       continue;
 
     if (data_1d[i]) {
-      mxnet::TShape s(1);
+      mxnet::TShape s(1, -1);
       s[0] = len;
       SHAPE_ASSIGN_CHECK(*in_shape, i, s);
     } else {
-      auto in = mxnet::TShape(shape.ndim() + 1);
+      auto in = mxnet::TShape(shape.ndim() + 1, -1);
       in[0] = len;
-      for (size_t i = 1; i < in.ndim(); i++)
+      for (int i = 1; i < in.ndim(); i++)
         in[i] = shape[i - 1];
       SHAPE_ASSIGN_CHECK(*in_shape, i, in);
     }
   }
   // For the shape of state.
-  for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_state_locs.ndim(); i++) {
     size_t loc = params.in_state_locs[i];
     SHAPE_ASSIGN_CHECK(*in_shape, i + params.in_data_locs.ndim(),
                        subg_in_shape[loc]);
   }
   // For the shape of remaining data.
-  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
     size_t loc = params.remain_locs[i];
     SHAPE_ASSIGN_CHECK(*in_shape,
                        i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
@@ -387,15 +387,15 @@ static bool ForeachType(const nnvm::NodeAttrs& attrs,
   remap(*in_type, params.in_data_locs.ndim() + params.in_state_locs.ndim(),
         params.remain_locs, &subg_in_type);
   bool success = InferSubgraphDataType(*attrs.subgraphs[0], &subg_in_type, out_type);
-  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
     TYPE_ASSIGN_CHECK(*in_type, i, subg_in_type[loc]);
   }
-  for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_state_locs.ndim(); i++) {
     size_t loc = params.in_state_locs[i];
     TYPE_ASSIGN_CHECK(*in_type, i + params.in_data_locs.ndim(), subg_in_type[loc]);
   }
-  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
     size_t loc = params.remain_locs[i];
     TYPE_ASSIGN_CHECK(*in_type, i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
                       subg_in_type[loc]);
@@ -418,16 +418,16 @@ static bool ForeachStorageType(const nnvm::NodeAttrs& attrs,
         params.remain_locs, &subg_in_attrs);
   bool success = InferSubgraphStorage(*attrs.subgraphs[0], dev_mask,
                                       dispatch_mode, &subg_in_attrs, out_attrs);
-  for (size_t i = 0; i < params.in_data_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_data_locs.ndim(); i++) {
     size_t loc = params.in_data_locs[i];
     STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i, subg_in_attrs[loc]);
   }
-  for (size_t i = 0; i < params.in_state_locs.ndim(); i++) {
+  for (int i = 0; i < params.in_state_locs.ndim(); i++) {
     size_t loc = params.in_state_locs[i];
     STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, i + params.in_data_locs.ndim(),
                               subg_in_attrs[loc]);
   }
-  for (size_t i = 0; i < params.remain_locs.ndim(); i++) {
+  for (int i = 0; i < params.remain_locs.ndim(); i++) {
     size_t loc = params.remain_locs[i];
     STORAGE_TYPE_ASSIGN_CHECK(*in_attrs,
                               i + params.in_data_locs.ndim() + params.in_state_locs.ndim(),
@@ -488,9 +488,9 @@ struct WhileLoopParam : public dmlc::Parameter<WhileLoopParam> {
   // `cond_input_locs' contains indices of inputs fed to `cond', and
   // `func_input_locs' contains indices of inputs fed to `func'.
   // `func_var_locs' are indices in which input "variables" are stored in func's inputs.
-  nnvm::Tuple<dim_t> cond_input_locs;
-  nnvm::Tuple<dim_t> func_input_locs;
-  nnvm::Tuple<dim_t> func_var_locs;
+  mxnet::Tuple<dim_t> cond_input_locs;
+  mxnet::Tuple<dim_t> func_input_locs;
+  mxnet::Tuple<dim_t> func_var_locs;
   DMLC_DECLARE_PARAMETER(WhileLoopParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(2)
     .describe("Number of input arguments, including cond and func as two symbol inputs.");
@@ -538,12 +538,12 @@ class WhileLoopState: public LoopState {
                  n_iterations(0U),
                  cond_op(LoopState::MakeSharedOp(cond)),
                  oi_map(params.func_var_locs.ndim(), -1) {
-    const nnvm::Tuple<dim_t> &func_input_locs = params.func_input_locs;
-    const nnvm::Tuple<dim_t> &func_var_locs = params.func_var_locs;
-    const nnvm::Tuple<dim_t> &cond_input_locs = params.cond_input_locs;
-    for (size_t i = 0; i < func_var_locs.ndim(); ++i) {
+    const mxnet::Tuple<dim_t> &func_input_locs = params.func_input_locs;
+    const mxnet::Tuple<dim_t> &func_var_locs = params.func_var_locs;
+    const mxnet::Tuple<dim_t> &cond_input_locs = params.cond_input_locs;
+    for (int i = 0; i < func_var_locs.ndim(); ++i) {
       dim_t pos_i = func_input_locs[func_var_locs[i]];
-      for (size_t j = 0; j < cond_input_locs.ndim(); ++j) {
+      for (int j = 0; j < cond_input_locs.ndim(); ++j) {
         dim_t pos_j = cond_input_locs[j];
         if (pos_i == pos_j) {
           this->oi_map[i] = j;
@@ -740,7 +740,7 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
   // infer shape for cond and func
   auto infer_subg = [&params, in_shape, out_shape](std::shared_ptr<Symbol> subg,
                                                    ShapeVector *_subg_out,
-                                                   const nnvm::Tuple<dim_t> &input_locs,
+                                                   const mxnet::Tuple<dim_t> &input_locs,
                                                    int num_out_data,
                                                    bool fill_out_shape) {
     // create subg_in
@@ -781,7 +781,7 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
     for (size_t i = 0; i < subg_in.size(); ++i) {
       auto eid = idx.entry_id(input_nids[i], 0);
       auto g_out_shape = new_shapes[eid];
-      if (g_out_shape.ndim() == 0 || g_out_shape.Size() == 0) {
+      if (!shape_is_known(g_out_shape)) {
         // when the shape is not fully inferred
         continue;
       }
@@ -795,13 +795,13 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
     for (int i = 0; i < num_out_data; ++i) {
       auto eid = idx.entry_id(g.outputs[i]);
       auto g_out_shape = new_shapes[eid];
-      if (g_out_shape.ndim() == 0 || g_out_shape.Size() == 0) {
+      if (!shape_is_known(g_out_shape)) {
         // when the shape is not fully inferred
         continue;
       }
-      auto out = mxnet::TShape(g_out_shape.ndim() + 1);
+      auto out = mxnet::TShape(g_out_shape.ndim() + 1, -1);
       out[0] = params.max_iterations;
-      for (size_t i = 1; i < out.ndim(); i++)
+      for (int i = 1; i < out.ndim(); i++)
         out[i] = g_out_shape[i - 1];
       SHAPE_ASSIGN_CHECK(*out_shape, i, out);
     }
@@ -809,7 +809,7 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
     for (size_t i = num_out_data; i < g.outputs.size(); ++i) {
       auto eid = idx.entry_id(g.outputs[i]);
       auto g_out_shape = new_shapes[eid];
-      if (g_out_shape.ndim() == 0 || g_out_shape.Size() == 0) {
+      if (!shape_is_known(g_out_shape)) {
         // when the shape is not fully inferred
         continue;
       }
@@ -817,7 +817,7 @@ static bool WhileLoopShape(const nnvm::NodeAttrs& attrs,
     }
     return g.GetAttr<size_t>("shape_num_unknown_nodes") == 0;
   };
-  mxnet::ShapeVector cond_out_shape{mxnet::TShape(1U)};  // this means: [(1, )]
+  mxnet::ShapeVector cond_out_shape{mxnet::TShape(1, 1)};  // this means: [(1, )]
   mxnet::ShapeVector func_out_shape(params.num_outputs);
   CHECK(params.sync_in_out(in_shape, out_shape, is_udf));
   bool succ_0 = infer_subg(attrs.subgraphs[0], &cond_out_shape, params.cond_input_locs, 0, false);
@@ -915,9 +915,9 @@ WhileLoopGradient(const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& og
 struct CondParam : public dmlc::Parameter<CondParam> {
   int num_args;
   int num_outputs;
-  nnvm::Tuple<dim_t> cond_input_locs;
-  nnvm::Tuple<dim_t> then_input_locs;
-  nnvm::Tuple<dim_t> else_input_locs;
+  mxnet::Tuple<dim_t> cond_input_locs;
+  mxnet::Tuple<dim_t> then_input_locs;
+  mxnet::Tuple<dim_t> else_input_locs;
   DMLC_DECLARE_PARAMETER(CondParam) {
     DMLC_DECLARE_FIELD(num_args).set_lower_bound(3)
     .describe("Number of input arguments, including cond, then and else as three symbol inputs.");
@@ -992,7 +992,7 @@ static void CondComputeExCPU(const OpStatePtr& state_ptr,
   state.cond_op->Forward(nullptr, cond_input_ptr, cond_output_ptr);
   branch_selection = as_bool_scalar(*cond_output_ptr[0]);
   // select the right branch
-  const nnvm::Tuple<dim_t> &func_input_locs = branch_selection
+  const mxnet::Tuple<dim_t> &func_input_locs = branch_selection
                                             ? params.then_input_locs
                                             : params.else_input_locs;
   LoopState &loop_state = branch_selection
@@ -1017,7 +1017,7 @@ static void CondGradComputeExCPU(const OpStatePtr& state_ptr,
   // select the right branch
   int branch_selection = state.branch_selection;
   CHECK_NE(branch_selection, -1);
-  const nnvm::Tuple<dim_t> &func_input_locs = branch_selection
+  const mxnet::Tuple<dim_t> &func_input_locs = branch_selection
                                             ? params.then_input_locs
                                             : params.else_input_locs;
   LoopState &loop_state = branch_selection
@@ -1048,7 +1048,7 @@ static bool CondShape(const nnvm::NodeAttrs& attrs,
   // infer shape for cond, then and else
   auto infer_subg = [&params, in_shape, out_shape](std::shared_ptr<Symbol> subg,
                                                    ShapeVector *_subg_out,
-                                                   const nnvm::Tuple<dim_t> &input_locs,
+                                                   const mxnet::Tuple<dim_t> &input_locs,
                                                    bool fill_out_shape) {
     // create subg_in
     mxnet::ShapeVector subg_in;
@@ -1086,7 +1086,7 @@ static bool CondShape(const nnvm::NodeAttrs& attrs,
     for (size_t i = 0; i < subg_in.size(); ++i) {
       auto eid = idx.entry_id(input_nids[i], 0);
       auto g_out_shape = new_shapes[eid];
-      if (g_out_shape.ndim() == 0 || g_out_shape.Size() == 0) {
+      if (!shape_is_known(g_out_shape)) {
         // when the shape is not fully inferred
         continue;
       }
@@ -1099,7 +1099,7 @@ static bool CondShape(const nnvm::NodeAttrs& attrs,
     for (size_t i = 0; i < g.outputs.size(); ++i) {
       auto eid = idx.entry_id(g.outputs[i]);
       auto g_out_shape = new_shapes[eid];
-      if (g_out_shape.ndim() == 0 || g_out_shape.Size() == 0) {
+      if (!shape_is_known(g_out_shape)) {
         // when the shape is not fully inferred
         continue;
       }
@@ -1107,7 +1107,7 @@ static bool CondShape(const nnvm::NodeAttrs& attrs,
     }
     return g.GetAttr<size_t>("shape_num_unknown_nodes") == 0;
   };
-  ShapeVector cond_out_shape{mxnet::TShape(1U)};  // this means: [(1, )]
+  ShapeVector cond_out_shape{mxnet::TShape(1, 1)};  // this means: [(1, )]
   ShapeVector then_out_shape(params.num_outputs);
   ShapeVector else_out_shape(params.num_outputs);
   bool succ_0 = infer_subg(attrs.subgraphs[0], &cond_out_shape, \
@@ -1190,7 +1190,7 @@ static bool BackwardCondStorageType(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size() + 3U, (size_t) params.num_args);
   CHECK_EQ(attrs.subgraphs.size(), 3U);
   static const std::function<bool(const int &)> is_udf = is_stype_udf;
-  auto sub_pass = [&](const std::shared_ptr<Symbol> &subg, const nnvm::Tuple<dim_t> &input_locs) {
+  auto sub_pass = [&](const std::shared_ptr<Symbol> &subg, const mxnet::Tuple<dim_t> &input_locs) {
     // A. first construct subg_in_attrs
     // need subg_in_attrs as subg_bwd_out (copy), subg_fwd_in (extract), subg_fwd_out (copy)
     std::vector<int> subg_in_attrs;
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index ed6748a9c85c..d2126bd29d80 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -64,11 +64,11 @@ struct ConvolutionV1Param : public dmlc::Parameter<ConvolutionV1Param> {
   dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(ConvolutionV1Param) {
     DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, 0))
     .describe("convolution stride: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, 0))
     .describe("convolution dilate: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, 0))
     .describe("pad for convolution: (h, w) or (d, h, w)");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
     .describe("convolution filter(channel) number");
@@ -336,7 +336,7 @@ class ConvolutionV1Op : public Operator {
     // param_.workspace is in elements of sizeof(DType)
     // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
     nstep_ = std::max<index_t>(
-        std::min(static_cast<index_t>(param_.workspace) /
+        std::min<index_t>(param_.workspace /
           (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
       1);
 
@@ -405,7 +405,7 @@ class ConvolutionV1Prop : public OperatorProperty {
     // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
     out_shape->resize(1, mxnet::TShape());
     const mxnet::TShape &dshp = (*in_shape)[conv_v1::kData];
-    if (dshp.ndim() ==  0) return false;
+    if (!mxnet::ndim_is_known(dshp)) return false;
     if (param_.kernel.ndim() == 2) {
       // 2d conv_v1
       CHECK_EQ(dshp.ndim(), 4U) \
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
deleted file mode 100644
index cc8e4db404da..000000000000
--- a/src/operator/cudnn_rnn-inl.h
+++ /dev/null
@@ -1,863 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2016 by Contributors
- * \file cudnn_rnn-inl.h
- * \brief
- * \author Sebastian Bodenstein
-*/
-#ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_
-#define MXNET_OPERATOR_CUDNN_RNN_INL_H_
-
-#define USE_CUDNN_LSTM_PROJ MXNET_USE_CUDNN == 1 && CUDNN_VERSION >= 7200
-
-#include <mxnet/storage.h>
-#include <vector>
-#include <map>
-#include <string>
-#include <utility>
-#include <cstdint>
-#include "./rnn-inl.h"
-
-namespace mxnet {
-namespace op {
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-template<typename DType>
-class CuDNNRNNOp : public Operator {
- public:
-  explicit CuDNNRNNOp(RNNParam param) {
-    this->param_ = param;
-    init_cudnn_ = false;
-    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
-    // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
-    // No tests in place for fp16 RNNs, so leave TensorCore disabled for now.
-    cudnn_tensor_core_ = false;
-    // When fp16 RNN tests are introduced, we can enable TensorCore as follows:
-//    cudnn_tensor_core =
-//        mshadow::DataType<DType>::kFlag == mshadow::kFloat16 && GetEnvAllowTensorCore();
-    // Defaults
-    input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
-    // RNN Mode
-    switch (param_.mode) {
-      case rnn_enum::kRnnRelu:
-        mode_ = CUDNN_RNN_RELU;
-        break;
-      case rnn_enum::kRnnTanh:
-        mode_ = CUDNN_RNN_TANH;
-        break;
-      case rnn_enum::kLstm:
-        mode_ = CUDNN_LSTM;
-        break;
-      case rnn_enum::kGru:
-        mode_ = CUDNN_GRU;
-        break;
-      default:
-        LOG(FATAL) << "Not implmented";
-    }
-#if USE_CUDNN_LSTM_PROJ
-    if (param_.projection_size.has_value()) {
-      CHECK_EQ(param_.mode, rnn_enum::kLstm)
-        << "Projection is only supported for LSTM.";
-      CHECK_GE(param_.state_size, param_.projection_size.value())
-        << "State size must be larger than projection size.";
-    }
-#else
-    CHECK(!param_.projection_size.has_value())
-      << "Projection is only supported for LSTM with CuDNN version later than 7.1.1.";
-#endif
-#if USE_CUDNN_LSTM_PROJ
-    if (param_.lstm_state_clip_min.has_value()
-        || param_.lstm_state_clip_max.has_value()) {
-      CHECK_EQ(param_.mode, rnn_enum::kLstm)
-        << "State clipping is only supported for LSTM.";
-      CHECK(param_.lstm_state_clip_min.has_value() && param_.lstm_state_clip_max.has_value())
-        << "lstm_state_clip_min and lstm_state_clip_max must be specified together.";
-      CHECK_GE(param_.lstm_state_clip_max.value(), param_.lstm_state_clip_min.value())
-        << "lstm_state_clip_max must be greater or equal to lstm_state_clip_min";
-    }
-#else
-    CHECK(!param_.lstm_state_clip_min.has_value()
-          && !param_.lstm_state_clip_max.has_value())
-      << "State clipping is only supported for LSTM with CuDNN version later than 7.2.1.";
-#endif
-    // RNN Direction
-    direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
-    // Other
-    if (param_.mode == rnn_enum::kLstm)
-      param_.lstm_q_ = true;
-    else
-      param_.lstm_q_ = false;
-
-    // Create descriptors
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&hy_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&cy_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcx_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhy_desc_));
-    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcy_desc_));
-
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc_));
-    CUDNN_CALL(cudnnCreateFilterDescriptor(&dw_desc_));
-
-    CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_));
-    CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
-
-    #if USE_CUDNN_LSTM_PROJ
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_));
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&y_data_desc_));
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_));
-    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_));
-    #endif
-  }
-
-  ~CuDNNRNNOp() {
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(cy_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcx_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhy_desc_));
-    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcy_desc_));
-
-    CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc_));
-    CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_));
-    CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
-
-    if (init_cudnn_) {
-      for (size_t i = 0; i < x_desc_vec_.size(); ++i) {
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc_vec_[i]));
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc_vec_[i]));
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]));
-        CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]));
-      }
-      init_cudnn_ = false;
-
-      Storage::Get()->Free(reserve_space_);
-      if (param_.p > 0) {
-        Storage::Get()->Free(dropout_states_);
-      }
-    }
-    #if USE_CUDNN_LSTM_PROJ
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(x_data_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(y_data_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dx_data_desc_));
-    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_));
-    #endif
-  }
-
-  virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    size_t in_expected = param_.lstm_q_ ? 4 : 3;
-    size_t out_expected = param_.lstm_q_ ? 3 : 2;
-    if (!param_.state_outputs)
-        out_expected = 1;
-
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    Stream<gpu> *s = ctx.get_stream<gpu>();
-    // get input + output tensors
-    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-
-    void * hy_ptr = NULL;
-    if (param_.state_outputs)
-      hy_ptr = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
-
-    DType * cx_ptr = NULL;
-    DType * cy_ptr = NULL;
-
-    if (param_.lstm_q_)
-      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-    if (param_.lstm_q_ && param_.state_outputs)
-      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
-
-    CHECK_EQ(x.CheckContiguous(), true);
-    CHECK_EQ(w.CheckContiguous(), true);
-    CHECK_EQ(hx.CheckContiguous(), true);
-    CHECK_EQ(y.CheckContiguous(), true);
-
-    if (!init_cudnn_) {
-      Init(s, in_data, out_data);
-    }
-    // Get temp space
-    int temp_size = workspace_size_;
-    Tensor<gpu, 1, DType> temp_space =
-      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
-                              mshadow::Shape1(temp_size), s);
-    #if USE_CUDNN_LSTM_PROJ
-    std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
-    CUDNN_CALL(cudnnSetRNNDataDescriptor(x_data_desc_,
-                                         dtype_,
-                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                         param_.seq_length_,
-                                         param_.batch_size_,
-                                         param_.input_size_,
-                                         seqLengthArray.data(),
-                                         nullptr));
-    int out_size =
-      (param_.projection_size.has_value()) ? param_.projection_size.value() : param_.state_size;
-    out_size = (param_.bidirectional) ? (out_size * 2) : out_size;
-    CUDNN_CALL(cudnnSetRNNDataDescriptor(y_data_desc_,
-                                         dtype_,
-                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                         param_.seq_length_,
-                                         param_.batch_size_,
-                                         out_size,
-                                         seqLengthArray.data(),
-                                         nullptr));
-    if (ctx.is_train) {
-      CUDNN_CALL(cudnnSetRNNDataDescriptor(dx_data_desc_,
-                                           dtype_,
-                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                           param_.seq_length_,
-                                           param_.batch_size_,
-                                           param_.input_size_,
-                                           seqLengthArray.data(),
-                                           nullptr));
-      CUDNN_CALL(cudnnSetRNNDataDescriptor(dy_data_desc_,
-                                           dtype_,
-                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
-                                           param_.seq_length_,
-                                           param_.batch_size_,
-                                           out_size,
-                                           seqLengthArray.data(),
-                                           nullptr));
-    }
-    #endif
-
-    #if USE_CUDNN_LSTM_PROJ
-    bool clip_state = param_.lstm_state_clip_min.has_value();
-    bool clip_nan = param_.lstm_state_clip_nan;
-    CUDNN_CALL(cudnnRNNSetClip(s->dnn_handle_,
-                               rnn_desc_,
-                               clip_state ? CUDNN_RNN_CLIP_MINMAX : CUDNN_RNN_CLIP_NONE,
-                               clip_nan ? CUDNN_NOT_PROPAGATE_NAN : CUDNN_PROPAGATE_NAN,
-                               clip_state ? param_.lstm_state_clip_min.value() : 0.0,
-                               clip_state ? param_.lstm_state_clip_max.value() : 0.0));
-    #endif
-
-    if (ctx.is_train) {
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnRNNForwardTrainingEx(s->dnn_handle_,
-                                           rnn_desc_,
-                                           x_data_desc_,
-                                           x.dptr_,
-                                           hx_desc_,
-                                           hx.dptr_,
-                                           cx_desc_,
-                                           cx_ptr,
-                                           w_desc_,
-                                           w.dptr_,
-                                           y_data_desc_,
-                                           y.dptr_,
-                                           hy_desc_,
-                                           hy_ptr,
-                                           cy_desc_,
-                                           cy_ptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           nullptr,
-                                           temp_space.dptr_,
-                                           workspace_byte_,
-                                           reserve_space_.dptr,
-                                           reserve_space_byte_));
-      #else
-      CUDNN_CALL(cudnnRNNForwardTraining(s->dnn_handle_,
-                                         rnn_desc_,
-                                         param_.seq_length_,
-                                         x_desc_vec_.data(),
-                                         x.dptr_,
-                                         hx_desc_,
-                                         hx.dptr_,
-                                         cx_desc_,
-                                         cx_ptr,
-                                         w_desc_,
-                                         w.dptr_,
-                                         y_desc_vec_.data(),
-                                         y.dptr_,
-                                         hy_desc_,
-                                         hy_ptr,
-                                         cy_desc_,
-                                         cy_ptr,
-                                         temp_space.dptr_,
-                                         workspace_byte_,
-                                         reserve_space_.dptr,
-                                         reserve_space_byte_));
-      #endif
-    } else {
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnRNNForwardInferenceEx(s->dnn_handle_,
-                                            rnn_desc_,
-                                            x_data_desc_,
-                                            x.dptr_,
-                                            hx_desc_,
-                                            hx.dptr_,
-                                            cx_desc_,
-                                            cx_ptr,
-                                            w_desc_,
-                                            w.dptr_,
-                                            y_data_desc_,
-                                            y.dptr_,
-                                            hy_desc_,
-                                            hy_ptr,
-                                            cy_desc_,
-                                            cy_ptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            nullptr,
-                                            temp_space.dptr_,
-                                            workspace_byte_));
-      #else
-      CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_,
-                                          rnn_desc_,
-                                          param_.seq_length_,
-                                          x_desc_vec_.data(),
-                                          x.dptr_,
-                                          hx_desc_,
-                                          hx.dptr_,
-                                          cx_desc_,
-                                          cx_ptr,
-                                          w_desc_,
-                                          w.dptr_,
-                                          y_desc_vec_.data(),
-                                          y.dptr_,
-                                          hy_desc_,
-                                          hy_ptr,
-                                          cy_desc_,
-                                          cy_ptr,
-                                          temp_space.dptr_,
-                                          workspace_byte_));
-      #endif
-    }
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    size_t in_expected = param_.lstm_q_ ? 4 : 3;
-    size_t out_expected = param_.lstm_q_ ? 3 : 2;
-    if (!param_.state_outputs)
-      out_expected = 1;
-
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    CHECK_EQ(in_grad.size(), in_expected);
-    CHECK_EQ(out_grad.size(), out_expected);
-    CHECK_EQ(req.size(), in_expected);
-    CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
-    CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
-    Stream<gpu> *s = ctx.get_stream<gpu>();
-    // get input + output tensors
-    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dx = in_grad[rnn_enum::kData].get<gpu, 3, DType>(s);
-    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
-    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
-    if (req[rnn_enum::kParams] != kAddTo) {
-      dw = mshadow::expr::ScalarExp<DType>(0.0f);
-    }
-    // only need kStateOut grad output_states is true
-    void * dhy_ptr = NULL;
-    if (param_.state_outputs)
-      dhy_ptr = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
-
-    // Deal with lstm
-    void * dcx_ptr = NULL;
-    void * dcy_ptr = NULL;
-    void * cx_ptr = NULL;
-
-    if (param_.mode == rnn_enum::kLstm) {
-      CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
-      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
-    }
-    if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
-        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
-
-    CHECK_EQ(x.CheckContiguous(), true);
-    CHECK_EQ(w.CheckContiguous(), true);
-    CHECK_EQ(dw.CheckContiguous(), true);
-    CHECK_EQ(hx.CheckContiguous(), true);
-    CHECK_EQ(dhx.CheckContiguous(), true);
-    CHECK_EQ(y.CheckContiguous(), true);
-    CHECK_EQ(dy.CheckContiguous(), true);
-
-    if (!init_cudnn_) {
-      Init(s, in_data, out_data);
-    }
-
-    // Get temp space
-    int temp_size = workspace_size_;
-    Tensor<gpu, 1, DType> temp_space =
-      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
-                              mshadow::Shape1(temp_size), s);
-    #if USE_CUDNN_LSTM_PROJ
-    CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
-                                      rnn_desc_,
-                                      y_data_desc_,
-                                      y.dptr_,
-                                      dy_data_desc_,
-                                      dy.dptr_,
-                                      nullptr,
-                                      nullptr,
-                                      dhy_desc_,
-                                      dhy_ptr,
-                                      dcy_desc_,
-                                      dcy_ptr,
-                                      w_desc_,
-                                      w.dptr_,
-                                      hx_desc_,
-                                      hx.dptr_,
-                                      cx_desc_,
-                                      cx_ptr,
-                                      dx_data_desc_,
-                                      dx.dptr_,
-                                      dhx_desc_,
-                                      dhx.dptr_,
-                                      dcx_desc_,
-                                      dcx_ptr,
-                                      nullptr,
-                                      nullptr,
-                                      temp_space.dptr_,
-                                      workspace_byte_,
-                                      reserve_space_.dptr,
-                                      reserve_space_byte_));
-    CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_,
-                                         rnn_desc_,
-                                         x_data_desc_,
-                                         x.dptr_,
-                                         hx_desc_,
-                                         hx.dptr_,
-                                         y_data_desc_,
-                                         y.dptr_,
-                                         temp_space.dptr_,
-                                         workspace_byte_,
-                                         dw_desc_,
-                                         dw.dptr_,
-                                         reserve_space_.dptr,
-                                         reserve_space_byte_));
-    #else
-    CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_,
-                                    rnn_desc_,
-                                    param_.seq_length_,
-                                    y_desc_vec_.data(),
-                                    y.dptr_,
-                                    dy_desc_vec_.data(),
-                                    dy.dptr_,
-                                    dhy_desc_,
-                                    dhy_ptr,
-                                    dcy_desc_,
-                                    dcy_ptr,
-                                    w_desc_,
-                                    w.dptr_,
-                                    hx_desc_,
-                                    hx.dptr_,
-                                    cx_desc_,
-                                    cx_ptr,
-                                    dx_desc_vec_.data(),
-                                    dx.dptr_,
-                                    dhx_desc_,
-                                    dhx.dptr_,
-                                    dcx_desc_,
-                                    dcx_ptr,
-                                    temp_space.dptr_,
-                                    workspace_byte_,
-                                    reserve_space_.dptr,
-                                    reserve_space_byte_));
-    CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_,
-                                       rnn_desc_,
-                                       param_.seq_length_,
-                                       x_desc_vec_.data(),
-                                       x.dptr_,
-                                       hx_desc_,
-                                       hx.dptr_,
-                                       y_desc_vec_.data(),
-                                       y.dptr_,
-                                       temp_space.dptr_,
-                                       workspace_byte_,
-                                       dw_desc_,
-                                       dw.dptr_,
-                                       reserve_space_.dptr,
-                                       reserve_space_byte_));
-    #endif
-  }
-
- private:
-  inline void Init(mshadow::Stream<gpu> *s,
-                   const std::vector<TBlob> &in_data,
-                   const std::vector<TBlob> &out_data) {
-    using namespace mshadow;
-    #if CUDNN_MAJOR >= 5
-    format_ = CUDNN_TENSOR_NCHW;
-    #endif
-    size_t in_expected = param_.lstm_q_ ? 4 : 3;
-    size_t out_expected = param_.lstm_q_ ? 3 : 2;
-    if (!param_.state_outputs)
-      out_expected = 1;
-
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    if (!init_cudnn_) {
-      init_cudnn_ = true;
-      // get input + output tensors
-      Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
-      Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
-      param_.seq_length_ = x.shape_[0];
-      param_.batch_size_ = x.shape_[1];
-      param_.input_size_ = x.shape_[2];
-
-      // Tensor Descriptors
-      std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
-      std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
-      std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
-      std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
-      int dimA[3];
-      int strideA[3];
-      for (int i = 0; i < param_.seq_length_; i++) {
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&x_vec[i]));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&y_vec[i]));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_vec[i]));
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_vec[i]));
-
-        dimA[0] = param_.batch_size_;
-        dimA[1] = param_.input_size_;
-        dimA[2] = 1;
-        strideA[0] = dimA[2] * dimA[1];
-        strideA[1] = dimA[2];
-        strideA[2] = 1;
-
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(x_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(dx_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-        dimA[0] = param_.batch_size_;
-        dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
-        dimA[2] = 1;
-        strideA[0] = dimA[2] * dimA[1];
-        strideA[1] = dimA[2];
-        strideA[2] = 1;
-
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(y_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-        CUDNN_CALL(cudnnSetTensorNdDescriptor(dy_vec[i],
-                                              dtype_,
-                                              3,
-                                              dimA,
-                                              strideA));
-      }
-      x_desc_vec_ = x_vec;
-      y_desc_vec_ = y_vec;
-      dx_desc_vec_ = dx_vec;
-      dy_desc_vec_ = dy_vec;
-
-      // set the state tensors
-      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
-      dimA[1] = param_.batch_size_;
-      dimA[2] = param_.state_size;
-      strideA[0] = dimA[2] * dimA[1];
-      strideA[1] = dimA[2];
-      strideA[2] = 1;
-      #if USE_CUDNN_LSTM_PROJ
-      int dimB[3];
-      int strideB[3];
-      dimB[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
-      dimB[1] = param_.batch_size_;
-      dimB[2] = param_.projection_size.has_value() ?
-                param_.projection_size.value() : param_.state_size;
-      strideB[0] = dimB[2] * dimB[1];
-      strideB[1] = dimB[2];
-      strideB[2] = 1;
-      #endif
-
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(cx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(cy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcx_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #if USE_CUDNN_LSTM_PROJ
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimB,
-                                            strideB));
-      #else
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-      #endif
-      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcy_desc_,
-                                            dtype_,
-                                            3,
-                                            dimA,
-                                            strideA));
-
-      // Create Dropout descriptors
-      if (param_.p > 0) {
-        CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_));
-        dropout_size_ = dropout_byte_ / sizeof(DType);
-        dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU(s->dev_id));
-      } else {
-        dropout_states_ = {};
-        dropout_byte_ = 0;
-      }
-      CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_,
-                                           param_.p,  // discard probability
-                                           dropout_states_.dptr, dropout_byte_,
-                                           seed_));
-      // RNN descriptors
-      #if CUDNN_MAJOR >= 6
-      cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
-      CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_,
-                                          rnn_desc_,
-                                          param_.state_size,
-                                          param_.num_layers,
-                                          dropout_desc_,
-                                          input_mode_,
-                                          direction_,
-                                          mode_,
-                                          rnn_algo,
-                                          dtype_));
-      #else
-      CUDNN_CALL(cudnnSetRNNDescriptor(rnn_desc_,
-                                       param_.state_size,
-                                       param_.num_layers,
-                                       dropout_desc_,
-                                       input_mode_,
-                                       direction_,
-                                       mode_,
-                                       dtype_));
-      #endif
-      #if CUDNN_MAJOR >= 7
-        cudnnMathType_t math_type = CUDNN_DEFAULT_MATH;
-        if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) {
-          math_type = CUDNN_TENSOR_OP_MATH;
-        }
-      #if CUDNN_VERSION >= 7200
-            if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
-                (DataType<DType>::kFlag != kFloat16))
-              math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
-      #endif
-        CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
-      #endif
-      #if USE_CUDNN_LSTM_PROJ
-      if (param_.projection_size.has_value()) {
-        CUDNN_CALL(cudnnSetRNNProjectionLayers(s->dnn_handle_,
-                                               rnn_desc_,
-                                               param_.projection_size.value(),
-                                               0));
-      }
-      #endif
-      // Get temp space sizes
-      CUDNN_CALL(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
-                                          rnn_desc_,
-                                          param_.seq_length_,
-                                          x_desc_vec_.data(),
-                                          &workspace_byte_));
-      CUDNN_CALL(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
-                                                rnn_desc_,
-                                                param_.seq_length_,
-                                                x_desc_vec_.data(),
-                                                &reserve_space_byte_));
-      workspace_size_ = workspace_byte_ / sizeof(DType);
-      // Allocate the reserve space
-      reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
-
-      // Check that number of params are correct
-      size_t cudnn_param_size;
-      CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
-                                       rnn_desc_,
-                                       x_desc_vec_[0],
-                                       &cudnn_param_size,
-                                       dtype_));
-      CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
-
-      // Set param descriptors
-      int dim_w[3] = {1, 1, 1};
-      dim_w[0] = w.shape_[0];
-      CUDNN_CALL(cudnnSetFilterNdDescriptor(w_desc_,
-                                            dtype_,
-                                            format_,
-                                            3,
-                                            dim_w));
-      CUDNN_CALL(cudnnSetFilterNdDescriptor(dw_desc_,
-                                            dtype_,
-                                            format_,
-                                            3,
-                                            dim_w));
-
-      // Query weight layout
-      // cudnnFilterDescriptor_t m_desc;
-      // CHECK_EQ(cudnnCreateFilterDescriptor(&m_desc), CUDNN_STATUS_SUCCESS);
-      // DType *p;
-      // int n = 2;
-      // int64_t last = 0;
-      // if (param_.mode == rnn_enum::kLstm) n = 8;
-      // else if (param_.mode == rnn_enum::kGru) n = 6;
-
-      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
-      //   for (int j = 0; j < n; ++j) {
-      //     CHECK_EQ(cudnnGetRNNLinLayerMatrixParams(s->dnn_handle_, rnn_desc_,
-      //       i, x_desc_vec_[0], w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
-      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
-      //     last = ((int64_t)(p - NULL))/sizeof(DType);
-      //     cudnnDataType_t t;
-      //     cudnnTensorFormat_t f;
-      //     int ndim = 5;
-      //     int dims[5] = {0, 0, 0, 0, 0};
-      //     CHECK_EQ(cudnnGetFilterNdDescriptor(m_desc, ndim, &t, &f, &ndim, &dims[0]),
-      //       CUDNN_STATUS_SUCCESS);
-      //     LOG(INFO) << "w: " <<  i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
-      //     for (int i = 0; i < ndim; ++i) LOG(INFO) << dims[i];
-      //   }
-      // }
-
-      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
-      //   for (int j = 0; j < n; ++j) {
-      //     CHECK_EQ(cudnnGetRNNLinLayerBiasParams(s->dnn_handle_, rnn_desc_, i, x_desc_vec_[0],
-      //       w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
-      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
-      //     last = ((int64_t)(p - NULL))/sizeof(DType);
-      //     LOG(INFO) << "b: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
-      //   }
-      // }
-    }
-  }
-
-  cudnnDataType_t dtype_;
-  bool init_cudnn_;
-  cudnnRNNDescriptor_t rnn_desc_;
-  cudnnRNNMode_t mode_;
-  cudnnDirectionMode_t direction_;
-  cudnnRNNInputMode_t input_mode_;
-  cudnnDropoutDescriptor_t dropout_desc_;
-  Storage::Handle dropout_states_, reserve_space_;
-  uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
-  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
-  int workspace_size_, dropout_size_;
-  std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
-  #if USE_CUDNN_LSTM_PROJ
-  cudnnRNNDataDescriptor_t x_data_desc_, y_data_desc_, dx_data_desc_, dy_data_desc_;
-  #endif
-  cudnnTensorDescriptor_t hx_desc_, cx_desc_;
-  cudnnTensorDescriptor_t hy_desc_, cy_desc_;
-  cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
-  cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
-
-  cudnnFilterDescriptor_t w_desc_, dw_desc_;
-  // Allow TensorCore algo policy
-  bool cudnn_tensor_core_;
-
-  #if CUDNN_MAJOR >= 5
-  cudnnTensorFormat_t format_;
-  #endif
-  RNNParam param_;
-};
-#endif  // __CUDACC__ && CUDNN
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_CUDNN_RNN_INL_H_
diff --git a/src/operator/custom/custom-inl.h b/src/operator/custom/custom-inl.h
index c5eaea13661e..3bf63b75cfdb 100644
--- a/src/operator/custom/custom-inl.h
+++ b/src/operator/custom/custom-inl.h
@@ -96,7 +96,12 @@ class CustomOperator {
       bool prev_recording = Imperative::Get()->set_is_recording(recording);
       bool prev_training = Imperative::Get()->set_is_training(training);
 
-      func();
+      try {
+        func();
+      } catch (dmlc::Error& e) {
+        exception_ =
+            std::make_shared<std::exception_ptr>(std::current_exception());
+      }
 
       Imperative::Get()->set_is_training(prev_training);
       Imperative::Get()->set_is_recording(prev_recording);
@@ -116,6 +121,16 @@ class CustomOperator {
 
       Engine::Get()->PushSync(
           [=](RunContext rctx) {
+            try {
+              Throw();
+              for (const auto& i : arrs) {
+                Engine::Get()->Throw(i.var());
+              }
+            } catch(dmlc::Error& err) {
+              ctx.async_on_complete(&err);
+              return;
+            }
+
             for (size_t i = 0, out_idx = 0; i < arrs.size(); i++) {
               if (arrs[i].storage_type() == kDefaultStorage ||
                   arrs[i].storage_type() == kUndefinedStorage)
@@ -125,14 +140,15 @@ class CustomOperator {
                 out_idx++;
               }
             }
+
             ctx.async_on_complete();
           },
-          ctx.run_ctx.ctx, vars, vars2, FnProperty::kNormal, 0,
+          ctx.run_ctx.ctx, vars, vars2, FnProperty::kNoSkip, 0,
           "CustomOperator");
     });
     // increase num_threads if there is not enough threads to execute custom operator
-    if (q_.size() > num_free_threads)
-      CreateThreads(q_.size() - num_free_threads);
+    if (q_.size() > num_free_threads_)
+      CreateThreads(q_.size() - num_free_threads_);
     cv_.notify_all();
   }
 
@@ -142,9 +158,10 @@ class CustomOperator {
   }
 
   void Start() {
-    num_free_threads = 0;
+    num_free_threads_ = 0;
     destructing_ = false;
     naive_engine_ = true;
+    exception_ = nullptr;
     if (std::string("NaiveEngine") != dmlc::GetEnv("MXNET_ENGINE_TYPE", std::string())) {
       naive_engine_ = false;
     }
@@ -162,6 +179,14 @@ class CustomOperator {
     workers_.clear();
   }
 
+  inline void Throw() {
+    if (exception_ && *exception_) {
+      std::exception_ptr tmp = *exception_;
+      exception_ = nullptr;
+      std::rethrow_exception(tmp);
+    }
+  }
+
  private:
   CustomOperator() {
     this->Start();
@@ -171,21 +196,20 @@ class CustomOperator {
     while (!q_.empty() || !destructing_) {
       cv_.wait(lock, [&] {return !q_.empty() || destructing_;});
       while (!q_.empty()) {
-        --num_free_threads;
+        --num_free_threads_;
         auto fn = q_.front();
         q_.pop();
         lock.unlock();
         fn();
-        ++num_free_threads;
+        ++num_free_threads_;
         lock.lock();
       }
     }
   }
   void SetNumThreads(int num_threads) {
-    num_threads = std::min(dmlc::GetEnv("MXNET_CUSTOM_OP_NUM_THREADS", 16), num_threads);
     for (int i = workers_.size(); i < num_threads; ++i) {
       workers_.emplace_back(std::thread([this]{this->ThreadTarget();}));
-      ++num_free_threads;
+      ++num_free_threads_;
     }
   }
   void CreateThreads(int num_new_threads) {
@@ -196,8 +220,9 @@ class CustomOperator {
   // async worker
   std::condition_variable cv_;
   std::vector<std::thread> workers_;
-  std::atomic<uint32_t> num_free_threads;
+  std::atomic<uint32_t> num_free_threads_;
   std::queue<std::function<void(void)> > q_;
+  std::shared_ptr<std::exception_ptr> exception_;
   bool naive_engine_;
   bool destructing_;
 };
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index 46249c9bbcc6..412bfa1bc3aa 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -128,17 +128,21 @@ bool InferShape(const NodeAttrs& attrs,
   const CustomParam& params = nnvm::get<CustomParam>(attrs.parsed);
 
   size_t total = params.num_args + params.num_outs + params.num_auxs;
-  std::vector<uint32_t*> shapes(total);
+  std::vector<int*> shapes(total);
   std::vector<int> ndims(total);
   size_t buff_size = 0;
-  for (const auto& i : *in_shape) buff_size += i.ndim();
-  std::vector<uint32_t> buff(buff_size);
-  uint32_t *ptr = buff.data();
+  for (const auto& i : *in_shape) {
+    if (i.ndim() > 0) {
+      buff_size += i.ndim();
+    }
+  }
+  std::vector<int> buff(buff_size);
+  int *ptr = buff.data();
   for (size_t i = 0; i < in_shape->size(); ++i) {
     shapes[i] = ptr;
     ndims[i] = (*in_shape)[i].ndim();
-    for (size_t j = 0; j < (*in_shape)[i].ndim(); ++j, ++ptr) {
-      *ptr = static_cast<uint32_t>((*in_shape)[i][j]);
+    for (int j = 0; j < (*in_shape)[i].ndim(); ++j, ++ptr) {
+      *ptr = (*in_shape)[i][j];
     }
   }
 
@@ -263,7 +267,7 @@ OpStatePtr CreateState(const NodeAttrs& attrs, Context ctx,
   for (size_t i = 0; i < in_shape.size(); ++i) {
     shapes[i] = ptr;
     ndims[i] = in_shape[i].ndim();
-    for (size_t j = 0; j < in_shape[i].ndim(); ++j, ++ptr) {
+    for (int j = 0; j < in_shape[i].ndim(); ++j, ++ptr) {
       *ptr = static_cast<uint32_t>(in_shape[i][j]);
     }
   }
diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index c37324678120..aeb189f35b78 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -93,7 +93,7 @@ inline bool ToTensorShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
 
   mxnet::TShape &shp = (*in_attrs)[0];
-  if (!shp.ndim()) return false;
+  if (!shape_is_known(shp)) return false;
 
   CHECK((shp.ndim() == 3) || (shp.ndim() == 4))
       << "Input image must have shape (height, width, channels), or "
@@ -215,16 +215,16 @@ void ToTensorOpForward(const nnvm::NodeAttrs &attrs,
 }
 
 struct NormalizeParam : public dmlc::Parameter<NormalizeParam> {
-  nnvm::Tuple<float> mean;
-  nnvm::Tuple<float> std;
+  mxnet::Tuple<float> mean;
+  mxnet::Tuple<float> std;
 
   DMLC_DECLARE_PARAMETER(NormalizeParam) {
     DMLC_DECLARE_FIELD(mean)
-    .set_default(nnvm::Tuple<float> {0.0f, 0.0f, 0.0f, 0.0f})
+    .set_default(mxnet::Tuple<float> {0.0f, 0.0f, 0.0f, 0.0f})
     .describe("Sequence of means for each channel. "
               "Default value is 0.");
     DMLC_DECLARE_FIELD(std)
-    .set_default(nnvm::Tuple<float> {1.0f, 1.0f, 1.0f, 1.0f})
+    .set_default(mxnet::Tuple<float> {1.0f, 1.0f, 1.0f, 1.0f})
     .describe("Sequence of standard deviations for each channel. "
               "Default value is 1.");
   }
@@ -245,7 +245,7 @@ inline bool NormalizeOpShape(const nnvm::NodeAttrs& attrs,
       << "Input tensor must have shape (channels, height, width), or "
       << "(N, channels, height, width), but got " << dshape;
 
-  uint32_t nchannels;
+  int nchannels = 0;
   if (dshape.ndim() == 3) {
     nchannels = dshape[0];
     CHECK(nchannels == 3 || nchannels == 1)
@@ -549,7 +549,7 @@ template<typename DType, int axis>
 void FlipImpl(const mxnet::TShape &shape, DType *src, DType *dst) {
   int head = 1, mid = shape[axis], tail = 1;
   for (int i = 0; i < axis; ++i) head *= shape[i];
-  for (uint32_t i = axis+1; i < shape.ndim(); ++i) tail *= shape[i];
+  for (int i = axis+1; i < shape.ndim(); ++i) tail *= shape[i];
 
   for (int i = 0; i < head; ++i) {
     for (int j = 0; j < (mid >> 1); ++j) {
@@ -981,7 +981,7 @@ inline void RandomColorJitter(const nnvm::NodeAttrs &attrs,
 }
 
 struct AdjustLightingParam : public dmlc::Parameter<AdjustLightingParam> {
-  nnvm::Tuple<float> alpha;
+  mxnet::Tuple<float> alpha;
   DMLC_DECLARE_PARAMETER(AdjustLightingParam) {
     DMLC_DECLARE_FIELD(alpha)
     .describe("The lighting alphas for the R, G, B channels.");
@@ -997,7 +997,7 @@ struct RandomLightingParam : public dmlc::Parameter<RandomLightingParam> {
   }
 };
 
-inline void AdjustLightingImpl(const nnvm::Tuple<float>& alpha,
+inline void AdjustLightingImpl(const mxnet::Tuple<float>& alpha,
                         const OpContext &ctx,
                         const std::vector<TBlob> &inputs,
                         const std::vector<OpReqType> &req,
diff --git a/src/operator/image/resize-inl.h b/src/operator/image/resize-inl.h
index de2189838d76..4ebebbfb272c 100644
--- a/src/operator/image/resize-inl.h
+++ b/src/operator/image/resize-inl.h
@@ -49,12 +49,12 @@ void ResizeImplCUDA(Stream<gpu> *s,
 #endif  // MXNET_USE_CUDA
 
 struct ResizeParam : public dmlc::Parameter<ResizeParam> {
-  nnvm::Tuple<int> size;
+  mxnet::Tuple<int> size;
   bool keep_ratio;
   int interp;
   DMLC_DECLARE_PARAMETER(ResizeParam) {
     DMLC_DECLARE_FIELD(size)
-    .set_default(nnvm::Tuple<int>())
+    .set_default(mxnet::Tuple<int>())
     .describe("Size of new image. Could be (width, height) or (size)");
     DMLC_DECLARE_FIELD(keep_ratio)
     .describe("Whether to resize the short edge or both edges to `size`, "
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index cfdd1064d6fb..7f8638630145 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -315,7 +315,7 @@ class LeakyReLUOp : public Operator {
     return a < b ? (a < c ? a : c) : (b < c ? b : c);
   }
   static inline mxnet::TShape expand_shape(const mxnet::TShape& src, const mxnet::TShape& dst) {
-    mxnet::TShape result(dst.ndim());
+    mxnet::TShape result(dst.ndim(), -1);
     int s = src.ndim() - 1;
     for (int i = dst.ndim() - 1; i >= 0; i--) {
       if (s >= 0 && i <= 1 && (dst[i] == src[s] || src[s] == 1)) {
@@ -355,10 +355,10 @@ class LeakyReLUProp : public OperatorProperty {
       CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
     }
     const mxnet::TShape &dshape = in_shape->at(leakyrelu::kData);
-    if (dshape.ndim() == 0) return false;
+    if (!mxnet::ndim_is_known(dshape)) return false;
     if (param_.act_type == leakyrelu::kPReLU) {
       const mxnet::TShape &gshape = in_shape->at(leakyrelu::kGamma);
-      if (gshape.ndim() == 0) {
+      if (!mxnet::ndim_is_known(gshape)) {
         in_shape->at(leakyrelu::kGamma) = mxnet::TShape(Shape1(dshape[1]));
       }
       if (dshape == gshape) {
diff --git a/src/operator/loss_binary_op-inl.h b/src/operator/loss_binary_op-inl.h
index a3853c56359a..1d71993da515 100644
--- a/src/operator/loss_binary_op-inl.h
+++ b/src/operator/loss_binary_op-inl.h
@@ -43,7 +43,7 @@ inline bool SoftmaxCrossEntropyShape(const nnvm::NodeAttrs& attrs,
       << "SoftmaxCrossEntropy only accept 1D label";
   CHECK_EQ((*in_attrs)[0][0], (*in_attrs)[1][0])
       << "SoftmaxCrossEntropy: data label shape mismatch";
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1));
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
   return true;
 }
 
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index c27a98ac1940..ab53e7733066 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -45,14 +45,12 @@ namespace mshadow_op {
 __constant__ const float PI = 3.14159265358979323846;
 __constant__ const float SELU_ALPHA = 1.6732632423543772848170429916717;
 __constant__ const float SELU_LAMBDA = 1.0507009873554804934193349852946;
-__constant__ const float GELU_CUBIC_CONSTANT = 0.044715;
-__constant__ const float GELU_ROOT_2_OVER_PI = 0.7978845608028654;
+__constant__ const float SQRT_2 = 1.4142135623730950488016887242096;
 #else
 const float PI = 3.14159265358979323846;
 const float SELU_ALPHA = 1.6732632423543772848170429916717;
 const float SELU_LAMBDA = 1.0507009873554804934193349852946;
-const float GELU_CUBIC_CONSTANT = 0.044715;
-const float GELU_ROOT_2_OVER_PI = 0.7978845608028654;
+const float SQRT_2 = 1.4142135623730950488016887242096;
 using std::isnan;
 #endif
 using std::enable_if;
@@ -131,21 +129,6 @@ MXNET_UNARY_MATH_OP(softsign, a / (1.0f + math::fabs(a)));
 
 MXNET_UNARY_MATH_OP(softsign_grad, 1.0f /  math::sqr(1.0f + math::fabs(a)));
 
-#define MXNET_GELU_GX(a) \
-  a * (DType(1.0f) + DType(GELU_CUBIC_CONSTANT) * a * a)
-
-#define MXNET_GELU_GX_GRAD(a) \
-  (DType(1.0f) + DType(3.0f * GELU_CUBIC_CONSTANT) * a * a)
-
-#define MXNET_GELU_TANH(a) \
-  math::tanh(DType(GELU_ROOT_2_OVER_PI) * MXNET_GELU_GX(a))
-
-MXNET_UNARY_MATH_OP(gelu, DType(0.5f) * a * (DType(1.0f) + MXNET_GELU_TANH(a)));
-
-MXNET_BINARY_MATH_OP_NC(gelu_grad,
-  b / a + b * (DType(1.0f) - MXNET_GELU_TANH(a)) *
-  DType(GELU_ROOT_2_OVER_PI) * MXNET_GELU_GX_GRAD(a));
-
 MXNET_UNARY_MATH_OP_NC(selu, DType(SELU_LAMBDA) *
                          (a > DType(0) ? a : DType(math::id(SELU_ALPHA) * math::expm1(a))));
 
@@ -191,6 +174,13 @@ MXNET_UNARY_MATH_OP(erf_grad, 2.0 / math::sqrt(PI) * math::exp(-(a * a)));
 
 MXNET_SIMPLE_UNARY_MATH_OP(erf);
 
+MXNET_UNARY_MATH_OP(gelu,
+  DType(0.5f * static_cast<float>(a) * (1.0f + math::erf(static_cast<float>(a) / SQRT_2))));
+
+MXNET_BINARY_MATH_OP_NC(gelu_grad,
+  DType(0.5f * (1.0f + math::erf(static_cast<float>(a) / SQRT_2) +
+                static_cast<float>(a) * erf_grad::Map(static_cast<float>(a) / SQRT_2) / SQRT_2)));
+
 MXNET_SIMPLE_UNARY_MATH_OP(exp);
 
 MXNET_SIMPLE_UNARY_MATH_OP(expm1);
@@ -355,7 +345,6 @@ MXNET_BINARY_MATH_OP(logical_xor, (a || b) && !(a && b) ? DType(1) : DType(0));
 MXNET_UNARY_MATH_OP(square_root, math::sqrt(a));
 
 MXNET_UNARY_MATH_OP(square_root_grad, 0.5f / math::id(a));
-
 MXNET_UNARY_MATH_OP(reciprocal_square_root, 1.0f / math::sqrt(a));
 
 MXNET_UNARY_MATH_OP(reciprocal_square_root_grad, -0.5f / (math::sqrt(a) * math::id(a)));
@@ -945,13 +934,13 @@ struct nanprod {
 /*! \brief compute l2 norm */
 struct nrm2 {
   /*! \brief do reduction into dst */
-  template<typename DType>
-  MSHADOW_XINLINE static void Reduce(volatile DType& sum_of_squares, volatile DType src) { // NOLINT(*)
+  template<typename AType, typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile AType& sum_of_squares, volatile DType src) { // NOLINT(*)
     sum_of_squares += src * src;
   }
   /*! \brief do stable reduction into dst */
-  template<typename DType>
-  MSHADOW_XINLINE static void Reduce(volatile DType& sum_of_squares,  volatile DType src, volatile DType& scale) { // NOLINT(*)
+  template<typename AType, typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile AType& sum_of_squares,  volatile DType src, volatile DType& scale) { // NOLINT(*)
     if (src != 0) {
       DType abs = mshadow_op::abs::Map(src);
       if (scale < abs) {
@@ -1012,6 +1001,66 @@ struct nrm2 {
   }
 };
 
+/*! \brief sum reducer */
+struct sum {
+  /*! \brief do reduction into dst */
+  template<typename AType, typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile AType& dst,  volatile DType src) { // NOLINT(*)
+    dst += src;
+  }
+  /*! \brief do stable reduction into dst */
+  template<typename AType, typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile AType& dst,  volatile DType src, volatile DType& residual) { // NOLINT(*)
+    DType y = src - residual;
+    DType t = dst + y;
+    residual = (t - dst) - y;
+    dst = t;
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& src_val) { // NOLINT(*)
+    Reduce(dst_val, src_val);
+  }
+  /*! \brief combine the results of two reducers */
+  template<typename DType>
+  MSHADOW_XINLINE static void Merge(volatile DType& dst_val, volatile DType& dst_residual, volatile DType& src_val, volatile DType& src_residual) { // NOLINT(*)
+    DType t1 = dst_val + src_val;
+    DType e = t1 - dst_val;
+    DType t2 = ((src_val - e) + (dst_val - (t1 - e))) + dst_residual + src_residual;
+    dst_val = t1 + t2;
+    dst_residual = t2 - (dst_val - t1);
+  }
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst) {} // NOLINT(*)
+  /*! \brief finalize reduction */
+  template<typename DType>
+  MSHADOW_XINLINE static void Finalize(volatile DType& dst, volatile DType& residual) {} // NOLINT(*)
+  /*!
+   *\brief calculate gradient of redres with respect to redsrc,
+   * redres: reduced result, redsrc: one of reduction element
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static DType PartialGrad(DType redres, DType redsrc) {
+    return 1;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv) { // NOLINT(*)
+    initv = 0;
+  }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv, DType &residual) { // NOLINT(*)
+    SetInitValue(initv);
+    residual = 0;
+  }
+};
+
 struct nanprod_grad : public mxnet_op::tunable {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index d8fc5031e4ff..f17b708a7687 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -273,20 +273,91 @@ inline int get_num_threads<cpu>(const int N) {
     }                                                      \
     break;                                                 \
   case mshadow::kUint8:                                    \
-    LOG(FATAL) << "This operation only support "           \
-                  "floating point types not uint8";        \
+    {                                                      \
+      typedef uint8_t DType;                               \
+      typedef uint8_t AType;                               \
+      LOG(FATAL) << "This operation only support "         \
+                    "floating point types not uint8";      \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt8:                                     \
+    {                                                      \
+      typedef int8_t DType;                                \
+      typedef int8_t AType;                                \
+      LOG(FATAL) << "This operation only support "         \
+                    "floating point types not int8";       \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt32:                                    \
+    {                                                      \
+      typedef int32_t DType;                               \
+      typedef int32_t AType;                               \
+      LOG(FATAL) << "This operation only support "         \
+                    "floating point types, not int32";     \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kInt64:                                    \
+    {                                                      \
+      typedef int64_t DType;                               \
+      typedef int64_t AType;                               \
+      LOG(FATAL) << "This operation only support "         \
+                    "floating point types, not int64";     \
+    }                                                      \
+    break;                                                 \
+  default:                                                 \
+    LOG(FATAL) << "Unknown type enum " << type;            \
+  }
+
+#define MXNET_ACC_TYPE_SWITCH(type, DType, AType, ...)\
+  switch (type) {                                          \
+  case mshadow::kFloat32:                                  \
+    {                                                      \
+      typedef float DType;                                 \
+      typedef double AType;                                \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat64:                                  \
+    {                                                      \
+      typedef double DType;                                \
+      typedef double AType;                                \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat16:                                  \
+    {                                                      \
+      typedef mshadow::half::half_t DType;                 \
+      typedef float AType;                                 \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kUint8:                                    \
+    {                                                      \
+      typedef uint8_t DType;                               \
+      typedef uint32_t AType;                              \
+      {__VA_ARGS__}                                        \
+    }                                                      \
     break;                                                 \
   case mshadow::kInt8:                                     \
-    LOG(FATAL) << "This operation only support "           \
-                  "floating point types not int8";         \
+    {                                                      \
+      typedef int8_t DType;                                \
+      typedef int32_t AType;                               \
+      {__VA_ARGS__}                                        \
+    }                                                      \
     break;                                                 \
   case mshadow::kInt32:                                    \
-    LOG(FATAL) << "This operation only support "           \
-                  "floating point types, not int32";       \
+    {                                                      \
+      typedef int32_t DType;                               \
+      typedef int64_t AType;                               \
+      {__VA_ARGS__}                                        \
+    }                                                      \
     break;                                                 \
   case mshadow::kInt64:                                    \
-    LOG(FATAL) << "This operation only support "           \
-                  "floating point types, not int64";       \
+    {                                                      \
+      typedef int64_t DType;                               \
+      typedef int64_t AType;                               \
+      {__VA_ARGS__}                                        \
+    }                                                      \
     break;                                                 \
   default:                                                 \
     LOG(FATAL) << "Unknown type enum " << type;            \
@@ -714,6 +785,7 @@ struct Kernel<OP, gpu> {
   /*! \brief Launch GPU kernel */
   template<typename ...Args>
   inline static void Launch(mshadow::Stream<gpu> *s, int N, Args... args) {
+    if (0 == N) return;
     using namespace mshadow::cuda;
     int ngrid = std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
     mxnet_generic_kernel<OP, Args...>
@@ -724,6 +796,7 @@ struct Kernel<OP, gpu> {
 
   template<typename ...Args>
   inline static void LaunchEx(mshadow::Stream<gpu> *s, const int N, Args... args) {
+    if (0 == N) return;
     using namespace mshadow::cuda;
     int ngrid = std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
     mxnet_generic_kernel_ex<OP, Args...>
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 511fe455e946..622952cc4bc5 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -332,7 +332,7 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
 
   const int channelCount = dshape[channelAxis];
 
-  if (dshape.ndim() == 0) {
+  if (!mxnet::ndim_is_known(dshape)) {
     return false;
   }
 
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index fa441c45321e..8fb229889332 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -39,39 +39,40 @@ static bool ConcatShape(const nnvm::NodeAttrs& attrs,
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
   mxnet::TShape dshape;
-  index_t size = 0;
-  bool has_zero = false;
+  dim_t size = 0;
+  bool has_unknown_dim_size = false;
   int axis = -1;
   for (int i = 0; i < param_.num_args; ++i) {
     mxnet::TShape tmp = (*in_shape)[i];
-    if (tmp.ndim()) {
+    if (tmp.ndim() > 0) {
       axis = CheckAxis(param_.dim, tmp.ndim());
-      has_zero = tmp[axis] == 0 || has_zero;
+      has_unknown_dim_size = !mxnet::dim_size_is_known(tmp, axis) || has_unknown_dim_size;
       size += tmp[axis];
-      tmp[axis] = 0;
+      tmp[axis] = -1;
       shape_assign(&dshape, tmp);
     }
   }
 
   mxnet::TShape tmp = (*out_shape)[0];
-  if (tmp.ndim()) {
+  if (tmp.ndim() > 0) {
     axis = CheckAxis(param_.dim, tmp.ndim());
-    tmp[axis] = 0;
+    tmp[axis] = -1;
     shape_assign(&dshape, tmp);
   }
 
-  if (dshape.ndim() == 0) return false;
+  if (dshape.ndim() == -1) return false;
+  CHECK_NE(dshape.ndim(), 0) << "zero-dimensional arrays cannot be concatenated";
 
   for (int i = 0; i < param_.num_args; ++i) {
     CHECK(shape_assign(&(*in_shape)[i], dshape))
         << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
   }
 
-  if (!has_zero) dshape[axis] = size;
+  if (!has_unknown_dim_size) dshape[axis] = size;
   CHECK(shape_assign(&(*out_shape)[0], dshape))
       << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
 
-  return dshape.Size() != 0;
+  return shape_is_known(dshape);
 }
 
 // Concat for RNN param deals with the reverse shape inference from output
@@ -90,26 +91,27 @@ static bool RNNParamConcatShape(const nnvm::NodeAttrs& attrs,
   int axis = -1;
   for (int i = 0; i < param_.num_args; ++i) {
     mxnet::TShape tmp = (*in_shape)[i];
-    if (tmp.ndim()) {
+    if (tmp.ndim() > 0) {
       axis = CheckAxis(param_.dim, tmp.ndim());
-      if (tmp[axis] == 0) {
+      if (!mxnet::dim_size_is_known(tmp, axis)) {
         zero_indices.emplace_back(i);
       } else {
+        CHECK_GE(tmp[axis], 0);
         size += tmp[axis];
       }
-      tmp[axis] = 0;
+      tmp[axis] = -1;
       shape_assign(&dshape, tmp);
     }
   }
 
   mxnet::TShape tmp = (*out_shape)[0];
-  if (tmp.ndim()) {
+  if (tmp.ndim() > 0) {
     axis = CheckAxis(param_.dim, tmp.ndim());
-    tmp[axis] = 0;
+    tmp[axis] = -1;
     shape_assign(&dshape, tmp);
   }
 
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
 
   for (int i = 0; i < param_.num_args; ++i) {
     CHECK(shape_assign(&(*in_shape)[i], dshape))
@@ -119,21 +121,21 @@ static bool RNNParamConcatShape(const nnvm::NodeAttrs& attrs,
   if (zero_indices.empty()) dshape[axis] = size;
   CHECK(shape_assign(&(*out_shape)[0], dshape))
       << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
-  if ((*out_shape)[0][axis] != 0 && !zero_indices.empty()) {
+  if ((*out_shape)[0][axis] != -1 && !zero_indices.empty()) {
     int residual = (*out_shape)[0][axis] - size;
     CHECK_GE(residual, 0)
         << "Input size already exceeds output size. Residual: " << residual;
-    CHECK(zero_indices.size() <= 2 && zero_indices.size() >= 0)
+    CHECK(zero_indices.size() <= 2 && zero_indices.size() > 0)
         << "Expecting 1 or 2 inputs that need shape inference. Got: " << zero_indices.size();
-    bool need_infer = !(*out_shape)[0].Size();
+    bool need_infer = !shape_is_known((*out_shape)[0]);
     for (int i : zero_indices) {
       (*in_shape)[i][axis] = residual / zero_indices.size();
-      need_infer = need_infer || !(*in_shape)[i].Size();
+      need_infer = need_infer || !shape_is_known((*in_shape)[i]);
     }
     return !need_infer;
   }
 
-  return dshape.Size() != 0;
+  return shape_is_known(dshape);
 }
 
 static bool ConcatType(const nnvm::NodeAttrs& attrs,
@@ -232,9 +234,10 @@ bool SupportMKLDNNConcat(const std::vector<NDArray> &arrs) {
   for (auto &arr : arrs) {
     if (arr.IsView()) return false;
     if (arr.dtype() != mshadow::kFloat32) return false;
-    unsigned ndim = arr.shape().ndim();
-    unsigned mkldnn_ndims =
-        static_cast<unsigned>(arr.GetMKLDNNData()->get_primitive_desc().desc().data.ndims);
+    // DO not support zero-size tensors.
+    if (arr.shape().Size() == 0) return false;
+    int ndim = arr.shape().ndim();
+    const int mkldnn_ndims = arr.GetMKLDNNData()->get_primitive_desc().desc().data.ndims;
     if (!(ndim == 2 || ndim == 4) || ndim != mkldnn_ndims) return false;
   }
   return true;
diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h
index 7ae34ae363b4..7d5f7c7d5757 100644
--- a/src/operator/nn/convolution-inl.h
+++ b/src/operator/nn/convolution-inl.h
@@ -69,11 +69,11 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(ConvolutionParam) {
     DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (w,), (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, 0))
     .describe("Convolution stride: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, 0))
     .describe("Convolution dilate: (w,), (h, w) or (d, h, w). Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, 0))
     .describe("Zero pad for convolution: (w,), (h, w) or (d, h, w). Defaults to no padding.");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
     .describe("Convolution filter(channel) number");
@@ -209,9 +209,9 @@ class ConvolutionOp {
       Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
         .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
       // calculate the shape of col_buffer
-      mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1);
+      mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
       col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-      for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+      for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
         col_buffer_shape[i] = out_data[0].shape_[i+1];
       }
       // create a column buffer using workspace and col_buffer_shape
@@ -295,9 +295,9 @@ class ConvolutionOp {
       Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
         .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
       // calculate the shape of col_buffer
-      mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1);
+      mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, 1);
       col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-      for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+      for (int i = 1; i < col_buffer_shape.ndim(); ++i) {
         col_buffer_shape[i] = out_grad[conv::kData].shape_[i+1];
       }
       // create a column buffer using workspace and col_buffer_shape
@@ -342,10 +342,10 @@ class ConvolutionOp {
   void LayerSetUp(const mxnet::TShape& ishape, const mxnet::TShape& oshape) {
     channel_axis_ = 1;  // hard code channel axis
     const index_t first_spatial_axis = channel_axis_ + 1;
-    const index_t num_axes = param_.kernel.ndim() + 2;
+    const int num_axes = param_.kernel.ndim() + 2;
     num_spatial_axes_ = num_axes - first_spatial_axis;
     is_1x1_ = true;
-    for (index_t i = 0; i < param_.kernel.ndim(); ++i) {
+    for (int i = 0; i < param_.kernel.ndim(); ++i) {
       is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0;
       if (!is_1x1_) break;
     }
diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc
index 527a0073930f..536e9a731171 100644
--- a/src/operator/nn/convolution.cc
+++ b/src/operator/nn/convolution.cc
@@ -96,24 +96,28 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
   // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
   out_shape->resize(1, mxnet::TShape());
   const mxnet::TShape &dshp = (*in_shape)[conv::kData];
-  if (dshp.ndim() ==  0) return false;
+  if (!mxnet::ndim_is_known(dshp)) return false;
 
   if (param_.kernel.ndim() == 1) {
     // 1d conv
     CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x";
     Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
-    Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+    Shape<3> wshape = Shape3(param_.num_filter / param_.num_group,
+        mxnet::dim_size_is_known(dshape, 1) ? dshape[1] / param_.num_group : -1,
         param_.kernel[0]);
     wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
-    wshape[0] *= param_.num_group;
+    if (wshape[0] >= 0) {
+      wshape[0] *= param_.num_group;
+    }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
     if (!param_.no_bias) {
       SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
     }
 
     const index_t dilated_ksize_x = param_.DilatedKernelSize(0);
-    CHECK_EQ(dshape[1] % param_.num_group, 0U) \
-      << "input num_filter must divide group size";
+    if (dshape[1] != -1) {
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) << "input num_filter must divide group size";
+    }
     CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
       << "output num_filter must divide group size";
     CHECK_GT(param_.kernel.Size(), 0U) \
@@ -125,21 +129,21 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape;
     oshape[0] = dshape[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] ?
-      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
+    oshape[2] = dshape[2] != -1 ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : -1;
     SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
     // Perform incomplete shape inference. Fill in the missing values in data shape.
     // 1) We can always fill in the batch_size.
     // 2) We can back-calculate the input height/width if the corresponding stride is 1.
     oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
     dshape[0] = oshape[0];
-    if (oshape[2] && param_.stride[0] == 1) {
+    if (oshape[2] != -1 && param_.stride[0] == 1) {
       dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
     }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
         ConvertLayout(dshape, kNCW, param_.layout.value()));
     // Check whether the kernel sizes are valid
-    if (dshape[2] != 0) {
+    if (dshape[2] != -1) {
       CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
     }
     return true;
@@ -149,10 +153,12 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
       << "Input data should be 4D in batch-num_filter-y-x";
     Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
     Shape<4> wshape = Shape4(param_.num_filter / param_.num_group,
-        dshape[1] / param_.num_group,
+        mxnet::dim_size_is_known(dshape, 1) ? dshape[1] / param_.num_group : -1,
         param_.kernel[0], param_.kernel[1]);
     wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
-    wshape[0] *= param_.num_group;
+    if (wshape[0] >= 0) {
+      wshape[0] *= param_.num_group;
+    }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
     if (!param_.no_bias) {
       SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
@@ -160,8 +166,9 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
 
     const index_t dilated_ksize_y = param_.DilatedKernelSize(0);
     const index_t dilated_ksize_x = param_.DilatedKernelSize(1);
-    CHECK_EQ(dshape[1] % param_.num_group, 0U) \
-      << "input num_filter must divide group size";
+    if (dshape[1] != -1) {
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) << "input num_filter must divide group size";
+    }
     CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
       << "output num_filter must divide group size";
     CHECK_GT(param_.kernel.Size(), 0U) \
@@ -173,29 +180,29 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<4> oshape;
     oshape[0] = dshape[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] ?
-      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0;
-    oshape[3] = dshape[3] ?
-      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0;
+    oshape[2] = dshape[2] != -1 ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : -1;
+    oshape[3] = dshape[3] != -1 ?
+      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : -1;
     SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
     // Perform incomplete shape inference. Fill in the missing values in data shape.
     // 1) We can always fill in the batch_size.
     // 2) We can back-calculate the input height/width if the corresponding stride is 1.
     oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
     dshape[0] = oshape[0];
-    if (oshape[2] && param_.stride[0] == 1) {
+    if (oshape[2] != -1 && param_.stride[0] == 1) {
       dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0];
     }
-    if (oshape[3] && param_.stride[1] == 1) {
+    if (oshape[3] != -1 && param_.stride[1] == 1) {
       dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1];
     }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
         ConvertLayout(dshape, kNCHW, param_.layout.value()));
     // Check whether the kernel sizes are valid
-    if (dshape[2] != 0) {
+    if (dshape[2] != -1) {
       CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
     }
-    if (dshape[3] != 0) {
+    if (dshape[3] != -1) {
       CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
     }
     return true;
@@ -204,10 +211,13 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(dshp.ndim(), 5U) \
       << "Input data should be 5D in batch-num_filter-depth-y-x";
     Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
-    Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+    Shape<5> wshape = Shape5(param_.num_filter / param_.num_group,
+        mxnet::dim_size_is_known(dshape, 1) ? dshape[1] / param_.num_group : -1,
         param_.kernel[0], param_.kernel[1], param_.kernel[2]);
     wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
-    wshape[0] *= param_.num_group;
+    if (wshape[0] >= 0) {
+      wshape[0] *= param_.num_group;
+    }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
     if (!param_.no_bias) {
       SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
@@ -218,8 +228,9 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     const index_t dilated_ksize_d = param_.DilatedKernelSize(0);
     const index_t dilated_ksize_y = param_.DilatedKernelSize(1);
     const index_t dilated_ksize_x = param_.DilatedKernelSize(2);
-    CHECK_EQ(dshape[1] % param_.num_group, 0U)
-      << "input num_filter must divide group size";
+    if (dshape[1] >= 0) {
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) << "input num_filter must divide group size";
+    }
     CHECK_EQ(param_.num_filter % param_.num_group, 0U)
       << "output num_filter must divide group size";
     CHECK_GT(param_.kernel.Size(), 0U) \
@@ -233,37 +244,37 @@ static bool ConvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<5> oshape;
     oshape[0] = dshape[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = dshape[2] ?
-      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0;
-    oshape[3] = dshape[3] ?
-      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0;
-    oshape[4] = dshape[4] ?
-      (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0;
+    oshape[2] = dshape[2] != -1 ?
+      (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : -1;
+    oshape[3] = dshape[3] != -1 ?
+      (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : -1;
+    oshape[4] = dshape[4] != -1 ?
+      (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : -1;
     SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
     // Perform incomplete shape inference. Fill in the missing values in data shape.
     // 1) We can always fill in the batch_size.
     // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1.
     oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW);
     dshape[0] = oshape[0];
-    if (oshape[2] && param_.stride[0] == 1) {
+    if (oshape[2] != -1 && param_.stride[0] == 1) {
       dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0];
     }
-    if (oshape[3] && param_.stride[1] == 1) {
+    if (oshape[3] != -1 && param_.stride[1] == 1) {
       dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1];
     }
-    if (oshape[4] && param_.stride[2] == 1) {
+    if (oshape[4] != -1 && param_.stride[2] == 1) {
       dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2];
     }
     SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
         ConvertLayout(dshape, kNCDHW, param_.layout.value()));
     // Check whether the kernel sizes are valid
-    if (dshape[2] != 0) {
+    if (dshape[2] != -1) {
       CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
     }
-    if (dshape[3] != 0) {
+    if (dshape[3] != -1) {
       CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input";
     }
-    if (dshape[4] != 0) {
+    if (dshape[4] != -1) {
       CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input";
     }
     return true;
diff --git a/src/operator/nn/ctc_loss-inl.h b/src/operator/nn/ctc_loss-inl.h
index 357888dc30f1..8c841dfc24b4 100644
--- a/src/operator/nn/ctc_loss-inl.h
+++ b/src/operator/nn/ctc_loss-inl.h
@@ -239,7 +239,7 @@ inline bool CTCLossOpShape(const nnvm::NodeAttrs &attrs,
                                       "the maximum sequence length of the "
                                       "data.";
 
-    mxnet::TShape oshape(1);
+    mxnet::TShape oshape(1, -1);
     oshape[0] = dshape[1];  // batch size
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);  // forward output
     SHAPE_ASSIGN_CHECK(*out_attrs, 1, dshape);  // grad output
diff --git a/src/operator/nn/cudnn/cudnn_algoreg-inl.h b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
index cef9d6f86940..3f2d24c5bf7e 100644
--- a/src/operator/nn/cudnn/cudnn_algoreg-inl.h
+++ b/src/operator/nn/cudnn/cudnn_algoreg-inl.h
@@ -96,7 +96,7 @@ class CuDNNAlgoReg {
       if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
         LOG(INFO) << "Running performance tests to find the best convolution "
             "algorithm, "
-            "this can take a while... (setting env variable "
+            "this can take a while... (set the environment variable "
             "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
         if (reg_.size() >= 1000) {
           // Many people are very concerned about this warning, so change the warning once.
diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cc b/src/operator/nn/cudnn/cudnn_batch_norm.cc
index 5632028dd769..cb35ce170e8e 100644
--- a/src/operator/nn/cudnn/cudnn_batch_norm.cc
+++ b/src/operator/nn/cudnn/cudnn_batch_norm.cc
@@ -37,7 +37,7 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector *in_
   using namespace mshadow;
   CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, moving_mean, moving_var]";
   const mxnet::TShape &dshape = in_shape->at(0);
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   in_shape->at(1) = mxnet::TShape(Shape1(dshape[1]));
   in_shape->at(2) = mxnet::TShape(Shape1(dshape[1]));
   in_shape->at(3) = mxnet::TShape(Shape1(dshape[1]));
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index 55b263896339..679e0cd1057b 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -1015,9 +1015,9 @@ class CuDNNConvolutionOp {
   // e.g. {shape[0], shape[1], shape[2]} -> {shape[1]*shape[2], shape[2], 1}
   template <int dim>
   inline Shape<dim> Strides(const mxnet::TShape &s) {
-    uint32_t ndim = s.ndim();
-    mxnet::TShape strides(ndim);
-    for (uint32_t i = 0; i != ndim; ++i)
+    int ndim = s.ndim();
+    mxnet::TShape strides(ndim, -1);
+    for (int i = 0; i != ndim; ++i)
       strides[i] = s.ProdShape(i+1, ndim);
     return strides.get<dim>();
   }
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index 47f688c8ab9c..adb6caf1c028 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -933,9 +933,9 @@ class CuDNNDeconvolutionOp {
   // e.g. {shape[0], shape[1], shape[2]} -> {shape[1]*shape[2], shape[2], 1}
   template <int dim>
   inline Shape<dim> Strides(const mxnet::TShape &s) {
-    uint32_t ndim = s.ndim();
-    mxnet::TShape strides(ndim);
-    for (uint32_t i = 0; i != ndim; ++i)
+    int ndim = s.ndim();
+    mxnet::TShape strides(ndim, -1);
+    for (int i = 0; i != ndim; ++i)
       strides[i] = s.ProdShape(i+1, ndim);
     return strides.get<dim>();
   }
diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h
index 5248c1211ac7..58f9be702396 100644
--- a/src/operator/nn/deconvolution-inl.h
+++ b/src/operator/nn/deconvolution-inl.h
@@ -65,13 +65,13 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
   DMLC_DECLARE_PARAMETER(DeconvolutionParam) {
     DMLC_DECLARE_FIELD(kernel).describe("Deconvolution kernel size: (w,), (h, w) or (d, h, w). "
                   "This is same as the kernel size used for the corresponding convolution");
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, 0))
         .describe("The stride used for the corresponding convolution: (w,), (h, w) or (d, h, w). "
                   "Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, 0))
         .describe("Dilation factor for each dimension of the input: (w,), (h, w) or (d, h, w). "
                   "Defaults to 1 for each dimension.");
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, 0))
         .describe("The amount of implicit zero padding added during convolution for each "
                   "dimension of the input: "
                   "(w,), (h, w) or (d, h, w). "
@@ -79,11 +79,11 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
                   "If `target_shape` is set, "
                   "`pad` will be ignored and a padding that will generate the target shape "
                   "will be used. Defaults to no padding.");
-    DMLC_DECLARE_FIELD(adj).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(adj).set_default(mxnet::TShape(0, 0))
         .describe("Adjustment for output shape: (w,), (h, w) or (d, h, w). "
                   "If `target_shape` is set, "
                   "`adj` will be ignored and computed accordingly.");
-    DMLC_DECLARE_FIELD(target_shape).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(target_shape).set_default(mxnet::TShape(0, 0))
         .describe("Shape of the output tensor: (w,), (h, w) or (d, h, w).");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
         .describe("Number of output filters.");
@@ -134,16 +134,18 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
       for (size_t i = 0; i < ndim; i++) {
         // input.ndim() can be larger than ndim, in case that the complete input
         // shape was passed and not only the ndim last ones
-        o_pad[i] = stride[i] * (input[(input_ndim - ndim) + i] - 1) + DilatedKernelSize(i);
-        CHECK_GE(o_pad[i], target_shape[i]) << "too big target shape";
-        o_pad[i] -= target_shape[i];
-        o_adj[i] = o_pad[i] % 2;
-        o_pad[i] = (o_pad[i] + 1) / 2;
+        if (mxnet::dim_size_is_known(input, input_ndim - ndim + i)) {
+          o_pad[i] = stride[i] * (input[(input_ndim - ndim) + i] - 1) + DilatedKernelSize(i);
+          CHECK_GE(o_pad[i], target_shape[i]) << "too big target shape";
+          o_pad[i] -= target_shape[i];
+          o_adj[i] = o_pad[i] % 2;
+          o_pad[i] = (o_pad[i] + 1) / 2;
+        }
       }
     } else {
-      for (size_t i = 0; i < ndim; i++) {
-        o_pad[i] = pad[i];
-        o_adj[i] = adj[i];
+      for (int i = 0; i < static_cast<int>(ndim); i++) {
+        o_pad[i] = i < pad.ndim() ? pad[i] : 0;
+        o_adj[i] = i < adj.ndim() ? adj[i] : 0;
       }
     }
   }
@@ -460,7 +462,7 @@ class DeconvolutionOp {
                                      oshape[2] * oshape[3]);
     // See convolution for workspace calculations. nstep_ will be the effective batch size
     nstep_ = std::max<index_t>(
-        std::min(static_cast<index_t>(param_.workspace) /
+        std::min<index_t>(param_.workspace /
           (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
       1);
 
diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc
index 27928b9b41c3..09b255d009e0 100644
--- a/src/operator/nn/deconvolution.cc
+++ b/src/operator/nn/deconvolution.cc
@@ -54,7 +54,7 @@ static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
   }
   out_shape->resize(1, mxnet::TShape());
   const mxnet::TShape &dshape = (*in_shape)[deconv::kData];
-  if (dshape.ndim() ==  0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
 
   if (param_.kernel.ndim() == 1) {
     // 1d conv
@@ -90,8 +90,12 @@ static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<3> oshape;
     oshape[0] = dshape_ncw[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) +
-      dilated_ksize_x - 2 * o_pad[0] + o_adj[0];
+    if (mxnet::dim_size_is_known(dshape_ncw[2])) {
+      oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) +
+          dilated_ksize_x - 2 * o_pad[0] + o_adj[0];
+    } else {
+      oshape[2] = -1;
+    }
 
     if (param_.target_shape.ndim() > 0) {
       if (param_.target_shape[0] > 0) {
@@ -141,10 +145,18 @@ static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<4> oshape;
     oshape[0] = dshape_nchw[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) +
-      dilated_ksize_y - 2 * o_pad[0] + o_adj[0];
-    oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) +
-      dilated_ksize_x - 2 * o_pad[1] + o_adj[1];
+    if (mxnet::dim_size_is_known(dshape_nchw[2])) {
+      oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) +
+          dilated_ksize_y - 2 * o_pad[0] + o_adj[0];
+    } else {
+      oshape[2] = -1;
+    }
+    if (mxnet::dim_size_is_known(dshape_nchw[3])) {
+      oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) +
+          dilated_ksize_x - 2 * o_pad[1] + o_adj[1];
+    } else {
+      oshape[3] = -1;
+    }
 
     if (param_.target_shape.ndim() > 1) {
       if (param_.target_shape[0] > 0) {
@@ -203,12 +215,24 @@ static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs,
     Shape<5> oshape;
     oshape[0] = dshape_ncdhw[0];
     oshape[1] = param_.num_filter;
-    oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) +
-      dilated_ksize_d - 2 * o_pad[0] + o_adj[0];
-    oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) +
-      dilated_ksize_y - 2 * o_pad[1] + o_adj[1];
-    oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) +
-      dilated_ksize_x - 2 * o_pad[2] + o_adj[2];
+    if (mxnet::dim_size_is_known(dshape_ncdhw[2])) {
+      oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) +
+          dilated_ksize_d - 2 * o_pad[0] + o_adj[0];
+    } else {
+      oshape[2] = -1;
+    }
+    if (mxnet::dim_size_is_known(dshape_ncdhw[3])) {
+      oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) +
+          dilated_ksize_y - 2 * o_pad[1] + o_adj[1];
+    } else {
+      oshape[3] = -1;
+    }
+    if (mxnet::dim_size_is_known(dshape_ncdhw[4])) {
+      oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) +
+          dilated_ksize_x - 2 * o_pad[2] + o_adj[2];
+    } else {
+      oshape[4] = -1;
+    }
 
     if (param_.target_shape.ndim() > 2) {
       if (param_.target_shape[0] > 0) {
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 01611dfce191..a34d2992c8c6 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -78,7 +78,7 @@ struct DropoutParam : public dmlc::Parameter<DropoutParam> {
     .add_enum("always", dropout::kAlways)
     .set_default(dropout::kTraining)
     .describe("Whether to only turn on dropout during training or to also turn on for inference.");
-    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape(0, 0))
     .describe("Axes for variational dropout kernel.");
     DMLC_DECLARE_FIELD(cudnn_off).set_default(dmlc::optional<bool>(false))
     .describe("Whether to turn off cudnn in dropout operator. "
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index 5fdc672d766e..afad6fd5cc80 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -95,10 +95,10 @@ Example::
   CHECK_EQ(in_shape->size(), 1U);
   const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
   mxnet::TShape dshape(in_shape->at(0));
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   out_shape->clear();
   out_shape->push_back(dshape);
-  for (index_t i = 0; i < param.axes.ndim(); ++i) {
+  for (int i = 0; i < param.axes.ndim(); ++i) {
     dshape[param.axes[i]] = 1;
   }
   out_shape->push_back(dshape);
diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc
index 2bc321832af6..a097357ef5a3 100644
--- a/src/operator/nn/fully_connected.cc
+++ b/src/operator/nn/fully_connected.cc
@@ -52,7 +52,7 @@ static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape dshape = (*in_shape)[fullc::kData];
   mxnet::TShape oshape = (*out_shape)[0];
   // require data to be known
-  if (dshape.ndim() ==  0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
 
   index_t num_input;
   if (!param.flatten) {
@@ -75,7 +75,7 @@ static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs,
   } else {
     SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
   }
-  if (oshape.ndim() != 0) {
+  if (oshape.ndim() > 0) {
     dshape[0] = oshape[0];
     SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape);
   }
diff --git a/src/operator/nn/im2col.h b/src/operator/nn/im2col.h
index 0059a420726d..06a4e1b75b33 100644
--- a/src/operator/nn/im2col.h
+++ b/src/operator/nn/im2col.h
@@ -152,7 +152,7 @@ inline void im2col_nd_core_cpu(const DType* data_input, const bool im2col,
     const mxnet::TShape& kernel_shape, const mxnet::TShape& pad, const mxnet::TShape& stride,
     const mxnet::TShape& dilation, DType* data_output, OpReqType req = mxnet::kWriteTo) {
   if (mxnet::kNullOp == req) return;
-  index_t num_spatial_axes = kernel_shape.ndim();
+  int num_spatial_axes = kernel_shape.ndim();
   if (!im2col) {
     index_t im_size = im_shape[1];  // skip batch dim
     for (index_t i = 0; i < num_spatial_axes; ++i) {
@@ -319,7 +319,7 @@ inline void col2im(mshadow::Stream<cpu>* s,
                    const mxnet::TShape& col_shape, const mxnet::TShape& kernel_shape,
                    const mxnet::TShape& pad, const mxnet::TShape& stride,
                    const mxnet::TShape& dilation, DType* data_im, OpReqType req) {
-  index_t num_spatial_axes = kernel_shape.ndim();
+  int num_spatial_axes = kernel_shape.ndim();
   if (2 == num_spatial_axes) {
     col2im_cpu(data_col, im_shape[1], im_shape[2], im_shape[3],
                kernel_shape[0], kernel_shape[1], pad[0], pad[1],
diff --git a/src/operator/nn/layer_norm-inl.h b/src/operator/nn/layer_norm-inl.h
index dc4914bf2457..c7de7d734521 100644
--- a/src/operator/nn/layer_norm-inl.h
+++ b/src/operator/nn/layer_norm-inl.h
@@ -167,7 +167,7 @@ void LayerNormGradCompute(const nnvm::NodeAttrs& attrs,
   const LayerNormParam& param = nnvm::get<LayerNormParam>(attrs.parsed);
   int axis = param.axis;
   if (axis < 0) {
-    axis += static_cast<int>(inputs[0].ndim());
+    axis += inputs[0].ndim();
   }
   CHECK(axis >= 0 && axis < inputs[0].ndim()) << "Channel axis out of range: " << param.axis;
   Stream<xpu> *s = ctx.get_stream<xpu>();
diff --git a/src/operator/nn/layer_norm.cc b/src/operator/nn/layer_norm.cc
index d4c308398cb7..2e47503a3318 100644
--- a/src/operator/nn/layer_norm.cc
+++ b/src/operator/nn/layer_norm.cc
@@ -41,14 +41,14 @@ static bool LayerNormShape(const nnvm::NodeAttrs& attrs,
   const mxnet::TShape &dshape = in_shape->at(layernorm::kData);
   int axis = param.axis;
   if (axis < 0) {
-    axis += static_cast<int>(dshape.ndim());
+    axis += dshape.ndim();
   }
-  CHECK(axis >= 0 && axis < static_cast<int>(dshape.ndim()))
+  CHECK(axis >= 0 && axis < dshape.ndim())
     << "Channel axis out of range: axis=" << param.axis;
 
   const int channelCount = dshape[axis];
 
-  if (dshape.ndim() == 0) {
+  if (!mxnet::ndim_is_known(dshape)) {
     return false;
   }
 
diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc
index 410bdab667e5..b632e35b57fe 100644
--- a/src/operator/nn/lrn.cc
+++ b/src/operator/nn/lrn.cc
@@ -40,7 +40,7 @@ bool LRNShape(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
   const mxnet::TShape &dshape = in_shape->at(0);
-  if (dshape.ndim() == 0) return false;
+  if (!shape_is_known(dshape)) return false;
   out_shape->clear();
   out_shape->push_back(dshape);
   out_shape->push_back(dshape);
diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h
new file mode 100644
index 000000000000..6bf30e3f3bbe
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_act-inl.h
+ * \brief MKLDNN(Quantized) Activation operator based on subgraph
+ * /author Zhiyuan Huang
+*/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
+
+
+#if MXNET_USE_MKLDNN == 1
+#include <vector>
+#include <utility>
+#include "../activation-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param);
+mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
+    const ActivationParam& param, bool is_train,
+    const mkldnn::memory &input_mem, int dtype);
+
+class MKLDNNActForward {
+ public:
+  const mkldnn::eltwise_forward::primitive_desc fwd_pd;
+
+  MKLDNNActForward(const ActivationParam& param, bool is_train,
+                   const NDArray &data, const mkldnn::memory &mem): fwd_pd(
+                       GetActFwdDescImpl(param, is_train, mem, data.dtype())) {}
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output);
+  const mkldnn::eltwise_forward &GetFwd() const;
+
+ private:
+  std::shared_ptr<mkldnn::eltwise_forward> fwd_;
+  std::shared_ptr<mkldnn::memory> data_;
+  std::shared_ptr<mkldnn::memory> out_;
+};
+
+typedef ParamOpSign<ActivationParam> MKLDNNActSignature;
+MKLDNNActForward &GetActForward(const ActivationParam& param,
+                                const OpContext &ctx, const NDArray &in_data,
+                                const mkldnn::memory &in_mem);
+
+void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                             const NDArray &in_data, const OpReqType &req,
+                             const NDArray &out_data);
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
index 8c64888b4608..9ce27fad4b19 100644
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -32,8 +32,7 @@
 #include <string>
 #include <utility>
 #include "../../operator_common.h"
-#include "../activation-inl.h"
-#include "./mkldnn_base-inl.h"
+#include "mkldnn_act-inl.h"
 
 #if MXNET_USE_MKLDNN == 1
 
@@ -58,7 +57,7 @@ bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input) {
   return SupportMKLDNNAct(param);
 }
 
-static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) {
+mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) {
   switch (param.act_type) {
     case activation::kReLU:
       return mkldnn::algorithm::eltwise_relu;
@@ -74,9 +73,7 @@ static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) {
   }
 }
 
-typedef std::shared_ptr<mkldnn::eltwise_forward::primitive_desc> mkldnn_act_pdesc_ptr;
-
-static mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
+mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
     const ActivationParam& param, bool is_train,
     const mkldnn::memory &input_mem, int dtype) {
   mkldnn::memory::primitive_desc data_mpd = input_mem.get_primitive_desc();
@@ -84,65 +81,41 @@ static mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
   auto cpu_engine = data_mpd.get_engine();
 
   auto alg = GetMKLDNNActAlgo(param);
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    DType alpha = 0;
-    mkldnn::eltwise_forward::desc desc = is_train
-        ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training,
-                                        alg, data_md, alpha)
-        : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring,
-                                        alg, data_md, alpha);
-    return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
-  });
-  LOG(FATAL) << "Unsupported data type for MKLDNN activation";
-  mkldnn::eltwise_forward::desc desc = mkldnn::eltwise_forward::desc(
-      mkldnn::prop_kind::forward_training, alg, data_md, 0.0);
+
+  auto prop = is_train ? mkldnn::prop_kind::forward_training :
+                         mkldnn::prop_kind::forward_scoring;
+  auto desc = mkldnn::eltwise_forward::desc(prop, alg, data_md, 0.0f);
   return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
 }
 
-typedef ParamOpSign<ActivationParam> MKLDNNActSignature;
-
-class MKLDNNActForward {
-  std::shared_ptr<mkldnn::eltwise_forward> fwd;
-  std::shared_ptr<mkldnn::memory> data;
-  std::shared_ptr<mkldnn::memory> out;
-
- public:
-  const mkldnn::eltwise_forward::primitive_desc fwd_pd;
-
-  MKLDNNActForward(const ActivationParam& param, bool is_train,
-                   const NDArray &data, const mkldnn::memory &mem): fwd_pd(
-                       GetActFwdDescImpl(param, is_train, mem, data.dtype())) {
-  }
-
-  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) {
-    if (this->data == nullptr)
-      this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              data.get_primitive_desc(), data.get_data_handle()));
-    else
-      this->data->set_data_handle(data.get_data_handle());
-
-    CHECK(fwd_pd.dst_primitive_desc() == output.get_primitive_desc());
-    if (this->out == nullptr)
-      this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.dst_primitive_desc(), output.get_data_handle()));
-    else
-      this->out->set_data_handle(output.get_data_handle());
-
-    if (this->fwd == nullptr) {
-      this->fwd = std::shared_ptr<mkldnn::eltwise_forward>(
-          new mkldnn::eltwise_forward(fwd_pd, mkldnn::primitive::at(*this->data),
-                                      *this->out));
-    }
+void MKLDNNActForward::SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) {
+  if (this->data_ == nullptr)
+    this->data_ = std::make_shared<mkldnn::memory>(data.get_primitive_desc(),
+            data.get_data_handle());
+  else
+    this->data_->set_data_handle(data.get_data_handle());
+
+  CHECK(fwd_pd.dst_primitive_desc() == output.get_primitive_desc());
+  if (this->out_ == nullptr)
+    this->out_ = std::make_shared<mkldnn::memory>(fwd_pd.dst_primitive_desc(),
+            output.get_data_handle());
+  else
+    this->out_->set_data_handle(output.get_data_handle());
+
+  if (this->fwd_ == nullptr) {
+    this->fwd_ = std::shared_ptr<mkldnn::eltwise_forward>(
+        new mkldnn::eltwise_forward(fwd_pd, mkldnn::primitive::at(*this->data_),
+                                    *this->out_));
   }
+}
 
-  const mkldnn::eltwise_forward &GetFwd() const {
-    return *fwd;
-  }
-};
+const mkldnn::eltwise_forward &MKLDNNActForward::GetFwd() const {
+  return *fwd_;
+}
 
-static MKLDNNActForward &GetActForward(const ActivationParam& param,
-                                       const OpContext &ctx, const NDArray &in_data,
-                                       const mkldnn::memory &in_mem) {
+MKLDNNActForward &GetActForward(const ActivationParam& param,
+                                const OpContext &ctx, const NDArray &in_data,
+                                const mkldnn::memory &in_mem) {
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActForward, OpHash> fwds;
 #else
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index a460e33fa548..3da3f23d7683 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -464,7 +464,7 @@ mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc p
                                                 mkldnn_memory_format_t format);
 
 inline bool same_shape(const mxnet::TShape &shape, const mkldnn_dims_t dims, int ndims) {
-  if (shape.ndim() != (size_t)ndims)
+  if (shape.ndim() != ndims)
     return false;
   for (int i = 0; i < ndims; i++)
     if (shape[i] != dims[i])
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
index 8e2b57781a18..7b266efc2a14 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat.cc
+++ b/src/operator/nn/mkldnn/mkldnn_concat.cc
@@ -92,13 +92,13 @@ void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
   auto gz_mem = inputs[0].GetMKLDNNData();
   mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc();
   /* init the offset */
-  mkldnn::memory::dims offsets = {0, 0, 0, 0};
+  mkldnn::memory::dims offsets(outputs[0].shape().ndim());
+  for (auto &v : offsets) {
+    v = 0;
+  }
+
   for (int i = 0; i < num_in_data; i++) {
-    mkldnn::memory::dims diff_src_tz
-        = {static_cast<int>(outputs[i].shape()[0]),
-          static_cast<int>(outputs[i].shape()[1]),
-          static_cast<int>(outputs[i].shape()[2]),
-          static_cast<int>(outputs[i].shape()[3])};
+    mkldnn::memory::dims diff_src_tz(outputs[i].shape().begin(), outputs[i].shape().end());
     auto diff_src_mpd = outputs[i].GetMKLDNNData()->get_primitive_desc();
     auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]);
     // create view from gy to gxs[i]
diff --git a/src/operator/nn/mkldnn/mkldnn_slice.cc b/src/operator/nn/mkldnn/mkldnn_slice.cc
index 3f3d82020598..2a817a25a5b8 100644
--- a/src/operator/nn/mkldnn/mkldnn_slice.cc
+++ b/src/operator/nn/mkldnn/mkldnn_slice.cc
@@ -37,12 +37,12 @@ MKLDNNSliceFwd::MKLDNNSliceFwd(const SliceParam &param,
                                const NDArray &out) {
   const mxnet::TShape ishape = in.shape();
   const mxnet::TShape oshape = out.shape();
-  uint32_t N = ishape.ndim();
+  const int N = ishape.ndim();
   mkldnn::memory::dims dims(N);
   mkldnn::memory::dims offsets(N);
-  for (uint32_t i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i) {
     int s = 0;
-    if (param.begin[i]) {
+    if (i < param.begin.ndim() &&  param.begin[i]) {
       s = *param.begin[i];
       if (s < 0) s += ishape[i];
     }
diff --git a/src/operator/nn/mkldnn/mkldnn_transpose.cc b/src/operator/nn/mkldnn/mkldnn_transpose.cc
index 0986d0616f75..eec19bababb7 100644
--- a/src/operator/nn/mkldnn/mkldnn_transpose.cc
+++ b/src/operator/nn/mkldnn/mkldnn_transpose.cc
@@ -55,9 +55,9 @@ class MKLDNNTransposeForward {
     auto shape = data.shape();
     auto data_ndim = shape.ndim();
     auto axes_ndim = param.axes.ndim();
-    auto axes = mxnet::TShape(data_ndim);
+    auto axes = mxnet::TShape(data_ndim, -1);
     if (axes_ndim == 0) {
-      for (size_t i = 0; i < data_ndim; i++) {
+      for (int i = 0; i < data_ndim; i++) {
         axes[i] = data_ndim - i - 1;
       }
     } else {
@@ -79,7 +79,7 @@ class MKLDNNTransposeForward {
     dst_fmt.data_type = mkldnn_f32;
     dst_fmt.format = mkldnn_blocked;
 
-    for (size_t i = 0; i < data_ndim; i++)
+    for (int i = 0; i < data_ndim; i++)
       dst_fmt.dims[i] = shape[i];
 
     unsigned int total_stride = 1;
diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h
index 9e1e73bf19e2..03f0fa8edd6c 100644
--- a/src/operator/nn/pooling-inl.h
+++ b/src/operator/nn/pooling-inl.h
@@ -55,7 +55,7 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   dmlc::optional<bool> count_include_pad;
   dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
-    DMLC_DECLARE_FIELD(kernel).set_default(mxnet::TShape())  // add default value here
+    DMLC_DECLARE_FIELD(kernel).set_default(mxnet::TShape(0, 0))  // add default value here
     .enforce_nonzero()
     .describe("Pooling kernel size: (y, x) or (d, y, x)");
 
@@ -78,11 +78,11 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     .add_enum("same", pool_enum::kSame)
     .describe("Pooling convention to be applied.");
 
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, 0))
     .enforce_nonzero()
     .describe("Stride: for pooling (y, x) or (d, y, x). Defaults to 1 for each dimension.");
 
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, 0))
     .describe("Pad for pooling: (y, x) or (d, y, x). Defaults to no padding.");
 
     DMLC_DECLARE_FIELD(p_value).set_default(dmlc::optional<int>())
@@ -200,11 +200,11 @@ class PoolingOp {
         kernel = mxnet::TShape(ishape.data() + 2,
                         ishape.data() + ishape.ndim());
       }
-      padding = mxnet::TShape(ishape.ndim() - 2);
+      padding = mxnet::TShape(ishape.ndim() - 2, 0);
       for (index_t i = 0; i < ishape.ndim() - 2; i++) {
         padding[i] = 0;
       }
-      stride = mxnet::TShape(ishape.ndim() - 2);
+      stride = mxnet::TShape(ishape.ndim() - 2, 1);
     }
     const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
                         param_.p_value.value() : 1;
@@ -257,11 +257,11 @@ class PoolingOp {
         kernel = mxnet::TShape(ishape.data() + 2,
                         ishape.data() + ishape.ndim());
       }
-      padding = mxnet::TShape(ishape.ndim() - 2);
+      padding = mxnet::TShape(ishape.ndim() - 2, 0);
       for (index_t i = 0; i < ishape.ndim() - 2; i++) {
         padding[i] = 0;
       }
-      stride = mxnet::TShape(ishape.ndim() - 2);
+      stride = mxnet::TShape(ishape.ndim() - 2, 1);
     }
 
     const int p_value = (param_.pool_type == pool_enum::kLpPooling && param_.p_value.has_value()) ?
diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc
index 2d16604baa20..3e081c9a0552 100644
--- a/src/operator/nn/pooling.cc
+++ b/src/operator/nn/pooling.cc
@@ -114,11 +114,11 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       << "Pooling: Input data should be  3D in (batch, channel, x)"
       << " Or 4D in (batch, channel, y, x) "
       << " Or 5D in (batch, channel, d, y, x)";
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   int layout = param.GetLayout(dshape.ndim());
   if (param.global_pool) {
     mxnet::TShape oshape = dshape;
-    size_t c_index = 0;
+    int c_index = 0;
     switch (layout) {
       case mshadow::kNCW:
       case mshadow::kNCHW:
@@ -133,7 +133,7 @@ static bool PoolingShape(const nnvm::NodeAttrs &attrs,
       default:
         LOG(FATAL) << "Unsupported tensor layout " << param.layout.value();
     }
-    for (size_t i{1}; i < dshape.ndim(); i++)
+    for (int i = 1; i < dshape.ndim(); i++)
       if (i != c_index)
         oshape[i] = 1;
     out_shape->clear();
diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h
index 662ba78cd84a..8219e3e9bd8d 100644
--- a/src/operator/nn/upsampling-inl.h
+++ b/src/operator/nn/upsampling-inl.h
@@ -59,7 +59,9 @@ struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
     .set_range(1, 1000)
     .describe("Up sampling scale");
     DMLC_DECLARE_FIELD(num_filter)
-    .describe("Input filter. Only used by bilinear sample_type.")
+    .describe("Input filter. Only used by bilinear sample_type."
+              "Since bilinear upsampling uses deconvolution, num_filters "
+              "is set to the number of channels.")
     .set_default(0);
     DMLC_DECLARE_FIELD(sample_type)
     .add_enum("nearest", up_enum::kNearest)
diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc
index d09017bf713e..cb57b1b2d16f 100644
--- a/src/operator/nn/upsampling.cc
+++ b/src/operator/nn/upsampling.cc
@@ -60,7 +60,7 @@ static bool UpSamplingShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
     CHECK_EQ(dshape.ndim(), 4U) << \
       "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)";
-    if (dshape.ndim() ==  0) return false;
+    if (!shape_is_known(dshape)) return false;
     int kernel = 2 * param_.scale - param_.scale % 2;
     SHAPE_ASSIGN_CHECK(*in_shape,
         up_enum::kWeight,
@@ -121,7 +121,9 @@ struct UpSamplingGrad {
 DMLC_REGISTER_PARAMETER(UpSamplingParam);
 
 NNVM_REGISTER_OP(UpSampling)
-.describe("Performs nearest neighbor/bilinear up sampling to inputs.")
+.describe("Performs nearest neighbor/bilinear up sampling to inputs. "
+          "Bilinear upsampling makes use of deconvolution. Therefore, "
+          "provide 2 inputs - data and weight. ")
 .set_num_inputs([](const NodeAttrs& attrs) {
   const UpSamplingParam& params = nnvm::get<UpSamplingParam>(attrs.parsed);
   return params.sample_type == up_enum::kNearest ? params.num_args : 2;
@@ -149,7 +151,8 @@ NNVM_REGISTER_OP(UpSampling)
 .set_attr<FCompute>("FCompute<cpu>", UpSamplingCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", UpSamplingGrad{"_backward_UpSampling"})
 .set_attr<std::string>("key_var_num_args", "num_args")
-.add_argument("data", "NDArray-or-Symbol[]", "Array of tensors to upsample")
+.add_argument("data", "NDArray-or-Symbol[]", "Array of tensors to upsample. "
+              "For bilinear upsampling, there should be 2 inputs - 1 data and 1 weight.")
 .add_arguments(UpSamplingParam::__FIELDS__())
 .set_attr<nnvm::FSetInputVarAttrOnCompose>("FSetInputVarAttrOnCompose",
     [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) {
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index f629534dabd0..59f572211d0e 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -103,9 +103,10 @@ struct InferStorageTypeError : public dmlc::Error {
     : dmlc::Error(msg_), msg(msg_), index(index) {}
 };
 
-/*! \brief check if shape is empty or contains unknown (0) dim. */
+/*! \brief check if shape is empty or contains unknown (0) dim.
+ * DEPRECATED. */
 inline bool shape_is_none(const mxnet::TShape& x) {
-  return x.ndim() == 0 || x.Size() == 0;
+  return !mxnet::shape_is_known(x);
 }
 
 /*! \brief check if type is none (-1) */
@@ -120,7 +121,7 @@ inline bool storage_type_is_none(const int& x) {
 
 /*! \brief check if shape is scalar({1}). */
 inline bool shape_is_scalar(const mxnet::TShape& x) {
-  return x.ndim() == 1 && x.Size() == 1;
+  return x.ndim() == 0;
 }
 
 /*! \brief get string representation of shape */
@@ -159,16 +160,16 @@ inline std::string type_string(const int& x) {
  * \return whether x and y are compatible.
  */
 inline bool shape_assign(mxnet::TShape *y, const mxnet::TShape& x) {
-  if (y->ndim() == 0) {
+  if (!mxnet::ndim_is_known(*y)) {
     *y = x;
     return true;
   } else if (y->ndim() != x.ndim()) {
-    return x.ndim() == 0;
+    return !mxnet::ndim_is_known(x);
   } else {
-    for (size_t i = 0; i < y->ndim(); ++i) {
-      if ((*y)[i] == 0) {
+    for (int i = 0; i < y->ndim(); ++i) {
+      if (!mxnet::dim_size_is_known(*y, i)) {
         (*y)[i] = x[i];
-      } else if ((*y)[i] != x[i] && x[i] != 0) {
+      } else if ((*y)[i] != x[i] && x[i] >= 0) {
         return false;
       }
     }
@@ -563,7 +564,7 @@ class OpSignature {
   }
 
   void AddSign(const mxnet::TShape &shape) {
-    for (size_t i = 0; i < shape.ndim(); i++) {
+    for (int i = 0; i < shape.ndim(); i++) {
       hash = hash * 2 + shape[i];
       eles.push_back(shape[i]);
     }
diff --git a/src/operator/operator_util.cc b/src/operator/operator_util.cc
index b87428ca2b64..bc097a5b0c1c 100644
--- a/src/operator/operator_util.cc
+++ b/src/operator/operator_util.cc
@@ -774,7 +774,7 @@ class SimpleUnaryOpProp : public SimpleOpPropBase {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
     const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
+    if (!shape_is_known(dshape)) return false;
     out_shape->clear();
     if (source->unary_shape_ == nullptr) {
       out_shape->push_back(dshape);
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 49eb96b9f8b2..bd923aebbb80 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -83,8 +83,8 @@ struct SGDParam : public dmlc::Parameter<SGDParam> {
 };
 
 struct MultiSGDParam : public dmlc::Parameter<MultiSGDParam> {
-  nnvm::Tuple<float> lrs;
-  nnvm::Tuple<float> wds;
+  mxnet::Tuple<float> lrs;
+  mxnet::Tuple<float> wds;
   float rescale_grad;
   float clip_gradient;
   int num_weights;
@@ -110,8 +110,8 @@ struct MultiSGDParam : public dmlc::Parameter<MultiSGDParam> {
 };
 
 struct MultiSGDMomParam : public dmlc::Parameter<MultiSGDMomParam> {
-  nnvm::Tuple<float> lrs;
-  nnvm::Tuple<float> wds;
+  mxnet::Tuple<float> lrs;
+  mxnet::Tuple<float> wds;
   float momentum;
   float rescale_grad;
   float clip_gradient;
diff --git a/src/operator/pad-inl.h b/src/operator/pad-inl.h
index 140d7099e817..89b0ab7780b6 100644
--- a/src/operator/pad-inl.h
+++ b/src/operator/pad-inl.h
@@ -230,7 +230,7 @@ class PadProp : public OperatorProperty {
       }
     }
     mxnet::TShape oshape = dshape;
-    for (size_t i = 0; i < dshape.ndim(); ++i) {
+    for (int i = 0; i < dshape.ndim(); ++i) {
       oshape[i] =
           param_.pad_width[2 * i] + param_.pad_width[2 * i + 1] + dshape[i];
     }
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
index 4e0ccc1caeb9..4241b08a0c5e 100644
--- a/src/operator/pooling_v1-inl.h
+++ b/src/operator/pooling_v1-inl.h
@@ -55,7 +55,7 @@ struct PoolingV1Param : public dmlc::Parameter<PoolingV1Param> {
   int pooling_convention;
   bool global_pool;
   DMLC_DECLARE_PARAMETER(PoolingV1Param) {
-    DMLC_DECLARE_FIELD(kernel).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(kernel).set_default(mxnet::TShape(0, -1))
     .enforce_nonzero()
     .describe("pooling kernel size: (y, x) or (d, y, x)");
 
@@ -73,11 +73,11 @@ struct PoolingV1Param : public dmlc::Parameter<PoolingV1Param> {
     .add_enum("valid", pool_v1_enum::kValid)
     .describe("Pooling convention to be applied.");
 
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, -1))
     .enforce_nonzero()
     .describe("stride: for pooling (y, x) or (d, y, x)");
 
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, -1))
     .describe("pad for pooling: (y, x) or (d, y, x)");
   }
 };
@@ -217,19 +217,20 @@ class PoolingV1Prop : public OperatorProperty {
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     using namespace mshadow;
     param_.Init(kwargs);
-    if (!param_.global_pool) {
-      if (param_.kernel.ndim() == 2) {
-        if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-        if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-      } else {
-        CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported";
-        if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-        if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
-      }
-      CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
-        << "stride and kernel should have the same length";
-      CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
-        << "pad and kernel should have the same length";
+    if (param_.kernel.ndim() == 1) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+    } else if (param_.kernel.ndim() == 2) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    } else {
+        // ignore kernel size only if global_pool not assigned false
+        if (param_.global_pool == false) {
+          CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim()
+              << "D pooling not supported";
+        }
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
     }
   }
 
@@ -247,7 +248,7 @@ class PoolingV1Prop : public OperatorProperty {
     CHECK_LE(dshape.ndim(), 5U) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
                                << "Or 5D in (batch, channel, d, y, x)";
     mxnet::TShape oshape = dshape;
-    if (dshape.ndim() ==  0) return false;
+    if (dshape.ndim() ==  -1) return false;
     if (param_.global_pool) {
       if (dshape.ndim() == 4) {
         oshape[2] = 1;
diff --git a/src/operator/quantization/dequantize-inl.h b/src/operator/quantization/dequantize-inl.h
index dcda5a8b4bef..92b74b787141 100644
--- a/src/operator/quantization/dequantize-inl.h
+++ b/src/operator/quantization/dequantize-inl.h
@@ -68,30 +68,6 @@ struct dequantize_zero_centered {
   }
 };
 
-template<typename xpu>
-void DequantizeCompute(const nnvm::NodeAttrs& attrs,
-                     const OpContext& ctx,
-                     const std::vector<TBlob>& inputs,
-                     const std::vector<OpReqType>& req,
-                     const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using mshadow::red::limits::MinValue;
-  using mshadow::red::limits::MaxValue;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (inputs[0].type_flag_ == mshadow::kUint8) {
-    Kernel<dequantize_unsigned, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<float>(),
-      inputs[0].dptr<uint8_t>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
-      MinValue<uint8_t>(), MaxValue<uint8_t>());
-  } else if (inputs[0].type_flag_ == mshadow::kInt8) {
-    Kernel<dequantize_zero_centered, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<float>(),
-      inputs[0].dptr<int8_t>(), inputs[1].dptr<float>(), inputs[2].dptr<float>(),
-      MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
-  } else {
-    LOG(FATAL) << "dequantize op only supports input type int8 or uint8";
-  }
-}
-
 inline bool DequantizeShape(const nnvm::NodeAttrs& attrs,
                           mxnet::ShapeVector *in_attrs,
                           mxnet::ShapeVector *out_attrs) {
@@ -99,11 +75,11 @@ inline bool DequantizeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
 
   for (size_t i = 1; i < 3; ++i) {
-    SHAPE_ASSIGN_CHECK(*in_attrs, i, mxnet::TShape({1}));
+    SHAPE_ASSIGN_CHECK(*in_attrs, i, mxnet::TShape(1, 1));
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  return !shape_is_none(out_attrs->at(0));
+  return shape_is_known(out_attrs->at(0));
 }
 
 inline bool DequantizeType(const nnvm::NodeAttrs& attrs,
@@ -119,6 +95,44 @@ inline bool DequantizeType(const nnvm::NodeAttrs& attrs,
   return (*in_attrs)[0] != -1;
 }
 
+template <typename xpu>
+class DequantizeOperator {
+ public:
+  explicit DequantizeOperator(const nnvm::NodeAttrs &attrs) : attrs_(attrs) {}
+  void Forward(const OpContext &ctx, const std::vector<TBlob> &inputs,
+               const std::vector<OpReqType> &req, const std::vector<TBlob> &outputs) {
+    using namespace mshadow;
+    using namespace mxnet_op;
+    using mshadow::red::limits::MaxValue;
+    using mshadow::red::limits::MinValue;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (inputs[0].type_flag_ == mshadow::kUint8) {
+      Kernel<dequantize_unsigned, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<float>(),
+                                               inputs[0].dptr<uint8_t>(), inputs[1].dptr<float>(),
+                                               inputs[2].dptr<float>(), MinValue<uint8_t>(),
+                                               MaxValue<uint8_t>());
+    } else if (inputs[0].type_flag_ == mshadow::kInt8) {
+      Kernel<dequantize_zero_centered, xpu>::Launch(
+          s, outputs[0].Size(), outputs[0].dptr<float>(), inputs[0].dptr<int8_t>(),
+          inputs[1].dptr<float>(), inputs[2].dptr<float>(),
+          MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+    } else {
+      LOG(FATAL) << "dequantize op only supports input type int8 or uint8";
+    }
+  }
+
+ private:
+  nnvm::NodeAttrs attrs_;
+};
+
+template <typename xpu>
+static void DequantizeForward(const OpStatePtr &state_ptr, const OpContext &ctx,
+                              const std::vector<TBlob> &inputs, const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
+  auto &op = state_ptr.get_state<DequantizeOperator<xpu>>();
+  op.Forward(ctx, inputs, req, outputs);
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_QUANTIZATION_DEQUANTIZE_INL_H_
diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index 7c84673095f0..e8e2cd90b86c 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -48,6 +48,22 @@ bool DequantizeStorageType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+static OpStatePtr CreateDequantizeState(const nnvm::NodeAttrs &attrs, Context ctx,
+                                        const std::vector<TShape> &in_shapes,
+                                        const std::vector<int> &in_types) {
+  OpStatePtr state;
+  if (ctx.dev_type == kGPU) {
+    state = OpStatePtr::Create<DequantizeOperator<gpu>>(attrs);
+  } else {
+#if MXNET_USE_MKLDNN == 1
+    state = OpStatePtr::Create<SgMKLDNNDequantizeOperator>(attrs);
+#else
+    state = OpStatePtr::Create<DequantizeOperator<cpu>>(attrs);
+#endif
+  }
+  return state;
+}
+
 NNVM_REGISTER_OP(_contrib_dequantize)
 .describe(R"code(Dequantize the input tensor into a float tensor.
 min_range and max_range are scalar floats that specify the range for
@@ -68,17 +84,22 @@ by keep zero centered for the quantized value:
 .set_attr_parser(ParamParser<DequantizeParam>)
 .set_num_inputs(3)
 .set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "min_range", "max_range"};
+  })
 .set_attr<mxnet::FInferShape>("FInferShape", DequantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", DequantizeType)
 .set_attr<FInferStorageType>("FInferStorageType", DequantizeStorageType)
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FCreateOpState>("FCreateOpState", CreateDequantizeState)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNDequantizeCompute)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNDequantizeForward)
 #endif
-.set_attr<FCompute>("FCompute<cpu>", DequantizeCompute<cpu>)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", DequantizeForward<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `uint8`")
 .add_argument("min_range", "NDArray-or-Symbol", "The minimum scalar value "
   "possibly produced for the input in float32")
diff --git a/src/operator/quantization/dequantize.cu b/src/operator/quantization/dequantize.cu
index ca5f91c5def9..dee8b2207e01 100644
--- a/src/operator/quantization/dequantize.cu
+++ b/src/operator/quantization/dequantize.cu
@@ -28,7 +28,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_contrib_dequantize)
-.set_attr<FCompute>("FCompute<gpu>", DequantizeCompute<gpu>);
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DequantizeForward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
index b66adf787fef..27fa070afbe0 100644
--- a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
@@ -26,82 +26,104 @@
 #ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
 #define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_DEQUANTIZE_INL_H_
 #if MXNET_USE_MKLDNN == 1
-#include <string>
 #include <algorithm>
+#include <string>
 #include <vector>
 #include "../../nn/mkldnn/mkldnn_base-inl.h"
 
 namespace mxnet {
 namespace op {
 
-template<typename SrcType, typename DstType>
-static void MKLDNNDequantizeComputeKer(const std::vector<NDArray> &inputs,
-                                       const std::vector<NDArray> &outputs,
-                                       const std::vector<OpReqType> &req) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using red::limits::MaxValue;
-  using red::limits::MinValue;
-  float real_range = 0.0;
-  float quantized_range = 0.0;
-  if (inputs[0].dtype() == mshadow::kUint8) {
-    quantized_range = MaxAbs(MaxValue<SrcType>(), MinValue<SrcType>());
-    real_range = MaxAbs(*inputs[1].data().dptr<DstType>(), *inputs[2].data().dptr<DstType>());
-  } else if (inputs[0].dtype() == mshadow::kInt8) {
-    quantized_range = MinAbs(MaxValue<SrcType>(), MinValue<SrcType>());
-    real_range = MaxAbs(*inputs[1].data().dptr<DstType>(), *inputs[2].data().dptr<DstType>());
-  } else {
-    LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as output type";
-  }
-  float scale = real_range / quantized_range;
-  primitive_attr attr;
-  const int mask = 0;
-  std::vector<float> scales = {scale};
-  attr.set_output_scales(mask, scales);
-  attr.set_int_output_round_mode(round_nearest);
-  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
 
-  NDArray in_buffer = inputs[0];
-  if (inputs[0].IsView() && inputs[0].IsMKLDNNData())
-    in_buffer = inputs[0].Reorder2Default();
+class SgMKLDNNDequantizeOperator {
+ public:
+  explicit SgMKLDNNDequantizeOperator(const nnvm::NodeAttrs &attrs)
+      : param_(nnvm::get<DequantizeParam>(attrs.parsed)) {}
 
+  void Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+               const std::vector<OpReqType> &req, const std::vector<NDArray> &outputs);
+
+ private:
+  bool initialized_{false};
+  DequantizeParam param_;
+  float cached_data_min_{0.f};
+  float cached_data_max_{0.f};
+  std::shared_ptr<mkldnn::memory> i_mem_;
+  std::shared_ptr<mkldnn::memory> o_mem_;
+  std::shared_ptr<mkldnn::reorder> fwd_pd_;
+};
+
+void SgMKLDNNDequantizeOperator::Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+                                         const std::vector<OpReqType> &req,
+                                         const std::vector<NDArray> &outputs) {
+  NDArray in_buffer = inputs[0];
+  if (inputs[0].IsView() && inputs[0].IsMKLDNNData()) in_buffer = inputs[0].Reorder2Default();
   auto i_mem = in_buffer.GetMKLDNNData();
-  auto i_mpd = i_mem->get_primitive_desc();
-  auto i_desc = i_mpd.desc();
-  size_t i_ndim = in_buffer.shape().ndim();
-  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
-  for (size_t i = 0; i < i_ndim; i++) {
-    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
-  }
-  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
-  if (i_fmt == mkldnn::memory::format::nhwc) {
-    // For 4d tensor, nchw is the default format
-    i_fmt = mkldnn::memory::format::nchw;
+  float data_min = *inputs[1].data().dptr<float>();
+  float data_max = *inputs[2].data().dptr<float>();
+
+  if (initialized_ && (cached_data_min_ != data_min || cached_data_max_ != data_max))
+    initialized_ = false;
+
+  if (!initialized_) {
+    cached_data_min_ = data_min;
+    cached_data_max_ = data_max;
+    float real_range = MaxAbs(cached_data_min_, cached_data_max_);
+    float quantized_range = 0.0;
+    if (inputs[0].dtype() == mshadow::kUint8) {
+      quantized_range = kUint8Range;
+    } else if (inputs[0].dtype() == mshadow::kInt8) {
+      quantized_range = kInt8Range;
+      real_range = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
+    } else {
+      LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as output type";
+    }
+    float scale = real_range / quantized_range;
+    primitive_attr attr;
+    const int mask = 0;
+    std::vector<float> scales = {scale};
+    attr.set_output_scales(mask, scales);
+    attr.set_int_output_round_mode(round_nearest);
+    mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+    auto i_mpd = i_mem->get_primitive_desc();
+    auto i_desc = i_mpd.desc();
+    size_t i_ndim = in_buffer.shape().ndim();
+    mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+    for (size_t i = 0; i < i_ndim; i++) {
+      i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+    }
+    mkldnn::memory::format o_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+    if (o_fmt == mkldnn::memory::format::nhwc) {
+      // For 4d tensor, nchw is the default format
+      o_fmt = mkldnn::memory::format::nchw;
+    }
+    auto o_desc =
+        mkldnn::memory::desc(i_dims, (mkldnn::memory::data_type)data_type_enum<float>::type, o_fmt);
+    auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+    auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
+    i_mem_ = std::make_shared<mkldnn::memory>(i_mpd, nullptr);
+    o_mem_ = std::make_shared<mkldnn::memory>(o_mpd, nullptr);
+    fwd_pd_ = std::make_shared<mkldnn::reorder>(reorder_pd, *i_mem_, *o_mem_);
+    initialized_ = true;
   }
-  auto o_desc = mkldnn::memory::desc(i_dims,
-                                    (mkldnn::memory::data_type)data_type_enum<DstType>::type,
-                                    i_fmt);
-  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
-  auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
-  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
-  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
+  auto o_mem = CreateMKLDNNMem(outputs[0], o_mem_->get_primitive_desc(), req[0]);
+  i_mem_->set_data_handle(i_mem->get_data_handle());
+  o_mem_->set_data_handle(o_mem.second->get_data_handle());
+  MKLDNNStream::Get()->RegisterPrim(*fwd_pd_);
   CommitOutput(outputs[0], o_mem);
   MKLDNNStream::Get()->Submit();
 }
 
-static void MKLDNNDequantizeCompute(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                                    const std::vector<NDArray> &inputs,
-                                    const std::vector<OpReqType> &req,
-                                    const std::vector<NDArray> &outputs) {
-  if (inputs[0].dtype() == mshadow::kUint8) {
-    MKLDNNDequantizeComputeKer<uint8_t, float>(inputs, outputs, req);
-  } else if (inputs[0].dtype() == mshadow::kInt8) {
-    MKLDNNDequantizeComputeKer<int8_t, float>(inputs, outputs, req);
-  } else {
-    LOG(FATAL) << "mkldnn dequantize op only supports int8 and uint8 as input type";
-  }
+static void SgMKLDNNDequantizeForward(const OpStatePtr &state_ptr, const OpContext &ctx,
+                                      const std::vector<NDArray> &inputs,
+                                      const std::vector<OpReqType> &req,
+                                      const std::vector<NDArray> &outputs) {
+  SgMKLDNNDequantizeOperator &op = state_ptr.get_state<SgMKLDNNDequantizeOperator>();
+  op.Forward(ctx, inputs, req, outputs);
 }
 
+
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
index d6060e54a82c..bd1b47e4c2de 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -34,99 +34,37 @@
 namespace mxnet {
 namespace op {
 
-template <typename SrcType, typename DstType>
-static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
-                                     const std::vector<NDArray>& outputs,
-                                     const QuantizeV2Param& param,
-                                     const std::vector<OpReqType>& req) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using red::limits::MaxValue;
-  using red::limits::MinValue;
-  SrcType real_range = 0.f;
-  DstType quantized_range = 0;
-  NDArray in_buffer = inputs[0];
-  SrcType data_min = red::limits::MaxValue<SrcType>();
-  SrcType data_max = red::limits::MinValue<SrcType>();
-  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-    data_min = param.min_calib_range.value();
-    data_max = param.max_calib_range.value();
-  } else {
-    // no calib info
-    in_buffer = inputs[0].Reorder2Default();
-    auto in_ptr = in_buffer.data().dptr<SrcType>();
-    auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-    std::vector<SrcType> data_maxs(nthreads, data_max);
-    std::vector<SrcType> data_mins(nthreads, data_min);
-#pragma omp parallel for num_threads(nthreads)
-    for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size()); i++) {
-      int tid = omp_get_thread_num();
-      if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
-      if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
-    }
-    for (index_t i = 0; i < nthreads; i++) {
-      if (data_maxs[i] > data_max) data_max = data_maxs[i];
-      if (data_mins[i] < data_min) data_min = data_mins[i];
-    }
-  }
+class SgMKLDNNQuantizeOperator {
+ public:
+  explicit SgMKLDNNQuantizeOperator(const nnvm::NodeAttrs &attrs)
+      : param_(nnvm::get<QuantizeV2Param>(attrs.parsed)) {}
 
-  auto out_type = GetOutputType(param);
-  if (out_type == mshadow::kUint8) {
-    real_range = std::max<SrcType>(0.f, data_max);
-    quantized_range = MaxValue<DstType>();
-    *outputs[1].data().dptr<float>() = 0.f;
-    *outputs[2].data().dptr<float>() = real_range;
-  } else if (out_type == mshadow::kInt8) {
-    real_range = MaxAbs(data_min, data_max);
-    quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
-    *outputs[1].data().dptr<float>() = -real_range;
-    *outputs[2].data().dptr<float>() = real_range;
-  } else {
-    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
-  }
-  float scale = static_cast<float>(quantized_range) / real_range;
+  void Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+               const std::vector<OpReqType> &req, const std::vector<NDArray> &outputs);
 
-  primitive_attr attr;
-  const int mask = 0;
-  std::vector<float> scales = {scale};
-  attr.set_output_scales(mask, scales);
-  attr.set_int_output_round_mode(round_nearest);
-  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+ private:
+  bool initalized_{false};
+  QuantizeV2Param param_;
+  float cached_data_min_{0.f};
+  float cached_data_max_{0.f};
+  std::shared_ptr<mkldnn::memory> i_mem_;
+  std::shared_ptr<mkldnn::memory> o_mem_;
+  std::shared_ptr<mkldnn::reorder> fwd_pd_;
+};
 
-  if (in_buffer.IsView() && in_buffer.IsMKLDNNData()) in_buffer = inputs[0].Reorder2Default();
-  auto i_mem = in_buffer.GetMKLDNNData();
-  auto i_mpd = i_mem->get_primitive_desc();
-  auto i_desc = i_mpd.desc();
-  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
-  if (i_fmt == mkldnn::memory::format::nchw ||
-      i_fmt == mkldnn::memory::format::nChw8c ||
-      i_fmt == mkldnn_nChw16c) {
-    i_fmt = mkldnn::memory::format::nhwc;
-  }
-  size_t i_ndim = in_buffer.shape().ndim();
-  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
-  for (size_t i = 0; i < i_ndim; i++) {
-    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
-  }
-  auto o_desc =
-      mkldnn::memory::desc(i_dims, (mkldnn::memory::data_type)data_type_enum<DstType>::type, i_fmt);
-  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
-  auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
-  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
-  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
-  CommitOutput(outputs[0], o_mem);
-  MKLDNNStream::Get()->Submit();
-}
+void SgMKLDNNQuantizeOperator::Forward(const OpContext &ctx, const std::vector<NDArray> &inputs,
+                                       const std::vector<OpReqType> &req,
+                                       const std::vector<NDArray> &outputs) {
+  float quantized_range = 0.0;
+  NDArray in_buffer = inputs[0];
+  float data_min = mshadow::red::limits::MaxValue<float>();
+  float data_max = mshadow::red::limits::MinValue<float>();
 
-static void MKLDNNQuantizeV2Compute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
-                                    const std::vector<NDArray>& inputs,
-                                    const std::vector<OpReqType>& req,
-                                    const std::vector<NDArray>& outputs) {
-  const QuantizeV2Param& param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+  // Pass through quantized data
   if (inputs[0].dtype() == mshadow::kUint8 || inputs[0].dtype() == mshadow::kInt8) {
-    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-      *outputs[1].data().dptr<float>() = param.min_calib_range.value();
-      *outputs[2].data().dptr<float>() = param.max_calib_range.value();
+    if (param_.min_calib_range.has_value() && param_.max_calib_range.has_value()) {
+      *outputs[1].data().dptr<float>() = param_.min_calib_range.value();
+      *outputs[2].data().dptr<float>() = param_.max_calib_range.value();
     } else {
       if (inputs[0].dtype() == mshadow::kUint8) {
         *outputs[1].data().dptr<float>() = 0;
@@ -137,21 +75,101 @@ static void MKLDNNQuantizeV2Compute(const nnvm::NodeAttrs& attrs, const OpContex
       }
     }
     if (req[0] != kWriteInplace) {
-      const_cast<NDArray&>(outputs[0]).CopyFrom(*inputs[0].GetMKLDNNData());
+      const_cast<NDArray &>(outputs[0]).CopyFrom(*inputs[0].GetMKLDNNData());
       MKLDNNStream::Get()->Submit();
     }
   } else {
-    auto out_type = GetOutputType(param);
+    if (in_buffer.IsView() && in_buffer.IsMKLDNNData()) in_buffer = inputs[0].Reorder2Default();
+    auto i_mem = in_buffer.GetMKLDNNData();
+
+    if (param_.min_calib_range.has_value() && param_.max_calib_range.has_value()) {
+      data_min = param_.min_calib_range.value();
+      data_max = param_.max_calib_range.value();
+    } else {
+      // no calib info
+      in_buffer = inputs[0].Reorder2Default();
+      auto in_ptr = in_buffer.data().dptr<float>();
+      auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+      std::vector<float> data_maxs(nthreads, data_max);
+      std::vector<float> data_mins(nthreads, data_min);
+#pragma omp parallel for num_threads(nthreads)
+      for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size()); i++) {
+        int tid = omp_get_thread_num();
+        if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
+        if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
+      }
+      for (index_t i = 0; i < nthreads; i++) {
+        if (data_maxs[i] > data_max) data_max = data_maxs[i];
+        if (data_mins[i] < data_min) data_min = data_mins[i];
+      }
+
+      if (initalized_ && (cached_data_min_ != data_min || cached_data_max_ != data_max))
+        initalized_ = false;
+    }
+
+    // Write output min/max
+    auto out_type = GetQuantizeOutputType(param_);
     if (out_type == mshadow::kUint8) {
-      MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
+      quantized_range = kUint8Range;
+      *outputs[1].data().dptr<float>() = data_min;
+      *outputs[2].data().dptr<float>() = data_max;
     } else if (out_type == mshadow::kInt8) {
-      MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+      float real_range = MaxAbs(data_min, data_max);
+      quantized_range = kInt8Range;
+      *outputs[1].data().dptr<float>() = -real_range;
+      *outputs[2].data().dptr<float>() = real_range;
     } else {
       LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
     }
+
+    if (!initalized_) {
+      cached_data_min_ = data_min;
+      cached_data_max_ = data_max;
+      float real_range = MaxAbs(data_min, data_max);
+      float scale = quantized_range / real_range;
+      primitive_attr attr;
+      const int mask = 0;
+      std::vector<float> scales = {scale};
+      attr.set_output_scales(mask, scales);
+      attr.set_int_output_round_mode(round_nearest);
+      mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+      auto i_mpd = i_mem->get_primitive_desc();
+      auto i_desc = i_mpd.desc();
+      mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+      if (i_fmt == mkldnn::memory::format::nchw || i_fmt == mkldnn::memory::format::nChw8c ||
+          i_fmt == mkldnn_nChw16c) {
+        i_fmt = mkldnn::memory::format::nhwc;
+      }
+      size_t i_ndim = in_buffer.shape().ndim();
+      mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+      for (size_t i = 0; i < i_ndim; i++) {
+        i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+      }
+      auto o_desc = mkldnn::memory::desc(i_dims, get_mkldnn_type(out_type), i_fmt);
+      auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+      auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
+      i_mem_ = std::make_shared<mkldnn::memory>(i_mpd, nullptr);
+      o_mem_ = std::make_shared<mkldnn::memory>(o_mpd, nullptr);
+      fwd_pd_ = std::make_shared<mkldnn::reorder>(reorder_pd, *i_mem_, *o_mem_);
+      initalized_ = true;
+    }
+    auto o_mem = CreateMKLDNNMem(outputs[0], o_mem_->get_primitive_desc(), req[0]);
+    i_mem_->set_data_handle(i_mem->get_data_handle());
+    o_mem_->set_data_handle(o_mem.second->get_data_handle());
+    MKLDNNStream::Get()->RegisterPrim(*fwd_pd_);
+    CommitOutput(outputs[0], o_mem);
+    MKLDNNStream::Get()->Submit();
   }
 }
 
+static void SgMKLDNNQuantizeForward(const OpStatePtr &state_ptr, const OpContext &ctx,
+                                    const std::vector<NDArray> &inputs,
+                                    const std::vector<OpReqType> &req,
+                                    const std::vector<NDArray> &outputs) {
+  SgMKLDNNQuantizeOperator &op = state_ptr.get_state<SgMKLDNNQuantizeOperator>();
+  op.Forward(ctx, inputs, req, outputs);
+}
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_act.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_act.cc
new file mode 100644
index 000000000000..bc69cb5e9bf7
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_act.cc
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file mkldnn_quantized_act.cc
+ * \brief MKLDNN(Quantized) Activation operator based on subgraph
+ * /author Zhiyuan Huang
+*/
+#if MXNET_USE_MKLDNN == 1
+
+#include "../../nn/mkldnn/mkldnn_act-inl.h"
+#include "../quantization_utils.h"
+
+namespace mxnet {
+namespace op {
+
+static void MKLDNNQuantizedActForward(const nnvm::NodeAttrs& attrs,
+                                      const OpContext& ctx,
+                                      const std::vector<NDArray>& in_data,
+                                      const std::vector<OpReqType>& req,
+                                      const std::vector<NDArray>& out_data) {
+  CHECK(in_data[0].dtype() == mshadow::kUint8 ||
+        in_data[0].dtype() == mshadow::kInt8)
+      << "_contrib_quantized_act op only supports uint8 and int8 as input "
+         "type";
+
+  MKLDNNActivationForward(attrs, ctx, in_data[0], req[0], out_data[0]);
+  out_data[1].data().dptr<float>()[0] = in_data[1].data().dptr<float>()[0];
+  out_data[2].data().dptr<float>()[0] = in_data[2].data().dptr<float>()[0];
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_act)
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizedActForward);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
index b8c47c3af11b..55028d8c8ccc 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_conv.cc
@@ -72,7 +72,7 @@ static void MKLDNNQuantizedConvForward(const nnvm::NodeAttrs& attrs,
   MKLDNNStream::Get()->Submit();
   Stream<cpu> *s = ctx.get_stream<cpu>();
   const size_t num_inputs = param.no_bias ? 2 : 3;
-  mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+  mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(s, 1,
            out_data[1].data().dptr<float>(), out_data[2].data().dptr<float>(),
            in_data[num_inputs].data().dptr<float>(),
            in_data[num_inputs+1].data().dptr<float>(),
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
index 71daf2ec2c16..e8abab22446e 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
@@ -74,13 +74,13 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs,
     int32_t *quantized_bias_ptr = quantized_bias.data().dptr<int32_t>();
     size_t bias_size = bias.shape().Size();
     #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-    for (size_t i = 0; i < bias_size; ++i) {
+    for (index_t i = 0; i < static_cast<index_t>(bias_size); ++i) {
       quantized_bias_ptr[i] = bias_ptr[i] * bias_int32_rescale;
     }
   }
 
   Stream<cpu> *s = ctx.get_stream<cpu>();
-  mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
+  mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(s, 1,
     min_output_ptr, max_output_ptr, &min_data, &max_data, &min_weight, &max_weight);
 
   bool is_train = false;
diff --git a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
index 45713589dd48..03d9b9067b57 100644
--- a/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_requantize-inl.h
@@ -34,6 +34,7 @@
 namespace mxnet {
 namespace op {
 
+template <typename DstType>
 static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
                                        const OpContext& ctx,
                                        const std::vector<NDArray>& inputs,
@@ -45,7 +46,6 @@ static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
   using red::limits::MaxValue;
   using red::limits::MinValue;
   typedef int32_t SrcDType;
-  typedef int8_t  DstDType;
   // check shapes
   size_t i_dim = inputs[0].shape().ndim();
   size_t o_dim = outputs[0].shape().ndim();
@@ -56,12 +56,21 @@ static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
                                   *inputs[2].data().dptr<float>());
   float first_scale = first_real_range / first_quantized_range;
   float second_real_range = real_range;
-  float second_quantized_range = MinAbs(MaxValue<DstDType>(),
-                                        MinValue<DstDType>());
+  float second_quantized_range = 0.f;
+  if (std::is_same<DstType, int8_t>::value) {
+    second_quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
+    *outputs[1].data().dptr<float>() = -second_real_range;
+    *outputs[2].data().dptr<float>() = second_real_range;
+  } else if (std::is_same<DstType, uint8_t>::value) {
+    second_quantized_range = MaxValue<DstType>();
+    *outputs[1].data().dptr<float>() = 0.f;
+    *outputs[2].data().dptr<float>() = second_real_range;
+  } else {
+    LOG(FATAL) << "Unsupported requantize output type";
+  }
   float second_scale = second_quantized_range / second_real_range;
   float scale = first_scale * second_scale;
-  *outputs[1].data().dptr<float>() = -second_real_range;
-  *outputs[2].data().dptr<float>() = second_real_range;
+
   primitive_attr attr;
   const int mask = 0;
   std::vector<float> scales = {scale};
@@ -82,7 +91,7 @@ static void MKLDNNRequantizeForwardKer(const nnvm::NodeAttrs& attrs,
     i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
   }
   auto o_desc = mkldnn::memory::desc(i_dims,
-                                    (mkldnn::memory::data_type)data_type_enum<DstDType>::type,
+                                    (mkldnn::memory::data_type)data_type_enum<DstType>::type,
                                     i_fmt);
   auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
   auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
@@ -99,55 +108,47 @@ static void MKLDNNRequantizeForward(const nnvm::NodeAttrs& attrs,
                                     const std::vector<NDArray>& outputs) {
   using namespace mshadow;
   using namespace mxnet_op;
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
   typedef int32_t SrcDType;
   typedef int8_t  DstDType;
-  Stream<cpu> *s = ctx.get_stream<cpu>();
   const RequantizeParam& param = nnvm::get<RequantizeParam>(attrs.parsed);
   float real_range;
   // Model is calibrated
   if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
     real_range =
           MaxAbs(param.min_calib_range.value(), param.max_calib_range.value());
-    MKLDNNRequantizeForwardKer(attrs, ctx, inputs, req, outputs, real_range);
   // Model is not calibrated
   } else {
-    mxnet::TShape src_shape, dst_shape;
-    const size_t actual_float_size = sizeof(float);
-    const size_t actual_quantized_size = sizeof(SrcDType);
-    const size_t temp_reduce_size = ConfigReduce<cpu, SrcDType>(s,
-                         inputs[0].shape(), mxnet::TShape({1}), &src_shape, &dst_shape);
-    Tensor<cpu, 1, char> temp_space =
-      ctx.requested[0].get_space_typed<cpu, 1, char>(
-      Shape1(2*actual_float_size+2*actual_quantized_size+temp_reduce_size), s);
-    Tensor<cpu, 1, float> actual_min_float(
-                 reinterpret_cast<float*>(temp_space.dptr_), Shape1(1), s);
-    Tensor<cpu, 1, float> actual_max_float(
-                 reinterpret_cast<float*>(temp_space.dptr_) + 1, Shape1(1), s);
-    const int dev_id = ctx.run_ctx.ctx.dev_id;
-    TBlob actual_min_quantized(reinterpret_cast<SrcDType*>(
-                       temp_space.dptr_ + 8), Shape1(1), cpu::kDevMask, dev_id);
-    TBlob actual_max_quantized(reinterpret_cast<SrcDType*>(
-                   temp_space.dptr_ + 8) + 1, Shape1(1), cpu::kDevMask, dev_id);
-    Tensor<cpu, 1, char> workspace(
-            temp_space.dptr_+2*actual_float_size+2*actual_quantized_size,
-            Shape1(temp_reduce_size), s);
-    broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
-        s, actual_min_quantized.reshape(dst_shape), kWriteTo,
-        workspace, inputs[0].Reorder2Default().data().reshape(src_shape));
-    Kernel<QuantizedToFloatStruct, cpu>::Launch(s, 1,
-        actual_min_float.dptr_, actual_min_quantized.dptr<SrcDType>(),
-        inputs[1].Reorder2Default().data().dptr<float>(),
-        inputs[2].Reorder2Default().data().dptr<float>());
-    broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
-        s, actual_max_quantized.reshape(dst_shape), kWriteTo,
-        workspace, inputs[0].Reorder2Default().data().reshape(src_shape));
-    Kernel<QuantizedToFloatStruct, cpu>::Launch(s, 1,
-        actual_max_float.dptr_, actual_max_quantized.dptr<SrcDType>(),
-        inputs[1].Reorder2Default().data().dptr<float>(),
-        inputs[2].Reorder2Default().data().dptr<float>());
-
-    real_range = MaxAbs(*actual_min_float.dptr_, *actual_max_float.dptr_);
-    MKLDNNRequantizeForwardKer(attrs, ctx, inputs, req, outputs, real_range);
+    NDArray in_buffer = inputs[0].Reorder2Default();
+    auto in_ptr = in_buffer.data().dptr<SrcDType>();
+    auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+    SrcDType data_min = MaxValue<SrcDType>();
+    SrcDType data_max = MinValue<SrcDType>();
+    std::vector<SrcDType> data_maxs(nthreads, data_max);
+    std::vector<SrcDType> data_mins(nthreads, data_min);
+#pragma omp parallel for num_threads(nthreads)
+    for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size()); i++) {
+      int tid = omp_get_thread_num();
+      if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
+      if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
+    }
+    for (index_t i = 0; i < nthreads; i++) {
+      if (data_maxs[i] > data_max) data_max = data_maxs[i];
+      if (data_mins[i] < data_min) data_min = data_mins[i];
+    }
+    float src_range = MinAbs(MinValue<SrcDType>(), MaxValue<SrcDType>());
+    SrcDType data_range = MaxAbs(data_min, data_max);
+    float data_scale = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
+    real_range = data_range * data_scale / src_range;
+  }
+  auto out_type = GetQuantizeOutputType(param);
+  if (out_type == mshadow::kUint8) {
+    MKLDNNRequantizeForwardKer<uint8_t>(attrs, ctx, inputs, req, outputs, real_range);
+  } else if (out_type == mshadow::kInt8) {
+    MKLDNNRequantizeForwardKer<int8_t>(attrs, ctx, inputs, req, outputs, real_range);
+  } else {
+    LOG(FATAL) << "mkldnn requantize op only supports int8 and uint8 as output type";
   }
 }
 
diff --git a/src/operator/quantization/quantization_utils.h b/src/operator/quantization/quantization_utils.h
index c540ea441431..86018e65713f 100644
--- a/src/operator/quantization/quantization_utils.h
+++ b/src/operator/quantization/quantization_utils.h
@@ -127,39 +127,31 @@ MSHADOW_XINLINE void RequantizeManyInNewRange(size_t count, T2* output, const T1
  * \brief Get the scaling factor for converting type T to float.
  */
 template<typename T>
-MSHADOW_XINLINE float FloatForOneQuantizedLevel(float range_min, float range_max) {
+MSHADOW_XINLINE float FloatForOneQuantizedLevel(float range_min, float range_max, bool all_sign) {
   using mshadow::red::limits::MinValue;
   using mshadow::red::limits::MaxValue;
-  const int64_t highest = static_cast<int64_t>(MaxValue<T>());
-  const int64_t lowest  = static_cast<int64_t>(MinValue<T>());
-  const float float_for_one_quantized_level =
-      (range_max - range_min) / (highest - lowest);
-  return float_for_one_quantized_level;
+  float range_data = MaxAbs(range_min, range_max);
+  float range_T = all_sign ? MinAbs(MinValue<T>(), MaxValue<T>()) : MaxValue<T>();
+  return range_data / range_T;
 }
 
 template <typename TA, typename TB, typename TC>
-MSHADOW_XINLINE void QuantizationRangeForMultiplication(float min_a, float max_a,
-                                                        float min_b, float max_b,
-                                                        float* min_c, float* max_c) {
-  using mshadow::red::limits::MinValue;
+MSHADOW_XINLINE void QuantizationRangeForMultiplication(float min_a, float max_a, float min_b,
+                                                        float max_b, float *min_c, float *max_c,
+                                                        bool all_sign) {
   using mshadow::red::limits::MaxValue;
-  const float a_float_for_one_quant_level =
-    FloatForOneQuantizedLevel<TA>(min_a, max_a);
-  const float b_float_for_one_quant_level =
-    FloatForOneQuantizedLevel<TB>(min_b, max_b);
-
-  const int64_t c_highest =
-    static_cast<int64_t>(MaxValue<TC>());
-  const int64_t c_lowest  =
-    static_cast<int64_t>(MinValue<TC>());
+  using mshadow::red::limits::MinValue;
+  const float a_float_for_one_quant_level = FloatForOneQuantizedLevel<TA>(min_a, max_a, all_sign);
+  const float b_float_for_one_quant_level = FloatForOneQuantizedLevel<TB>(min_b, max_b, all_sign);
+  const float range_c =
+      MinAbs(static_cast<int64_t>(MinValue<TC>()), static_cast<int64_t>(MaxValue<TC>()));
   const float c_float_for_one_quant_level =
-    a_float_for_one_quant_level * b_float_for_one_quant_level;
-
-  *min_c = c_float_for_one_quant_level * c_lowest;
-  *max_c = c_float_for_one_quant_level * c_highest;
+      a_float_for_one_quant_level * b_float_for_one_quant_level;
+  *max_c = c_float_for_one_quant_level * range_c;
+  *min_c = -*max_c;
 }
 
-struct QuantizationRangeForMultiplicationStruct {
+struct QuantizationRangeForS8S8MultiplicationStruct {
   MSHADOW_XINLINE static void Map(int i,
                                   float *min_c,
                                   float *max_c,
@@ -168,7 +160,20 @@ struct QuantizationRangeForMultiplicationStruct {
                                   const float *min_b,
                                   const float *max_b) {
   QuantizationRangeForMultiplication<int8_t, int8_t, int32_t>(
-    min_a[i], max_a[i], min_b[i], max_b[i], min_c, max_c);
+    min_a[i], max_a[i], min_b[i], max_b[i], min_c, max_c, true);
+  }
+};
+
+struct QuantizationRangeForS8U8MultiplicationStruct {
+  MSHADOW_XINLINE static void Map(int i,
+                                  float *min_c,
+                                  float *max_c,
+                                  const float *min_a,
+                                  const float *max_a,
+                                  const float *min_b,
+                                  const float *max_b) {
+  QuantizationRangeForMultiplication<int8_t, uint8_t, int32_t>(
+    min_a[i], max_a[i], min_b[i], max_b[i], min_c, max_c, false);
   }
 };
 
@@ -186,6 +191,29 @@ inline size_t ConfigReduce(mshadow::Stream<xpu>* s,
   return broadcast::ReduceWorkspaceSize<NDim, DType>(s, *dst_shape, kWriteTo, *src_shape);
 }
 
+enum QuantizeOutType { kAuto = 0, kInt8, kUint8 };
+
+template<typename Param>
+static mshadow::TypeFlag GetQuantizeOutputType(const Param &param) {
+  auto out_type = mshadow::kInt8;
+  if (param.out_type == QuantizeOutType::kAuto) {
+    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+      if (param.min_calib_range.value() >= 0.0) {
+        out_type = mshadow::kUint8;
+      } else {
+        out_type = mshadow::kInt8;
+      }
+    }
+  } else if (param.out_type == QuantizeOutType::kInt8) {
+    out_type = mshadow::kInt8;
+  } else if (param.out_type == QuantizeOutType::kUint8) {
+    out_type = mshadow::kUint8;
+  } else {
+    LOG(FATAL) << "Unsupported out_type in params: " <<param.out_type;
+  }
+  return out_type;
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZATION_UTILS_H_
diff --git a/src/operator/quantization/quantize-inl.h b/src/operator/quantization/quantize-inl.h
index 1ad0016c52bc..7b856579a7b5 100644
--- a/src/operator/quantization/quantize-inl.h
+++ b/src/operator/quantization/quantize-inl.h
@@ -120,13 +120,13 @@ inline bool QuantizeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 3U);
 
   for (size_t i = 1; i < 3; ++i) {
-    SHAPE_ASSIGN_CHECK(*in_attrs, i, mxnet::TShape({1}));
+    SHAPE_ASSIGN_CHECK(*in_attrs, i, mxnet::TShape(1, 1));
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
   SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape{1});
   SHAPE_ASSIGN_CHECK(*out_attrs, 2, mxnet::TShape{1});
-  return !shape_is_none(out_attrs->at(0));
+  return shape_is_known(out_attrs->at(0));
 }
 
 inline bool QuantizeType(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 5bd9e8af9038..7591477b1081 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -89,11 +89,12 @@ std::vector<NodeEntry> OfflineParams(std::vector<NodeEntry>&& outputs,
   return outputs;
 }
 
-inline bool NeedQuantize(const NodePtr node,
-                         const std::unordered_set<std::string>& excluded_nodes) {
+inline NodePtr NeedQuantize(NodePtr node, const std::unordered_set<std::string>& excluded_nodes) {
+  std::unordered_map<NodePtr, NodePtr> quantized_node;
   static auto& quantized_op_map = Op::GetAttr<mxnet::FQuantizedOp>("FQuantizedOp");
   static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
   const auto& op = node->op();
+
   if (op && quantized_op_map.count(op)) {
     bool need = true;
     if (excluded_nodes.count(node->attrs.name)) {
@@ -112,14 +113,24 @@ inline bool NeedQuantize(const NodePtr node,
         });
       }
     }
-    return need;
+
+    if (need) {
+      auto n_ptr = quantized_op_map[node->op()];
+      auto tmp_node = n_ptr(node->attrs);
+      if (tmp_node->op()) {
+        quantized_node[node] = tmp_node;
+      } else {
+        quantized_node[node] = nullptr;
+      }
+    } else {
+      quantized_node[node] = nullptr;
+    }
   }
-  return false;
+  return quantized_node[node];
 }
 
 Graph QuantizeGraph(Graph &&src) {
   static const auto& flist_outputs = nnvm::Op::GetAttr<nnvm::FListOutputNames>("FListOutputNames");
-  static const auto& quantized_op_map = Op::GetAttr<mxnet::FQuantizedOp>("FQuantizedOp");
   static const auto& need_requantize_map = Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
   static const auto& avoid_quantize_input_map =
       Op::GetAttr<mxnet::FAvoidQuantizeInput>("FAvoidQuantizeInput");
@@ -136,11 +147,9 @@ Graph QuantizeGraph(Graph &&src) {
     NodePtr new_node = Node::Create();
     // If the currently visited node needs quantization, insert a quantize op node before the
     // current node and replace the current node with the quantized version in the new graph.
-    if (NeedQuantize(node, excluded_nodes)) {
-      auto fquantized_op = quantized_op_map[node->op()];
-      // If the currently visited node's op registered the FQuantizedOp property, new_node is a
-      // quantizated version of a that op, such as quantized_conv2d.
-      new_node = fquantized_op(node->attrs);
+    auto tmp_node = NeedQuantize(node, excluded_nodes);
+    if (tmp_node) {
+      new_node = tmp_node;
 
       // add data into quantized op input
       for (size_t i = 0; i < node->inputs.size(); ++i) {
@@ -239,6 +248,7 @@ Graph QuantizeGraph(Graph &&src) {
         NodePtr requantize_node = Node::Create();
         requantize_node->attrs.op = Op::Get("_contrib_requantize");
         requantize_node->attrs.name = "requantize_" + node->attrs.name;
+        requantize_node->attrs.dict["out_type"] = quantized_dtype;
         if (requantize_node->op()->attr_parser != nullptr) {
           requantize_node->op()->attr_parser(&(requantize_node->attrs));
         }
@@ -389,7 +399,7 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
         node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
         node->op()->attr_parser(&(node->attrs));
         const QuantizeV2Param& param = nnvm::get<QuantizeV2Param>(node->attrs.parsed);
-        if (param.out_type == QuantizeV2Param::OutType::kUint8 &&
+        if (param.out_type == QuantizeOutType::kUint8 &&
             param.min_calib_range.value() < 0.0f) {
           LOG(WARNING) << "Calibration statistics indicates that node `" << node->attrs.name
                        << "` has negative input, consider use `auto` or `int8` as out_type";
diff --git a/src/operator/quantization/quantize_v2-inl.h b/src/operator/quantization/quantize_v2-inl.h
index 02ace6c39fac..a8cbc0b6fdf5 100644
--- a/src/operator/quantization/quantize_v2-inl.h
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -38,16 +38,15 @@ namespace mxnet {
 namespace op {
 
 struct QuantizeV2Param : public dmlc::Parameter<QuantizeV2Param> {
-  enum OutType { kAuto = 0, kInt8, kUint8 };
   int out_type;
   dmlc::optional<float> min_calib_range;
   dmlc::optional<float> max_calib_range;
   DMLC_DECLARE_PARAMETER(QuantizeV2Param) {
     DMLC_DECLARE_FIELD(out_type)
-      .add_enum("auto", kAuto)
-      .add_enum("int8", kInt8)
-      .add_enum("uint8", kUint8)
-      .set_default(kInt8)
+      .add_enum("auto", QuantizeOutType::kAuto)
+      .add_enum("int8", QuantizeOutType::kInt8)
+      .add_enum("uint8", QuantizeOutType::kUint8)
+      .set_default(QuantizeOutType::kInt8)
       .describe("Output data type. `auto` can be specified to automatically determine output type "
                 "according to min_calib_range.");
     DMLC_DECLARE_FIELD(min_calib_range)
@@ -61,26 +60,6 @@ struct QuantizeV2Param : public dmlc::Parameter<QuantizeV2Param> {
   }
 };
 
-static mshadow::TypeFlag GetOutputType(const QuantizeV2Param &param) {
-  auto out_type = mshadow::kInt8;
-  if (param.out_type == QuantizeV2Param::OutType::kAuto) {
-    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-      if (param.min_calib_range.value() >= 0.0) {
-        out_type = mshadow::kUint8;
-      } else {
-        out_type = mshadow::kInt8;
-      }
-    }
-  } else if (param.out_type == QuantizeV2Param::OutType::kInt8) {
-    out_type = mshadow::kInt8;
-  } else if (param.out_type == QuantizeV2Param::OutType::kUint8) {
-    out_type = mshadow::kUint8;
-  } else {
-    LOG(FATAL) << "Unsupported out_type in params: " <<param.out_type;
-  }
-  return out_type;
-}
-
 // quantize float to uint8_t
 struct quantize_v2_unsigned {
   template <typename DstDType, typename SrcDType>
@@ -125,95 +104,14 @@ struct quantize_v2_zero_centered {
   }
 };
 
-template <typename xpu>
-void QuantizeV2Compute(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                       const std::vector<TBlob> &inputs, const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &outputs) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  typedef float SrcDType;
-  using mshadow::red::limits::MaxValue;
-  using mshadow::red::limits::MinValue;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
-  auto out_type = GetOutputType(param);
-  if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
-    LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
-                  "please switch to the context of CPU or int8 data type for GPU.";
-  }
-
-  if (inputs[0].type_flag_ == mshadow::kUint8 || inputs[0].type_flag_ == mshadow::kInt8) {
-    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-      *outputs[1].dptr<float>() = param.min_calib_range.value();
-      *outputs[2].dptr<float>() = param.max_calib_range.value();
-    } else {
-      if (inputs[0].type_flag_ == mshadow::kUint8) {
-        *outputs[1].dptr<float>() = 0;
-        *outputs[2].dptr<float>() = 255;
-      } else {
-        *outputs[1].dptr<float>() = -127;
-        *outputs[2].dptr<float>() = 127;
-      }
-    }
-    UnaryOp::IdentityCompute<xpu>(attrs, ctx, {inputs[0]}, req, outputs);
-  } else {
-    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
-      if (out_type == mshadow::kUint8) {
-        Kernel<quantize_v2_unsigned, xpu>::Launch(
-            s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
-            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
-            param.max_calib_range.value(), MinValue<uint8_t>(), MaxValue<uint8_t>());
-      } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
-        Kernel<quantize_v2_zero_centered, xpu>::Launch(
-            s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
-            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
-            param.max_calib_range.value(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
-      } else {
-        LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
-      }
-    } else {  // model is not calibrated
-      mxnet::TShape src_shape, dst_shape;
-      const size_t actual_float_size = sizeof(float);
-      const size_t temp_reduce_size = ConfigReduce<xpu, SrcDType>(
-          s, inputs[0].shape_, mxnet::TShape({1}), &src_shape, &dst_shape);
-      Tensor<xpu, 1, char> temp_space = ctx.requested[0].get_space_typed<xpu, 1, char>(
-          Shape1(2 * actual_float_size + temp_reduce_size), s);
-      const int dev_id = ctx.run_ctx.ctx.dev_id;
-      TBlob in_min_t(reinterpret_cast<SrcDType *>(temp_space.dptr_), Shape1(1), xpu::kDevMask,
-                    dev_id);
-      TBlob in_max_t(reinterpret_cast<SrcDType *>(temp_space.dptr_) + 1, Shape1(1), xpu::kDevMask,
-                    dev_id);
-      Tensor<xpu, 1, char> workspace(temp_space.dptr_ + 2 * actual_float_size,
-                                    Shape1(temp_reduce_size), s);
-      broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
-          s, in_min_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
-      broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
-          s, in_max_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
-      if (out_type == mshadow::kUint8) {
-        Kernel<quantize_v2_unsigned, xpu>::Launch(
-            s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
-            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
-            in_max_t.dptr<float>(), MinValue<uint8_t>(), MaxValue<uint8_t>());
-      } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
-        Kernel<quantize_v2_zero_centered, xpu>::Launch(
-            s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
-            outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
-            in_max_t.dptr<float>(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
-      } else {
-        LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
-      }
-    }
-  }
-}
-
-static inline bool QuantizeV2Shape(const nnvm::NodeAttrs &attrs, mxnet::ShapeVector *in_attrs,
-                                   mxnet::ShapeVector *out_attrs) {
+static inline bool QuantizeV2Shape(const nnvm::NodeAttrs &attrs, std::vector<TShape> *in_attrs,
+                                   std::vector<TShape> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 3U);
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape{1});
-  SHAPE_ASSIGN_CHECK(*out_attrs, 2, mxnet::TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 2, TShape{1});
   return !shape_is_none(out_attrs->at(0));
 }
 
@@ -224,7 +122,7 @@ static inline bool QuantizeV2Type(const nnvm::NodeAttrs &attrs, std::vector<int>
   const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
   CHECK(in_attrs->at(0) == mshadow::kFloat32 || in_attrs->at(0) == mshadow::kUint8 ||
         in_attrs->at(0) == mshadow::kInt8);
-  auto out_type = GetOutputType(param);
+  auto out_type = GetQuantizeOutputType(param);
   if (out_type == mshadow::kUint8) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
   } else if (out_type == mshadow::kInt8) {
@@ -237,6 +135,102 @@ static inline bool QuantizeV2Type(const nnvm::NodeAttrs &attrs, std::vector<int>
   return (*in_attrs)[0] != -1;
 }
 
+template<typename xpu>
+class QuantizeV2Operator {
+ public:
+  explicit QuantizeV2Operator(const nnvm::NodeAttrs &attrs) : attrs_(attrs) {}
+
+  void Forward(const OpContext &ctx, const std::vector<TBlob> &inputs,
+               const std::vector<OpReqType> &req, const std::vector<TBlob> &outputs) {
+    using namespace mshadow;
+    using namespace mxnet_op;
+    typedef float SrcDType;
+    using mshadow::red::limits::MaxValue;
+    using mshadow::red::limits::MinValue;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs_.parsed);
+    auto out_type = GetQuantizeOutputType(param);
+    if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
+      LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
+                    "please switch to the context of CPU or int8 data type for GPU.";
+    }
+
+    if (inputs[0].type_flag_ == mshadow::kUint8 || inputs[0].type_flag_ == mshadow::kInt8) {
+      if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+        *outputs[1].dptr<float>() = param.min_calib_range.value();
+        *outputs[2].dptr<float>() = param.max_calib_range.value();
+      } else {
+        if (inputs[0].type_flag_ == mshadow::kUint8) {
+          *outputs[1].dptr<float>() = 0;
+          *outputs[2].dptr<float>() = 255;
+        } else {
+          *outputs[1].dptr<float>() = -127;
+          *outputs[2].dptr<float>() = 127;
+        }
+      }
+      UnaryOp::IdentityCompute<xpu>(attrs_, ctx, {inputs[0]}, req, outputs);
+    } else {
+      if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+        if (out_type == mshadow::kUint8) {
+          Kernel<quantize_v2_unsigned, xpu>::Launch(
+              s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
+              outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
+              param.max_calib_range.value(), MinValue<uint8_t>(), MaxValue<uint8_t>());
+        } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
+          Kernel<quantize_v2_zero_centered, xpu>::Launch(
+              s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
+              outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
+              param.max_calib_range.value(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+        } else {
+          LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+        }
+      } else {  // model is not calibrated
+        mxnet::TShape src_shape, dst_shape;
+        const size_t actual_float_size = sizeof(float);
+        const size_t temp_reduce_size = ConfigReduce<xpu, SrcDType>(
+            s, inputs[0].shape_, mxnet::TShape(1, 1), &src_shape, &dst_shape);
+        Tensor<xpu, 1, char> temp_space = ctx.requested[0].get_space_typed<xpu, 1, char>(
+            Shape1(2 * actual_float_size + temp_reduce_size), s);
+        const int dev_id = ctx.run_ctx.ctx.dev_id;
+        TBlob in_min_t(reinterpret_cast<SrcDType *>(temp_space.dptr_), Shape1(1), xpu::kDevMask,
+                       dev_id);
+        TBlob in_max_t(reinterpret_cast<SrcDType *>(temp_space.dptr_) + 1, Shape1(1), xpu::kDevMask,
+                       dev_id);
+        Tensor<xpu, 1, char> workspace(temp_space.dptr_ + 2 * actual_float_size,
+                                       Shape1(temp_reduce_size), s);
+        broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
+            s, in_min_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
+        broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
+            s, in_max_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
+        if (out_type == mshadow::kUint8) {
+          Kernel<quantize_v2_unsigned, xpu>::Launch(
+              s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
+              outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
+              in_max_t.dptr<float>(), MinValue<uint8_t>(), MaxValue<uint8_t>());
+        } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
+          Kernel<quantize_v2_zero_centered, xpu>::Launch(
+              s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
+              outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
+              in_max_t.dptr<float>(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+        } else {
+          LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+        }
+      }
+    }
+  }
+
+ private:
+  nnvm::NodeAttrs attrs_;
+};
+
+template <typename xpu>
+static void QuantizeV2Forward(const OpStatePtr &state_ptr, const OpContext &ctx,
+                              const std::vector<TBlob> &inputs, const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
+  auto &op = state_ptr.get_state<QuantizeV2Operator<xpu>>();
+  op.Forward(ctx, inputs, req, outputs);
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZE_V2_INL_H_
diff --git a/src/operator/quantization/quantize_v2.cc b/src/operator/quantization/quantize_v2.cc
index 920100bc9f8b..e9017a58a82c 100644
--- a/src/operator/quantization/quantize_v2.cc
+++ b/src/operator/quantization/quantize_v2.cc
@@ -47,6 +47,22 @@ static bool QuantizeV2StorageType(const nnvm::NodeAttrs& attrs, const int dev_ma
   return true;
 }
 
+static OpStatePtr CreateQuantizeV2State(const nnvm::NodeAttrs& attrs, Context ctx,
+                                        const std::vector<TShape>& in_shapes,
+                                        const std::vector<int>& in_types) {
+  OpStatePtr state;
+  if (ctx.dev_type == kGPU) {
+    state = OpStatePtr::Create<QuantizeV2Operator<gpu>>(attrs);
+  } else {
+#if MXNET_USE_MKLDNN == 1
+    state = OpStatePtr::Create<SgMKLDNNQuantizeOperator>(attrs);
+#else
+    state = OpStatePtr::Create<QuantizeV2Operator<cpu>>(attrs);
+#endif
+  }
+  return state;
+}
+
 NNVM_REGISTER_OP(_contrib_quantize_v2)
 .describe(R"code(Quantize a input tensor from float to `out_type`,
 with user-specified `min_calib_range` and `max_calib_range` or the input range collected at runtime.
@@ -86,11 +102,12 @@ If min_calib_range isn't presented, the output type will be int8.
 // TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
 // will be reverted after the improvement of CachedOP is done.
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.set_attr<FCreateOpState>("FCreateOpState", CreateQuantizeV2State)
 #if MXNET_USE_MKLDNN == 1
 .set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeV2Compute)
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", SgMKLDNNQuantizeForward)
 #endif
-.set_attr<FCompute>("FCompute<cpu>", QuantizeV2Compute<cpu>)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", QuantizeV2Forward<cpu>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs) {
   return std::vector<std::pair<int, int> >{{0, 0}};
 })
diff --git a/src/operator/quantization/quantize_v2.cu b/src/operator/quantization/quantize_v2.cu
index ab0cf9c5ad0e..7acdf56082e5 100644
--- a/src/operator/quantization/quantize_v2.cu
+++ b/src/operator/quantization/quantize_v2.cu
@@ -28,7 +28,7 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_contrib_quantize_v2)
-.set_attr<FCompute>("FCompute<gpu>", QuantizeV2Compute<gpu>);
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", QuantizeV2Forward<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/quantized_activation.cc b/src/operator/quantization/quantized_activation.cc
new file mode 100644
index 000000000000..4ab74d0b1c3f
--- /dev/null
+++ b/src/operator/quantization/quantized_activation.cc
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file quantized_activation.cc
+*/
+#include <mxnet/op_attr_types.h>
+#include "../nn/activation-inl.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+
+bool QuantizedActivationShape(const nnvm::NodeAttrs& attrs,
+                              std::vector<TShape> *in_shape,
+                              std::vector<TShape> *out_shape) {
+  CHECK_EQ(in_shape->size(), 3U);
+  if (shape_is_none(in_shape->at(0))) return false;
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, TShape{1});
+  SHAPE_ASSIGN_CHECK(*in_shape, 2, TShape{1});
+  out_shape->clear();
+  out_shape->push_back((*in_shape)[0]);
+  out_shape->push_back(TShape{1});
+  out_shape->push_back(TShape{1});
+  return true;
+}
+
+bool QuantizedActivationType(const nnvm::NodeAttrs& attrs,
+                             std::vector<int> *in_type,
+                             std::vector<int> *out_type) {
+  const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+  CHECK_EQ(in_type->size(), 3U);
+  CHECK_EQ(out_type->size(), 3U);
+  if (param.act_type == activation::kReLU) {
+    TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kInt8);
+  } else {
+    LOG(FATAL) << "_contrib_quantized_act only supports act_type=relu for now";
+  }
+  TYPE_ASSIGN_CHECK(*in_type, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_type, 2, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_type, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_type, 2, mshadow::kFloat32);
+  return true;
+}
+
+inline static bool QuantizedActivationStorageType(const nnvm::NodeAttrs &attrs,
+                                                  const int dev_mask,
+                                                  DispatchMode *dispatch_mode,
+                                                  std::vector<int> *in_attrs,
+                                                  std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3);
+
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  const ActivationParam &param = nnvm::get<ActivationParam>(attrs.parsed);
+  if (dev_mask == mshadow::cpu::kDevMask && param.act_type == activation::kReLU) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#else
+  CHECK_EQ(out_attrs->size(), 3);
+#endif
+  for (int& out_attr : *out_attrs)
+    out_attr = kDefaultStorage;
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_act)
+.describe(R"code(Activation operator for input and output data type of int8.
+The input and output data comes with min and max thresholds for quantizing
+the float32 data into int8.
+
+.. Note::
+     This operator only supports forward propogation. DO NOT use it in training.
+     This operator only supports `relu`)code" ADD_FILELINE)
+.set_num_inputs(3)
+.set_num_outputs(3)
+.set_attr_parser(ParamParser<ActivationParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "min_data", "max_data"};
+  })
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output", "min_output", "max_output"};
+  })
+.set_attr<nnvm::FInferType>("FInferType", QuantizedActivationType)
+.set_attr<mxnet::FInferShape>("FInferShape", QuantizedActivationShape)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedActivationStorageType)
+.set_attr<FNeedRequantize>("FNeedRequantize",
+  [](const NodeAttrs& attrs) {
+    const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
+    CHECK(param.act_type == activation::kReLU)
+      << "_contrib_quantized_act only supports act_type=relu for now";
+    return false;
+  })
+.add_argument("data", "NDArray-or-Symbol", "Input data.")
+.add_argument("min_data", "NDArray-or-Symbol", "Minimum value of data.")
+.add_argument("max_data", "NDArray-or-Symbol", "Maximum value of data.")
+.add_arguments(ActivationParam::__FIELDS__());
+
+NNVM_REGISTER_OP(Activation)
+.set_attr<FQuantizedOp>("FQuantizedOp", [](const NodeAttrs& attrs) {
+  ActivationParam param;
+  param.Init(attrs.dict);
+  nnvm::NodePtr node = nnvm::Node::Create();
+  if (param.act_type == activation::kReLU) {
+    node->attrs.op = Op::Get("_contrib_quantized_act");
+    node->attrs.name = "quantized_" + attrs.name;
+  } else {
+    node->attrs.op = nullptr;
+    node->attrs.name = attrs.name;
+  }
+  node->attrs.dict = attrs.dict;
+  if (node->op()->attr_parser != nullptr) {
+    node->op()->attr_parser(&(node->attrs));
+  }
+  return node;
+});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantized_concat.cc b/src/operator/quantization/quantized_concat.cc
index e32bb5a18e1a..d6aeb41da1f8 100644
--- a/src/operator/quantization/quantized_concat.cc
+++ b/src/operator/quantization/quantized_concat.cc
@@ -35,34 +35,34 @@ static bool ConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_sha
   CHECK_EQ(out_shape->size(), 3U);
   mxnet::TShape dshape;
   index_t size = 0;
-  bool has_zero = false;
+  bool has_unknown_dim_size = false;
   int axis = -1;
   for (int i = 0; i < param_.num_args; ++i) {
     mxnet::TShape tmp = (*in_shape)[i];
-    if (tmp.ndim()) {
+    if (tmp.ndim() > 0) {
       axis = CheckAxis(param_.dim, tmp.ndim());
-      has_zero = tmp[axis] == 0 || has_zero;
+      has_unknown_dim_size = !mxnet::dim_size_is_known(tmp, axis) || has_unknown_dim_size;
       size += tmp[axis];
-      tmp[axis] = 0;
+      tmp[axis] = -1;
       shape_assign(&dshape, tmp);
     }
   }
 
   mxnet::TShape tmp = (*out_shape)[0];
-  if (tmp.ndim()) {
+  if (tmp.ndim() > 0) {
     axis = CheckAxis(param_.dim, tmp.ndim());
-    tmp[axis] = 0;
+    tmp[axis] = -1;
     shape_assign(&dshape, tmp);
   }
 
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
 
   for (int i = 0; i < param_.num_args; ++i) {
     CHECK(shape_assign(&(*in_shape)[i], dshape))
         << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
   }
 
-  if (!has_zero) dshape[axis] = size;
+  if (!has_unknown_dim_size) dshape[axis] = size;
   CHECK(shape_assign(&(*out_shape)[0], dshape))
       << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
 
@@ -71,7 +71,7 @@ static bool ConcatShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector* in_sha
   }
   SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape{1});
   SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape{1});
-  return dshape.Size() != 0;
+  return shape_is_known(dshape);
 }
 
 static bool ConcatType(const nnvm::NodeAttrs& attrs, std::vector<int>* in_type,
diff --git a/src/operator/quantization/quantized_conv.cc b/src/operator/quantization/quantized_conv.cc
index 1a801ee50744..aa3f5ce1ad61 100644
--- a/src/operator/quantization/quantized_conv.cc
+++ b/src/operator/quantization/quantized_conv.cc
@@ -78,8 +78,8 @@ bool QuantizedConvShape(const nnvm::NodeAttrs& attrs,
   oshape[W] = (AddPad(dshape[W], param.pad[1]) - wshape[W]) / param.stride[1] + 1;
 
   SHAPE_ASSIGN_CHECK(*out_shape, 0, oshape);
-  SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape({1}));
-  SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape({1}));
+  SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape(1, 1));
   return true;
 }
 
diff --git a/src/operator/quantization/quantized_conv.cu b/src/operator/quantization/quantized_conv.cu
index ee688c0648c8..23c41a17ef4a 100644
--- a/src/operator/quantization/quantized_conv.cu
+++ b/src/operator/quantization/quantized_conv.cu
@@ -174,7 +174,7 @@ class QuantizedCuDNNConvOp {
     // of in_data[0] and in_data[1]. Need to rescale the min/max range of out_data
     // based on the min/max ranges of in_data[0] and in_data[1].
     const size_t num_inputs = param_.no_bias ? 2 : 3;
-    mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, gpu>::Launch(s, 1,
+    mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, gpu>::Launch(s, 1,
       out_data[1].dptr<float>(), out_data[2].dptr<float>(),
        in_data[num_inputs].dptr<float>(),  in_data[num_inputs+1].dptr<float>(),
        in_data[num_inputs+2].dptr<float>(),  in_data[num_inputs+3].dptr<float>());
diff --git a/src/operator/quantization/quantized_flatten-inl.h b/src/operator/quantization/quantized_flatten-inl.h
index 99a262de19ca..de051b969659 100644
--- a/src/operator/quantization/quantized_flatten-inl.h
+++ b/src/operator/quantization/quantized_flatten-inl.h
@@ -86,10 +86,10 @@ inline bool QuantizedFlattenShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 3U);
 
   const mxnet::TShape &dshape = (*in_attrs)[0];
-  if (shape_is_none(dshape)) return false;
+  if (!shape_is_known(dshape)) return false;
 
-  uint32_t target_dim = 1;
-  for (uint32_t i = 1; i < dshape.ndim(); ++i) {
+  dim_t target_dim = 1;
+  for (int i = 1; i < dshape.ndim(); ++i) {
     target_dim *= dshape[i];
   }
 
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index 0a04e71b9093..ceac0b6ec9a0 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -47,7 +47,7 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_shape->size(), num_inputs * 3);
   CHECK_EQ(out_shape->size(), 3U);
 
-  CHECK(!shape_is_none(in_shape->at(0)))
+  CHECK(shape_is_known(in_shape->at(0)))
     << "QuantizedFullyConnectedOp input data shape must be given";
   const mxnet::TShape& dshape = in_shape->at(0);
   index_t num_input;
@@ -75,8 +75,8 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
   } else {
     SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
   }
-  SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape({1}));
-  SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape({1}));
+  SHAPE_ASSIGN_CHECK(*out_shape, 1, mxnet::TShape(1, 1));
+  SHAPE_ASSIGN_CHECK(*out_shape, 2, mxnet::TShape(1, 1));
   return true;
 }
 
@@ -233,7 +233,7 @@ void QuantizedFullyConnectedForwardCPU(const nnvm::NodeAttrs& attrs,
   Tensor<cpu, 1, float> max_weight =
     in_data[num_inputs + quantized_fullc::kWeightMax].get<cpu, 1, float>(s);
 
-  Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1, min_output.dptr_,
+  Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(s, 1, min_output.dptr_,
       max_output.dptr_, min_data.dptr_, max_data.dptr_, min_weight.dptr_, max_weight.dptr_);
   if (!param.no_bias) {
     Tensor<cpu, 1, int8_t> bias = in_data[fullc::kBias].get_with_shape<cpu, 1, int8_t>(
diff --git a/src/operator/quantization/quantized_fully_connected.cu b/src/operator/quantization/quantized_fully_connected.cu
index d1cbdc98d535..04680c8c2b78 100644
--- a/src/operator/quantization/quantized_fully_connected.cu
+++ b/src/operator/quantization/quantized_fully_connected.cu
@@ -109,7 +109,7 @@ void QuantizedFullyConnectedForwardGPU(const nnvm::NodeAttrs& attrs,
                            cmp_type,
                            CUBLAS_GEMM_DFALT));
 
-  Kernel<QuantizationRangeForMultiplicationStruct, gpu>::Launch(s, 1,
+  Kernel<QuantizationRangeForS8S8MultiplicationStruct, gpu>::Launch(s, 1,
     outputs[1].dptr<float>(), outputs[2].dptr<float>(),
      inputs[num_inputs].dptr<float>(),   inputs[num_inputs+1].dptr<float>(),
      inputs[num_inputs+2].dptr<float>(), inputs[num_inputs+3].dptr<float>());
diff --git a/src/operator/quantization/quantized_pooling.cc b/src/operator/quantization/quantized_pooling.cc
index af604080a756..1839e2a29d77 100644
--- a/src/operator/quantization/quantized_pooling.cc
+++ b/src/operator/quantization/quantized_pooling.cc
@@ -35,7 +35,7 @@ bool QuantizedPoolingShape(const nnvm::NodeAttrs& attrs,
                            mxnet::ShapeVector *out_shape) {
   const PoolingParam& param = nnvm::get<PoolingParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), 3U);
-  if (shape_is_none(in_shape->at(0))) return false;
+  if (!shape_is_known(in_shape->at(0))) return false;
   const mxnet::TShape &dshape = (*in_shape)[0];
   CHECK_EQ(dshape.ndim(), 4U)
       << "quantized_pooling: Input data should be 4D in "
@@ -45,7 +45,7 @@ bool QuantizedPoolingShape(const nnvm::NodeAttrs& attrs,
       << "QuantizedPoolingOp only supports NCHW layout for now, saw " << layout;
   // NCHW layout
   const int N = 0, H = 2, W = 3, C = 1;
-  mxnet::TShape oshape(4);
+  mxnet::TShape oshape(4, -1);
   CHECK_EQ(param.kernel.ndim(), 2) << "QuantizedPoolingOp only supports 2D pooling for now";
   CHECK(param.kernel[0] <= dshape[H] + 2 * param.pad[0])
       << "kernel size (" << param.kernel[0]
diff --git a/src/operator/quantization/requantize-inl.h b/src/operator/quantization/requantize-inl.h
index 21d58d4607eb..2bdc3a712961 100644
--- a/src/operator/quantization/requantize-inl.h
+++ b/src/operator/quantization/requantize-inl.h
@@ -38,9 +38,17 @@ namespace mxnet {
 namespace op {
 
 struct RequantizeParam : public dmlc::Parameter<RequantizeParam> {
+  int out_type;
   dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
   dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
   DMLC_DECLARE_PARAMETER(RequantizeParam) {
+    DMLC_DECLARE_FIELD(out_type)
+      .add_enum("auto", QuantizeOutType::kAuto)
+      .add_enum("int8", QuantizeOutType::kInt8)
+      .add_enum("uint8", QuantizeOutType::kUint8)
+      .set_default(QuantizeOutType::kInt8)
+      .describe("Output data type. `auto` can be specified to automatically determine output type "
+                "according to min_calib_range.");
     DMLC_DECLARE_FIELD(min_calib_range)
     .set_default(dmlc::optional<float>())
     .describe("The minimum scalar value in the form of float32 obtained "
@@ -59,10 +67,18 @@ inline bool RequantizeType(const nnvm::NodeAttrs& attrs,
                            std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 3U);
+  const RequantizeParam &param = nnvm::get<RequantizeParam>(attrs.parsed);
   TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt32);
   TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
   TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  auto out_type = GetQuantizeOutputType(param);
+  if (out_type == mshadow::kUint8) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
+  } else if (out_type == mshadow::kInt8) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  } else {
+    LOG(FATAL) << "requantize op only supports int8 and uint8 as output type";
+  }
   TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
   TYPE_ASSIGN_CHECK(*out_attrs, 2, mshadow::kFloat32);
   return (*in_attrs)[0] != -1;
@@ -100,6 +116,11 @@ void RequantizeForward(const nnvm::NodeAttrs& attrs,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const RequantizeParam& param =
     nnvm::get<RequantizeParam>(attrs.parsed);
+  auto out_type = GetQuantizeOutputType(param);
+  if (out_type == mshadow::kUint8 && std::is_same<xpu, gpu>::value) {
+    LOG(FATAL) << "currently, uint8 quantization is only supported by CPU, "
+                  "please switch to the context of CPU or int8 data type for GPU.";
+  }
 
   if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
     Kernel<RequantizeKernel, xpu>::Launch(s, inputs[0].Size(),
@@ -111,7 +132,7 @@ void RequantizeForward(const nnvm::NodeAttrs& attrs,
     const size_t actual_float_size = sizeof(float);
     const size_t actual_quantized_size = sizeof(SrcDType);
     const size_t temp_reduce_size = ConfigReduce<xpu, SrcDType>(
-        s, inputs[0].shape_, mxnet::TShape({1}), &src_shape, &dst_shape);
+        s, inputs[0].shape_, mxnet::TShape(1, 1), &src_shape, &dst_shape);
     Tensor<xpu, 1, char> temp_space =
       ctx.requested[0].get_space_typed<xpu, 1, char>(
           Shape1(2*actual_float_size+2*actual_quantized_size+temp_reduce_size), s);
diff --git a/src/operator/quantization/requantize.cc b/src/operator/quantization/requantize.cc
index 4807226e464c..43682383b0d6 100644
--- a/src/operator/quantization/requantize.cc
+++ b/src/operator/quantization/requantize.cc
@@ -61,6 +61,10 @@ inference accuracy.
 .set_attr_parser(ParamParser<RequantizeParam>)
 .set_num_inputs(3)
 .set_num_outputs(3)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "min_range", "max_range"};
+  })
 .set_attr<mxnet::FInferShape>("FInferShape", QuantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", RequantizeType)
 .set_attr<FInferStorageType>("FInferStorageType", RequantizeStorageType)
diff --git a/src/operator/random/multisample_op.h b/src/operator/random/multisample_op.h
index e9f266932e13..7d5e256297ad 100644
--- a/src/operator/random/multisample_op.h
+++ b/src/operator/random/multisample_op.h
@@ -66,7 +66,7 @@ inline bool MultiSampleOpShape(const nnvm::NodeAttrs& attrs,
   // Get shape to be sampled for each parameter set.
   const MultiSampleParam& param = nnvm::get<MultiSampleParam>(attrs.parsed);
   mxnet::TShape sshape = param.shape;
-  for (size_t i = 0; i < sshape.ndim(); ++i) {
+  for (int i = 0; i < sshape.ndim(); ++i) {
     CHECK_GT(sshape[i], 0) << "shape parameter must be non-zero within each dimension";
   }
   // Examine output shape whether it is already defined.
diff --git a/src/operator/random/sample_multinomial_op.h b/src/operator/random/sample_multinomial_op.h
index e76cd646b850..b38aefbc1634 100644
--- a/src/operator/random/sample_multinomial_op.h
+++ b/src/operator/random/sample_multinomial_op.h
@@ -41,7 +41,7 @@ struct SampleMultinomialParam : public dmlc::Parameter<SampleMultinomialParam> {
   int dtype;
   DMLC_DECLARE_PARAMETER(SampleMultinomialParam) {
     DMLC_DECLARE_FIELD(shape)
-      .set_default(mxnet::TShape())
+      .set_default(mxnet::TShape(0, 1))
       .describe("Shape to be sampled from each random distribution.");
     DMLC_DECLARE_FIELD(get_prob)
     .set_default(false)
@@ -68,7 +68,7 @@ inline bool SampleMultinomialOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), param.get_prob ? 2U : 1U);
   const mxnet::TShape& ishape = (*in_attrs)[0];
-  if (!ishape.ndim()) return false;
+  if (!shape_is_known(ishape)) return false;
 
   MSHADOW_TYPE_SWITCH(param.dtype, DType, {
     CHECK_LE(ishape[ishape.ndim() - 1], mxnet::common::MaxIntegerValue<DType>())
@@ -76,26 +76,26 @@ inline bool SampleMultinomialOpShape(const nnvm::NodeAttrs& attrs,
   });
 
   if (ishape.ndim() == 1) {
-    if (param.shape.ndim()) {
+    if (param.shape.ndim() > 0) {
       SHAPE_ASSIGN_CHECK(*out_attrs, 0, param.shape);
       if (param.get_prob) SHAPE_ASSIGN_CHECK(*out_attrs, 1, param.shape);
     } else {
-      SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1));
-      if (param.get_prob) SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape(1));
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
+      if (param.get_prob) SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape(1, 1));
     }
     return true;
   }
 
-  mxnet::TShape oshape(ishape.ndim() - 1 + param.shape.ndim());
-  for (size_t i = 0; i < ishape.ndim() - 1; ++i) {
+  mxnet::TShape oshape(ishape.ndim() - 1 + param.shape.ndim(), -1);
+  for (int i = 0; i < ishape.ndim() - 1; ++i) {
     oshape[i] = ishape[i];
   }
-  for (size_t i = 0; i < param.shape.ndim(); ++i) {
+  for (int i = 0; i < param.shape.ndim(); ++i) {
     oshape[i + ishape.ndim() - 1] = param.shape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   if (param.get_prob) SHAPE_ASSIGN_CHECK(*out_attrs, 1, oshape);
-  return true;
+  return shape_is_known(out_attrs->at(0)) && shape_is_known(out_attrs->at(1));
 }
 
 
diff --git a/src/operator/random/unique_sample_op.h b/src/operator/random/unique_sample_op.h
index 87998c8f46b1..e88b95a8bdd6 100644
--- a/src/operator/random/unique_sample_op.h
+++ b/src/operator/random/unique_sample_op.h
@@ -60,7 +60,7 @@ inline bool SampleUniqueShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 0U);
   CHECK_EQ(out_attrs->size(), 2U);
   // output shape is known
-  if ((*out_attrs)[0].ndim() == 2 && param.shape.ndim() == 0) {
+  if ((*out_attrs)[0].ndim() == 2 && !mxnet::ndim_is_known(param.shape)) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::Shape1((*out_attrs)[0][0]));
     return true;
   }
diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h
index 8b63a8a2cff6..d8f102de1675 100644
--- a/src/operator/regression_output-inl.h
+++ b/src/operator/regression_output-inl.h
@@ -57,7 +57,7 @@ inline bool RegressionOpShape(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   CHECK_EQ(in_attrs->size(), 2U) << "Input:[data, label]";
   const mxnet::TShape &dshape = in_attrs->at(0);
-  if (dshape.ndim() == 0) return false;
+  if (!shape_is_known(dshape)) return false;
   auto &lshape = (*in_attrs)[1];
   if (lshape.ndim() == 0) {
     // special treatment for 1D output, to allow 1D label by default.
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 71ad331786ae..37f21ce6d126 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -26,6 +26,9 @@
 #ifndef MXNET_OPERATOR_RNN_INL_H_
 #define MXNET_OPERATOR_RNN_INL_H_
 
+#define MXNET_USE_CUDNN_RNN MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
+#define USE_CUDNN_LSTM_PROJ MXNET_USE_CUDNN == 1 && CUDNN_VERSION >= 7200
+
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
@@ -35,6 +38,7 @@
 #include <vector>
 #include <string>
 #include <utility>
+#include <cstdint>
 #include "./math.h"
 #include "./math_functions-inl.h"
 #include "./operator_common.h"
@@ -47,7 +51,7 @@ namespace rnn_enum {
   enum RNNOpInputs {kData, kParams, kState, kStateCell};
   enum RNNOpOutputs {kOut, kStateOut, kStateCellOut};
   enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
-  enum RNNOpResource {kTempSpace};
+  enum RNNOpResource {kCuDNNDropoutDescSpace};
 }
 
 inline int GetRnnParamSize(int num_layer,
@@ -160,9 +164,8 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   uint32_t num_layers;
   bool bidirectional, state_outputs;
   int mode;
-  float p, pkeep_;
+  float p;
   int seq_length_, batch_size_, input_size_;
-  bool lstm_q_;  // whether type is lstm
   dmlc::optional<int> projection_size;
   dmlc::optional<double> lstm_state_clip_min, lstm_state_clip_max;
   bool lstm_state_clip_nan;
@@ -212,7 +215,6 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
   }
 };
 
-
 /**
  * @params: ws: Temp workspace for gemm's output storage.
  *          rs: Reserve space of forward intermediate data used for training.
@@ -236,6 +238,7 @@ struct RNNParam : public dmlc::Parameter<RNNParam> {
  *                  hy's shape is [num_layers, batch_size, state_size]
  *          cy_ptr: Only used in lstm mode. pointer of tensor cy  containing the cell state
  *                  for t=seq_length. cy' shape is [num_layers, batch_size, state_size]
+ *          dropout: should be 0 <= dropout < 1
  *          mode: Specifies the type of RNN to compute.
  */
 template <typename DType>
@@ -376,58 +379,189 @@ void RNNBackward(DType* ws,
   }
 }
 
-template<typename DType>
-class RNNOp : public Operator{
+template<typename xpu, typename DType>
+class RNNOp {
  public:
-  explicit RNNOp(RNNParam p)
-    :param_(p), init_space_(false), reserve_space_size_(0) {
+  RNNParam param_;
+  Context ctx_;
+  explicit RNNOp(RNNParam param, Context ctx) {
+    this->param_ = param;
+    this->ctx_ = ctx;
+    #if MXNET_USE_CUDNN_RNN
+    init_cudnn_ = false;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy.
+    // No tests in place for fp16 RNNs, so leave TensorCore disabled for now.
+    cudnn_tensor_core_ = false;
+    // When fp16 RNN tests are introduced, we can enable TensorCore as follows:
+//    cudnn_tensor_core =
+//        mshadow::DataType<DType>::kFlag == mshadow::kFloat16 && GetEnvAllowTensorCore();
+    // Defaults
+    input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
+    // RNN Mode
+    switch (param_.mode) {
+      case rnn_enum::kRnnRelu:
+        mode_ = CUDNN_RNN_RELU;
+        break;
+      case rnn_enum::kRnnTanh:
+        mode_ = CUDNN_RNN_TANH;
+        break;
+      case rnn_enum::kLstm:
+        mode_ = CUDNN_LSTM;
+        break;
+      case rnn_enum::kGru:
+        mode_ = CUDNN_GRU;
+        break;
+      default:
+        LOG(FATAL) << "Not implmented";
+    }
+#if USE_CUDNN_LSTM_PROJ
     if (param_.projection_size.has_value()) {
-      LOG(FATAL) << "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
+      CHECK_EQ(param_.mode, rnn_enum::kLstm)
+        << "Projection is only supported for LSTM.";
+      CHECK_GE(param_.state_size, param_.projection_size.value())
+        << "State size must be larger than projection size.";
     }
+#else
+    CHECK(!param_.projection_size.has_value())
+      << "Projection is only supported for LSTM with CuDNN version later than 7.1.1.";
+#endif
+#if USE_CUDNN_LSTM_PROJ
     if (param_.lstm_state_clip_min.has_value()
         || param_.lstm_state_clip_max.has_value()) {
-      LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1";
+      CHECK_EQ(param_.mode, rnn_enum::kLstm)
+        << "State clipping is only supported for LSTM.";
+      CHECK(param_.lstm_state_clip_min.has_value() && param_.lstm_state_clip_max.has_value())
+        << "lstm_state_clip_min and lstm_state_clip_max must be specified together.";
+      CHECK_GE(param_.lstm_state_clip_max.value(), param_.lstm_state_clip_min.value())
+        << "lstm_state_clip_max must be greater or equal to lstm_state_clip_min";
+    }
+#else
+    CHECK(!param_.lstm_state_clip_min.has_value()
+          && !param_.lstm_state_clip_max.has_value())
+      << "State clipping is only supported for LSTM with CuDNN version later than 7.2.1.";
+#endif
+    // RNN Direction
+    direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+    // Create descriptors
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&hx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&hy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&cy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcx_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dhy_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&dcy_desc_));
+
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&w_desc_));
+    CUDNN_CALL(cudnnCreateFilterDescriptor(&dw_desc_));
+
+    CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_));
+    CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
+
+    #if USE_CUDNN_LSTM_PROJ
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_));
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&y_data_desc_));
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dx_data_desc_));
+    CUDNN_CALL(cudnnCreateRNNDataDescriptor(&dy_data_desc_));
+    #endif
+    #else
+    if (ctx_.dev_type == kGPU) {
+      LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
+    }
+    #endif
+
+    if (ctx_.dev_type == kCPU) {
+      this->init_space_ = false;
+      this->temp_init_space_ = false;
+      this->reserve_cpu_space_size_ = 0;
+      this->temp_cpu_space_size_ = 0;
+
+      if (param_.projection_size.has_value()) {
+        LOG(FATAL) <<
+            "hidden layer projection is only supported for GPU with CuDNN later than 7.1.1";
+      }
+      if (param_.lstm_state_clip_min.has_value()
+          || param_.lstm_state_clip_max.has_value()) {
+        LOG(FATAL) << "LSTM state clipping is only supported for GPU with CuDNN later than 7.2.1";
+      }
     }
   }
 
   ~RNNOp() {
-    if (init_space_) {
+    #if MXNET_USE_CUDNN_RNN
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(hy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(cy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcx_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dhy_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(dcy_desc_));
+
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(w_desc_));
+    CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_));
+    CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
+
+    if (init_cudnn_) {
+      for (size_t i = 0; i < x_desc_vec_.size(); ++i) {
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc_vec_[i]));
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc_vec_[i]));
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]));
+        CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]));
+      }
+      init_cudnn_ = false;
+      Storage::Get()->Free(temp_space_);
       Storage::Get()->Free(reserve_space_);
-      init_space_ = false;
+    }
+    #if USE_CUDNN_LSTM_PROJ
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(x_data_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(y_data_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dx_data_desc_));
+    CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_));
+    #endif
+    #endif
+
+    if (ctx_.dev_type == kCPU) {
+      if (init_space_) {
+        Storage::Get()->Free(reserve_cpu_space_);
+        init_space_ = false;
+      }
+      if (temp_init_space_) {
+        Storage::Get()->Free(temp_cpu_space_);
+        temp_init_space_ = false;
+      }
     }
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK(param_.p >= 0.0f && param_.p < 1.0f)
         << "unsupported dropout value, should be 0 <= dropout < 1";
-
-    size_t in_expected = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    size_t out_expected = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    if (!param_.state_outputs) {
-      out_expected = 1;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
     }
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    Stream<cpu> *s = ctx.get_stream<cpu>();
-    // get input + output tensor
-    Tensor<cpu, 3, DType> x = in_data[rnn_enum::kData].get<cpu, 3, DType>(s);
-    Tensor<cpu, 1, DType> w = in_data[rnn_enum::kParams].get<cpu, 1, DType>(s);
-    Tensor<cpu, 3, DType> hx = in_data[rnn_enum::kState].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> y = out_data[rnn_enum::kOut].get<cpu, 3, DType>(s);
-    CHECK(x.CheckContiguous());
-    CHECK(w.CheckContiguous());
-    CHECK(hx.CheckContiguous());
-    CHECK(y.CheckContiguous());
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    // get input + output tensors
+    Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
+
     param_.seq_length_ = x.shape_[0];
     param_.batch_size_ = x.shape_[1];
     param_.input_size_ = x.shape_[2];
-
     const int direction = param_.bidirectional ? 2 : 1;
     const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
     DType* b_ptr = w.dptr_ + w.shape_[0] - bsize;
@@ -438,124 +572,308 @@ class RNNOp : public Operator{
     }
     DType* cx_ptr = NULL;
     DType* cy_ptr = NULL;
-
-    if (param_.mode == rnn_enum::kLstm) {
-      cx_ptr = in_data[rnn_enum::kStateCell].dptr<DType>();
-      if (param_.state_outputs) {
-        cy_ptr = out_data[rnn_enum::kStateCellOut].dptr<DType>();
-      }
+    if (param_.mode == rnn_enum::kLstm)
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<xpu, 3, DType>(s)).dptr_;
+    if (param_.mode == rnn_enum::kLstm && param_.state_outputs)
+      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<xpu, 3, DType>(s)).dptr_;
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+
+    #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
+    if (!init_cudnn_) {
+      Init(ctx, s, in_data, out_data);
     }
 
-    // allocate temp space
-    const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                                                      param_.state_size, direction, param_.mode);
-    Tensor<cpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-        .get_space_typed<cpu, 1, DType>(Shape1(workspace_size), s);
+    #if USE_CUDNN_LSTM_PROJ
+    std::vector<int> seqLengthArray(param_.batch_size_, param_.seq_length_);
+    CUDNN_CALL(cudnnSetRNNDataDescriptor(x_data_desc_,
+                                         dtype_,
+                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                         param_.seq_length_,
+                                         param_.batch_size_,
+                                         param_.input_size_,
+                                         seqLengthArray.data(),
+                                         nullptr));
+    int out_size =
+      (param_.projection_size.has_value()) ? param_.projection_size.value() : param_.state_size;
+    out_size = (param_.bidirectional) ? (out_size * 2) : out_size;
+    CUDNN_CALL(cudnnSetRNNDataDescriptor(y_data_desc_,
+                                         dtype_,
+                                         CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                         param_.seq_length_,
+                                         param_.batch_size_,
+                                         out_size,
+                                         seqLengthArray.data(),
+                                         nullptr));
+    if (ctx.is_train) {
+      CUDNN_CALL(cudnnSetRNNDataDescriptor(dx_data_desc_,
+                                           dtype_,
+                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                           param_.seq_length_,
+                                           param_.batch_size_,
+                                           param_.input_size_,
+                                           seqLengthArray.data(),
+                                           nullptr));
+      CUDNN_CALL(cudnnSetRNNDataDescriptor(dy_data_desc_,
+                                           dtype_,
+                                           CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED,
+                                           param_.seq_length_,
+                                           param_.batch_size_,
+                                           out_size,
+                                           seqLengthArray.data(),
+                                           nullptr));
+    }
+    #endif
+
+    #if USE_CUDNN_LSTM_PROJ
+    bool clip_state = param_.lstm_state_clip_min.has_value();
+    bool clip_nan = param_.lstm_state_clip_nan;
+    CUDNN_CALL(cudnnRNNSetClip(s->dnn_handle_,
+                               rnn_desc_,
+                               clip_state ? CUDNN_RNN_CLIP_MINMAX : CUDNN_RNN_CLIP_NONE,
+                               clip_nan ? CUDNN_NOT_PROPAGATE_NAN : CUDNN_PROPAGATE_NAN,
+                               clip_state ? param_.lstm_state_clip_min.value() : 0.0,
+                               clip_state ? param_.lstm_state_clip_max.value() : 0.0));
+    #endif
 
     if (ctx.is_train) {
-      const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
-                                                   param_.seq_length_, param_.batch_size_,
-                                                   param_.state_size, param_.mode);
-      if (init_space_ && reserve_space_size_ < r_size) {
-        Storage::Get()->Free(reserve_space_);
-        init_space_ = false;
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnRNNForwardTrainingEx(s->dnn_handle_,
+                                           rnn_desc_,
+                                           x_data_desc_,
+                                           x.dptr_,
+                                           hx_desc_,
+                                           hx.dptr_,
+                                           cx_desc_,
+                                           cx_ptr,
+                                           w_desc_,
+                                           w.dptr_,
+                                           y_data_desc_,
+                                           y.dptr_,
+                                           hy_desc_,
+                                           hy_ptr,
+                                           cy_desc_,
+                                           cy_ptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           nullptr,
+                                           temp_space_.dptr,
+                                           workspace_byte_,
+                                           reserve_space_.dptr,
+                                           reserve_space_byte_));
+      #else
+      CUDNN_CALL(cudnnRNNForwardTraining(s->dnn_handle_,
+                                         rnn_desc_,
+                                         param_.seq_length_,
+                                         x_desc_vec_.data(),
+                                         x.dptr_,
+                                         hx_desc_,
+                                         hx.dptr_,
+                                         cx_desc_,
+                                         cx_ptr,
+                                         w_desc_,
+                                         w.dptr_,
+                                         y_desc_vec_.data(),
+                                         y.dptr_,
+                                         hy_desc_,
+                                         hy_ptr,
+                                         cy_desc_,
+                                         cy_ptr,
+                                         temp_space_.dptr,
+                                         workspace_byte_,
+                                         reserve_space_.dptr,
+                                         reserve_space_byte_));
+      #endif
+    } else {
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnRNNForwardInferenceEx(s->dnn_handle_,
+                                            rnn_desc_,
+                                            x_data_desc_,
+                                            x.dptr_,
+                                            hx_desc_,
+                                            hx.dptr_,
+                                            cx_desc_,
+                                            cx_ptr,
+                                            w_desc_,
+                                            w.dptr_,
+                                            y_data_desc_,
+                                            y.dptr_,
+                                            hy_desc_,
+                                            hy_ptr,
+                                            cy_desc_,
+                                            cy_ptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            temp_space_.dptr,
+                                            workspace_byte_));
+      #else
+      CUDNN_CALL(cudnnRNNForwardInference(s->dnn_handle_,
+                                          rnn_desc_,
+                                          param_.seq_length_,
+                                          x_desc_vec_.data(),
+                                          x.dptr_,
+                                          hx_desc_,
+                                          hx.dptr_,
+                                          cx_desc_,
+                                          cx_ptr,
+                                          w_desc_,
+                                          w.dptr_,
+                                          y_desc_vec_.data(),
+                                          y.dptr_,
+                                          hy_desc_,
+                                          hy_ptr,
+                                          cy_desc_,
+                                          cy_ptr,
+                                          temp_space_.dptr,
+                                          workspace_byte_));
+      #endif
+    }
+    #endif
+
+    if (ctx_.dev_type == kCPU) {
+      // allocate temp space
+      const size_t work_cpu_space_size =
+          GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                              param_.state_size, direction, param_.mode);
+      if (temp_init_space_ && temp_cpu_space_size_ < work_cpu_space_size) {
+          Storage::Get()->Free(temp_cpu_space_);
+          temp_init_space_ = false;
       }
-
-      if (!init_space_) {
-        reserve_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
-        reserve_space_size_ = r_size;
-        init_space_ = true;
+      if (!temp_init_space_) {
+        temp_cpu_space_ = Storage::Get()->Alloc
+            (work_cpu_space_size * sizeof(DType), Context::CPU());
+        temp_cpu_space_size_ = work_cpu_space_size;
+        temp_init_space_ = true;
+      }
+      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
+      if (ctx.is_train) {
+        const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                                     param_.seq_length_, param_.batch_size_,
+                                                     param_.state_size, param_.mode);
+        if (init_space_ && reserve_cpu_space_size_ < r_size) {
+          Storage::Get()->Free(reserve_cpu_space_);
+          init_space_ = false;
+        }
+        if (!init_space_) {
+          reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
+          reserve_cpu_space_size_ = r_size;
+          init_space_ = true;
+        }
+
+        DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
+
+        RNNForwardTraining<DType>(work_cpu_space,
+                                  reserve_space_ptr,
+                                  param_.state_outputs,
+                                  param_.num_layers,
+                                  direction,
+                                  param_.seq_length_,
+                                  param_.batch_size_,
+                                  param_.input_size_,
+                                  param_.state_size,
+                                  x.dptr_,
+                                  hx.dptr_,
+                                  cx_ptr,
+                                  w.dptr_,
+                                  b_ptr,
+                                  y.dptr_,
+                                  hy_ptr,
+                                  cy_ptr,
+                                  param_.p,
+                                  param_.mode);
+      } else {
+        RNNForwardInference<DType>(work_cpu_space,
+                                   param_.state_outputs,
+                                   param_.num_layers,
+                                   direction,
+                                   param_.seq_length_,
+                                   param_.batch_size_,
+                                   param_.input_size_,
+                                   param_.state_size,
+                                   x.dptr_,
+                                   hx.dptr_,
+                                   cx_ptr,
+                                   w.dptr_,
+                                   b_ptr,
+                                   y.dptr_,
+                                   hy_ptr,
+                                   cy_ptr,
+                                   param_.mode);
       }
-
-      DType* reserve_space_ptr = static_cast<DType*>(reserve_space_.dptr);
-      RNNForwardTraining<DType>(workspace.dptr_,
-                                reserve_space_ptr,
-                                param_.state_outputs,
-                                param_.num_layers,
-                                direction,
-                                param_.seq_length_,
-                                param_.batch_size_,
-                                param_.input_size_,
-                                param_.state_size,
-                                x.dptr_,
-                                hx.dptr_,
-                                cx_ptr,
-                                w.dptr_,
-                                b_ptr,
-                                y.dptr_,
-                                hy_ptr,
-                                cy_ptr,
-                                param_.p,
-                                param_.mode);
-    } else {
-      RNNForwardInference<DType>(workspace.dptr_,
-                                 param_.state_outputs,
-                                 param_.num_layers,
-                                 direction,
-                                 param_.seq_length_,
-                                 param_.batch_size_,
-                                 param_.input_size_,
-                                 param_.state_size,
-                                 x.dptr_,
-                                 hx.dptr_,
-                                 cx_ptr,
-                                 w.dptr_,
-                                 b_ptr,
-                                 y.dptr_,
-                                 hy_ptr,
-                                 cy_ptr,
-                                 param_.mode);
     }
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad) {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK(param_.p >= 0.0f && param_.p < 1.0f)
         << "unsupported dropout value, should be 0 <= dropout < 1";
 
-    size_t in_expected = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
-    size_t out_expected = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
-    if (!param_.state_outputs) {
-      out_expected = 1;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
     }
-    CHECK_EQ(in_data.size(), in_expected);
-    CHECK_EQ(out_data.size(), out_expected);
-    CHECK_EQ(in_grad.size(), in_expected);
-    CHECK_EQ(out_grad.size(), out_expected);
-    CHECK_EQ(req.size(), in_expected);
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+    CHECK_EQ(in_grad.size(), num_inputs);
+    CHECK_EQ(out_grad.size(), num_outputs);
+    CHECK_EQ(req.size(), num_inputs);
     CHECK_NE(req[rnn_enum::kData], kAddTo) << "AddTo is not supported for data";
     CHECK_NE(req[rnn_enum::kState], kAddTo) << "AddTo is not supported for state";
-    mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+    Stream<xpu> *s = ctx.get_stream<xpu>();
     // get input + output tensors
-    Tensor<cpu, 3, DType> x = in_data[rnn_enum::kData].get<cpu, 3, DType>(s);
-    Tensor<cpu, 1, DType> w = in_data[rnn_enum::kParams].get<cpu, 1, DType>(s);
-    Tensor<cpu, 3, DType> hx = in_data[rnn_enum::kState].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> y = out_data[rnn_enum::kOut].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> dx = in_grad[rnn_enum::kData].get<cpu, 3, DType>(s);
-    Tensor<cpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<cpu, 1, DType>(s);
-    Tensor<cpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<cpu, 3, DType>(s);
-    Tensor<cpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<cpu, 3, DType>(s);
-    CHECK(x.CheckContiguous());
-    CHECK(w.CheckContiguous());
-    CHECK(hx.CheckContiguous());
-    CHECK(y.CheckContiguous());
-    CHECK(dx.CheckContiguous());
-    CHECK(dw.CheckContiguous());
-    CHECK(dhx.CheckContiguous());
-    CHECK(dy.CheckContiguous());
+    Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dx = in_grad[rnn_enum::kData].get<xpu, 3, DType>(s);
+    Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<xpu, 1, DType>(s);
+    Tensor<xpu, 3, DType> hx = in_data[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> y = out_data[rnn_enum::kOut].get<xpu, 3, DType>(s);
+    Tensor<xpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<xpu, 3, DType>(s);
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(dw.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(dhx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+    CHECK_EQ(dy.CheckContiguous(), true);
+    CHECK_EQ(dx.CheckContiguous(), true);
+
+    if (req[rnn_enum::kParams] != kAddTo) {
+      dw = mshadow::expr::ScalarExp<DType>(0.0f);
+    }
+
     param_.seq_length_ = x.shape_[0];
     param_.batch_size_ = x.shape_[1];
     param_.input_size_ = x.shape_[2];
 
     const int direction = param_.bidirectional ? 2 : 1;
     const int bsize = GetRnnBiasSize(param_.num_layers, param_.state_size, direction, param_.mode);
+
     DType* db_ptr = dw.dptr_ + w.shape_[0] - bsize;
 
     DType * dhy_ptr = NULL;
@@ -563,260 +881,582 @@ class RNNOp : public Operator{
       dhy_ptr = out_grad[rnn_enum::kStateOut].dptr<DType>();
     }
 
-    DType * cx_ptr = NULL;
-    DType * dcx_ptr = NULL;
-    DType * dcy_ptr = NULL;
+    DType* dcx_ptr = NULL;
+    DType* dcy_ptr = NULL;
+    DType* cx_ptr = NULL;
 
     if (param_.mode == rnn_enum::kLstm) {
       CHECK_NE(req[rnn_enum::kStateCell], kAddTo) << "AddTo is not supported for state cell";
-      cx_ptr = in_data[rnn_enum::kStateCell].dptr<DType>();
-      dcx_ptr = in_grad[rnn_enum::kStateCell].dptr<DType>();
-      if (param_.state_outputs) {
-        dcy_ptr = out_grad[rnn_enum::kStateCellOut].dptr<DType>();
-      }
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<xpu, 3, DType>(s)).dptr_;
+      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<xpu, 3, DType>(s)).dptr_;
     }
+    if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
+        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<xpu, 3, DType>(s)).dptr_;
 
-    // allocate temp space
-    const size_t workspace_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                                                      param_.state_size, direction, param_.mode);
-    Tensor<cpu, 1, DType> workspace = ctx.requested[rnn_enum::kTempSpace]
-        .get_space_typed<cpu, 1, DType>(Shape1(workspace_size), s);
-
-    size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
-                                           param_.seq_length_, param_.batch_size_,
-                                           param_.state_size, param_.mode);
-    if (!init_space_ || reserve_space_size_ != r_size) {
-      LOG(FATAL) << "Check forward init error";
+    #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
+    if (!init_cudnn_) {
+      Init(ctx, s, in_data, out_data);
     }
 
-    DType* reserve_space_ptr = static_cast<DType*>(reserve_space_.dptr);
-    RNNBackward<DType>(workspace.dptr_,
-                       reserve_space_ptr,
-                       param_.num_layers,
-                       direction,
-                       param_.seq_length_,
-                       param_.batch_size_,
-                       param_.input_size_,
-                       param_.state_size,
-                       x.dptr_,
-                       hx.dptr_,
-                       cx_ptr,
-                       w.dptr_,
-                       y.dptr_,
-                       dy.dptr_,
-                       dhy_ptr,
-                       dcy_ptr,
-                       dx.dptr_,
-                       dhx.dptr_,
-                       dcx_ptr,
-                       dw.dptr_,
-                       db_ptr,
-                       req[rnn_enum::kData],
-                       req[rnn_enum::kParams],
-                       req[rnn_enum::kState],
-                       // State cell should be present for LSTMs, but is absent for other RNNs.
-                       param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp,
-                       param_.p,
-                       param_.mode);
-  }
-
- private:
-  RNNParam param_;
-  bool init_space_;
-  size_t reserve_space_size_;
-  Storage::Handle reserve_space_;
-};  // class RNNOp
+    #if USE_CUDNN_LSTM_PROJ
+    CUDNN_CALL(cudnnRNNBackwardDataEx(s->dnn_handle_,
+                                      rnn_desc_,
+                                      y_data_desc_,
+                                      y.dptr_,
+                                      dy_data_desc_,
+                                      dy.dptr_,
+                                      nullptr,
+                                      nullptr,
+                                      dhy_desc_,
+                                      dhy_ptr,
+                                      dcy_desc_,
+                                      dcy_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      dx_data_desc_,
+                                      dx.dptr_,
+                                      dhx_desc_,
+                                      dhx.dptr_,
+                                      dcx_desc_,
+                                      dcx_ptr,
+                                      nullptr,
+                                      nullptr,
+                                      temp_space_.dptr,
+                                      workspace_byte_,
+                                      reserve_space_.dptr,
+                                      reserve_space_byte_));
+    CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_,
+                                         rnn_desc_,
+                                         x_data_desc_,
+                                         x.dptr_,
+                                         hx_desc_,
+                                         hx.dptr_,
+                                         y_data_desc_,
+                                         y.dptr_,
+                                         temp_space_.dptr,
+                                         workspace_byte_,
+                                         dw_desc_,
+                                         dw.dptr_,
+                                         reserve_space_.dptr,
+                                         reserve_space_byte_));
+    #else
+    CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_,
+                                    rnn_desc_,
+                                    param_.seq_length_,
+                                    y_desc_vec_.data(),
+                                    y.dptr_,
+                                    dy_desc_vec_.data(),
+                                    dy.dptr_,
+                                    dhy_desc_,
+                                    dhy_ptr,
+                                    dcy_desc_,
+                                    dcy_ptr,
+                                    w_desc_,
+                                    w.dptr_,
+                                    hx_desc_,
+                                    hx.dptr_,
+                                    cx_desc_,
+                                    cx_ptr,
+                                    dx_desc_vec_.data(),
+                                    dx.dptr_,
+                                    dhx_desc_,
+                                    dhx.dptr_,
+                                    dcx_desc_,
+                                    dcx_ptr,
+                                    temp_space_.dptr,
+                                    workspace_byte_,
+                                    reserve_space_.dptr,
+                                    reserve_space_byte_));
+    CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_,
+                                       rnn_desc_,
+                                       param_.seq_length_,
+                                       x_desc_vec_.data(),
+                                       x.dptr_,
+                                       hx_desc_,
+                                       hx.dptr_,
+                                       y_desc_vec_.data(),
+                                       y.dptr_,
+                                       temp_space_.dptr,
+                                       workspace_byte_,
+                                       dw_desc_,
+                                       dw.dptr_,
+                                       reserve_space_.dptr,
+                                       reserve_space_byte_));
+    #endif
+    #endif
+
+    if (ctx_.dev_type == kCPU) {
+      // allocate temp space
+      const size_t work_cpu_space_size =
+          GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+                              param_.state_size, direction, param_.mode);
+      if (!temp_init_space_ || temp_cpu_space_size_ != work_cpu_space_size) {
+        LOG(FATAL) << "Check temp init error";
+      }
+      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
+      size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
+                                             param_.seq_length_, param_.batch_size_,
+                                             param_.state_size, param_.mode);
 
-template<typename xpu>
-Operator* CreateOp(RNNParam param, int dtype);
+      if (!init_space_ || reserve_cpu_space_size_ != r_size) {
+        LOG(FATAL) << "Check forward init error";
+      }
 
-#if DMLC_USE_CXX11
-class RNNProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    if (param_.mode == rnn_enum::kLstm) {
-      return {"data", "parameters", "state", "state_cell"};
-    } else {
-      return {"data", "parameters", "state"};
+      DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
+      RNNBackward<DType>(work_cpu_space,
+                         reserve_space_ptr,
+                         param_.num_layers,
+                         direction,
+                         param_.seq_length_,
+                         param_.batch_size_,
+                         param_.input_size_,
+                         param_.state_size,
+                         x.dptr_,
+                         hx.dptr_,
+                         cx_ptr,
+                         w.dptr_,
+                         y.dptr_,
+                         dy.dptr_,
+                         dhy_ptr,
+                         dcy_ptr,
+                         dx.dptr_,
+                         dhx.dptr_,
+                         dcx_ptr,
+                         dw.dptr_,
+                         db_ptr,
+                         req[rnn_enum::kData],
+                         req[rnn_enum::kParams],
+                         req[rnn_enum::kState],
+                         // State cell should be present for LSTMs, but is absent for other RNNs.
+                         param_.mode == rnn_enum::kLstm ? req[rnn_enum::kStateCell] : kNullOp,
+                         param_.p,
+                         param_.mode);
     }
   }
 
-  std::vector<std::string> ListOutputs() const override {
-    std::vector<std::string> outputs = {"output"};
-    if (!param_.state_outputs)
-      return outputs;
-    else
-      outputs.emplace_back("state");
-    if (param_.mode == rnn_enum::kLstm)
-      outputs.emplace_back("state_cell");
-    return outputs;
-  }
-
-  int NumOutputs() const override {
-    int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1;
-    int num_outputs = param_.state_outputs ? (mode_num + 1) : 1;
-    return num_outputs;
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
 
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
+ private:
+  inline void Init(const OpContext &ctx,
+                   mshadow::Stream<xpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
     using namespace mshadow;
-    if (param_.mode == rnn_enum::kLstm) {
-      CHECK_EQ(in_shape->size(), 4U) << "Input:[data, parameters, state, cell_state]";
-    } else {
-      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, parameters, state]";
-    }
-    const mxnet::TShape &dshape = (*in_shape)[rnn_enum::kData];
-    if (dshape.ndim() ==  0) return false;
-    CHECK_EQ(dshape.ndim(), 3U) \
-        << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]";
-    // data: [sequence len, batch, input dimension]
-    int batch_size = dshape[1];
-    int input_size = dshape[2];
-    int numDirections = param_.bidirectional ? 2 : 1;
-    int total_layers = numDirections * param_.num_layers;  // double for bidirectional
-    int layer_size = (param_.projection_size.has_value()) ?
-                     param_.projection_size.value() : param_.state_size;
-    SHAPE_ASSIGN_CHECK(*in_shape,
-                       rnn_enum::kState,
-                       Shape3(total_layers, batch_size, layer_size));
-    if (param_.mode == rnn_enum::kLstm)
-      SHAPE_ASSIGN_CHECK(*in_shape,
-                         rnn_enum::kStateCell,
-                         Shape3(total_layers, batch_size, param_.state_size));
-
-    // calculate parameter vector length
-    int param_size = GetRnnParamSize(param_.num_layers,
-                                     input_size,
-                                     param_.state_size,
-                                     numDirections,
-                                     param_.mode,
-                                     param_.projection_size);
-    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
-
-    out_shape->clear();
-    // output: [sequence len, batch, output size]
-    mxnet::TShape oshape = dshape;
-    if (param_.projection_size.has_value()) {
-      oshape[2] = numDirections * param_.projection_size.value();
-    } else {
-      oshape[2] = numDirections * param_.state_size;
+    size_t num_inputs = (param_.mode == rnn_enum::kLstm) ? 4 : 3;
+    //  kOut
+    size_t num_outputs = 1;
+    if (param_.state_outputs) {
+      // kOut, kStateOut, kStateCellOut
+      num_outputs = (param_.mode == rnn_enum::kLstm) ? 3 : 2;
     }
-    out_shape->push_back(oshape);
-    if (!param_.state_outputs) {
-      return true;
-    } else {
-      // outStateShape: [layer_num, batch, state size]
-      mxnet::TShape outStateShape = dshape;
-      outStateShape[0] = total_layers;
-      outStateShape[1] = batch_size;
-      if (param_.projection_size.has_value()) {
-        outStateShape[2] = param_.projection_size.value();
-      } else {
-        outStateShape[2] = param_.state_size;
+
+    CHECK_EQ(in_data.size(), num_inputs);
+    CHECK_EQ(out_data.size(), num_outputs);
+
+    #if MXNET_USE_CUDNN_RNN && defined(__CUDACC__)
+    #if CUDNN_MAJOR >= 5
+    format_ = CUDNN_TENSOR_NCHW;
+    #endif
+
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      // get input + output tensors
+      Tensor<xpu, 3, DType> x = in_data[rnn_enum::kData].get<xpu, 3, DType>(s);
+      Tensor<xpu, 1, DType> w = in_data[rnn_enum::kParams].get<xpu, 1, DType>(s);
+      param_.seq_length_ = x.shape_[0];
+      param_.batch_size_ = x.shape_[1];
+      param_.input_size_ = x.shape_[2];
+
+      // Tensor Descriptors
+      std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
+      int dimA[3];
+      int strideA[3];
+      for (int i = 0; i < param_.seq_length_; i++) {
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&x_vec[i]));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&y_vec[i]));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_vec[i]));
+        CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_vec[i]));
+
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        dimA[2] = 1;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(x_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(dx_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
+        dimA[2] = 1;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(y_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
+        CUDNN_CALL(cudnnSetTensorNdDescriptor(dy_vec[i],
+                                              dtype_,
+                                              3,
+                                              dimA,
+                                              strideA));
       }
-      out_shape->push_back(outStateShape);
-      // Deal with lstm cell state
-      if (param_.mode == rnn_enum::kLstm) {
-        mxnet::TShape cellStateShape = dshape;
-        cellStateShape[0] = total_layers;
-        cellStateShape[1] = batch_size;
-        cellStateShape[2] = param_.state_size;
-        out_shape->push_back(cellStateShape);
+      x_desc_vec_ = x_vec;
+      y_desc_vec_ = y_vec;
+      dx_desc_vec_ = dx_vec;
+      dy_desc_vec_ = dy_vec;
+
+      // set the state tensors
+      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
+      dimA[1] = param_.batch_size_;
+      dimA[2] = param_.state_size;
+      strideA[0] = dimA[2] * dimA[1];
+      strideA[1] = dimA[2];
+      strideA[2] = 1;
+      #if USE_CUDNN_LSTM_PROJ
+      int dimB[3];
+      int strideB[3];
+      dimB[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
+      dimB[1] = param_.batch_size_;
+      dimB[2] = param_.projection_size.has_value() ?
+                param_.projection_size.value() : param_.state_size;
+      strideB[0] = dimB[2] * dimB[1];
+      strideB[1] = dimB[2];
+      strideB[2] = 1;
+      #endif
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(cx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(hy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(cy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcx_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #if USE_CUDNN_LSTM_PROJ
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimB,
+                                            strideB));
+      #else
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dhy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+      #endif
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dcy_desc_,
+                                            dtype_,
+                                            3,
+                                            dimA,
+                                            strideA));
+
+      // Create Dropout descriptors
+      DType* dropout_states_ = NULL;
+      if (param_.p > 0) {
+         ctx.requested[rnn_enum::kCuDNNDropoutDescSpace].get_cudnn_dropout_desc
+            (&dropout_desc_, s, 1.0f - param_.p, seed_);
+      } else {
+        dropout_byte_ = 0;
       }
-      return true;
-    }
-  }
 
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (size_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      CUDNN_CALL(cudnnSetDropoutDescriptor(dropout_desc_, s->dnn_handle_,
+                                           param_.p,  // discard probability
+                                           dropout_states_, dropout_byte_,
+                                           seed_));
+
+      // RNN descriptors
+      #if CUDNN_MAJOR >= 6
+      cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
+      CUDNN_CALL(cudnnSetRNNDescriptor_v6(s->dnn_handle_,
+                                          rnn_desc_,
+                                          param_.state_size,
+                                          param_.num_layers,
+                                          dropout_desc_,
+                                          input_mode_,
+                                          direction_,
+                                          mode_,
+                                          rnn_algo,
+                                          dtype_));
+      #else
+      CUDNN_CALL(cudnnSetRNNDescriptor(rnn_desc_,
+                                       param_.state_size,
+                                       param_.num_layers,
+                                       dropout_desc_,
+                                       input_mode_,
+                                       direction_,
+                                       mode_,
+                                       dtype_));
+      #endif
+      #if CUDNN_MAJOR >= 7
+        cudnnMathType_t math_type = CUDNN_DEFAULT_MATH;
+        if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) {
+          math_type = CUDNN_TENSOR_OP_MATH;
+        }
+      #if CUDNN_VERSION >= 7200
+            if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
+                (DataType<DType>::kFlag != kFloat16))
+              math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+      #endif
+        CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
+      #endif
+      #if USE_CUDNN_LSTM_PROJ
+      if (param_.projection_size.has_value()) {
+        CUDNN_CALL(cudnnSetRNNProjectionLayers(s->dnn_handle_,
+                                               rnn_desc_,
+                                               param_.projection_size.value(),
+                                               0));
       }
+      #endif
+      // Get temp space sizes
+      CUDNN_CALL(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
+                                          rnn_desc_,
+                                          param_.seq_length_,
+                                          x_desc_vec_.data(),
+                                          &workspace_byte_));
+      CUDNN_CALL(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
+                                                rnn_desc_,
+                                                param_.seq_length_,
+                                                x_desc_vec_.data(),
+                                                &reserve_space_byte_));
+      workspace_size_ = workspace_byte_ / sizeof(DType);
+      // Allocate the reserve space
+      reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
+      // Allocate the temp space
+      temp_space_ = Storage::Get()->Alloc(workspace_byte_, Context::GPU(s->dev_id));
+      // Check that number of params are correct
+      size_t cudnn_param_size;
+      CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
+                                       rnn_desc_,
+                                       x_desc_vec_[0],
+                                       &cudnn_param_size,
+                                       dtype_));
+      CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
+      // Set param descriptors
+      int dim_w[3] = {1, 1, 1};
+      dim_w[0] = w.shape_[0];
+      CUDNN_CALL(cudnnSetFilterNdDescriptor(w_desc_,
+                                            dtype_,
+                                            format_,
+                                            3,
+                                            dim_w));
+      CUDNN_CALL(cudnnSetFilterNdDescriptor(dw_desc_,
+                                            dtype_,
+                                            format_,
+                                            3,
+                                            dim_w));
+
+      // Query weight layout
+      // cudnnFilterDescriptor_t m_desc;
+      // CHECK_EQ(cudnnCreateFilterDescriptor(&m_desc), CUDNN_STATUS_SUCCESS);
+      // DType *p;
+      // int n = 2;
+      // int64_t last = 0;
+      // if (param_.mode == rnn_enum::kLstm) n = 8;
+      // else if (param_.mode == rnn_enum::kGru) n = 6;
+
+      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
+      //   for (int j = 0; j < n; ++j) {
+      //     CHECK_EQ(cudnnGetRNNLinLayerMatrixParams(s->dnn_handle_, rnn_desc_,
+      //       i, x_desc_vec_[0], w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
+      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
+      //     last = ((int64_t)(p - NULL))/sizeof(DType);
+      //     cudnnDataType_t t;
+      //     cudnnTensorFormat_t f;
+      //     int ndim = 5;
+      //     int dims[5] = {0, 0, 0, 0, 0};
+      //     CHECK_EQ(cudnnGetFilterNdDescriptor(m_desc, ndim, &t, &f, &ndim, &dims[0]),
+      //       CUDNN_STATUS_SUCCESS);
+      //     LOG(INFO) << "w: " <<  i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
+      //     for (int i = 0; i < ndim; ++i) LOG(INFO) << dims[i];
+      //   }
+      // }
+
+      // for (int i = 0; i < param_.num_layers*(param_.bidirectional?2:1); ++i) {
+      //   for (int j = 0; j < n; ++j) {
+      //     CHECK_EQ(cudnnGetRNNLinLayerBiasParams(s->dnn_handle_, rnn_desc_, i, x_desc_vec_[0],
+      //       w_desc_, 0, j, m_desc, (void**)&p), CUDNN_STATUS_SUCCESS);
+      //     LOG(INFO) << ((int64_t)(p - NULL))/sizeof(DType) - last;
+      //     last = ((int64_t)(p - NULL))/sizeof(DType);
+      //     LOG(INFO) << "b: " << i << " " << j << " " << ((int64_t)(p - NULL))/sizeof(DType);
+      //   }
+      // }
     }
-    out_type->clear();
-    out_type->push_back(dtype);
-    if (!param_.state_outputs) {
-      return true;
+  #endif
+  }
+  #if MXNET_USE_CUDNN_RNN
+  cudnnDataType_t dtype_;
+  bool init_cudnn_;
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnRNNMode_t mode_;
+  cudnnDirectionMode_t direction_;
+  cudnnRNNInputMode_t input_mode_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+  Storage::Handle reserve_space_, temp_space_;
+  uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
+  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
+  int workspace_size_;
+  std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
+  #if USE_CUDNN_LSTM_PROJ
+  cudnnRNNDataDescriptor_t x_data_desc_, y_data_desc_, dx_data_desc_, dy_data_desc_;
+  #endif
+  cudnnTensorDescriptor_t hx_desc_, cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_, cy_desc_;
+  cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
+
+  cudnnFilterDescriptor_t w_desc_, dw_desc_;
+  // Allow TensorCore algo policy
+  bool cudnn_tensor_core_;
+
+  #if CUDNN_MAJOR >= 5
+  cudnnTensorFormat_t format_;
+  #endif
+  #endif
+  bool init_space_, temp_init_space_;
+  size_t reserve_cpu_space_size_, temp_cpu_space_size_;
+  Storage::Handle reserve_cpu_space_, temp_cpu_space_;
+};  //  class RNNOp
+
+static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
+                                 const Context ctx,
+                                 const mxnet::ShapeVector &in_shapes,
+                                 const std::vector<int> &in_types) {
+  const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
+  OpStatePtr state = OpStatePtr();
+  MSHADOW_REAL_TYPE_SWITCH(in_types[rnn_enum::kData], DType, {
+    if (ctx.dev_type == kGPU) {
+      state = OpStatePtr::Create<RNNOp<gpu, DType>>(param, ctx);
     } else {
-      out_type->push_back(dtype);
-      // Deal with lstm cell state
-      if (param_.mode == rnn_enum::kLstm)
-        out_type->push_back(dtype);
-      return true;
+      state = OpStatePtr::Create<RNNOp<cpu, DType>>(param, ctx);
     }
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new RNNProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "RNN";
-  }
+  });
+  return state;
+}
 
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    std::vector<int> dep = {in_data[rnn_enum::kData], in_data[rnn_enum::kParams],
-        in_data[rnn_enum::kState], out_data[rnn_enum::kOut], out_grad[rnn_enum::kOut]};
+template<typename xpu>
+void RNNStatefulCompute(const OpStatePtr& state,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  int dtype = inputs[rnn_enum::kData].type_flag_;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    RNNOp<xpu, DType>& op = state.get_state<RNNOp<xpu, DType>>();
+    op.Forward(ctx, inputs, req, outputs);
+  });
+}
 
-    if (param_.state_outputs) {
-      dep.push_back(out_data[rnn_enum::kStateOut]);
-      dep.push_back(out_grad[rnn_enum::kStateOut]);
+/*
+index description
+0: x
+1: w
+2: hx
+3: y
+4: dy
+5: hy
+6: dhy
+7: cx
+8: cy
+9: dcy
+*/
+template<typename xpu>
+void RNNStatefulGradCompute(const OpStatePtr& state,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + 3);
+  std::vector<TBlob> out_data{inputs[3]};
+  std::vector<TBlob> out_grad{inputs[4]};
+  const std::vector<TBlob> &in_grad = outputs;
+
+  int dtype = inputs[rnn_enum::kData].type_flag_;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    RNNOp<xpu, DType>& op = state.get_state<RNNOp<xpu, DType>>();
+    const RNNParam& param = op.param_;
+    int index = 5;
+    if (param.state_outputs) {
+      out_data.push_back(inputs[index++]);
+      out_grad.push_back(inputs[index++]);
     }
 
-    if (param_.mode == rnn_enum::kLstm) {
-      dep.push_back(in_data[rnn_enum::kStateCell]);
-      if (param_.state_outputs) {
-        dep.push_back(out_data[rnn_enum::kStateCellOut]);
-        dep.push_back(out_grad[rnn_enum::kStateCellOut]);
+    if (param.mode == rnn_enum::kLstm) {
+      in_data.push_back(inputs[index++]);
+      if (param.state_outputs) {
+        out_data.push_back(inputs[index++]);
+        out_grad.push_back(inputs[index]);
       }
     }
-    return dep;
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
 
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented";
-    return NULL;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                             std::vector<int> *in_type) const override;
+    op.Backward(ctx, out_grad, in_data, out_data, req, in_grad);
+  });
+}
 
- private:
-  RNNParam param_;
-};  // class RNNProp
-#endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
+
 #endif  // MXNET_OPERATOR_RNN_INL_H_
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 621b9eb110e7..7012a3c22f50 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -27,24 +27,142 @@
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(RNNParam param, int dtype) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new RNNOp<DType>(param);
-  });
-  return op;
+
+DMLC_REGISTER_PARAMETER(RNNParam);
+static inline std::vector<std::string> ListArguments(const RNNParam& param_) {
+  if (param_.mode == rnn_enum::kLstm) {
+    return {"data", "parameters", "state", "state_cell"};
+  } else {
+    return {"data", "parameters", "state"};
+  }
 }
 
-Operator *RNNProp::CreateOperatorEx(Context ctx,
-                                  mxnet::ShapeVector *in_shape,
-                                  std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+static bool RNNShape(const nnvm::NodeAttrs& attrs,
+                     std::vector<TShape> *in_shape,
+                     std::vector<TShape> *out_shape) {
+  const RNNParam& param_ = nnvm::get<RNNParam>(attrs.parsed);
+  using namespace mshadow;
+  if (param_.mode == rnn_enum::kLstm) {
+    CHECK_EQ(in_shape->size(), 4U) << "Needed input:[data, parameters, state, cell_state],"
+        << " got in_shape->size(): " << in_shape->size();
+  } else {
+    CHECK_EQ(in_shape->size(), 3U) <<
+      "Needed input:[data, parameters, state], got in_shape->size(): " << in_shape->size();
+  }
+  const TShape &dshape = (*in_shape)[rnn_enum::kData];
+  if (!mxnet::ndim_is_known(dshape)) return false;
+  CHECK_EQ(dshape.ndim(), 3U) \
+      << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]";
+  // data: [sequence len, batch, input dimension]
+  int batch_size = dshape[1];
+  int input_size = dshape[2];
+  int numDirections = param_.bidirectional ? 2 : 1;
+  int total_layers = numDirections * param_.num_layers;  // double for bidirectional
+  int layer_size = (param_.projection_size.has_value()) ?
+      param_.projection_size.value() : param_.state_size;
+  SHAPE_ASSIGN_CHECK(*in_shape,
+                     rnn_enum::kState,
+                     Shape3(total_layers, batch_size, layer_size));
+  if (param_.mode == rnn_enum::kLstm) {
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       rnn_enum::kStateCell,
+                       Shape3(total_layers, batch_size, param_.state_size));
+  }
+
+  // calculate parameter vector length
+  int param_size = GetRnnParamSize(param_.num_layers,
+                                   input_size,
+                                   param_.state_size,
+                                   numDirections,
+                                   param_.mode,
+                                   param_.projection_size);
+  SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
+  out_shape->clear();
+  // output: [sequence len, batch, output size]
+  TShape oshape = dshape;
+  if (param_.projection_size.has_value()) {
+    oshape[2] = numDirections * param_.projection_size.value();
+  } else {
+    oshape[2] = numDirections * param_.state_size;
+  }
+  out_shape->push_back(oshape);
+  if (param_.state_outputs) {
+    // outStateShape: [layer_num, batch, state size]
+    TShape outStateShape = dshape;
+    outStateShape[0] = total_layers;
+    outStateShape[1] = batch_size;
+    if (param_.projection_size.has_value()) {
+      outStateShape[2] = param_.projection_size.value();
+    } else {
+      outStateShape[2] = param_.state_size;
+    }
+    out_shape->push_back(outStateShape);
+    // Deal with lstm cell state
+    if (param_.mode == rnn_enum::kLstm) {
+      TShape cellStateShape = dshape;
+      cellStateShape[0] = total_layers;
+      cellStateShape[1] = batch_size;
+      cellStateShape[2] = param_.state_size;
+      out_shape->push_back(cellStateShape);
+    }
+  }
+  return true;
 }
 
-DMLC_REGISTER_PARAMETER(RNNParam);
+static bool RNNType(const nnvm::NodeAttrs& attrs,
+                    std::vector<int> *in_type,
+                    std::vector<int> *out_type) {
+  const RNNParam& param_ = nnvm::get<RNNParam>(attrs.parsed);
+  if (param_.mode == rnn_enum::kLstm) {
+    CHECK_EQ(in_type->size(), 4U);
+  } else {
+    CHECK_EQ(in_type->size(), 3U);
+  }
+  int dtype = (*in_type)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  for (size_t i = 0; i < in_type->size(); ++i) {
+    if ((*in_type)[i] == -1) {
+      TYPE_ASSIGN_CHECK(*in_type, i, dtype);
+    } else {
+      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]);
+    }
+  }
+  out_type->clear();
+  out_type->push_back(dtype);
+  if (param_.state_outputs) {
+    out_type->push_back(dtype);
+    // Deal with lstm cell state
+    if (param_.mode == rnn_enum::kLstm)
+      out_type->push_back(dtype);
+  }
+  return true;
+}
 
-MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
+struct RNNGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr &n,
+          const std::vector<nnvm::NodeEntry> &ograd) const {
+    const RNNParam& params = nnvm::get<RNNParam>(n->attrs.parsed);
+    std::vector<nnvm::NodeEntry> heads{ n->inputs[rnn_enum::kData],
+      n->inputs[rnn_enum::kParams], n->inputs[rnn_enum::kState] };
+    heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kOut, 0});
+    heads.push_back(ograd[rnn_enum::kOut]);
+    if (params.state_outputs) {
+      heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kStateOut, 0});
+      heads.push_back(ograd[rnn_enum::kStateOut]);
+    }
+    if (params.mode == rnn_enum::kLstm) {
+      heads.push_back(n->inputs[rnn_enum::kStateCell]);
+      if (params.state_outputs) {
+        heads.emplace_back(nnvm::NodeEntry{n, rnn_enum::kStateCellOut, 0});
+        heads.push_back(ograd[rnn_enum::kStateCellOut]);
+      }
+    }
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+  }
+};
+
+NNVM_REGISTER_OP(RNN)
 .describe(R"code(Applies recurrent layers to input data. Currently, vanilla RNN, LSTM and GRU are
 implemented, with both multi-layer and bidirectional support.
 
@@ -97,7 +215,49 @@ The definition of GRU here is slightly different from paper but compatible with
             z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
             n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
             h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\
-            \end{array})code")
+            \end{array}
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<RNNParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  return params.mode == rnn_enum::kLstm ? 4 : 3;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  //  kOut
+  int num_outputs = 1;
+  if (params.state_outputs) {
+    // kOut, kStateOut, kStateCellOut
+    num_outputs = (params.mode == rnn_enum::kLstm) ? 3 : 2;
+  }
+
+  return num_outputs;
+})
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  return ListArguments(params);
+})
+.set_attr<mxnet::FInferShape>("FInferShape", RNNShape)
+.set_attr<nnvm::FInferType>("FInferType", RNNType)
+.set_attr<FCreateOpState>("FCreateOpState", CreateRNNState)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", RNNGrad{"_backward_RNN"})
+.set_attr<FResourceRequestEx>("FResourceRequestEx",
+  [](const NodeAttrs& attrs, const int dev_mask, const DispatchMode dispatch_mode) {
+    std::vector<ResourceRequest> request;
+    const RNNParam& param = nnvm::get<RNNParam>(attrs.parsed);
+    if (param.p == 0) return request;
+    if (dev_mask == kGPU) {
+#if MXNET_USE_CUDNN_RNN
+      if (1.0f - param.p > 0) {
+        request.emplace_back(ResourceRequest::kCuDNNDropoutDesc);
+        return request;
+      }
+#endif
+    }
+    return request;
+})
 .add_argument("data", "NDArray-or-Symbol", "Input data to RNN")
 .add_argument("parameters", "NDArray-or-Symbol",
               "Vector of all RNN trainable parameters concatenated")
@@ -105,5 +265,15 @@ The definition of GRU here is slightly different from paper but compatible with
 .add_argument("state_cell", "NDArray-or-Symbol",
               "initial cell state for LSTM networks (only for LSTM)")
 .add_arguments(RNNParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_RNN)
+.set_num_outputs([](const NodeAttrs& attrs) {
+  const RNNParam& params = nnvm::get<RNNParam>(attrs.parsed);
+  return params.mode == rnn_enum::kLstm ? 4 : 3;
+})
+.set_attr_parser(ParamParser<RNNParam>)
+.set_attr<bool>("TIsLayerOpBackward", true)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", RNNStatefulGradCompute<cpu>);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
index 402a8cf5f503..77bb95522711 100644
--- a/src/operator/rnn.cu
+++ b/src/operator/rnn.cu
@@ -26,24 +26,14 @@
 
 #include "./rnn-inl.h"
 #include <algorithm>
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-#include "./cudnn_rnn-inl.h"
-#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
 
 namespace mxnet {
 namespace op {
-template<>
-Operator* CreateOp<gpu>(RNNParam param, int dtype) {
-  Operator *op = NULL;
-#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new CuDNNRNNOp<DType>(param);
-  })
-#else
-  LOG(FATAL) << "RNN on GPU is only available for cuDNN at the moment.";
-#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
-  return op;
-}
 
+NNVM_REGISTER_OP(RNN)
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", RNNStatefulCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_RNN)
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", RNNStatefulGradCompute<gpu>);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index b4db80bdd721..4c42934f1618 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -263,7 +263,7 @@ class SequenceLastProp : public OperatorProperty {
       SHAPE_ASSIGN_CHECK(*in_shape, seq_last::kSequenceLength, Shape1(sbatch));
 
     // calculate output size
-    mxnet::TShape shape_o(dshape.ndim() - 1);
+    mxnet::TShape shape_o(dshape.ndim() - 1, -1);
     shape_o[0] = sbatch;
     for (index_t i = 1; i < shape_o.ndim(); ++i) shape_o[i] = dshape[i + 1];
 
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
index 6125782d525b..e37ffdcf1b91 100644
--- a/src/operator/slice_channel-inl.h
+++ b/src/operator/slice_channel-inl.h
@@ -195,9 +195,9 @@ class SliceChannelProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 1U);
     mxnet::TShape dshape = in_shape->at(slice_enum::kData);
     mxnet::TShape ishape = in_shape->at(slice_enum::kData);
-    if (dshape.ndim() == 0) return false;
+    if (!mxnet::ndim_is_known(dshape)) return false;
     if (param_.axis >= 0) {
-      CHECK_LT(static_cast<size_t>(param_.axis), dshape.ndim());
+      CHECK_LT(param_.axis, dshape.ndim());
     } else {
       CHECK_LT(param_.axis + dshape.ndim(), dshape.ndim());
     }
@@ -212,15 +212,18 @@ class SliceChannelProp : public OperatorProperty {
       << " evenly sized chunks, but this is not possible because "
       << param_.num_outputs << " does not evenly divide "
       << dshape[real_axis];
-    if (param_.squeeze_axis && ishape[real_axis] != 0) {
-      CHECK_EQ(ishape[real_axis], static_cast<size_t>(param_.num_outputs))
+    if (param_.squeeze_axis && ishape[real_axis] != -1) {
+      CHECK_EQ(ishape[real_axis], param_.num_outputs)
         << "If squeeze axis is True, the size of the sliced axis must be the same as num_outputs."
         << " Input shape=" << ishape << ", axis=" << real_axis
         << ", num_outputs=" << param_.num_outputs << ".";
     }
-    dshape[real_axis] /= param_.num_outputs;
-    if (param_.squeeze_axis && (dshape[real_axis] == 1 || ishape[real_axis] == 0)) {
-      for (int d = real_axis; d < static_cast<int>(dshape.ndim()) - 1; ++d) {
+    if (dshape[real_axis] >= 0) {
+      dshape[real_axis] /= param_.num_outputs;
+    }
+    if (param_.squeeze_axis && (dshape[real_axis] == 1
+        || !mxnet::dim_size_is_known(ishape, real_axis))) {
+      for (int d = real_axis; d < dshape.ndim() - 1; ++d) {
         dshape[d] = dshape[d+1];
       }
       dshape = mxnet::TShape(&dshape[0], &dshape[dshape.ndim()-1]);
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 5dca8bac14a3..80ab40ef6c50 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -337,19 +337,19 @@ class SoftmaxOutputProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
     const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
+    if (!shape_is_known(dshape)) return false;
 
     // label.shape == data.shape: use probability as label
     if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
       if (param_.multi_output) {
         mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
-        mxnet::TShape lshape2(dshape.ndim() - 1);
+        mxnet::TShape lshape2(dshape.ndim() - 1, -1);
         lshape2[0] = dshape[0];
-        for (index_t i = 2; i < dshape.ndim(); ++i)
+        for (int i = 2; i < dshape.ndim(); ++i)
           lshape2[i-1] = dshape[i];
         mxnet::TShape lshape3 = dshape;
         lshape3[1] = 1;
-        if (in_shape->at(softmaxout_enum::kLabel).ndim() == 0) {
+        if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
           in_shape->at(softmaxout_enum::kLabel) = lshape1;
         } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
         } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
@@ -361,8 +361,8 @@ class SoftmaxOutputProp : public OperatorProperty {
           throw InferShapeError(os.str(), softmaxout_enum::kLabel);
         }
       } else {
-        mxnet::TShape label_shape(dshape.ndim() - 1);
-        for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+        mxnet::TShape label_shape(dshape.ndim() - 1, -1);
+        for (int i = 0; i + 1 < dshape.ndim(); ++i)
           label_shape[i] = dshape[i];
         SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
       }
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
index b17ef3527297..548225f0496b 100644
--- a/src/operator/softmax_output.cc
+++ b/src/operator/softmax_output.cc
@@ -85,19 +85,19 @@ static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
   const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
   CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
   const mxnet::TShape &dshape = in_shape->at(0);
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
 
   // label.shape == data.shape: use probability as label
   if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
     if (param.multi_output) {
       mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
-      mxnet::TShape lshape2(dshape.ndim() - 1);
+      mxnet::TShape lshape2(dshape.ndim() - 1, -1);
       lshape2[0] = dshape[0];
-      for (index_t i = 2; i < dshape.ndim(); ++i)
+      for (int i = 2; i < dshape.ndim(); ++i)
         lshape2[i-1] = dshape[i];
       mxnet::TShape lshape3 = dshape;
       lshape3[1] = 1;
-      if (in_shape->at(softmaxout_enum::kLabel).ndim() == 0) {
+      if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
         in_shape->at(softmaxout_enum::kLabel) = lshape1;
       } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
       } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
@@ -109,8 +109,8 @@ static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
         throw InferShapeError(os.str(), softmaxout_enum::kLabel);
       }
     } else {
-      mxnet::TShape label_shape(dshape.ndim() - 1);
-      for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+      mxnet::TShape label_shape(dshape.ndim() - 1, -1);
+      for (int i = 0; i + 1 < dshape.ndim(); ++i)
         label_shape[i] = dshape[i];
       SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
     }
diff --git a/src/operator/spatial_transformer-inl.h b/src/operator/spatial_transformer-inl.h
index 9e5dee842d0d..660d57d55bab 100644
--- a/src/operator/spatial_transformer-inl.h
+++ b/src/operator/spatial_transformer-inl.h
@@ -190,10 +190,10 @@ class SpatialTransformerProp : public OperatorProperty {
     CHECK_EQ(param_.sampler_type, st::kBilinear) << "only supports bilinear sampling currently";
     const mxnet::TShape &dshape = (*in_shape)[st::kData];
     const mxnet::TShape &lshape = (*in_shape)[st::kLoc];
-    if (dshape.ndim() ==  0) return false;
+    if (!shape_is_known(dshape)) return false;
     CHECK_EQ(dshape.ndim(), 4U) \
         << "input data should be 4D in batch-num_filter-y-x";
-    if (lshape.ndim() ==  0) return false;
+    if (!shape_is_known(lshape)) return false;
     CHECK_EQ(lshape.ndim(), 2U) \
         << "locolisation paramter should be 4D in batch-num_hidden";
     if (param_.transform_type == st::kAffine) {
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index d61b4613602a..2c05fda9a879 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -31,6 +31,9 @@
 namespace mxnet {
 namespace op {
 
+using red::limits::MaxValue;
+using red::limits::MinValue;
+
 template <typename DType>
 static void UpdateConvWeightBias(NDArray *weight, NDArray *bias, bool no_bias,
                                  const NDArray &gamma, const NDArray &beta,
@@ -78,8 +81,6 @@ static inline size_t GetInSumIndex(const MKLDNNConvFusionParam &param) {
 
 template <typename DType>
 static std::vector<float> GetWeightScales(const NDArray &weight, bool weight_channelwise_scale) {
-  using red::limits::MaxValue;
-  using red::limits::MinValue;
   std::vector<float> weight_scales;
   const DType *weight_ptr = weight.data().dptr<DType>();
   size_t channel = weight.shape()[0];
@@ -111,9 +112,11 @@ static std::vector<float> GetWeightScales(const NDArray &weight, bool weight_cha
       if (total_min > weight_c_min[c]) total_min = weight_c_min[c];
       if (total_max < weight_c_max[c]) total_max = weight_c_max[c];
     }
-    weight_scales.resize(1);
+    weight_scales.resize(3);
     DType weight_range = MaxAbs(total_min, total_max);
     weight_scales[0] = kInt8Range / weight_range;
+    weight_scales[1] = total_min;
+    weight_scales[2] = total_max;
   }
   return weight_scales;
 }
@@ -175,7 +178,7 @@ class SgMKLDNNConvOperator {
                const std::vector<NDArray> &outputs);
 
  private:
-  bool initalized_{false};
+  bool initialized_{false};
   bool inplace_{false};
   bool post_requantize_{false};
   nnvm::Symbol subgraph_sym_;
@@ -235,7 +238,7 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
 
   // Copy inputs[in_sum] into outputs[kOut] in case inplace optimization failed.
   if (mkldnn_param.with_sum) {
-    if (!initalized_) {
+    if (!initialized_) {
       // TODO(zhennan): Currently, mkldnn fallback mechanism will break inplace option,
       // which make check (req[kOut] == kWriteInplace) useless.
       auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
@@ -247,33 +250,46 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     if (!inplace_) {
       auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
       auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
+      if (outputs[kOut].dtype() == mshadow::kInt32) {
+        auto mem_desc = in_mkl_mem->get_primitive_desc().desc();
+        auto this_dtype = get_mkldnn_type(mshadow::kInt32);
+        mkldnn::memory::desc omd(
+            mkldnn::memory::dims(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims),
+            this_dtype, static_cast<mkldnn::memory::format>(mem_desc.data.format));
+        mkldnn::memory::primitive_desc opd(omd, CpuEngine::Get()->get_engine());
+        mkldnn_mem_ptr tmp_mem(new mkldnn::memory(opd, out_mkl_mem->get_data_handle()));
+        MKLDNNStream::Get()->RegisterMem(tmp_mem);
+        MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*in_mkl_mem, *tmp_mem));
+        output = NDArray(tmp_mem);
+      } else {
       mkldnn_mem_ptr tmp_mem(
           new mkldnn::memory(in_mkl_mem->get_primitive_desc(), out_mkl_mem->get_data_handle()));
       MKLDNNStream::Get()->RegisterMem(tmp_mem);
       mxnet::MKLDNNCopy(*in_mkl_mem, tmp_mem.get());
       output = NDArray(tmp_mem);
+      }
     }
   }
 
   // Check input change
   // TODO(zhennan): Only update cached_* changed.
-  if (initalized_) {
+  if (initialized_) {
     if (mkldnn_param.with_bn) {
       if (weight_ver_ != inputs[in_weight].version() ||
           ((!conv_param.no_bias) && bias_ver_ != inputs[in_bias].version())) {
-        initalized_ = false;
+        initialized_ = false;
       }
     }
-    if (initalized_ && mkldnn_param.quantized) {
+    if (initialized_ && mkldnn_param.quantized) {
       if (cached_data_min_ != data_min || cached_data_max_ != data_max ||
           cached_sum_min_ != sum_min || cached_sum_max_ != sum_max ||
           weight_ver_ != inputs[in_weight].version() ||
           ((!conv_param.no_bias) && bias_ver_ != inputs[in_bias].version())) {
-        initalized_ = false;
+        initialized_ = false;
       }
     }
   }
-  if (!initalized_) {
+  if (!initialized_) {
     cached_data_min_ = data_min;
     cached_data_max_ = data_max;
     cached_sum_min_ = sum_min;
@@ -327,7 +343,8 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
       float quantized_out_range;
       float output_scale;
       if (mkldnn_param.with_sum) {
-        auto quantized_sum_range = cached_sum_min_ < 0 ? kInt8Range : kUint8Range;
+        auto quantized_sum_range =
+            (inputs[in_sum].dtype() == mshadow::kInt8) ? kInt8Range : kUint8Range;
         sum_in_scale = quantized_sum_range / MaxAbs(cached_sum_min_, cached_sum_max_);
       }
       if (post_requantize_) {
@@ -339,11 +356,23 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
           full_conv_param.requantize_scales[c] = output_scale / data_scale_ / weight_scales_[c];
         }
       } else {
+        Stream<cpu> *s = ctx.get_stream<cpu>();
+        if (data.dtype() == mshadow::kInt8) {
+          mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(
+              s, 1, &cached_output_min_, &cached_output_max_, &weight_scales_[1],
+              &weight_scales_[2], &cached_data_min_, &cached_data_max_);
+        } else {
+          mxnet_op::Kernel<QuantizationRangeForS8U8MultiplicationStruct, cpu>::Launch(
+              s, 1, &cached_output_min_, &cached_output_max_, &weight_scales_[1],
+              &weight_scales_[2], &cached_data_min_, &cached_data_max_);
+        }
+        weight_scales_.resize(1);
         output_scale = data_scale_ * weight_scales_[0];
         full_conv_param.requantize_scales.resize(0);
       }
-      if (mkldnn_param.with_sum)
+      if (mkldnn_param.with_sum) {
         full_conv_param.sum_scale = output_scale / sum_in_scale;
+      }
     }
     fwd_.reset(new MKLDNNConvForward(
         full_conv_param, ctx.is_train, data, cached_weight_,
@@ -353,7 +382,7 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     fwd_->SetNewMem(*data.GetMKLDNNData(), *cached_weight_.GetMKLDNNData(),
                     has_bias ? cached_bias_.GetMKLDNNData() : nullptr,
                     *output.GetMKLDNNData());
-    initalized_ = true;
+    initialized_ = true;
   }
 
   if (mkldnn_param.quantized) {
@@ -375,11 +404,10 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     MKLDNNConvolutionForwardFullFeature(full_conv_param, ctx, fwd_.get(), new_inputs, new_req,
                                         {output});
   }
-  if (post_requantize_) {
-  float *out_min_ptr = outputs[kMin].data().dptr<float>();
-  float *out_max_ptr = outputs[kMax].data().dptr<float>();
-  *out_min_ptr = cached_output_min_;
-  *out_max_ptr = cached_output_max_;
+
+  if (mkldnn_param.quantized) {
+    *outputs[kMin].data().dptr<float>() = cached_output_min_;
+    *outputs[kMax].data().dptr<float>() = cached_output_max_;
   }
   if (mkldnn_param.with_sum) {
     auto out = const_cast<NDArray &>(outputs[kOut]);
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc.cc b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
index 0ec05a2af087..f345a18c18a6 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
@@ -156,7 +156,7 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
         int32_t *quantized_bias_ptr = cached_bias_.data().dptr<int32_t>();
         size_t bias_size = bias.shape().Size();
         #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-        for (size_t i = 0; i < bias_size; ++i) {
+        for (index_t i = 0; i < static_cast<index_t>(bias_size); ++i) {
           quantized_bias_ptr[i] = bias_ptr[i] * bias_int32_rescale;
         }
       }
@@ -174,7 +174,7 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
           MaxAbs(cached_min_output_, cached_max_output_) / data_scale / weight_scale;
       } else {
         Stream<cpu> *s = ctx.get_stream<cpu>();
-        mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(
+        mxnet_op::Kernel<QuantizationRangeForS8S8MultiplicationStruct, cpu>::Launch(
           s, 1, &cached_min_output_, &cached_max_output_,
           &min_data, &max_data, &min_weight, &max_weight);
       }
diff --git a/src/operator/subgraph_op_common.cc b/src/operator/subgraph_op_common.cc
index 8934438d428a..e53d911614a0 100644
--- a/src/operator/subgraph_op_common.cc
+++ b/src/operator/subgraph_op_common.cc
@@ -178,7 +178,7 @@ bool as_bool_scalar(const NDArray &a) {
 }
 
 bool is_shape_udf(const mxnet::TShape &x) {
-  return x.ndim() == 0 || x.Size() == 0;
+  return !shape_is_known(x);
 }
 
 bool is_stype_udf(const int &x) {
@@ -225,7 +225,7 @@ void LoopState::Forward(int iter_no,
     if (!out_bufs[i].IsSame(coutputs[i])) {
       // The line below checks whether dynamic shape exists.
       // If so, re-initialize the shape.
-      if (coutputs[i].shape().ndim() == 0) {
+      if (!shape_is_known(coutputs[i].shape())) {
         const_cast<NDArray &>(coutputs[i]).Init(out_bufs[i].shape());
       }
       CopyFromTo(out_bufs[i], coutputs[i]);
diff --git a/src/operator/subgraph_op_common.h b/src/operator/subgraph_op_common.h
index 91adf576dc07..19528349c0c7 100644
--- a/src/operator/subgraph_op_common.h
+++ b/src/operator/subgraph_op_common.h
@@ -67,7 +67,7 @@ bool is_type_udf(const int &x);
 
 template <typename T>
 void extract_by_loc(const std::vector<T> &array,
-                    const nnvm::Tuple<dim_t> input_locs,
+                    const mxnet::Tuple<dim_t> input_locs,
                     std::vector<T> *out) {
   out->clear();
   out->reserve(input_locs.ndim());
@@ -94,11 +94,11 @@ bool fill_value(T *x, T *y, bool x_empty, bool y_empty) {
 }
 
 template <typename T>
-bool sync_in_in(const nnvm::Tuple<dim_t> &input_locs,
-                         std::vector<T> *in,
-                         std::vector<T> *subg_in,
-                         std::function<bool(const T &)> is_empty) {
-  for (size_t i = 0; i < input_locs.ndim(); ++i) {
+bool sync_in_in(const mxnet::Tuple<dim_t> &input_locs,
+                std::vector<T> *in,
+                std::vector<T> *subg_in,
+                std::function<bool(const T &)> is_empty) {
+  for (int i = 0; i < input_locs.ndim(); ++i) {
     T &x = in->at(input_locs[i]);
     T &y = subg_in->at(i);
     fill_value(&x, &y, is_empty(x), is_empty(y));
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
index 1609764f0ebe..dfe9fa606e95 100644
--- a/src/operator/svm_output-inl.h
+++ b/src/operator/svm_output-inl.h
@@ -143,9 +143,9 @@ class SVMOutputProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
     const mxnet::TShape &dshape = in_shape->at(0);
-    if (dshape.ndim() == 0) return false;
-    mxnet::TShape label_shape(dshape.ndim() - 1);
-    for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+    if (!mxnet::ndim_is_known(dshape)) return false;
+    mxnet::TShape label_shape(dshape.ndim() - 1, -1);
+    for (int i = 0; i + 1 < dshape.ndim(); ++i)
       label_shape[i] = dshape[i];
     SHAPE_ASSIGN_CHECK(*in_shape, svm_enum::kLabel, label_shape);
     out_shape->clear();
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index ce835084ab32..b17a81f75bc6 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -69,11 +69,11 @@ class SwapAxisOp : public Operator {
 
   void Reshape2Five(mshadow::Shape<5> *inter_shape,
                     const mxnet::TShape &shape,
-                    uint32_t dim1, uint32_t dim2) {
+                    int dim1, int dim2) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    index_t ndim_in = shape.ndim();
-    index_t si;
+    int ndim_in = shape.ndim();
+    int si;
 
     if (dim1 > dim2) {
       std::swap(dim1, dim2);
@@ -106,8 +106,8 @@ class SwapAxisOp : public Operator {
                 const std::vector<OpReqType> &req) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    uint32_t dim1 = param_.dim1;
-    uint32_t dim2 = param_.dim2;
+    int dim1 = param_.dim1;
+    int dim2 = param_.dim2;
 
     TBlob data_in = in_data[swapaxisenum::kData];
     TBlob data_out = out_data[swapaxisenum::kData];
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index 5d6c49ff8882..1b0127a8f322 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -72,15 +72,15 @@ void BinaryBroadcastComputeImpl(Stream<gpu> *s, const OpReqType req,
 }
 
 const int nthread_reduce = kMaxThreadsPerBlock;
-template<typename Reducer, int ndim, typename DType, typename OP, int unroll>
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP, int unroll>
 __launch_bounds__(nthread_reduce)
 __global__ void reduce_kernel(const int N, const int M, const bool addto,
-                              const DType* __restrict big, DType *small,
+                              const DType* __restrict big, OType *small,
                               const Shape<ndim> big_shape0, const Shape<ndim> small_shape,
                               const Shape<ndim> big_shape, const Shape<ndim> big_stride,
                               const int Mnext, const bool do_transpose) {
   extern __shared__ char shTileChar[];
-  DType* shTile = (DType*)(shTileChar);
+  AType* shTile = (AType*)(shTileChar);
   const int tid = threadIdx.x + threadIdx.y*blockDim.x;
   const int bx = (do_transpose) ? blockDim.y : blockDim.x;
   const int by = (do_transpose) ? blockDim.x : blockDim.y;
@@ -95,7 +95,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
       Shape<ndim> coord = unravel(idx, small_shape);
       int idx_big0 = ravel(coord, big_shape0);
 
-      DType val, residual;
+      AType val, residual;
       Reducer::SetInitValue(val, residual);
       if (idx < N) {
         for (int k = tidy + Mstart; k < Mend; k += by*unroll) {
@@ -113,7 +113,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
           }
           #pragma unroll
           for (int u=0;u < unroll;u++) {
-            if (k + u*by < Mend) Reducer::Reduce(val, tmp[u], residual);
+            if (k + u*by < Mend) Reducer::Reduce(val, AType(tmp[u]), residual);
           }
         }
       }
@@ -127,7 +127,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
         shTile[it0 * 2 + 1] = residual;
         __syncthreads();
         for (int t=1;t < by;t <<= 1) {
-          DType tmp, tmp_residual;
+          AType tmp, tmp_residual;
           Reducer::SetInitValue(tmp, tmp_residual);
           if (tidy + t < by) {
             tmp = shTile[(it0 + t*fbx) * 2];
@@ -139,12 +139,12 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
         }
         if (idx < N && tidy == 0) {
           Reducer::Finalize(shTile[tidx * 2], shTile[tidx * 2 + 1]);
-          assign(&small[idx + m0*N], addto, shTile[tidx * 2]);
+          assign(&small[idx + m0*N], addto, OType(shTile[tidx * 2]));
         }
       } else {
         if (idx < N) {
           Reducer::Finalize(val, residual);
-          assign(&small[idx + m0*N], addto, val);
+          assign(&small[idx + m0*N], addto, OType(val));
         }
       }
     }
@@ -261,18 +261,18 @@ __global__ void reduce_lines_kernel(const int N, const int M, const bool addto,
   }
 }
 
-template<typename Reducer, int ndim, typename DType, typename OP>
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 __global__ void reduce_kernel_M1(const int N, const bool addto,
-                                const DType* __restrict big, DType *small, const Shape<ndim> bshape,
+                                const DType* __restrict big, OType *small, const Shape<ndim> bshape,
                                 const Shape<ndim> sshape) {
   for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
     Shape<ndim> coord = unravel(idx, sshape);
     int j = ravel(coord, bshape);
-    DType val, residual;
+    AType val, residual;
     Reducer::SetInitValue(val, residual);
-    Reducer::Reduce(val, OP::Map(big[j]), residual);
+    Reducer::Reduce(val, AType(OP::Map(big[j])), residual);
     Reducer::Finalize(val, residual);
-    assign(&small[idx], addto, val);
+    assign(&small[idx], addto, OType(val));
   }
 }
 
@@ -491,7 +491,7 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const mxnet::TShape& small, const mxn
 
     if (config.Mnext > 1) {
       // small_dptr[] is N*Mnext*sizeof(DType) bytes
-      config.workspace_size += config.N*config.Mnext*sizeof(DType);
+      config.workspace_size += config.N*config.Mnext*sizeof(double);
       // Set gridDim.y to Mnext
       config.kernel_1.gridDim.y = std::min(kBaseGridNum, config.Mnext);
     }
@@ -516,23 +516,22 @@ ReduceImplConfig<ndim> ConfigureReduceImpl(const mxnet::TShape& small, const mxn
     {__VA_ARGS__}                                                     \
   }
 
-template<typename Reducer, int ndim, typename DType, typename OP>
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
                 const TBlob& big, const Tensor<gpu, 1, char>& workspace,
                 const ReduceImplConfig<ndim>& config) {
   if (config.M == 1) {
-    reduce_kernel_M1<Reducer, ndim, DType, OP>
+    reduce_kernel_M1<Reducer, ndim, AType, DType, OType, OP>
     <<< config.kernel_1.gridDim, config.kernel_1.blockDim, 0, stream >>>(
-      config.N, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(), big.shape_.get<ndim>(),
+      config.N, req == kAddTo, big.dptr<DType>(), small.dptr<OType>(), big.shape_.get<ndim>(),
       small.shape_.get<ndim>());
     MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel_M1);
   } else {
-
-    DType* small_dptr = small.dptr<DType>();
+    OType* small_dptr = small.dptr<OType>();
     bool addto = (req == kAddTo);
     if (config.Mnext > 1) {
       // small_dptr[] is N*Mnext*sizeof(DType) bytes
-      small_dptr = reinterpret_cast<DType*>(workspace.dptr_);
+      small_dptr = reinterpret_cast<OType*>(workspace.dptr_);
       addto = false;
       // Check that the workspace is contigiuous
       CHECK_EQ(workspace.CheckContiguous(), true);
@@ -544,7 +543,7 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
       config.kernel_1.blockDim.x : config.kernel_1.blockDim.y;
     const bool do_unroll = ( config.M / (by*config.Mnext) >= config.unroll_reduce );
     KERNEL_UNROLL_SWITCH(do_unroll, ReduceImplConfig<ndim>::unroll_reduce, UNROLL, {
-      reduce_kernel<Reducer, ndim, DType, OP, UNROLL>
+      reduce_kernel<Reducer, ndim, AType, DType, OType, OP, UNROLL>
       <<< config.kernel_1.gridDim, config.kernel_1.blockDim, config.kernel_1.shMemSize, stream>>>(
         config.N, config.M, addto, big.dptr<DType>(), small_dptr, big.shape_.get<ndim>(),
         small.shape_.get<ndim>(), config.rshape, config.rstride, config.Mnext,
@@ -553,9 +552,9 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
     MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_kernel);
 
     if (config.Mnext > 1) {
-      reduce_lines_kernel<Reducer, DType>
+      reduce_lines_kernel<Reducer, OType>
       <<< config.kernel_2.gridSize, config.kernel_2.blockSize, 0, stream >>>
-        (config.N, config.Mnext, req == kAddTo, config.N, small_dptr, small.dptr<DType>());
+        (config.N, config.Mnext, req == kAddTo, config.N, small_dptr, small.dptr<OType>());
       MSHADOW_CUDA_POST_KERNEL_CHECK(reduce_lines_kernel);
     }
   }
@@ -610,14 +609,39 @@ void ReduceImpl(cudaStream_t stream, const TBlob& small, const TBlob& lhs, const
 
 #undef KERNEL_UNROLL_SWITCH
 
-template<typename Reducer, int ndim, typename DType, typename OP>
+template<typename Reducer, int ndim, typename DType, typename OP, bool safe_acc = false>
 void Reduce(Stream<gpu> *s, const TBlob& small, const OpReqType req,
             const Tensor<gpu, 1, char>& workspace, const TBlob& big) {
   if (req == kNullOp) return;
   cudaStream_t stream = Stream<gpu>::GetStream(s);
   ReduceImplConfig<ndim> config =
     ConfigureReduceImpl<ndim, DType>(small.shape_, big.shape_, NULL, NULL);
-  ReduceImpl<Reducer, ndim, DType, OP>(stream, small, req, big, workspace, config);
+  if (safe_acc) {
+    // TODO(haojin2): Use real-only type swtich for windows temporarily due to CI issues.
+#ifndef _WIN32
+    MXNET_ACC_TYPE_SWITCH(mshadow::DataType<DType>::kFlag, DataType, AType, {
+      typedef typename std::conditional<safe_acc, AType, DataType>::type AccType;
+      MSHADOW_TYPE_SWITCH(small.type_flag_, OType, {
+        typedef typename std::conditional<safe_acc, OType, DataType>::type OutType;
+        config = ConfigureReduceImpl<ndim, AccType>(small.shape_, big.shape_, NULL, NULL);
+        ReduceImpl<Reducer, ndim, AccType, DataType, OutType, OP>(
+          stream, small, req, big, workspace, config);
+      });
+    });
+#else
+    MXNET_REAL_ACC_TYPE_SWITCH(mshadow::DataType<DType>::kFlag, DataType, AType, {
+      typedef typename std::conditional<safe_acc, AType, DataType>::type AccType;
+      MSHADOW_TYPE_SWITCH(small.type_flag_, OType, {
+        typedef typename std::conditional<safe_acc, OType, DataType>::type OutType;
+        config = ConfigureReduceImpl<ndim, AccType>(small.shape_, big.shape_, NULL, NULL);
+        ReduceImpl<Reducer, ndim, AccType, DataType, OutType, OP>(
+          stream, small, req, big, workspace, config);
+      });
+    });
+#endif
+  } else {
+    ReduceImpl<Reducer, ndim, DType, DType, DType, OP>(stream, small, req, big, workspace, config);
+  }
 }
 
 template <typename Reducer, int ndim, typename DType, typename OP>
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 0f6913e6e9df..d107e89806aa 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -153,21 +153,21 @@ MSHADOW_XINLINE void binary_broadcast_assign(const index_t idx, const bool addto
   assign(&out[idx], addto, OP::Map(lhs[j], rhs[k]));
 }
 
-template<typename Reducer, int ndim, typename DType, typename OP>
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 MSHADOW_XINLINE void seq_reduce_assign(const index_t idx, const size_t M, const bool addto,
-                                       const DType* __restrict big, DType *small,
+                                       const DType* __restrict big, OType *small,
                                        const Shape<ndim>& bshape, const Shape<ndim>& sshape,
                                        const Shape<ndim>& rshape, const Shape<ndim>& rstride) {
   Shape<ndim> coord = unravel(idx, sshape);
   index_t j = ravel(coord, bshape);
-  DType val, residual;
+  AType val, residual;
   Reducer::SetInitValue(val, residual);
   for (size_t k = 0; k < M; ++k) {
     coord = unravel(k, rshape);
-    Reducer::Reduce(val, OP::Map(big[j + dot(coord, rstride)]), residual);
+    Reducer::Reduce(val, AType(OP::Map(big[j + dot(coord, rstride)])), residual);
   }
   Reducer::Finalize(val, residual);
-  assign(&small[idx], addto, val);
+  assign(&small[idx], addto, OType(val));
 }
 
 #ifdef __CUDACC__
@@ -194,15 +194,15 @@ void BinaryBroadcastComputeImpl(Stream<cpu> *s, const OpReqType req,
                            out.shape_.get<ndim>());
 }
 
-template<typename Reducer, int ndim, typename DType, typename OP>
+template<typename Reducer, int ndim, typename AType, typename DType, typename OType, typename OP>
 void seq_reduce_compute(const size_t N, const size_t M, const bool addto,
-                        const DType *big, DType *small, const Shape<ndim> bshape,
+                        const DType *big, OType *small, const Shape<ndim> bshape,
                         const Shape<ndim> sshape, const Shape<ndim> rshape,
                         const Shape<ndim> rstride) {
   #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (index_t idx = 0; idx < static_cast<index_t>(N); ++idx) {
-    seq_reduce_assign<Reducer, ndim, DType, OP>(idx, M, addto, big, small, bshape, sshape, rshape,
-      rstride);
+    seq_reduce_assign<Reducer, ndim, AType, DType, OType, OP>(idx, M, addto, big, small,
+        bshape, sshape, rshape, rstride);
   }
 }
 
@@ -227,16 +227,41 @@ void seq_reduce_compute_extra_mem(const size_t N, const size_t M, const bool add
   }
 }
 
-template <typename Reducer, int ndim, typename DType, typename OP>
+template <typename Reducer, int ndim, typename DType, typename OP, bool safe_acc = false>
 void Reduce(Stream<cpu>* s, const TBlob& small, const OpReqType req,
             const Tensor<cpu, 1, char>& workspace, const TBlob& big) {
   if (req == kNullOp) return;
   Shape<ndim> rshape, rstride;
   diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
   size_t N = small.shape_.Size(), M = rshape.Size();
-  seq_reduce_compute<Reducer, ndim, DType, OP>(
-    N, M, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(),
-    big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
+  if (!safe_acc) {
+    seq_reduce_compute<Reducer, ndim, DType, DType, DType, OP>(
+      N, M, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(),
+      big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
+  } else {
+    // TODO(haojin2): Use real-only type swtich for windows temporarily due to CI issues.
+#ifndef _WIN32
+    MXNET_ACC_TYPE_SWITCH(mshadow::DataType<DType>::kFlag, DataType, AType, {
+      typedef typename std::conditional<safe_acc, AType, DataType>::type AccType;
+      MSHADOW_TYPE_SWITCH(small.type_flag_, OType, {
+        typedef typename std::conditional<safe_acc, OType, DataType>::type OutType;
+        seq_reduce_compute<Reducer, ndim, AccType, DataType, OutType, OP>(
+          N, M, req == kAddTo, big.dptr<DataType>(), small.dptr<OutType>(),
+          big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
+      });
+    });
+#else
+    MXNET_REAL_ACC_TYPE_SWITCH(mshadow::DataType<DType>::kFlag, DataType, AType, {
+      typedef typename std::conditional<safe_acc, AType, DataType>::type AccType;
+      MSHADOW_TYPE_SWITCH(small.type_flag_, OType, {
+        typedef typename std::conditional<safe_acc, OType, DataType>::type OutType;
+        seq_reduce_compute<Reducer, ndim, AccType, DataType, OutType, OP>(
+          N, M, req == kAddTo, big.dptr<DataType>(), small.dptr<OutType>(),
+          big.shape_.get<ndim>(), small.shape_.get<ndim>(), rshape, rstride);
+      });
+    });
+#endif
+  }
 }
 
 template <typename Reducer, int ndim, typename DType, typename OP>
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index b13906af6624..9fec6cd1255a 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -67,6 +67,7 @@ struct ReduceAxesParam : public dmlc::Parameter<ReduceAxesParam> {
 struct NormParam : public dmlc::Parameter<NormParam> {
   int ord;
   dmlc::optional<mxnet::TShape> axis;
+  dmlc::optional<int> out_dtype;
   bool keepdims;
   DMLC_DECLARE_PARAMETER(NormParam) {
     DMLC_DECLARE_FIELD(ord).set_default(2)
@@ -78,6 +79,15 @@ struct NormParam : public dmlc::Parameter<NormParam> {
       If `axis` is int, a reduction is performed on a particular axis.
       If `axis` is a 2-tuple, it specifies the axes that hold 2-D matrices,
       and the matrix norms of these matrices are computed.)code");
+    DMLC_DECLARE_FIELD(out_dtype)
+      .add_enum("float16", mshadow::kFloat16)
+      .add_enum("float32", mshadow::kFloat32)
+      .add_enum("float64", mshadow::kFloat64)
+      .add_enum("int64", mshadow::kInt64)
+      .add_enum("int32", mshadow::kInt32)
+      .add_enum("int8", mshadow::kInt8)
+      .set_default(dmlc::optional<int>())
+      .describe(R"code(The data type of the output.)code");
     DMLC_DECLARE_FIELD(keepdims).set_default(false)
       .describe("If this is set to `True`, the reduced axis is left "
                 "in the result as dimension with size one.");
@@ -129,9 +139,9 @@ struct BroadcastAxesParam : public dmlc::Parameter<BroadcastAxesParam> {
   mxnet::TShape axis;
   mxnet::TShape size;
   DMLC_DECLARE_PARAMETER(BroadcastAxesParam) {
-    DMLC_DECLARE_FIELD(axis).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(axis).set_default(mxnet::TShape(0, -1))
       .describe("The axes to perform the broadcasting.");
-    DMLC_DECLARE_FIELD(size).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(size).set_default(mxnet::TShape(0, -1))
       .describe("Target sizes of the broadcasting axes.");
   }
 };
@@ -139,7 +149,7 @@ struct BroadcastAxesParam : public dmlc::Parameter<BroadcastAxesParam> {
 struct BroadcastToParam : public dmlc::Parameter<BroadcastToParam> {
   mxnet::TShape shape;
   DMLC_DECLARE_PARAMETER(BroadcastToParam) {
-    DMLC_DECLARE_FIELD(shape).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(shape).set_default(mxnet::TShape(0, -1))
       .describe("The shape of the desired array."
                 " We can set the dim to zero if it's same as the original."
                 " E.g `A = broadcast_to(B, shape=(10, 0, 0))` "
@@ -165,7 +175,7 @@ inline int CheckAxis(int axis, int ndim) {
 }
 
 inline mxnet::TShape AxisShapeCompact(mxnet::TShape shape, int *axis, bool allow_2d) {
-  int ndim = static_cast<int>(shape.ndim());
+  int ndim = shape.ndim();
   index_t leading = 1, trailing = 1, M = shape[*axis];
   for (int i = 0; i < *axis; ++i) leading *= shape[i];
   for (int i = *axis + 1; i < ndim; ++i) trailing *= shape[i];
@@ -186,7 +196,7 @@ inline mxnet::TShape ReduceAxisShapeImpl(const mxnet::TShape& ishape,
                                          bool keepdims) {
   if (!axis || ishape.ndim() == 1) {
     if (keepdims) {
-      return mxnet::TShape(ishape.ndim());
+      return mxnet::TShape(ishape.ndim(), 1);
     }
     return mshadow::Shape1(1);
   }
@@ -198,7 +208,7 @@ inline mxnet::TShape ReduceAxisShapeImpl(const mxnet::TShape& ishape,
     return oshape;
   }
 
-  mxnet::TShape oshape(ishape.ndim() - 1);
+  mxnet::TShape oshape(ishape.ndim() - 1, 1);
   for (int i = 0; i < new_axis; ++i) oshape[i] = ishape[i];
   for (int i = new_axis+1; i < static_cast<int>(ishape.ndim()); ++i) {
     oshape[i-1] = ishape[i];
@@ -212,7 +222,7 @@ inline bool ReduceAxisShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& ishape = (*in_attrs)[0];
-  if (ishape.ndim() == 0) return false;
+  if (!shape_is_known(ishape)) return false;
 
   const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0,
@@ -223,12 +233,12 @@ inline bool ReduceAxisShape(const nnvm::NodeAttrs& attrs,
 inline mxnet::TShape ReduceAxesShapeImpl(const mxnet::TShape& ishape,
                                          const dmlc::optional<mxnet::TShape>& axis,
                                          bool keepdims, bool exclude) {
-  // if axis doesn't have value, treat it same mxnet::TShape().
+  // if axis doesn't have value, treat it same mxnet::TShape(0).
   if (!axis.has_value() || axis.value().ndim() == 0) {
     if (keepdims) {
-      return mxnet::TShape(ishape.ndim());
+      return mxnet::TShape(ishape.ndim(), 1);
     } else {
-      return mxnet::TShape(1);
+      return mxnet::TShape(1, 1);
     }
   }
   // axis has value
@@ -256,9 +266,9 @@ inline mxnet::TShape ReduceAxesShapeImpl(const mxnet::TShape& ishape,
   if (keepdims) {
     oshape = mxnet::TShape(ishape);
   } else if (exclude) {
-    oshape = mxnet::TShape(axes.ndim());
+    oshape = mxnet::TShape(axes.ndim(), 1);
   } else {
-    oshape = mxnet::TShape(std::max<index_t>(1, ishape.ndim() - axes.ndim()));
+    oshape = mxnet::TShape(std::max(1, ishape.ndim() - axes.ndim()), 1);
   }
 
   if (keepdims && exclude) {
@@ -294,7 +304,23 @@ inline bool ReduceAxesShape(const nnvm::NodeAttrs& attrs,
                             mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if ((*in_attrs)[0].ndim() == 0) return false;
+  if (!shape_is_known((*in_attrs)[0])) return false;
+  const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0,
+                     ReduceAxesShapeImpl((*in_attrs)[0], param.axis,
+                                         param.keepdims, param.exclude));
+  return true;
+}
+
+inline bool ReduceMinMaxAxesShape(const nnvm::NodeAttrs& attrs,
+                                  mxnet::ShapeVector *in_attrs,
+                                  mxnet::ShapeVector *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  if (!shape_is_known((*in_attrs)[0])) return false;
+  CHECK_GT((*in_attrs)[0].Size(), 0U)
+    << "Reduction input's size should > 0 "
+    << (*in_attrs)[0];
   const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0,
                      ReduceAxesShapeImpl((*in_attrs)[0], param.axis,
@@ -302,12 +328,30 @@ inline bool ReduceAxesShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+
+inline bool NormType(const nnvm::NodeAttrs& attrs,
+                     std::vector<int> *in_attrs,
+                     std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
+  if (param.out_dtype.has_value()) {
+    CHECK_NE(in_attrs->at(0), -1)
+      << "input data type should be specified when out_dtype is not null";
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.out_dtype.value());
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
+  }
+  return (*out_attrs)[0] != -1;
+}
+
 inline bool NormShape(const nnvm::NodeAttrs& attrs,
                       mxnet::ShapeVector *in_attrs,
                       mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if ((*in_attrs)[0].ndim() == 0) return false;
+  if (!shape_is_known((*in_attrs)[0])) return false;
   const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0,
                      ReduceAxesShapeImpl((*in_attrs)[0], param.axis,
@@ -320,12 +364,12 @@ inline bool BroadcastAxesShape(const nnvm::NodeAttrs& attrs,
                                mxnet::ShapeVector *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if ((*in_attrs)[0].ndim() == 0) return false;
+  if (!shape_is_known((*in_attrs)[0])) return false;
   const BroadcastAxesParam& param = nnvm::get<BroadcastAxesParam>(attrs.parsed);
   CHECK_EQ(param.axis.ndim() , param.size.ndim());
   mxnet::TShape &ishape = (*in_attrs)[0];
   mxnet::TShape oshape = ishape;
-  for (index_t i = 0; i < param.axis.ndim(); ++i) {
+  for (int i = 0; i < param.axis.ndim(); ++i) {
     CHECK_EQ(oshape[param.axis[i]], 1U) << "Broadcasting axis must have size 1";
     oshape[param.axis[i]] = param.size[i];
   }
@@ -339,13 +383,13 @@ inline bool BroadcastToShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& ishape = (*in_attrs)[0];
-  if (ishape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(ishape)) return false;
   const BroadcastToParam& param = nnvm::get<BroadcastToParam>(attrs.parsed);
   CHECK_EQ(ishape.ndim(), param.shape.ndim())
     << "Operand of shape " << ishape << " cannot be broadcasted to " << param.shape;
   mxnet::TShape oshape = param.shape;
-  for (index_t i = 0; i < ishape.ndim(); ++i) {
-    if (oshape[i] != 0) {
+  for (int i = 0; i < ishape.ndim(); ++i) {
+    if (oshape[i] != -1) {
       CHECK(ishape[i] == oshape[i] || ishape[i] == 1)
         << "Array cannot be broadcasted from " << ishape << " to " << param.shape;
     } else {
@@ -364,7 +408,7 @@ inline bool BroadcastLikeShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape& lhs_shape = (*in_attrs)[0];
   mxnet::TShape& rhs_shape = (*in_attrs)[1];
 
-  if ((lhs_shape.ndim() == 0) || (lhs_shape.ndim() == 0)) {
+  if (!mxnet::ndim_is_known(lhs_shape) || !mxnet::ndim_is_known(rhs_shape)) {
     return false;
   }
 
@@ -377,8 +421,8 @@ inline bool BroadcastLikeShape(const nnvm::NodeAttrs& attrs,
       << "Operand of shape " << lhs_shape << " cannot be broadcasted to " << rhs_shape;
 
     oshape = mxnet::TShape(rhs_shape);
-    for (index_t i = 0; i < lhs_shape.ndim(); ++i) {
-      if (rhs_shape[i] != 0) {
+    for (int i = 0; i < lhs_shape.ndim(); ++i) {
+      if (rhs_shape[i] != -1) {
         CHECK(lhs_shape[i] == rhs_shape[i] || lhs_shape[i] == 1)
           << "Array cannot be broadcasted from " << lhs_shape << " to " << rhs_shape;
       } else {
@@ -396,7 +440,7 @@ inline bool BroadcastLikeShape(const nnvm::NodeAttrs& attrs,
       << "Empty axes tuple is not allowed";
 
     oshape = mxnet::TShape(lhs_shape);
-    for (index_t i = 0; i < lhs_axes.ndim(); ++i) {
+    for (int i = 0; i < lhs_axes.ndim(); ++i) {
       auto copyfrom = lhs_axes[i];
       if (copyfrom < 0) {
         copyfrom =  lhs_shape.ndim() + copyfrom;
@@ -423,9 +467,9 @@ inline bool BroadcastLikeShape(const nnvm::NodeAttrs& attrs,
 
 inline void BroadcastReduceShapeCompact(const mxnet::TShape& big, const mxnet::TShape& small,
                                         mxnet::TShape *new_big, mxnet::TShape *new_small) {
-  index_t idim = std::max<index_t>(big.ndim(), MXNET_SPECIAL_MAX_NDIM);
-  *new_big = mxnet::TShape(idim);
-  *new_small = mxnet::TShape(idim);
+  const int idim = std::max(big.ndim(), MXNET_SPECIAL_MAX_NDIM);
+  *new_big = mxnet::TShape(idim, 1);
+  *new_small = mxnet::TShape(idim, 1);
   index_t j = 0;
   if (small.Size() == 1) {
     (*new_big)[j++] = big.Size();
@@ -451,12 +495,10 @@ inline void BroadcastReduceShapeCompact(const mxnet::TShape& big, const mxnet::T
       ++j;
     }
   }
-  if (j <= 2) {
-    new_small->assign(&(*new_small)[0], &(*new_small)[2]);
-    new_big->assign(&(*new_big)[0], &(*new_big)[2]);
-  } else if (j <= MXNET_SPECIAL_MAX_NDIM) {
-    new_small->assign(&(*new_small)[0], &(*new_small)[MXNET_SPECIAL_MAX_NDIM]);
-    new_big->assign(&(*new_big)[0], &(*new_big)[MXNET_SPECIAL_MAX_NDIM]);
+  if (j <= MXNET_SPECIAL_MAX_NDIM) {
+    const int ndim = (j <= 2? 2 : MXNET_SPECIAL_MAX_NDIM);
+    new_small->assign(new_small->begin(), new_small->begin() + ndim);
+    new_big->assign(new_big->begin(), new_big->begin() + ndim);
   } else {
     LOG(FATAL) << "Too many reduction axes from " << big << " to " << small;
   }
@@ -525,7 +567,7 @@ void SearchAxisCompute(const nnvm::NodeAttrs& attrs,
   });
 }
 
-template<typename xpu, typename reducer, bool normalize = false,
+template<typename xpu, typename reducer, bool safe_acc, bool normalize = false,
          typename OP = op::mshadow_op::identity>
 void ReduceAxesComputeImpl(const OpContext& ctx,
                            const std::vector<TBlob>& inputs,
@@ -538,20 +580,22 @@ void ReduceAxesComputeImpl(const OpContext& ctx,
   mxnet::TShape src_shape, dst_shape;
   BroadcastReduceShapeCompact(inputs[0].shape_, small, &src_shape, &dst_shape);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    const TBlob in_data = inputs[0].reshape(src_shape);
-    const TBlob out_data = outputs[0].reshape(dst_shape);
-    BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
-      size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(
-          s, out_data.shape_, req[0], in_data.shape_);
-      Tensor<xpu, 1, char> workspace =
-          ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
-      broadcast::Reduce<reducer, NDim, DType, OP>(
-          s, out_data, req[0], workspace, in_data);
-      if (normalize) {
-        auto out = out_data.FlatTo2D<xpu, DType>(s);
-        out /= scalar<DType>(src_shape.Size()/dst_shape.Size());
-      }
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      const TBlob in_data = inputs[0].reshape(src_shape);
+      const TBlob out_data = outputs[0].reshape(dst_shape);
+      BROADCAST_NDIM_SWITCH(dst_shape.ndim(), NDim, {
+        size_t workspace_size = broadcast::ReduceWorkspaceSize<NDim, DType>(
+            s, out_data.shape_, req[0], in_data.shape_);
+        Tensor<xpu, 1, char> workspace =
+            ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+        broadcast::Reduce<reducer, NDim, DType, OP, safe_acc>(
+            s, out_data, req[0], workspace, in_data);
+        if (normalize) {
+          auto out = out_data.FlatTo2D<xpu, OType>(s);
+          out /= scalar<OType>(src_shape.Size()/dst_shape.Size());
+        }
+      });
     });
   });
 }
@@ -571,7 +615,7 @@ void ReduceAxesCompute(const nnvm::NodeAttrs& attrs,
     small = ReduceAxesShapeImpl(inputs[0].shape_, param.axis, true, param.exclude);
   }
 
-  ReduceAxesComputeImpl<xpu, reducer, normalize, OP>(ctx, inputs, req, outputs, small);
+  ReduceAxesComputeImpl<xpu, reducer, false, normalize, OP>(ctx, inputs, req, outputs, small);
 }
 
 template <typename red_op, int req, int axis>
@@ -813,6 +857,35 @@ void ReduceAxesOpForwardEx(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
   }
 }
 
+template<int req, typename OP>
+struct reduce_axes_backward_broadcast {
+  template<typename DType, typename OType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  DType *data,
+                                  OType *out,
+                                  DType *igrad,
+                                  OType *ograd,
+                                  mshadow::Shape<5> in_shape,
+                                  mshadow::Shape<5> out_shape,
+                                  const uint32_t ndim) {
+    size_t in_stride = 1;
+    size_t out_stride = 1;
+    index_t idx = i;
+    index_t out_idx = i;
+    for (int iter = ndim - 1; iter >= 0; --iter) {
+      size_t dim_idx = idx % in_shape[iter];
+      out_idx -= dim_idx * in_stride;
+      if (out_shape[iter] != 1) {
+        out_idx += dim_idx * out_stride;
+      }
+      idx /= in_shape[iter];
+      in_stride *= in_shape[iter];
+      out_stride *= out_shape[iter];
+    }
+    KERNEL_ASSIGN(igrad[i], req, DType(ograd[out_idx]) * OP::Map(data[i], DType(out[out_idx])));
+  }
+};
+
 template<typename xpu, typename OP, bool normalize = false>
 void ReduceAxesBackwardUseInOutImpl(const OpContext& ctx,
                                     const mxnet::TShape &small,
@@ -821,37 +894,58 @@ void ReduceAxesBackwardUseInOutImpl(const OpContext& ctx,
                                     const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
+  using namespace mxnet_op;
 
   mxnet::TShape src_shape, dst_shape;
   BroadcastReduceShapeCompact(outputs[0].shape_, small, &src_shape, &dst_shape);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (dst_shape.ndim() == 2) {
-      Tensor<xpu, 2, DType> igrad =
-        outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-      Tensor<xpu, 2, DType> ograd =
-        inputs[0].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
-      Tensor<xpu, 2, DType> data =
-        inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-      Tensor<xpu, 2, DType> out =
-        inputs[2].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
-      ASSIGN_DISPATCH(igrad, req[0],
-          broadcast_to(ograd, src_shape)*F<OP>(data, broadcast_to(out, src_shape)));
-      if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
-    } else {
-      const int ndim = MXNET_SPECIAL_MAX_NDIM;
-      Tensor<xpu, ndim, DType> igrad =
-        outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-      Tensor<xpu, ndim, DType> ograd =
-        inputs[0].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
-      Tensor<xpu, ndim, DType> data =
-        inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-      Tensor<xpu, ndim, DType> out =
-        inputs[2].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
-      ASSIGN_DISPATCH(igrad, req[0],
-          broadcast_to(ograd, src_shape)*F<OP>(data, broadcast_to(out, src_shape)));
-      if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
-    }
+
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      mshadow::Shape<5> in_shape;
+      mshadow::Shape<5> out_shape;
+      for (int i = 0; i < 5; ++i) {
+        if (i < dst_shape.ndim()) {
+          in_shape[i] = src_shape[i];
+          out_shape[i] = dst_shape[i];
+        } else {
+          in_shape[i] = 1;
+          out_shape[i] = 1;
+        }
+      }
+      if (dst_shape.ndim() == 2) {
+        Tensor<xpu, 2, DType> igrad =
+          outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
+        Tensor<xpu, 2, OType> ograd =
+          inputs[0].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+        Tensor<xpu, 2, DType> data =
+          inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
+        Tensor<xpu, 2, OType> out =
+          inputs[2].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+        MXNET_REQ_TYPE_SWITCH(req[0], Req, {
+          Kernel<reduce_axes_backward_broadcast<Req, OP>, xpu>::Launch(
+            s, outputs[0].shape_.Size(), data.dptr_, out.dptr_, igrad.dptr_, ograd.dptr_,
+            in_shape, out_shape, src_shape.ndim());
+        });
+        if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
+      } else {
+        const int ndim = MXNET_SPECIAL_MAX_NDIM;
+        Tensor<xpu, ndim, DType> igrad =
+          outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, OType> ograd =
+          inputs[0].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, DType> data =
+          inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
+        Tensor<xpu, ndim, OType> out =
+          inputs[2].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+        MXNET_REQ_TYPE_SWITCH(req[0], Req, {
+          Kernel<reduce_axes_backward_broadcast<Req, OP>, xpu>::Launch(
+            s, outputs[0].shape_.Size(), data.dptr_, out.dptr_, igrad.dptr_, ograd.dptr_,
+            in_shape, out_shape, src_shape.ndim());
+        });
+        if (normalize) igrad /= scalar<DType>(src_shape.Size()/dst_shape.Size());
+      }
+    });
   });
 }
 
@@ -1090,14 +1184,42 @@ void LpNormCompute(const nnvm::NodeAttrs& attrs,
     small = ReduceAxesShapeImpl(inputs[0].shape_, param.axis, true, false);
   }
   if (param.ord == 1) {
-    ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, mshadow_op::abs>(
-          ctx, inputs, req, outputs, small);
+    ReduceAxesComputeImpl<xpu, mshadow_op::sum, true, false, mshadow_op::abs>(
+        ctx, inputs, req, outputs, small);
   } else if (param.ord == 2) {
-    ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, false, mshadow_op::identity>(
+    ReduceAxesComputeImpl<xpu, mshadow_op::nrm2, true, false, mshadow_op::identity>(
         ctx, inputs, req, outputs, small);
   }
 }
 
+template<int req>
+struct norm_backward_broadcast {
+  template<typename DType, typename OType>
+  MSHADOW_XINLINE static void Map(index_t i,
+                                  DType *igrad,
+                                  OType *ograd,
+                                  DType *data,
+                                  mshadow::Shape<5> in_shape,
+                                  mshadow::Shape<5> out_shape,
+                                  const uint32_t ndim) {
+    size_t in_stride = 1;
+    size_t out_stride = 1;
+    index_t idx = i;
+    index_t out_idx = i;
+    for (int iter = ndim - 1; iter >= 0; --iter) {
+      size_t dim_idx = idx % in_shape[iter];
+      out_idx -= dim_idx * in_stride;
+      if (out_shape[iter] != 1) {
+        out_idx += dim_idx * out_stride;
+      }
+      idx /= in_shape[iter];
+      in_stride *= in_shape[iter];
+      out_stride *= out_shape[iter];
+    }
+    KERNEL_ASSIGN(igrad[i], req, ograd[out_idx] * mshadow_op::sign::Map(data[i]));
+  }
+};
+
 template<typename xpu>
 void LpNormGradCompute(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
@@ -1106,6 +1228,7 @@ void LpNormGradCompute(const nnvm::NodeAttrs& attrs,
                        const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
+  using namespace mxnet_op;
   if (req[0] == kNullOp) return;
 
   const NormParam& param = nnvm::get<NormParam>(attrs.parsed);
@@ -1119,27 +1242,46 @@ void LpNormGradCompute(const nnvm::NodeAttrs& attrs,
     mxnet::TShape src_shape, dst_shape;
     BroadcastReduceShapeCompact(outputs[0].shape_, small, &src_shape, &dst_shape);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      if (dst_shape.ndim() == 2) {
-        Tensor<xpu, 2, DType> ograd =
-          inputs[0].get_with_shape<xpu, 2, DType>(dst_shape.get<2>(), s);
-        Tensor<xpu, 2, DType> igrad =
-          outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-        Tensor<xpu, 2, DType> data =
-          inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
-        ASSIGN_DISPATCH(igrad, req[0],
-          broadcast_to(ograd, src_shape)*F<mshadow_op::sign>(data));
+    mshadow::Shape<5> in_shape;
+    mshadow::Shape<5> out_shape;
+    for (int i = 0; i < 5; ++i) {
+      if (i < dst_shape.ndim()) {
+        in_shape[i] = src_shape[i];
+        out_shape[i] = dst_shape[i];
       } else {
-        const int ndim = MXNET_SPECIAL_MAX_NDIM;
-        Tensor<xpu, ndim, DType> igrad =
-          outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, DType> ograd =
-          inputs[0].get_with_shape<xpu, ndim, DType>(dst_shape.get<ndim>(), s);
-        Tensor<xpu, ndim, DType> data =
-          inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
-        ASSIGN_DISPATCH(igrad, req[0],
-          broadcast_to(ograd, src_shape)*F<mshadow_op::sign>(data));
+        in_shape[i] = 1;
+        out_shape[i] = 1;
       }
+    }
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, OType, {
+        if (dst_shape.ndim() == 2) {
+          Tensor<xpu, 2, OType> ograd =
+            inputs[0].get_with_shape<xpu, 2, OType>(dst_shape.get<2>(), s);
+          Tensor<xpu, 2, DType> igrad =
+            outputs[0].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
+          Tensor<xpu, 2, DType> data =
+            inputs[1].get_with_shape<xpu, 2, DType>(src_shape.get<2>(), s);
+          MXNET_REQ_TYPE_SWITCH(req[0], Req, {
+            Kernel<norm_backward_broadcast<Req>, xpu>::Launch(
+              s, igrad.shape_.Size(), igrad.dptr_, ograd.dptr_, data.dptr_,
+              in_shape, out_shape, src_shape.ndim());
+          });
+        } else {
+          const int ndim = MXNET_SPECIAL_MAX_NDIM;
+          Tensor<xpu, ndim, DType> igrad =
+            outputs[0].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
+          Tensor<xpu, ndim, OType> ograd =
+            inputs[0].get_with_shape<xpu, ndim, OType>(dst_shape.get<ndim>(), s);
+          Tensor<xpu, ndim, DType> data =
+            inputs[1].get_with_shape<xpu, ndim, DType>(src_shape.get<ndim>(), s);
+          MXNET_REQ_TYPE_SWITCH(req[0], Req, {
+            Kernel<norm_backward_broadcast<Req>, xpu>::Launch(
+              s, igrad.shape_.Size(), igrad.dptr_, ograd.dptr_, data.dptr_,
+              in_shape, out_shape, src_shape.ndim());
+          });
+        }
+      });
     });
   } else if (param.ord == 2) {
     ReduceAxesBackwardUseInOutImpl<xpu, mshadow_op::div, false>(ctx, small, inputs,
@@ -1363,6 +1505,16 @@ void PickOpBackward(const nnvm::NodeAttrs& attrs,
   .add_argument("data", "NDArray-or-Symbol", "The input")       \
   .add_arguments(ReduceAxesParam::__FIELDS__())
 
+#define MXNET_OPERATOR_REGISTER_MINMAX_REDUCE(name)             \
+  NNVM_REGISTER_OP(name)                                        \
+  .set_num_inputs(1)                                            \
+  .set_num_outputs(1)                                           \
+  .set_attr_parser(AxesParamParser<ReduceAxesParam>)            \
+  .set_attr<mxnet::FInferShape>("FInferShape", ReduceMinMaxAxesShape)  \
+  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>) \
+  .add_argument("data", "NDArray-or-Symbol", "The input")       \
+  .add_arguments(ReduceAxesParam::__FIELDS__())
+
 #define MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(name)               \
   NNVM_REGISTER_OP(name)                                            \
   .set_num_outputs(1)                                               \
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cc b/src/operator/tensor/broadcast_reduce_op_value.cc
index 52fd61aa110e..f890963c2cf1 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cc
+++ b/src/operator/tensor/broadcast_reduce_op_value.cc
@@ -186,7 +186,7 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_nanprod)
 .set_num_inputs(3)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::nanprod_grad>);
 
-MXNET_OPERATOR_REGISTER_REDUCE(max)
+MXNET_OPERATOR_REGISTER_MINMAX_REDUCE(max)
 .add_alias("max_axis")
 .describe(get_reduce_axes_description("max", __LINE__))
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::maximum>)
@@ -200,7 +200,7 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_max)
 .set_num_inputs(3)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::eq>);
 
-MXNET_OPERATOR_REGISTER_REDUCE(min)
+MXNET_OPERATOR_REGISTER_MINMAX_REDUCE(min)
 .add_alias("min_axis")
 .describe(get_reduce_axes_description("min", __LINE__))
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::minimum>)
@@ -352,7 +352,7 @@ Examples::
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<NormParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", NormShape)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", NormType)
 .set_attr<FInferStorageType>("FInferStorageType", LpNormStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_norm" })
 .set_attr<FResourceRequest>("FResourceRequest",
diff --git a/src/operator/tensor/diag_op-inl.h b/src/operator/tensor/diag_op-inl.h
index 1e3c1c9701d4..c95c1ce414f2 100644
--- a/src/operator/tensor/diag_op-inl.h
+++ b/src/operator/tensor/diag_op-inl.h
@@ -84,19 +84,19 @@ inline mxnet::TShape DiagShapeImpl(const mxnet::TShape& ishape, const int k,
 
   auto s = std::min(h, w);
   if (s < 0) {
-    s = 0;
+    s = -1;
   }
 
   if (x1 > x2) {
     std::swap(x1, x2);
   }
 
-  int32_t n_dim = static_cast<int32_t>(ishape.ndim()) - 1;
-  mxnet::TShape oshape(n_dim);
+  int32_t n_dim = ishape.ndim() - 1;
+  mxnet::TShape oshape(n_dim, -1);
 
   // remove axis1 and axis2 and append the new axis to the end
   uint32_t idx = 0;
-  for (int32_t i = 0; i <= n_dim; ++i) {
+  for (int i = 0; i <= n_dim; ++i) {
     if (i != x1 && i != x2) {
       oshape[idx++] = ishape[i];
     }
@@ -114,7 +114,7 @@ inline bool DiagOpShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(out_attrs->size(), 1U);
 
     const mxnet::TShape& ishape = (*in_attrs)[0];
-    if (ishape.ndim() == 0) {
+    if (!mxnet::ndim_is_known(ishape)) {
       return false;
     }
 
@@ -129,7 +129,7 @@ inline bool DiagOpShape(const nnvm::NodeAttrs& attrs,
     }
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
 
-    return out_attrs->at(0).ndim() != 0U;
+    return shape_is_known(out_attrs->at(0));
 }
 
 inline bool DiagOpType(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index 163b4426cb2b..318254b26b9f 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -38,7 +38,9 @@
 #ifdef __CUDACC__
 #include "./dot-inl.cuh"
 #endif  // __CUDACC__
-
+#if (MSHADOW_USE_MKL == 1)
+#include "sparse_matrix.h"
+#endif
 namespace mxnet {
 namespace op {
 
@@ -775,13 +777,35 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
   }
 
   using nnvm::dim_t;
-
+#if (MSHADOW_USE_MKL == 1)
+  TShape lhs_shape = lhs.shape();
+  TShape rhs_shape = rhs.shape_;
+#endif
   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
   const TBlob& data_r = rhs;
   const TBlob data_out = *ret;
 
+#if (MSHADOW_USE_MKL == 1)
+  if (data_l.type_flag_ == mshadow::kFloat32
+    && indptr_l.type_flag_ == mshadow::kInt64
+    && col_idx_l.type_flag_ == mshadow::kInt64
+    && !trans_lhs) {
+    bool ret = mkl_DotCsrDnsDns(static_cast<SP_INT64*>(indptr_l.dptr_),
+                                static_cast<SP_INT64*>(col_idx_l.dptr_),
+                                data_l.dptr<float>(),
+                                data_r.dptr<float>(),
+                                data_out.dptr<float>(),
+                                lhs_shape[0],
+                                lhs_shape[1],
+                                rhs_shape[1]);
+    if (ret) {
+      return;
+    }
+  }
+#endif
+
   MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
@@ -1217,20 +1241,20 @@ inline bool DotShape(const nnvm::NodeAttrs& attrs,
     if (Ta) {
       L[0] = mshadow::Shape1(lshape[0]);
       L[1] = lshape.ndim() > 1 ?
-             mxnet::TShape(&lshape[1], &lshape[lshape.ndim()]) : mxnet::TShape(1);
+             mxnet::TShape(&lshape[1], lshape.end()) : mxnet::TShape(1, 1);
     } else {
       L[0] = lshape.ndim() > 1 ?
-             mxnet::TShape(&lshape[0], &lshape[lshape.ndim()-1]) : mxnet::TShape(1);
+             mxnet::TShape(&lshape[0], &lshape[lshape.ndim()-1]) : mxnet::TShape(1, 1);
       L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]);
     }
     if (Tb) {
       R[0] = rshape.ndim() > 1 ?
-             mxnet::TShape(&rshape[0], &rshape[rshape.ndim()-1]) : mxnet::TShape(1);
+             mxnet::TShape(&rshape[0], &rshape[rshape.ndim()-1]) : mxnet::TShape(1, 1);
       R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]);
     } else {
       R[0] = mshadow::Shape1(rshape[0]);
       R[1] = rshape.ndim() > 1 ?
-             mxnet::TShape(&rshape[1], &rshape[rshape.ndim()]) : mxnet::TShape(1);
+             mxnet::TShape(&rshape[1], rshape.end()) : mxnet::TShape(1, 1);
     }
 
     if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) {
@@ -1238,8 +1262,8 @@ inline bool DotShape(const nnvm::NodeAttrs& attrs,
         << "dot shape error: " << lshape << " X " << rshape;
     }
     std::vector<index_t> buf;
-    if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]);
-    if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]);
+    if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], L[Ta].end());
+    if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], R[!Tb].end());
     mxnet::TShape oshape(buf.begin(), buf.end());
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   }
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 1d2b7c9c1163..73019fa8389b 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -48,33 +48,32 @@ inline bool BinaryBroadcastShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape& rhs = (*in_attrs)[1];
 
   // avoid pre-mature shape inference.
-  if (lhs.ndim() == 0 || rhs.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(lhs) || !mxnet::ndim_is_known(rhs)) return false;
 
   if (lhs == rhs) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, lhs);
-    return true;
+    return shape_is_known(lhs);
   }
-  mxnet::TShape out(std::max(lhs.ndim(), rhs.ndim()));
-  index_t bl = out.ndim() - lhs.ndim();
-  index_t br = out.ndim() - rhs.ndim();
-  for (index_t i = 0; i < out.ndim(); ++i) {
-    index_t l = 1, r = 1;
+  mxnet::TShape out(std::max(lhs.ndim(), rhs.ndim()), -1);
+  const int bl = out.ndim() - lhs.ndim();
+  const int br = out.ndim() - rhs.ndim();
+  for (int i = 0; i < out.ndim(); ++i) {
+    int l = 1, r = 1;
     if (i >= bl) l = lhs[i-bl];
     if (i >= br) r = rhs[i-br];
+    if (!mxnet::dim_size_is_known(l) || !mxnet::dim_size_is_known(r)) continue;
     if (l != r) {
-      if (l == 0 || r == 0) {
-        out[i] = 0;
-      } else {
-        CHECK(l == 1 || r == 1)
-          << "operands could not be broadcast together with shapes " << lhs << " " << rhs;
-        out[i] = std::max(l, r);
-      }
+      // Make it compatible with NumPy.
+      // For example, (2, 3) cannot broadcast to (2, 0, 3), but (1, 3) can broadcast to (2, 0, 3).
+      CHECK(l == 1 || r == 1)
+        << "operands could not be broadcast together with shapes " << lhs << " " << rhs;
+      out[i] = (l == 1 ? r : l);
     } else {
       out[i] = l;
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, out);
-  return true;
+  return shape_is_known(lhs) && shape_is_known(rhs) && shape_is_known(out);
 }
 
 inline bool BinaryBroadcastMulStorageType(const nnvm::NodeAttrs& attrs,
@@ -146,15 +145,15 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet:
                                        const mxnet::TShape& oshape, mxnet::TShape *new_lshape,
                                        mxnet::TShape *new_rshape, mxnet::TShape *new_oshape) {
   if (lshape == rshape) return 0;
-  index_t odim = std::max<index_t>(oshape.ndim(), broadcast::MAX_DIM);
-  *new_lshape = mxnet::TShape(odim);
-  *new_rshape = mxnet::TShape(odim);
-  *new_oshape = mxnet::TShape(odim);
-  index_t bl = oshape.ndim() - lshape.ndim();
-  index_t br = oshape.ndim() - rshape.ndim();
-  index_t j = 0, lprod = 1, rprod = 1, oprod = 1;
-  for (index_t i = 0; i < oshape.ndim(); ++i) {
-    index_t l = 1, r = 1, o = oshape[i];
+  const int odim = std::max(oshape.ndim(), broadcast::MAX_DIM);
+  *new_lshape = mxnet::TShape(odim, 1);
+  *new_rshape = mxnet::TShape(odim, 1);
+  *new_oshape = mxnet::TShape(odim, 1);
+  int bl = oshape.ndim() - lshape.ndim();
+  int br = oshape.ndim() - rshape.ndim();
+  int j = 0, lprod = 1, rprod = 1, oprod = 1;
+  for (int i = 0; i < oshape.ndim(); ++i) {
+    int l = 1, r = 1, o = oshape[i];
     if (i >= bl) l = lshape[i-bl];
     if (i >= br) r = rshape[i-br];
     if ((lprod != rprod || l != r) &&
@@ -176,9 +175,9 @@ inline int BinaryBroadcastShapeCompact(const mxnet::TShape& lshape, const mxnet:
   }
   if (j <= broadcast::MAX_DIM) {
     BROADCAST_NDIM_SWITCH(j, NDim, {
-      new_lshape->assign(&(*new_lshape)[0], &(*new_lshape)[NDim]);
-      new_rshape->assign(&(*new_rshape)[0], &(*new_rshape)[NDim]);
-      new_oshape->assign(&(*new_oshape)[0], &(*new_oshape)[NDim]);
+      new_lshape->assign(new_lshape->begin(), new_lshape->begin() + NDim);
+      new_rshape->assign(new_rshape->begin(), new_rshape->begin() + NDim);
+      new_oshape->assign(new_oshape->begin(), new_oshape->begin() + NDim);
     });
   } else {
     LOG(FATAL) << "Too many broadcast dimensions with operands " << lshape << " " << rshape;
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 19a9ac8359eb..5114a5d0dbe3 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -413,9 +413,9 @@ bool ReshapeLikeShapeCompute(const nnvm::NodeAttrs &attrs,
   GetReshapeLikeParams(param, lshape, rshape, &lhs_begin, &lhs_end, &rhs_begin,
                        &rhs_end);
 
-  int lhsrank = static_cast<int>(lshape.ndim());
+  int lhsrank = lshape.ndim();
   int orank = lhsrank + (rhs_end - rhs_begin) - (lhs_end - lhs_begin);
-  mxnet::TShape oshape(orank);
+  mxnet::TShape oshape(orank, -1);
 
   for (int i = 0; i < lhs_begin; ++i)
     oshape[i] = lshape[i];
@@ -436,7 +436,7 @@ bool ReshapeLikeShapeCompute(const nnvm::NodeAttrs &attrs,
       << "shape " << oshape << " because they have different "
       << "size.";
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 DMLC_REGISTER_PARAMETER(ReshapeLikeParam);
@@ -537,7 +537,7 @@ Example::
      mxnet::ShapeVector *out_attrs) {
     CHECK_EQ(in_attrs->size(), 1U);
     CHECK_EQ(out_attrs->size(), 1U);
-    mxnet::TShape target_shape(1);
+    mxnet::TShape target_shape(1, -1);
     target_shape[0] = in_attrs->at(0).ndim();
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, target_shape);
     return !shape_is_none(out_attrs->at(0));
@@ -589,7 +589,7 @@ Example::
      mxnet::ShapeVector *out_attrs) {
     CHECK_EQ(in_attrs->size(), 1U);
     CHECK_EQ(out_attrs->size(), 1U);
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, 1U);
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
     return !shape_is_none(out_attrs->at(0));
   })
 .set_attr<nnvm::FInferType>("FInferType",
diff --git a/src/operator/tensor/histogram-inl.h b/src/operator/tensor/histogram-inl.h
index 51d0bdb6c2b6..7194445d7b52 100644
--- a/src/operator/tensor/histogram-inl.h
+++ b/src/operator/tensor/histogram-inl.h
@@ -46,13 +46,13 @@ namespace op {
 
 struct HistogramParam : public dmlc::Parameter<HistogramParam> {
     dmlc::optional<int> bin_cnt;
-    dmlc::optional<nnvm::Tuple<double>> range;
+    dmlc::optional<mxnet::Tuple<double>> range;
     DMLC_DECLARE_PARAMETER(HistogramParam) {
       DMLC_DECLARE_FIELD(bin_cnt)
         .set_default(dmlc::optional<int>())
         .describe("Number of bins for uniform case");
       DMLC_DECLARE_FIELD(range)
-        .set_default(dmlc::optional<nnvm::Tuple<double>>())
+        .set_default(dmlc::optional<mxnet::Tuple<double>>())
         .describe("The lower and upper range of the bins. if not provided, "
                   "range is simply (a.min(), a.max()). values outside the "
                   "range are ignored. the first element of the range must be "
@@ -86,9 +86,9 @@ inline bool HistogramOpShape(const nnvm::NodeAttrs& attrs,
   if (has_cnt) {
     // if cnt is specified, the output histogram has shape (cnt,)
     // while output bins has shape (cnt+1,)
-    const int bin_cnt = param.bin_cnt.value();
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape({bin_cnt}));
-    SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape({bin_cnt + 1}));
+    const dim_t bin_cnt = param.bin_cnt.value();
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, bin_cnt));
+    SHAPE_ASSIGN_CHECK(*out_attrs, 1, mxnet::TShape(1, bin_cnt + 1));
   } else {
     // if cnt is not specified, the output histogram has shape (bins.Size() - 1)
     // while output bins has same shape as input bins
@@ -97,11 +97,11 @@ inline bool HistogramOpShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(oshape.ndim(), 1U) << "bins argument should be an 1D vector";
     CHECK_GE(oshape.Size(), 2U) << "number of bounds should be >= 2";
 
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape({(oshape[0] - 1)}));
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, oshape[0] - 1));
     SHAPE_ASSIGN_CHECK(*out_attrs, 1, in_attrs->at(1));
   }
 
-  return !shape_is_none(out_attrs->at(0)) && !shape_is_none(out_attrs->at(1)) &&
+  return shape_is_known(out_attrs->at(0)) && shape_is_known(out_attrs->at(1)) &&
          out_attrs->at(0).Size() == out_attrs->at(1).Size() - 1;
 }
 
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 8979531fef4e..e8c5e884588b 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -145,20 +145,20 @@ inline bool EmbeddingOpShape(const nnvm::NodeAttrs& attrs,
                              mxnet::ShapeVector *out_attrs) {
   using namespace mshadow;
   const mxnet::TShape &dshape = (*in_attrs)[embedding::kData];
-  if (dshape.ndim() ==  0) return false;
+  if (!shape_is_known(dshape)) return false;
   const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
   SHAPE_ASSIGN_CHECK(*in_attrs, embedding::kWeight, Shape2(param.input_dim,
                                                            param.output_dim));
   out_attrs->clear();
 
-  mxnet::TShape oshape(dshape.ndim()+1);
-  for (size_t i = 0; i < dshape.ndim(); ++i) {
+  mxnet::TShape oshape(dshape.ndim()+1, -1);
+  for (int i = 0; i < dshape.ndim(); ++i) {
     oshape[i] = dshape[i];
   }
   oshape[dshape.ndim()] = param.output_dim;
 
   out_attrs->push_back(oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 template<typename ParamType>
@@ -682,18 +682,18 @@ inline bool TakeOpShape(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   const mxnet::TShape &arrshape = (*in_attrs)[take_::kArr];
   const mxnet::TShape &idxshape = (*in_attrs)[take_::kIdx];
-  if (idxshape.ndim() == 0U || idxshape.Size() == 0U) return false;
+  if (!shape_is_known(idxshape)) return false;
   const TakeParam& param = nnvm::get<TakeParam>(attrs.parsed);
   if (param.mode == take_::kRaise) {
     LOG(FATAL) << "Raise is not supported for the time being...";
   }
-  CHECK(param.axis >= -1 * (int)arrshape.ndim() && param.axis < (int)arrshape.ndim())
+  CHECK(param.axis >= -1 * arrshape.ndim() && param.axis < arrshape.ndim())
     << "Axis should be in the range of [-r, r-1] where r is the rank of input tensor";
 
   out_attrs->clear();
 
   const index_t actual_axis = param.axis + ((param.axis < 0) ? arrshape.ndim() : 0);
-  mxnet::TShape oshape(idxshape.ndim() + arrshape.ndim() - 1);
+  mxnet::TShape oshape(idxshape.ndim() + arrshape.ndim() - 1, -1);
   for (index_t i = 0; i < idxshape.ndim(); ++i) {
     oshape[i + actual_axis] = idxshape[i];
   }
@@ -705,7 +705,7 @@ inline bool TakeOpShape(const nnvm::NodeAttrs& attrs,
     }
   }
   out_attrs->push_back(oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 inline bool TakeOpType(const nnvm::NodeAttrs& attrs,
@@ -1170,6 +1170,7 @@ inline bool OneHotOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   // The shape of indices
   const mxnet::TShape& ishape = (*in_attrs)[0];
+  if (!shape_is_known(ishape)) return false;
 
   int depth = 0;
   double on_value = 1.0;
@@ -1177,13 +1178,13 @@ inline bool OneHotOpShape(const nnvm::NodeAttrs& attrs,
   int dtype = mshadow::kFloat32;
   GetOneHotParams(param, &depth, &on_value, &off_value, &dtype);
 
-  mxnet::TShape oshape(ishape.ndim() + 1);
+  mxnet::TShape oshape(ishape.ndim() + 1, -1);
   for (index_t i = 0; i < ishape.ndim(); ++i) {
     oshape[i] = ishape[i];
   }
   oshape[oshape.ndim()-1] = depth;
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 inline bool OneHotOpType(const nnvm::NodeAttrs& attrs,
@@ -1270,15 +1271,17 @@ inline bool GatherNDShape(const nnvm::NodeAttrs& attrs,
   CHECK_LE(ishape[0], 10)
     << "gather_nd supports indexing along at most 10 dimensions.";
 
-  mxnet::TShape oshape(ishape.ndim() - 1 + dshape.ndim() - ishape[0]);
+  mxnet::TShape oshape(ishape.ndim() - 1 + dshape.ndim() - ishape[0], -1);
 
-  for (size_t i = 0; i < ishape.ndim() - 1; ++i) oshape[i] = ishape[i+1];
+  for (int i = 0; i < ishape.ndim() - 1; ++i) {
+    oshape[i] = ishape[i+1];
+  }
   for (int i = 0; i < dshape.ndim() - ishape[0]; ++i) {
     oshape[ishape.ndim()-1+i] = dshape[ishape[0] + i];
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 inline bool GatherNDType(const nnvm::NodeAttrs& attrs,
@@ -1370,7 +1373,7 @@ inline bool ScatterNDShape(const nnvm::NodeAttrs& attrs,
 
   bool valid = dshape.ndim() == ishape.ndim() - 1 + oshape.ndim() - ishape[0];
 
-  for (size_t i = 0; i < ishape.ndim() - 1; ++i) {
+  for (int i = 0; i < ishape.ndim() - 1; ++i) {
     valid = valid && dshape[i] == ishape[i+1];
   }
   for (int i = 0; i < oshape.ndim() - ishape[0]; ++i) {
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index fe1a1f62954a..b2e3830064ae 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -28,6 +28,7 @@
 #include <mxnet/base.h>
 #include <mxnet/operator_util.h>
 #include <mxnet/op_attr_types.h>
+#include <mxnet/imperative.h>
 #include <dmlc/parameter.h>
 #include <dmlc/optional.h>
 #include <vector>
@@ -49,7 +50,7 @@ struct InitOpParam : public dmlc::Parameter<InitOpParam> {
   int dtype;
   DMLC_DECLARE_PARAMETER(InitOpParam) {
     DMLC_DECLARE_FIELD(shape)
-    .set_default(mxnet::TShape())
+    .set_default(mxnet::TShape(0, 1))
     .describe("The shape of the output");
     DMLC_DECLARE_FIELD(ctx)
     .set_default("")
@@ -213,14 +214,13 @@ inline bool InitShape(const nnvm::NodeAttrs& attrs,
   const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 0U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if ((*out_attrs)[0].ndim() != 0 && param.shape.ndim() == 0) return true;
-  for (unsigned int i=0 ; i < param.shape.ndim() ; ++i) {
-    if (param.shape[i] < 0U) {
-      LOG(FATAL) << "Shape cannot contain negative values " << param.shape;
-    }
+  mxnet::TShape param_shape = param.shape;
+  if (!Imperative::Get()->is_np_comp()) {
+    common::ConvertToNumpyShape(&param_shape);
   }
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, param.shape);
-  return true;
+  if (shape_is_known((*out_attrs)[0]) && !shape_is_known(param_shape)) return true;
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, param_shape);
+  return shape_is_known(out_attrs->at(0));
 }
 
 template<typename ParamType>
@@ -278,6 +278,8 @@ inline bool InitStorageType(const nnvm::NodeAttrs& attrs,
  */
 template <bool is_integer = false, typename ValueType, typename xpu>
 void Fill(mshadow::Stream<xpu> *s, const TBlob& b, const OpReqType req, ValueType val) {
+  // If b is a zero-size tensor, do nothing.
+  if (b.Size() == 0) return;
   if (req != kNullOp) {
     const size_t size = b.Size();
     if (val == 0) {
diff --git a/src/operator/tensor/la_op.h b/src/operator/tensor/la_op.h
index 5e18e0ef5a25..db4607fe9262 100644
--- a/src/operator/tensor/la_op.h
+++ b/src/operator/tensor/la_op.h
@@ -384,7 +384,7 @@ mshadow::Tensor<xpu, dim, DType> LaOpFlatten(const TBlob& blob,
   }
   // Collapse ranges [0,axis-1] and [axis+1,ndim-2].
   CHECK_EQ(dim, 4);
-  mxnet::TShape shape(dim);
+  mxnet::TShape shape(dim, -1);
   shape[0] = 1;
   for (int i = 0; i < axis; ++i) {
     shape[0] *= blob.shape_[i];
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index fa108158b5c9..e99741b70bb6 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -49,17 +49,17 @@ namespace op {
 struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   mxnet::TShape target_shape;
   bool keep_highest;
-  nnvm::Tuple<int> shape;
+  mxnet::Tuple<int> shape;
   bool reverse;
   DMLC_DECLARE_PARAMETER(ReshapeParam) {
     DMLC_DECLARE_FIELD(shape)
-    .set_default(nnvm::Tuple<int>())
+    .set_default(mxnet::Tuple<int>())
     .describe("The target shape");
     DMLC_DECLARE_FIELD(reverse)
     .set_default(false)
     .describe("If true then the special values are inferred from right to left");
     DMLC_DECLARE_FIELD(target_shape)
-    .set_default(mxnet::TShape())
+    .set_default(mxnet::TShape(0, -1))
     .describe("(Deprecated! Use ``shape`` instead.) "
               "Target new shape. One and only one dim can be 0, "
               "in which case it will be inferred from the rest of dims");
@@ -71,11 +71,11 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
 };
 
 template<typename IType>
-inline mxnet::TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
-                                const mxnet::TShape& dshape, bool reverse) {
+inline mxnet::TShape InferReshapeShape(const mxnet::Tuple<IType>& shape,
+                                       const mxnet::TShape& dshape, bool reverse) {
   std::vector<IType> dshape_vec;
   std::vector<IType> param_shape_vec(shape.begin(), shape.end());
-  for (index_t i = 0; i < dshape.ndim(); ++i) {
+  for (int i = 0; i < dshape.ndim(); ++i) {
     dshape_vec.push_back(dshape[i]);
   }
   std::vector<IType> tmp;
@@ -102,28 +102,31 @@ inline mxnet::TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
     } else if (proposed_dim == -2) {
       // copy all remaining dims from source
       while (src_idx < dshape_len) {
-        size_t dn = dshape_vec[src_idx++];
+        const int dn = dshape_vec[src_idx++];
         tmp.push_back(dn);
       }
     } else if (proposed_dim == -3) {
       // merge two dims from source
       CHECK_LT(src_idx, dshape_len-1);
-      size_t d1 = dshape_vec[src_idx++];
-      size_t d2 = dshape_vec[src_idx++];
-      size_t dn = d1 * d2;
-      tmp.push_back(dn);
+      const int d1 = dshape_vec[src_idx++];
+      const int d2 = dshape_vec[src_idx++];
+      if (!mxnet::dim_size_is_known(d1) || !mxnet::dim_size_is_known(d2)) {
+        tmp.push_back(-1);
+      } else {
+        tmp.push_back(d1 * d2);
+      }
     } else if (proposed_dim == -4) {
       // split the source dim s into two dims
       // read the left dim and then the right dim (either can be -1)
       CHECK_LT(i + 2, params_len);
       CHECK_LT(src_idx, dshape_len);
-      size_t d0 = dshape_vec[src_idx++];
+      const int d0 = dshape_vec[src_idx++];
       IType d1 = param_shape_vec[++i];
       IType d2 = param_shape_vec[++i];
       CHECK(d1 != -1 || d2 != -1) << "Split dims cannot both be -1.";
-      if (d1 == -1) d1 = d0 / d2;
-      if (d2 == -1) d2 = d0 / d1;
-      CHECK(d1 * d2 == static_cast<IType>(d0) || static_cast<IType>(d0) == IType(0)) <<
+      if (d1 == -1 && d0 >= 0) d1 = d0 / d2;  // d0 must be known to do this
+      if (d2 == -1 && d0 >= 0) d2 = d0 / d1;  // d0 must be known to do this
+      CHECK(d1 * d2 == static_cast<IType>(d0) || static_cast<IType>(d0) == IType(-1)) <<
         "Split dims " << d1 << ", " << d2 << " do not divide original dim " << d0;
       tmp.push_back(d1);
       tmp.push_back(d2);
@@ -135,12 +138,12 @@ inline mxnet::TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
   }
 
   if (inf_idx >= 0) {
-    if (dshape.Size() > 0) {
+    if (shape_is_known(dshape)) {
       IType new_size = 1;
       for (IType x : tmp) new_size *= x;
       tmp[inf_idx] = dshape.Size() / new_size;
     } else {
-      tmp[inf_idx] = 0;
+      tmp[inf_idx] = -1;
     }
   }
   if (reverse) {
@@ -153,24 +156,24 @@ inline mxnet::TShape InferReshapeShape(const nnvm::Tuple<IType>& shape,
 }
 
 inline bool ReverseReshapeInferShape(mxnet::TShape *in, const mxnet::TShape& out) {
-  if (in->Size() && out.Size()) {
+  if (shape_is_known(*in) && shape_is_known(out)) {
     return true;
-  } else if (!out.Size()) {
+  } else if (!shape_is_known(out)) {
     return false;
   } else {
     int zero_axis = -1;
-    int non_zero_prod = 1;
-    for (index_t i = 0; i < in->ndim(); i++) {
-      if ((*in)[i] == 0) {
+    int known_dim_size_prod = 1;
+    for (int i = 0; i < in->ndim(); i++) {
+      if (!mxnet::dim_size_is_known(*in, i)) {
         if (zero_axis != -1)
           return false;  // more than 1 zero found.
         else
           zero_axis = i;
       } else {
-        non_zero_prod *= (*in)[i];
+        known_dim_size_prod *= (*in)[i];
       }
     }
-    (*in)[zero_axis] = out.Size() / non_zero_prod;
+    (*in)[zero_axis] = out.Size() / known_dim_size_prod;
     return true;
   }
 }
@@ -182,11 +185,11 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape &dshape = (*in_attrs)[0];
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   mxnet::TShape oshape;
   if (param_.shape.ndim() != 0) {
     oshape = InferReshapeShape(param_.shape, dshape, param_.reverse);
-  } else if (param_.target_shape.ndim()) {
+  } else if (param_.target_shape.ndim() != -1) {
     LOG(INFO) << "Using target_shape will be deprecated.";
     oshape = param_.target_shape;
     int neg_count = 0;
@@ -195,7 +198,7 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
     if (param_.keep_highest) {
       oshape[0] = dshape[0];
     }
-    for (index_t i = start_idx; i < oshape.ndim(); ++i) {
+    for (int i = start_idx; i < oshape.ndim(); ++i) {
       if (oshape[i] == 0) {
         neg_count++;
         inf_idx = i;
@@ -206,13 +209,16 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
       oshape[inf_idx] = dshape.Size() / oshape.Size();
     }
   } else {
-    return (*out_attrs)[0].ndim() && ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
+    return shape_is_known((*out_attrs)[0])
+           && ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
   }
   ReverseReshapeInferShape(&dshape, oshape);
+#if 0
   CHECK_EQ(oshape.Size(), dshape.Size())
     << "Target shape size is different to source. "
     << "Target: " << oshape
     << "\nSource: " << dshape;
+#endif
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return ReverseReshapeInferShape(&(*in_attrs)[0], (*out_attrs)[0]);
 }
@@ -223,9 +229,9 @@ inline bool FlattenShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape &dshape = (*in_attrs)[0];
-  if (dshape.ndim() == 0) return false;
-  uint32_t target_dim = 1;
-  for (uint32_t i = 1; i < dshape.ndim(); ++i) {
+  if (!shape_is_known(dshape)) return false;
+  int target_dim = 1;
+  for (int i = 1; i < dshape.ndim(); ++i) {
     target_dim *= dshape[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape2(dshape[0], target_dim));
@@ -235,7 +241,7 @@ inline bool FlattenShape(const nnvm::NodeAttrs& attrs,
 struct TransposeParam : public dmlc::Parameter<TransposeParam> {
   mxnet::TShape axes;
   DMLC_DECLARE_PARAMETER(TransposeParam) {
-    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape(0, -1))
     .describe("Target axis order. By default the axes will be inverted.");
   }
 
@@ -314,8 +320,8 @@ void Transpose(const nnvm::NodeAttrs& attrs,
   const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
   CHECK_EQ(req[0], kWriteTo) << "Transpose does not support inplace";
   if (param.axes.ndim() == 0) {
-    mxnet::TShape axes = mxnet::TShape(inputs[0].ndim());
-    for (index_t i = 0; i < axes.ndim(); ++i) {
+    mxnet::TShape axes(inputs[0].ndim(), -1);
+    for (int i = 0; i < axes.ndim(); ++i) {
       axes[i] = axes.ndim() - 1 - i;
     }
     TransposeImpl<xpu>(ctx.run_ctx, inputs[0], outputs[0], axes);
@@ -332,20 +338,20 @@ inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& shp = (*in_attrs)[0];
   CHECK_LE(shp.ndim(), 6U) << "Transpose support at most 6 dimensions";
-  mxnet::TShape ret(shp.ndim());
+  mxnet::TShape ret(shp.ndim(), -1);
   if (param.axes.ndim() == 0) {
-    for (index_t i = 0; i < shp.ndim(); ++i) {
+    for (int i = 0; i < shp.ndim(); ++i) {
       ret[i] = shp[shp.ndim()-1-i];
     }
   } else {
     CHECK_EQ(shp.ndim(), param.axes.ndim());
-    for (size_t i = 0; i < shp.ndim(); ++i) {
+    for (int i = 0; i < shp.ndim(); ++i) {
       CHECK(param.axes[i] < static_cast<int64_t>(shp.ndim()));
       ret[i] = shp[param.axes[i]];
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);
-  return true;
+  return shape_is_known(ret);
 }
 
 
@@ -366,7 +372,7 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   const ExpandDimParam& param = nnvm::get<ExpandDimParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
-  if (in_attrs->at(0).ndim() == 0U && out_attrs->at(0).ndim() == 0U) {
+  if (!mxnet::ndim_is_known(in_attrs->at(0)) && !mxnet::ndim_is_known(out_attrs->at(0))) {
     return false;
   }
 
@@ -374,7 +380,7 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   mxnet::TShape& oshape = (*out_attrs)[0];
   int indim = ishape.ndim();
   bool unknown_ishape = false;
-  if (0 == indim) {
+  if (-1 == indim) {
     indim = oshape.ndim() - 1;
     unknown_ishape = true;
   }
@@ -386,27 +392,27 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   CHECK(axis >= 0 && axis <= indim)
       << "axis must be in the range [" << -indim << ", " << indim << "] ("
       << param.axis << " provided)";
-  mxnet::TShape ret(indim + 1);
+  mxnet::TShape ret(indim + 1, -1);
   for (int i = 0; i < axis; ++i) {
-    ret[i] = (unknown_ishape? 0 : ishape[i]);
+    ret[i] = (unknown_ishape? -1 : ishape[i]);
   }
   ret[axis] = 1;
   for (int i = axis+1; i < indim+1; ++i) {
-    ret[i] = (unknown_ishape? 0 : ishape[i-1]);
+    ret[i] = (unknown_ishape? -1 : ishape[i-1]);
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);
 
-  ret = mxnet::TShape(indim);
+  ret = mxnet::TShape(indim, -1);
   for (int i = 0; i < axis; ++i) ret[i] = oshape[i];
   for (int i = axis+1; i < indim+1; ++i) ret[i-1] = oshape[i];
   SHAPE_ASSIGN_CHECK(*in_attrs, 0, ret);
-  return true;
+  return shape_is_known(in_attrs->at(0)) && shape_is_known(out_attrs->at(0));
 }
 
 // Currently MKLDNN only supports step = 1 or step has no value
 inline bool SupportMKLDNNSlice(const SliceParam& param) {
   if (param.step.ndim() == 0U) return true;
-  for (uint32_t i = 0; i < param.step.ndim(); ++i) {
+  for (int i = 0; i < param.step.ndim(); ++i) {
     if (param.step[i].has_value() && param.step[i].value() != 1)
       return false;
   }
@@ -589,11 +595,11 @@ void SliceCsrImpl(const SliceParam &param, const OpContext& ctx,
   const mxnet::TShape ishape = in.shape();
   const mxnet::TShape oshape = out.shape();
 
-  uint32_t N = ishape.ndim();
-  mxnet::TShape begin(N), end(N);
-  for (uint32_t i = 0; i < N; ++i) {
+  int N = ishape.ndim();
+  mxnet::TShape begin(N, -1), end(N, -1);
+  for (int i = 0; i < N; ++i) {
     int s = 0;
-    if (param.begin[i]) {
+    if (i < param.begin.ndim() && param.begin[i]) {
       s = *param.begin[i];
       if (s < 0) s += ishape[i];
     }
@@ -634,9 +640,9 @@ void SliceEx(const nnvm::NodeAttrs& attrs,
 
 template<int ndim>
 inline void GetIndexRange(const mxnet::TShape& dshape,
-                          const nnvm::Tuple<dmlc::optional<int>>& param_begin,
-                          const nnvm::Tuple<dmlc::optional<int>>& param_end,
-                          const nnvm::Tuple<dmlc::optional<int>>& param_step,
+                          const mxnet::Tuple<dmlc::optional<int>>& param_begin,
+                          const mxnet::Tuple<dmlc::optional<int>>& param_end,
+                          const mxnet::Tuple<dmlc::optional<int>>& param_step,
                           common::StaticArray<index_t, ndim>* begin,
                           common::StaticArray<index_t, ndim>* end,
                           common::StaticArray<index_t, ndim>* step) {
@@ -651,12 +657,12 @@ inline void GetIndexRange(const mxnet::TShape& dshape,
     << "Static array size=" << ndim
     << " is not equal to data shape ndim=" << dshape.ndim();
 
-  if (param_step.ndim() != 0U) {
+  if (param_step.ndim() != 0) {
     CHECK_EQ(param_step.ndim(), param_begin.ndim())
       << "step and begin must have the same length";
   }
 
-  for (index_t i = 0; i < param_begin.ndim(); ++i) {
+  for (int i = 0; i < param_begin.ndim(); ++i) {
     index_t s = param_step.ndim() != 0U && param_step[i].has_value() ? param_step[i].value() : 1;
     CHECK_NE(s, 0) << "slice op step[" << i << "] cannot be 0";
 
@@ -723,21 +729,21 @@ inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& dshape = (*in_attrs)[0];
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
   mxnet::TShape oshape = dshape;
 
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
-    for (index_t i = 0; i < param.begin.ndim(); ++i) {
+    for (int i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &oshape);
     }
-  });
+  })
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return !shape_is_none(dshape) && !shape_is_none(oshape);
+  return shape_is_known(oshape);
 }
 
 template<int ndim, int req, typename xpu>
@@ -943,11 +949,11 @@ inline bool SliceAssignOpShape(const nnvm::NodeAttrs& attrs,
   MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
     common::StaticArray<index_t, ndim> begin, end, step;
     GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
-    for (index_t i = 0; i < param.begin.ndim(); ++i) {
+    for (int i = 0; i < param.begin.ndim(); ++i) {
       const int b = begin[i], e = end[i], s = step[i];
       SetSliceOpOutputDimSize(i, b, e, s, &vshape);
     }
-  });
+  })
   SHAPE_ASSIGN_CHECK(*in_attrs, 1, vshape);
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, dshape);
   return true;
@@ -997,8 +1003,8 @@ void SliceAssignOpForward(const nnvm::NodeAttrs& attrs,
 
 struct SliceAssignScalarParam : public dmlc::Parameter<SliceAssignScalarParam> {
   double scalar;
-  nnvm::Tuple<dmlc::optional<int>> begin, end;
-  nnvm::Tuple<dmlc::optional<int>> step;
+  mxnet::Tuple<dmlc::optional<int>> begin, end;
+  mxnet::Tuple<dmlc::optional<int>> step;
   DMLC_DECLARE_PARAMETER(SliceAssignScalarParam) {
     DMLC_DECLARE_FIELD(scalar)
     .set_default(0)
@@ -1008,7 +1014,7 @@ struct SliceAssignScalarParam : public dmlc::Parameter<SliceAssignScalarParam> {
     DMLC_DECLARE_FIELD(end)
     .describe("ending indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(step)
-    .set_default(nnvm::Tuple<dmlc::optional<int>>())
+    .set_default(mxnet::Tuple<dmlc::optional<int>>())
     .describe("step for the slice operation, supports negative values.");
   }
 };
@@ -1019,7 +1025,7 @@ inline bool SliceAssignScalarOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& dshape = (*in_attrs)[0];
-  if (dshape.ndim() == 0U || dshape.Size() == 0U) return false;
+  if (!shape_is_known(dshape)) return false;
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, dshape);
   return true;
 }
@@ -1114,9 +1120,9 @@ inline void GetSliceAxisParams(const SliceAxisParam& param, const mxnet::TShape&
                            int* axis, index_t* begin, index_t* end) {
   *axis = param.axis;
   if (*axis < 0) {
-    *axis += static_cast<int>(ishape.ndim());
+    *axis += ishape.ndim();
   }
-  CHECK(*axis < static_cast<int>(ishape.ndim()) && *axis >= 0) <<
+  CHECK(*axis < ishape.ndim() && *axis >= 0) <<
     "Transformed axis must be smaller than the source ndim and larger than zero! Recieved axis=" <<
     param.axis << ", src_ndim=" << ishape.ndim() << ", transformed axis=" << *axis;
   index_t axis_size = static_cast<index_t>(ishape[*axis]);
@@ -1125,7 +1131,7 @@ inline void GetSliceAxisParams(const SliceAxisParam& param, const mxnet::TShape&
   if (*begin < 0) {
     *begin += axis_size;
   }
-  if (axis_size) {
+  if (axis_size > 0) {
     if (!static_cast<bool>(param.end)) {
       *end = axis_size;
     } else {
@@ -1153,19 +1159,24 @@ inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
   mxnet::TShape& ishape = (*in_attrs)[0];
+  if (!mxnet::ndim_is_known(ishape)) return false;
   int axis;
   index_t begin, end;
   GetSliceAxisParams(param, ishape, &axis, &begin, &end);
-  mxnet::TShape shape(ishape.ndim());
-  for (index_t i = 0; i < ishape.ndim(); ++i) {
-    if (static_cast<int>(i) == axis) {
+  if (!mxnet::dim_size_is_known(ishape, axis)) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);
+    return false;
+  }
+  mxnet::TShape shape(ishape.ndim(), -1);
+  for (int i = 0; i < ishape.ndim(); ++i) {
+    if (i == axis) {
       shape[i] = static_cast<index_t>(end - begin);
     } else {
       shape[i] = ishape[i];
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
-  return true;
+  return shape_is_known(shape);
 }
 
 
@@ -1181,7 +1192,7 @@ void SliceAxis(const nnvm::NodeAttrs& attrs,
   int axis;
   index_t begin, end;
   GetSliceAxisParams(param, inputs[0].shape_, &axis, &begin, &end);
-  int ndim = static_cast<int>(outputs[0].ndim());
+  int ndim = outputs[0].ndim();
 
   if (axis + 1 == ndim) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
@@ -1216,7 +1227,7 @@ void SliceAxisGrad_(const nnvm::NodeAttrs& attrs,
   int axis;
   index_t begin, end;
   GetSliceAxisParams(param, outputs[0].shape_, &axis, &begin, &end);
-  int ndim = static_cast<int>(outputs[0].shape_.ndim());
+  int ndim = outputs[0].shape_.ndim();
 
   if (axis + 1 == ndim) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
@@ -1252,9 +1263,9 @@ void SliceAxisGrad_(const nnvm::NodeAttrs& attrs,
 }
 
 struct SliceLikeParam : public dmlc::Parameter<SliceLikeParam> {
-  mxnet::TShape axes;
+  mxnet::Tuple<int> axes;
   DMLC_DECLARE_PARAMETER(SliceLikeParam) {
-    DMLC_DECLARE_FIELD(axes).set_default(mxnet::TShape())
+    DMLC_DECLARE_FIELD(axes).set_default(mxnet::Tuple<int>())
     .describe("List of axes on which input data will be sliced according to the "
               "corresponding size of the second input. By default will slice on "
               "all axes. Negative axes are supported.");
@@ -1273,7 +1284,7 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
     CHECK_EQ(ishape.ndim(), from_shape.ndim())
       << "By default slice_axis performs slice on all axes, but ndim mismatch "
          "for inputs: " << ishape.ndim() << " vs. " << from_shape.ndim();
-    for (index_t i = 0; i < ishape.ndim(); ++i) {
+    for (int i = 0; i < ishape.ndim(); ++i) {
       CHECK_GE(ishape[i], from_shape[i])
         << "Slice axis " << i << " with size " << from_shape[i]
         << "exceeds limit of input with size " << ishape[i];
@@ -1281,13 +1292,13 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, from_shape);
   } else {
     mxnet::TShape shape(ishape);
-    for (index_t i = 0; i < param.axes.ndim(); ++i) {
-      int axis = static_cast<int>(param.axes[i]);
+    for (int i = 0; i < param.axes.ndim(); ++i) {
+      int axis = param.axes[i];
       if (axis < 0) {
-        axis += static_cast<int>(ishape.ndim());
+        axis += ishape.ndim();
       }
       CHECK_GE(axis, 0)
-        << "Slice axis: " << static_cast<int>(param.axes[i]) << " too small";
+        << "Slice axis: " << param.axes[i] << " too small";
       CHECK_GT(ishape.ndim(), axis)
         << "Slice axis: " << axis << " exceeds first input: " << ishape.ndim();
       CHECK_GT(from_shape.ndim(), axis)
@@ -1304,39 +1315,39 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
 
 inline void SliceLikeInferRanges(const mxnet::TShape& dshape,
                                  const mxnet::TShape& fshape,
-                                 const mxnet::TShape& axes,
-                                 nnvm::Tuple<dmlc::optional<int>>* param_begin,
-                                 nnvm::Tuple<dmlc::optional<int>>* param_end,
-                                 nnvm::Tuple<dmlc::optional<int>>* param_step) {
+                                 const mxnet::Tuple<int>& axes,
+                                 mxnet::Tuple<dmlc::optional<int>>* param_begin,
+                                 mxnet::Tuple<dmlc::optional<int>>* param_end,
+                                 mxnet::Tuple<dmlc::optional<int>>* param_step) {
   std::vector<dmlc::optional<int>> pb(dshape.ndim());
   std::vector<dmlc::optional<int>> pe(dshape.ndim());
   std::vector<dmlc::optional<int>> ps(dshape.ndim());
   if (axes.ndim() == 0) {
-    for (index_t i = 0; i < dshape.ndim(); ++i) {
+    for (int i = 0; i < dshape.ndim(); ++i) {
       pb[i] = 0;
       pe[i] = fshape[i];
       ps[i] = 1;
     }
   } else {
-    for (index_t i = 0; i < axes.ndim(); ++i) {
-      int axis = static_cast<int>(axes[i]);
+    for (int i = 0; i < axes.ndim(); ++i) {
+      int axis = axes[i];
       if (axis < 0) {
-        axis += static_cast<int>(dshape.ndim());
+        axis += dshape.ndim();
       }
       CHECK_GE(axis, 0)
-        << "Slice axis: " << static_cast<int>(axes[i]) << " too small";
-      CHECK_LT(axis, static_cast<int>(dshape.ndim()))
+        << "Slice axis: " << axes[i] << " too small";
+      CHECK_LT(axis, dshape.ndim())
         << "Slice axis: " << axis << " exceeds first input: " << dshape.ndim();
-      CHECK_LT(axis, static_cast<int>(fshape.ndim()))
+      CHECK_LT(axis, fshape.ndim())
         << "Slice axis: " << axis << " exceeds first input: " << fshape.ndim();
       pb[axis] = 0;
       pe[axis] = fshape[axis];
       ps[axis] = 1;
     }
   }
-  *param_begin = nnvm::Tuple<dmlc::optional<int>>(pb.begin(), pb.end());
-  *param_end = nnvm::Tuple<dmlc::optional<int>>(pe.begin(), pe.end());
-  *param_step = nnvm::Tuple<dmlc::optional<int>>(ps.begin(), ps.end());
+  *param_begin = mxnet::Tuple<dmlc::optional<int>>(pb.begin(), pb.end());
+  *param_end = mxnet::Tuple<dmlc::optional<int>>(pe.begin(), pe.end());
+  *param_step = mxnet::Tuple<dmlc::optional<int>>(ps.begin(), ps.end());
 }
 
 template<typename xpu>
@@ -1355,9 +1366,9 @@ void SliceLikeForward(const nnvm::NodeAttrs& attrs,
   const TBlob& out = outputs[0];
   const mxnet::TShape& ishape = data.shape_;
   const mxnet::TShape& from_shape = inputs[1].shape_;
-  nnvm::Tuple<dmlc::optional<int>> param_begin;
-  nnvm::Tuple<dmlc::optional<int>> param_end;
-  nnvm::Tuple<dmlc::optional<int>> param_step;
+  mxnet::Tuple<dmlc::optional<int>> param_begin;
+  mxnet::Tuple<dmlc::optional<int>> param_end;
+  mxnet::Tuple<dmlc::optional<int>> param_step;
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
@@ -1403,9 +1414,9 @@ void SliceLikeBackward(const nnvm::NodeAttrs& attrs,
 
   const mxnet::TShape& ishape = ograd.shape_;
   const mxnet::TShape& from_shape = outputs[1].shape_;
-  nnvm::Tuple<dmlc::optional<int>> param_begin;
-  nnvm::Tuple<dmlc::optional<int>> param_end;
-  nnvm::Tuple<dmlc::optional<int>> param_step;
+  mxnet::Tuple<dmlc::optional<int>> param_begin;
+  mxnet::Tuple<dmlc::optional<int>> param_end;
+  mxnet::Tuple<dmlc::optional<int>> param_step;
   SliceLikeInferRanges(ishape, from_shape, param.axes, &param_begin, &param_end, &param_step);
 
   MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
@@ -1546,7 +1557,7 @@ inline void GetRepeatParams(const RepeatParam& param, const mxnet::TShape& ishap
   CHECK_GE(*repeats, 0) << "repeats cannot be a negative number";
   *axisOpt = param.axis;
   if (static_cast<bool>(*axisOpt)) {
-    int ndims = static_cast<int>(ishape.ndim());
+    int ndims = ishape.ndim();
     int axis = axisOpt->value();
     if (axis < 0) {
       axis += ndims;
@@ -1565,34 +1576,33 @@ inline bool RepeatOpShape(const nnvm::NodeAttrs& attrs,
   int repeats = 0;
   dmlc::optional<int> axisOpt;
   GetRepeatParams(param, ishape, &repeats, &axisOpt);
-  // If 0 repeats, return an empty 0 dim array
+  // If 0 repeats, return an empty 1-dim, 0-size array
   if (0 == repeats) {
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape());
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 0));
     return true;
   }
 
   // If repeats > 0, multiply the size of the corresponding axis by repeats
   if (static_cast<bool>(axisOpt)) {
-    int ndims = static_cast<int>(ishape.ndim());
+    int ndims = ishape.ndim();
     int axis = axisOpt.value();
     if (axis < 0) {
       axis += ndims;
     }
-    mxnet::TShape shape(ishape.ndim());
-    for (index_t i = 0; i < ishape.ndim(); ++i) {
-      if (static_cast<int>(i) == axis) {
-        shape[i] = static_cast<index_t>(repeats) * ishape[i];
+    mxnet::TShape shape(ishape.ndim(), -1);
+    for (int i = 0; i < ishape.ndim(); ++i) {
+      if (i == axis) {
+        shape[i] = repeats * ishape[i];
       } else {
         shape[i] = ishape[i];
       }
     }
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   } else {  // If axis is not input by user, return a flat 1D array of size = in.size*repeats
-    mxnet::TShape shape(1);
-    shape[0] = ishape.Size() * static_cast<index_t>(repeats);
+    mxnet::TShape shape(1, ishape.Size() * repeats);
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, shape);
   }
-  return true;
+  return shape_is_known(out_attrs->at(0));
 }
 
 inline bool RepeatOpType(const nnvm::NodeAttrs& attrs,
@@ -1620,16 +1630,16 @@ inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForRepeatOp(
   const int repeats) {
   if (static_cast<bool>(axisOpt)) {
     int axis = axisOpt.value();
-    int ndim = static_cast<int>(ishape.ndim());
+    int ndim = ishape.ndim();
     if (axis < 0)  {
       axis += ndim;
     }
-    CHECK(axis >= 0 && axis < static_cast<int>(ishape.ndim())) << "Invalid input of axis";
+    CHECK(axis >= 0 && axis < ishape.ndim()) << "Invalid input of axis";
 
     // reshape the input tensor by adding a dim at the (axis+1)-th dim
-    mxnet::TShape rshape(ishape.ndim()+1);
+    mxnet::TShape rshape(ishape.ndim()+1, 1);
     // the shape we want to broadcast to
-    mxnet::TShape bshape(rshape.ndim());
+    mxnet::TShape bshape(rshape.ndim(), 1);
     int i = 0;
     while (i <= axis) {
       rshape[i] = bshape[i] = ishape[i];
@@ -1637,7 +1647,7 @@ inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForRepeatOp(
     }
     rshape[i] = 1;
     bshape[i] = repeats;
-    while (i < static_cast<int>(ishape.ndim())) {
+    while (i < ishape.ndim()) {
       rshape[i+1] = ishape[i];
       bshape[i+1] = ishape[i];
       ++i;
@@ -1648,11 +1658,11 @@ inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForRepeatOp(
     // reshape the tensor into shape (ishape.Size(), 1)
     // then add one dim at axis = 1 and broadcast to
     // shape (ishape.Size(), repeats)
-    mxnet::TShape rshape(2);
+    mxnet::TShape rshape(2, 1);
     rshape[0] = ishape.Size();
     rshape[1] = 1;
 
-    mxnet::TShape bshape(2);
+    mxnet::TShape bshape(2, 1);
     bshape[0] = rshape[0];
     bshape[1] = repeats;
     return std::make_pair(rshape, bshape);
@@ -1667,7 +1677,7 @@ void RepeatOpForward(const nnvm::NodeAttrs& attrs,
                      const std::vector<TBlob>& outputs) {
   const TBlob& iTBlob = inputs[0];
   const mxnet::TShape& ishape = iTBlob.shape_;
-  if (ishape.ndim() == 0) return;
+  if (!shape_is_known(ishape)) return;
 
   int repeats = 0;
   dmlc::optional<int> axisOpt;
@@ -1711,7 +1721,7 @@ void RepeatOpBackward(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1U);
 
   const mxnet::TShape& oshape = outputs[0].shape_;
-  if (oshape.ndim() == 0) return;
+  if (!shape_is_known(oshape)) return;
 
   int repeats = 0;
   dmlc::optional<int> axisOpt;
@@ -1732,12 +1742,12 @@ void RepeatOpBackward(const nnvm::NodeAttrs& attrs,
     inputs[0].type_flag_, inputs[0].dev_id());
   std::vector<TBlob> newInputs = {iblob};
 
-  ReduceAxesComputeImpl<xpu, mshadow::red::sum, false>(
+  ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, false>(
       ctx, newInputs, req, newOutputs, rshapes.first);
 }
 
 struct TileParam : public dmlc::Parameter<TileParam> {
-  mxnet::TShape reps;
+  mxnet::Tuple<int> reps;
   DMLC_DECLARE_PARAMETER(TileParam) {
     DMLC_DECLARE_FIELD(reps)
       .describe("The number of times for repeating the tensor a. Each dim size of reps"
@@ -1755,19 +1765,22 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   const TileParam& param = nnvm::get<TileParam>(attrs.parsed);
   const mxnet::TShape& ishape = (*in_attrs)[0];
-  const mxnet::TShape& reps = param.reps;
+  if (!shape_is_known(ishape)) {
+    return false;
+  }
+  const mxnet::Tuple<int>& reps = param.reps;
   // If reps is empty, return a identical input array
-  if (reps.ndim() == 0 || ishape.ndim() == 0) {
+  if (reps.ndim() == 0) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);
     return true;
   }
-  for (size_t i = 0; i < reps.ndim(); ++i) {
+  for (int i = 0; i < reps.ndim(); ++i) {
     CHECK_GT(reps[i], 0) << "invalid reps=" << i << ", dim size must be greater than zero";
   }
-  mxnet::TShape oshape(std::max(ishape.ndim(), reps.ndim()));
-  int i1 = static_cast<int>(ishape.ndim()) - 1;
-  int i2 = static_cast<int>(reps.ndim()) - 1;
-  for (int i = static_cast<int>(oshape.ndim()) - 1; i >= 0; --i) {
+  mxnet::TShape oshape(std::max(ishape.ndim(), reps.ndim()), -1);
+  int i1 = ishape.ndim() - 1;
+  int i2 = reps.ndim() - 1;
+  for (int i = oshape.ndim() - 1; i >= 0; --i) {
     if (i1 >= 0 && i2 >= 0) {
       oshape[i] = ishape[i1--] * reps[i2--];
     } else if (i1 >= 0) {
@@ -1777,7 +1790,7 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
     }
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
-  return true;
+  return shape_is_known(oshape);
 }
 
 inline bool TileOpType(const nnvm::NodeAttrs& attrs,
@@ -1801,20 +1814,20 @@ inline bool TileOpType(const nnvm::NodeAttrs& attrs,
  */
 inline std::pair<mxnet::TShape, mxnet::TShape> ReshapeInputOutputForTileOp(
   const mxnet::TShape& ishape,
-  const mxnet::TShape& reps) {
+  const mxnet::Tuple<int>& reps) {
   if (ishape.ndim() == 0 || reps.ndim() == 0) {
     return std::make_pair(ishape, ishape);
   }
 
   // The shape we want to broadcast to
-  mxnet::TShape bshape(std::max(ishape.ndim(), reps.ndim()) * 2);
+  mxnet::TShape bshape(std::max(ishape.ndim(), reps.ndim()) * 2, 1);
 
   // The shape of the input tensor after adding new axes before each dim
-  mxnet::TShape rshape(bshape.ndim());
+  mxnet::TShape rshape(bshape.ndim(), 1);
 
-  int i1 = static_cast<int>(ishape.ndim()) - 1;
-  int i2 = static_cast<int>(reps.ndim()) - 1;
-  for (int i = static_cast<int>(bshape.ndim()) - 1; i >= 0; --i) {
+  int i1 = ishape.ndim() - 1;
+  int i2 = reps.ndim() - 1;
+  for (int i = bshape.ndim() - 1; i >= 0; --i) {
     if (0 == (i & 1)) {
       bshape[i] = (i2 >= 0? reps[i2--] : 1);
       rshape[i] = 1;
@@ -1854,10 +1867,10 @@ void TileOpForward(const nnvm::NodeAttrs& attrs,
 
   if (inputs[0].Size() == 0) return;
   const mxnet::TShape& ishape = inputs[0].shape_;
-  const mxnet::TShape& reps = nnvm::get<TileParam>(attrs.parsed).reps;
+  const mxnet::Tuple<int>& reps = nnvm::get<TileParam>(attrs.parsed).reps;
 
   // If any one of the number in reps is zero, return immediately
-  for (index_t i = 0; i < reps.ndim(); ++i) {
+  for (int i = 0; i < reps.ndim(); ++i) {
     if (0 == reps[i]) return;
   }
 
@@ -1896,10 +1909,10 @@ void TileOpBackward(const nnvm::NodeAttrs& attrs,
 
   if (inputs[0].Size() == 0) return;
   const mxnet::TShape& oshape = outputs[0].shape_;
-  const mxnet::TShape& reps = nnvm::get<TileParam>(attrs.parsed).reps;
+  const mxnet::Tuple<int>& reps = nnvm::get<TileParam>(attrs.parsed).reps;
 
   // If any one of the number in reps is zero, return immediately
-  for (index_t i = 0; i < reps.ndim(); ++i) {
+  for (int i = 0; i < reps.ndim(); ++i) {
     if (0 == reps[i]) return;
   }
 
@@ -1914,12 +1927,12 @@ void TileOpBackward(const nnvm::NodeAttrs& attrs,
     inputs[0].type_flag_, inputs[0].dev_id());
   std::vector<TBlob> newInputs = {iblob};
 
-  ReduceAxesComputeImpl<xpu, mshadow::red::sum, false>(
+  ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, false>(
       ctx, newInputs, req, newOutputs, rshapes.first);
 }
 
 struct ReverseParam : public dmlc::Parameter<ReverseParam> {
-  nnvm::Tuple<int> axis;
+  mxnet::Tuple<int> axis;
   DMLC_DECLARE_PARAMETER(ReverseParam) {
     DMLC_DECLARE_FIELD(axis)
     .describe("The axis which to reverse elements.");
@@ -1990,10 +2003,10 @@ void ReverseOpForward(const nnvm::NodeAttrs& attrs,
   std::vector<index_t>  trailing_(param.axis.ndim());
   index_t reverse_index = 0;
   for (int axis : param.axis) {
-    CHECK_LT(axis, static_cast<int>(ishape.ndim()));
+    CHECK_LT(axis, ishape.ndim());
     stride_[reverse_index] = ishape[axis];
     trailing_[reverse_index] = 1;
-    for (index_t i2 = axis + 1; i2 < ishape.ndim(); ++i2) {
+    for (int i2 = axis + 1; i2 < ishape.ndim(); ++i2) {
       trailing_[reverse_index] *= ishape[i2];
     }
     reverse_index++;
@@ -2054,9 +2067,9 @@ inline bool StackOpShape(const nnvm::NodeAttrs& attrs,
   for (const mxnet::TShape& i : (*in_attrs)) {
     shape_assign(&dshape, i);
   }
-  if (dshape.ndim() == 0) return false;
+  if (!shape_is_known(dshape)) return false;
 
-  mxnet::TShape oshape(dshape.ndim() + 1);
+  mxnet::TShape oshape(dshape.ndim() + 1, -1);
   int axis = CheckAxis(param.axis, oshape.ndim());
   for (int i = 0; i < axis; ++i) {
     oshape[i] = dshape[i];
@@ -2067,7 +2080,7 @@ inline bool StackOpShape(const nnvm::NodeAttrs& attrs,
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
 
-  return true;
+  return shape_is_known(oshape);
 }
 
 
@@ -2140,10 +2153,10 @@ void StackOpBackward(const nnvm::NodeAttrs& attrs,
 }
 
 struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
-  dmlc::optional<mxnet::TShape> axis;
+  dmlc::optional<mxnet::Tuple<int>> axis;
   DMLC_DECLARE_PARAMETER(SqueezeParam) {
     DMLC_DECLARE_FIELD(axis)
-    .set_default(dmlc::optional<mxnet::TShape>())
+    .set_default(dmlc::optional<mxnet::Tuple<int>>())
     .describe("Selects a subset of the single-dimensional entries in the shape."
               " If an axis is selected with shape entry greater than one, an error is raised.");
   }
@@ -2156,7 +2169,7 @@ struct SqueezeParam : public dmlc::Parameter<SqueezeParam> {
 inline size_t SqueezeShapeHelper(mxnet::TShape* shape) {
   CHECK(shape != nullptr);
   size_t count = 0;
-  for (size_t i = 0; i < shape->ndim(); ++i) {
+  for (int i = 0; i < shape->ndim(); ++i) {
     if ((*shape)[i] == 0) {
       ++count;
     } else {
@@ -2174,12 +2187,12 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   const mxnet::TShape& dshape = in_attrs->at(0);
   const int dndim = dshape.ndim();
-  if (shape_is_none(dshape)) return false;
+  if (!shape_is_known(dshape)) return false;
   mxnet::TShape oshape = dshape;
   if (param.axis.has_value()) {
     // preprocess axis
-    mxnet::TShape axes = param.axis.value();
-    for (size_t i = 0; i < axes.ndim(); ++i) {
+    mxnet::Tuple<int> axes = param.axis.value();
+    for (int i = 0; i < axes.ndim(); ++i) {
       if (axes[i] < 0) {
         axes[i] += dndim;
         CHECK_GE(axes[i], 0)
@@ -2194,7 +2207,7 @@ inline bool SqueezeShape(const nnvm::NodeAttrs& attrs,
       oshape[axes[i]] = 0;
     }
   } else {
-    for (size_t i = 0; i < oshape.ndim(); ++i) {
+    for (int i = 0; i < oshape.ndim(); ++i) {
       if (oshape[i] == 1) oshape[i] = 0;
     }
   }
@@ -2223,7 +2236,7 @@ inline bool DepthToSpaceOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   CHECK_EQ(in_attrs->at(0).ndim(), 4) << "Operation Depth To Space requires exactly 4D tensor";
 
-  mxnet::TShape expected_out(4);
+  mxnet::TShape expected_out(4, -1);
 
   mxnet::TShape& in_shape = in_attrs->at(0);
   int block = param.block_size;
@@ -2241,14 +2254,14 @@ inline bool DepthToSpaceOpShape(const nnvm::NodeAttrs& attrs,
 
   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] / (block * block);
-  size_t i = 2;
+  int i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] * block;
     ++i;
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, expected_out);
-  return true;
+  return shape_is_known(expected_out);
 }
 
 inline bool DepthToSpaceOpType(const nnvm::NodeAttrs& attrs,
@@ -2387,7 +2400,7 @@ inline bool SpaceToDepthOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(out_attrs->size(), 1U);
   CHECK_EQ(in_attrs->at(0).ndim(), 4) << "Operation Space To Depth requires exactly 4D tensor";
 
-  mxnet::TShape expected_out(in_attrs->at(0).ndim());
+  mxnet::TShape expected_out(in_attrs->at(0).ndim(), -1);
 
   mxnet::TShape& in_shape = in_attrs->at(0);
   int block = param.block_size;
@@ -2408,14 +2421,14 @@ inline bool SpaceToDepthOpShape(const nnvm::NodeAttrs& attrs,
 
   expected_out[0] = in_shape[0];
   expected_out[1] = in_shape[1] * block * block;
-  uint32_t i = 2;
+  int i = 2;
   while (i < expected_out.ndim()) {
     expected_out[i] = in_shape[i] / block;
     ++i;
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, expected_out);
-  return true;
+  return shape_is_known(expected_out);
 }
 
 inline bool SpaceToDepthOpType(const nnvm::NodeAttrs& attrs,
@@ -2556,7 +2569,7 @@ struct SplitParam : public dmlc::Parameter<SplitParam> {
 };  // struct SplitParam
 
 inline mxnet::TShape GetSplitIndices(const mxnet::TShape& ishape, int axis, int sections) {
-  mxnet::TShape indices(sections+1);
+  mxnet::TShape indices(sections+1, -1);
   indices[0] = 0;
   int64_t section_size = ishape[axis] / sections;
   for (int i = 0; i < sections; ++i) {
@@ -2588,7 +2601,7 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1U);
   mxnet::TShape dshape = in_attrs->at(split_enum::kData);
   mxnet::TShape ishape = in_attrs->at(split_enum::kData);
-  if (dshape.ndim() == 0) return false;
+  if (!mxnet::ndim_is_known(dshape)) return false;
   if (param.axis >= 0) {
     CHECK_LT(static_cast<size_t>(param.axis), dshape.ndim());
   } else {
@@ -2603,7 +2616,7 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
   int num_outputs = (param.sections > 0) ? indices.ndim() - 1 : indices.ndim();
   // Pre-compute squeezed output shape for future usage
   mxnet::TShape squeezed_dshape = dshape;
-  for (int d = real_axis; d < static_cast<int>(squeezed_dshape.ndim()) - 1; ++d) {
+  for (int d = real_axis; d < squeezed_dshape.ndim() - 1; ++d) {
     squeezed_dshape[d] = squeezed_dshape[d+1];
   }
   squeezed_dshape = mxnet::TShape(&squeezed_dshape[0], &squeezed_dshape[squeezed_dshape.ndim()-1]);
@@ -2635,7 +2648,7 @@ inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
       back_calculate_dshape[real_axis] += (*out_attrs)[i][real_axis];
     }
   }
-  for (int d = real_axis + 1; d < static_cast<int>(ishape.ndim()); ++d) {
+  for (int d = real_axis + 1; d < ishape.ndim(); ++d) {
     if (param.squeeze_axis) {
       back_calculate_dshape[d] = (*out_attrs)[0][d - 1];
     } else {
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 1431fef13594..b80c9a54510f 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -410,8 +410,8 @@ Examples::
           "transpose", n, ograds, {},
           std::unordered_map<std::string, std::string>());
     } else {
-      mxnet::TShape axes = mxnet::TShape(param.axes.ndim());
-      for (index_t i = 0; i < axes.ndim(); ++i) {
+      mxnet::TShape axes = mxnet::TShape(param.axes.ndim(), -1);
+      for (int i = 0; i < axes.ndim(); ++i) {
         axes[param.axes[i]] = i;
       }
       std::ostringstream os;
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index 5a95e05ffb65..1dda90104205 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -149,7 +149,7 @@ inline void ParseTopKParam(const mxnet::TShape& src_shape, const TopKParam& para
                                                   << src_shape.ndim() << ", found axis=" << *axis;
     *batch_size = src_shape.Size() / src_shape[*axis];
     *element_num = src_shape[*axis];
-    if (*axis != static_cast<int>(src_shape.ndim()) - 1) {
+    if (*axis != src_shape.ndim() - 1) {
       *do_transpose = true;
     }
   }
diff --git a/src/operator/tensor/slice-inl.h b/src/operator/tensor/slice-inl.h
index 4e94cbeda46c..78a2bd8c7b45 100644
--- a/src/operator/tensor/slice-inl.h
+++ b/src/operator/tensor/slice-inl.h
@@ -34,15 +34,15 @@ namespace mxnet {
 namespace op {
 
 struct SliceParam : public dmlc::Parameter<SliceParam> {
-  nnvm::Tuple<dmlc::optional<int>> begin, end;
-  nnvm::Tuple<dmlc::optional<int>> step;
+  mxnet::Tuple<dmlc::optional<int>> begin, end;
+  mxnet::Tuple<dmlc::optional<int>> step;
   DMLC_DECLARE_PARAMETER(SliceParam) {
     DMLC_DECLARE_FIELD(begin)
     .describe("starting indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(end)
     .describe("ending indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(step)
-    .set_default(nnvm::Tuple<dmlc::optional<int>>())
+    .set_default(mxnet::Tuple<dmlc::optional<int>>())
     .describe("step for the slice operation, supports negative values.");
   }
   bool operator==(const SliceParam& other) const {
diff --git a/src/operator/tensor/sparse_retain-inl.h b/src/operator/tensor/sparse_retain-inl.h
index 951bf80b81b8..04860e6f369f 100644
--- a/src/operator/tensor/sparse_retain-inl.h
+++ b/src/operator/tensor/sparse_retain-inl.h
@@ -290,7 +290,7 @@ void SparseRetainOpForwardRspImpl(mshadow::Stream<xpu> *s,
     Kernel<set_zero, xpu>::Launch(s, output_data.Size(), output_data.dptr<DType>());
     MSHADOW_IDX_TYPE_SWITCH(output_idx.type_flag_, RType, {  // row index data type
       MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, {  // index array data type
-        if (input_idx.Size() == input_nd.shape()[0]) {  // input rsp is dense
+        if (input_idx.Size() == static_cast<size_t>(input_nd.shape()[0])) {  // input rsp is dense
           using namespace mshadow;
           // copy indices
           Tensor<xpu, 1, RType> output_idx_tensor = output_idx.FlatTo1D<xpu, RType>(s);
diff --git a/src/operator/tensor/square_sum-inl.h b/src/operator/tensor/square_sum-inl.h
index 016b383117bc..c2e3182c6a1e 100644
--- a/src/operator/tensor/square_sum-inl.h
+++ b/src/operator/tensor/square_sum-inl.h
@@ -434,14 +434,16 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
                             " when ograd_stype = kRowSparseStorage";
     CHECK_EQ(ograd.shape().ndim(), 2U);
     const TBlob ograd_row_idx = ograd.aux_data(rowsparse::kIdx);
-    CHECK(ograd_row_idx.Size() == in_row_idx.Size() || in_row_idx.Size() == in_data.shape_[0]);
+    CHECK(ograd_row_idx.Size() == in_row_idx.Size() ||
+          in_row_idx.Size() == static_cast<size_t>(in_data.shape_[0]));
     igrad->CheckAndAlloc({ograd.aux_shape(rowsparse::kIdx)});
     const TBlob& igrad_data = igrad->data();
     const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
     MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
       // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp
       // ograd_row_idx and in_row_idx are expected to have the same elements
-      if (in_row_idx.Size() != input.shape()[0]) {  // if input data is not a full rsp
+      if (in_row_idx.Size() != static_cast<size_t>(input.shape()[0])) {
+          // if input data is not a full rsp
         CHECK_EQ(ograd_row_idx.Size(), in_row_idx.Size()) << "SquareSumRspGradImpl only supports"
                                                              " equal ograd_row_idx and"
                                                              " input_row_idx when ograd and"
@@ -452,7 +454,8 @@ void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
       }
       MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
         MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
-          if (in_row_idx.Size() != input.shape()[0]) {  // input data is not a full rsp
+          if (in_row_idx.Size() != static_cast<size_t>(input.shape()[0])) {
+              // input data is not a full rsp
             Kernel<SquareSumRspGradKernel<req_type, 1, kRowSparseStorage, false>, xpu>::Launch(
                 s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
                 igrad_data.dptr<DType>(), ograd_row_idx.dptr<IType>(),
diff --git a/src/profiler/profiler.h b/src/profiler/profiler.h
index adea941bda13..f1fac9ae8ddd 100644
--- a/src/profiler/profiler.h
+++ b/src/profiler/profiler.h
@@ -608,6 +608,12 @@ struct ProfileCounter : public ProfileObject {
       return IncrementValue(static_cast<uint64_t>(v));
     }
   }
+
+  inline bool operator >=(int64_t v) {
+      CHECK_GE(v, 0);
+      return value_ >= static_cast<uint64_t>(v);
+  }
+
   /*! \brief operator: object = v */
   inline ProfileCounter& operator = (uint64_t v) {
     SetValue(v);
diff --git a/src/profiler/storage_profiler.h b/src/profiler/storage_profiler.h
index bcbe7e7e3ffd..5ab5983267eb 100644
--- a/src/profiler/storage_profiler.h
+++ b/src/profiler/storage_profiler.h
@@ -66,7 +66,11 @@ class DeviceStorageProfiler {
         Init();  // In case of bug which tries to free first
         const size_t idx = prof->DeviceIndex(handle.ctx.dev_type, handle.ctx.dev_id);
         CHECK_LT(idx, mem_counters_.size()) << "Invalid device index: " << idx;
-        *mem_counters_[idx] -= handle.size;
+        if (*mem_counters_[idx] >= handle.size) {
+            *mem_counters_[idx] -= handle.size;
+        } else {
+            *mem_counters_[idx] = 0;
+        }
       }
     }
   }
diff --git a/tests/cpp/engine/engine_shutdown_test.cc b/tests/cpp/engine/engine_shutdown_test.cc
new file mode 100644
index 000000000000..893d08502c3a
--- /dev/null
+++ b/tests/cpp/engine/engine_shutdown_test.cc
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file engine_shutdown_test.cc
+ * \brief Tests engine shutdown for possible crashes
+*/
+#include <gtest/gtest.h>
+
+#include "../src/engine/engine_impl.h"
+#include "../include/test_util.h"
+
+/**
+ * This test will help ensure we don't crash during engine shutdown.
+ * The crash happens during a static destructor call, so this test may pass and then cause a test-run process crash.
+ */
+TEST(EngineShutdown, stop_without_crashing) {
+    static std::unique_ptr<mxnet::NDArray> ndArray;
+    {
+        auto engine = mxnet::Engine::_GetSharedRef();
+        ndArray = std::make_unique<mxnet::NDArray>(mxnet::Context::CPU());
+        engine->Stop();
+    }
+}
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index 405f3b30a176..ef3aec18e529 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -42,7 +42,7 @@
  * present the following workload
  *  n = reads.size()
  *  data[write] = (data[reads[0]] + ... data[reads[n]]) / n
- *  std::this_thread::sleep_for(std::chrono::microsecons(time));
+ *  std::this_thread::sleep_for(std::chrono::microseconds(time));
  */
 struct Workload {
   std::vector<int> reads;
@@ -76,7 +76,7 @@ void GenerateWorkload(int num_workloads, int num_var,
 /**
  * evaluate a single workload
  */
-void EvaluateWorload(const Workload& wl, std::vector<double>* data) {
+void EvaluateWorkload(const Workload& wl, std::vector<double>* data) {
   double tmp = 0;
   for (int i : wl.reads) tmp += data->at(i);
   data->at(wl.write) = tmp / (wl.reads.size() + 1);
@@ -88,9 +88,9 @@ void EvaluateWorload(const Workload& wl, std::vector<double>* data) {
 /**
  * evaluate a list of workload, return the time used
  */
-double EvaluateWorloads(const std::vector<Workload>& workloads,
-                      mxnet::Engine* engine,
-                      std::vector<double>* data) {
+double EvaluateWorkloads(const std::vector<Workload>& workloads,
+                         mxnet::Engine* engine,
+                         std::vector<double>* data) {
   using namespace mxnet;
   double t = dmlc::GetTime();
   std::vector<Engine::VarHandle> vars;
@@ -103,10 +103,10 @@ double EvaluateWorloads(const std::vector<Workload>& workloads,
   for (const auto& wl : workloads) {
     if (wl.reads.size() == 0) continue;
     if (engine == NULL) {
-      EvaluateWorload(wl, data);
+      EvaluateWorkload(wl, data);
     } else {
       auto func = [wl, data](RunContext ctx, Engine::CallbackOnComplete cb) {
-        EvaluateWorload(wl, data); cb();
+        EvaluateWorkload(wl, data); cb();
       };
       std::vector<Engine::VarHandle> reads;
       for (auto i : wl.reads) {
@@ -159,7 +159,7 @@ TEST(Engine, RandSumExpr) {
     std::vector<std::vector<double>> data(num_engine);
     for (int i = 0; i < num_engine; ++i) {
       data[i].resize(num_var, 1.0);
-      t[i] += EvaluateWorloads(workloads, engine[i], &data[i]);
+      t[i] += EvaluateWorkloads(workloads, engine[i], &data[i]);
     }
 
     for (int i = 1; i < num_engine; ++i) {
diff --git a/tests/cpp/include/test_mkldnn.h b/tests/cpp/include/test_mkldnn.h
index a379dab7bf90..f1682772a14a 100644
--- a/tests/cpp/include/test_mkldnn.h
+++ b/tests/cpp/include/test_mkldnn.h
@@ -49,7 +49,7 @@ inline static mkldnn::memory::primitive_desc GetMemPD(const mxnet::TShape s, int
 inline static mkldnn::memory::primitive_desc GetExpandedMemPD(
     mkldnn::memory::primitive_desc pd, float scale, int dim = 0) {
   CHECK(dim < pd.desc().data.ndims) << "dimension cannot be larger than total dimensions of input";
-  mxnet::TShape s(pd.desc().data.ndims);
+  mxnet::TShape s(pd.desc().data.ndims, -1);
   for (size_t i = 0; i < pd.desc().data.ndims; i++)
     s[i] = pd.desc().data.dims[i];
   s[dim] = static_cast<int>(s[dim] * scale);
@@ -165,7 +165,7 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
   std::vector<mkldnn::memory::primitive_desc> pds;
   {
     // 1D
-    mxnet::TShape s(1);
+    mxnet::TShape s(1, -1);
     s[0] = 279936;
     shapes.push_back(s);
     pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::x));
@@ -175,7 +175,7 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
   }
   {
     // 2D
-    mxnet::TShape s(2);
+    mxnet::TShape s(2, -1);
     s[0] = 96;
     s[1] = 2916;
     shapes.push_back(s);
@@ -187,12 +187,12 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
   }
   {
     // 4D
-    mxnet::TShape s1(4);
+    mxnet::TShape s1(4, -1);
     s1[0] = 10; s1[1] = 96; s1[2] = 54; s1[3] = 54;
     shapes.push_back(s1);
     pds.push_back(GetMemPD(s1, dtype, mkldnn::memory::format::nchw));
 
-    mxnet::TShape s2(4);
+    mxnet::TShape s2(4, -1);
     s2[0] = 96; s2[1] = 3; s2[2] = 11; s2[3] = 11;
     shapes.push_back(s2);
     pds.push_back(GetMemPD(s2, dtype, mkldnn::memory::format::oihw));
@@ -204,7 +204,7 @@ inline static TestArrayShapes GetTestArrayShapes(bool spatial_data_format = fals
   }
   {
     // 5D
-    mxnet::TShape s(5);
+    mxnet::TShape s(5, -1);
     s[0] = 96; s[1] = 1; s[2] = 3; s[3] = 11; s[4] = 11;
     shapes.push_back(s);
     pds.push_back(GetMemPD(s, dtype, mkldnn::memory::format::goihw));
@@ -259,7 +259,7 @@ enum ArrayTypes {
 inline NDArray CreateKernelNDArray(mxnet::TShape kernel, int num_filters, mxnet::TShape input,
     bool is_deconv = false) {
   CHECK_EQ(kernel.ndim(), 2) << "mkldnn only supports 2d filters on 4d inputs";
-  mxnet::TShape target_shape(4);
+  mxnet::TShape target_shape(4, -1);
   target_shape[0] = is_deconv ? input[1] : num_filters;
   target_shape[1] = is_deconv ? num_filters : input[1];
   target_shape[2] = kernel[0];
@@ -470,7 +470,7 @@ inline std::vector<NDArrayAttrs> GetTestOutputArrays(
     in_arrs.emplace_back(arr0.Slice(1, shape[0] + 1), "Reshaped NDArray");
   }
 
-  mxnet::TShape s(1);
+  mxnet::TShape s(1, -1);
   if (types & ArrayTypes::NormalReused) {
     // Type 5.
     // Get a reused version.
@@ -528,7 +528,7 @@ inline std::vector<NDArrayAttrs> GetTestOutputArrays(
 
     // Type 8, 9.
     // Get a reused version.
-    mxnet::TShape s(1);
+    mxnet::TShape s(1, -1);
     s[0] = shape.Size();
     NDArray arr = NDArray(s, Context());
     arr = arr.AsArray(shape, arr.dtype());
diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h
index e0caddbcd027..b0114e1721ef 100644
--- a/tests/cpp/include/test_util.h
+++ b/tests/cpp/include/test_util.h
@@ -353,14 +353,14 @@ inline StreamType& print_blob_(const RunContext& ctx,
 
   if (dim == 1) {
     // probably a 1d tensor (mshadow::Tensor is deprecated)
-    TBlob changed(blob.dptr<DType>(), mxnet::TShape(3), blob.dev_mask(), blob.dev_id());
+    TBlob changed(blob.dptr<DType>(), mxnet::TShape(3, -1), blob.dev_mask(), blob.dev_id());
     changed.shape_[0] = 1;
     changed.shape_[1] = 1;
     changed.shape_[2] = blob.shape_[0];
     return print_blob_<DType>(ctx, &os, changed, false, false, add_endl);
   } else if (dim == 2) {
     // probably a 2d tensor (mshadow::Tensor is deprecated)
-    TBlob changed(blob.dptr<DType>(), mxnet::TShape(4), blob.dev_mask(), blob.dev_id());
+    TBlob changed(blob.dptr<DType>(), mxnet::TShape(4, -1), blob.dev_mask(), blob.dev_id());
     changed.shape_[0] = 1;
     changed.shape_[1] = 1;
     changed.shape_[2] = blob.shape_[0];
diff --git a/tests/cpp/misc/serialization.cc b/tests/cpp/misc/serialization.cc
index 77014238c2fa..2509a43c27ee 100644
--- a/tests/cpp/misc/serialization.cc
+++ b/tests/cpp/misc/serialization.cc
@@ -48,7 +48,7 @@ TEST(SerializerTest, OutputMapCorrect) {
     std::map<std::string, std::tuple<uint32_t, mxnet::TShape, int, int> > output_map;
     output_map.emplace("output_0", std::make_tuple(1, mxnet::TShape({23, 12, 63, 432}), 0, 1));
     output_map.emplace("another_output", std::make_tuple(2, mxnet::TShape({23, 123}), 14, -23));
-    output_map.emplace("last_output", std::make_tuple(0, mxnet::TShape({0}), -1, 0));
+    output_map.emplace("last_output", std::make_tuple(0, mxnet::TShape(1, 0), -1, 0));
     std::string serialized_data;
     common::Serialize(output_map, &serialized_data);
     std::map<std::string, std::tuple<uint32_t, mxnet::TShape, int, int> > deserialized_output_map;
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index d74493a0f7fb..ed0e70b831f1 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -1266,7 +1266,7 @@ static void testSaveAndLoad(const std::vector<size_t>& dims,
   ChannelAxisTestData<DType> data;
   data.channel_data_ = inputChannelData;
 
-  mxnet::TShape shape(dims.size());
+  mxnet::TShape shape(dims.size(), -1);
   for (size_t i = 0, n = dims.size(); i < n; ++i) {
     shape[i] = index_t(dims[i]);
   }
@@ -1322,7 +1322,7 @@ static mxnet::TShape MakeShape(const std::vector<index_t>& shape,
   }
   CHECK_LT(channelAxis, shape.size() + 1);
   const index_t dim = index_t(shape.size()) + 1;
-  mxnet::TShape newShape(dim);
+  mxnet::TShape newShape(dim, -1);
   for (size_t x = 0; x < static_cast<size_t>(channelAxis); ++x) {
     newShape[x] = index_t(shape[x]);
   }
diff --git a/tests/cpp/operator/mkldnn_operator_test.cc b/tests/cpp/operator/mkldnn_operator_test.cc
index 559ab5da0ccc..961785dcfc87 100644
--- a/tests/cpp/operator/mkldnn_operator_test.cc
+++ b/tests/cpp/operator/mkldnn_operator_test.cc
@@ -916,13 +916,13 @@ void TestFullyConnectedOp(const OpAttrs &forward_attrs, const OpAttrs &backwards
       if (in_shape.ndim() < 2)
         continue;
 
-      mxnet::TShape wt_shape(2);
+      mxnet::TShape wt_shape(2, -1);
       wt_shape[0] = num_hid;
       wt_shape[1] = GetFCWeightDim2(in_shape);
       NDArray weights(wt_shape, Context());
       InitDefaultArray(&weights, false);
 
-      mxnet::TShape bias_shape(1);
+      mxnet::TShape bias_shape(1, -1);
       bias_shape[0] = num_hid;
       NDArray bias(bias_shape, Context());
       InitDefaultArray(&bias, false);
@@ -931,7 +931,7 @@ void TestFullyConnectedOp(const OpAttrs &forward_attrs, const OpAttrs &backwards
       inputs[1] = &weights;
       inputs[2] = &bias;
 
-      mxnet::TShape out_shape(2);
+      mxnet::TShape out_shape(2, -1);
       out_shape[0] = in_shape[0];
       out_shape[1] = num_hid;
 
diff --git a/tests/nightly/JenkinsfileForBinaries b/tests/nightly/JenkinsfileForBinaries
index 53e1c30e188f..6545d59f0e09 100755
--- a/tests/nightly/JenkinsfileForBinaries
+++ b/tests/nightly/JenkinsfileForBinaries
@@ -19,6 +19,7 @@
 //This is a Jenkinsfile for nightly tests. The format and some functions have been picked up from the top-level Jenkinsfile
 
 mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
+mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so'
 
 node('utility') {
   // Loading the utilities requires a node context unfortunately
@@ -39,6 +40,24 @@ core_logic: {
           utils.pack_lib('gpu', mx_lib)
         }
       }
+    },
+    'CPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/build-cpu-int64') {
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_cpu', 'build_ubuntu_cpu_large_tensor', false)
+          utils.pack_lib('cpu_int64', mx_cmake_lib, true)
+        }
+      }
+    },
+    'GPU: USE_INT64_TENSOR_SIZE': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/build-gpu-int64') {
+          utils.init_git()
+          utils.docker_run('ubuntu_nightly_gpu', 'build_ubuntu_gpu_large_tensor', true)
+          utils.pack_lib('gpu_int64', mx_cmake_lib, true)
+        }
+      }
     }
   }
 
@@ -59,6 +78,22 @@ core_logic: {
         }
       }
     },
+    'Test Large Tensor Size: CPU': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/large_tensor-cpu') {
+            utils.unpack_and_init('cpu_int64', mx_cmake_lib)
+            utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_large_tensor', false)
+        }
+      }
+    },
+    'Test Large Tensor Size: GPU': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/large_tensor-gpu') {
+            utils.unpack_and_init('gpu_int64', mx_cmake_lib)
+            utils.docker_run('ubuntu_nightly_gpu', 'nightly_test_large_tensor', true)
+        }
+      }
+    },
     'StraightDope: Python2 Single-GPU': {
       node(NODE_LINUX_GPU_P3) {
         ws('workspace/straight_dope-single_gpu') {
diff --git a/tests/nightly/apache_rat_license_check/rat-excludes b/tests/nightly/apache_rat_license_check/rat-excludes
index b3f64c246e6d..c08a2816045f 100755
--- a/tests/nightly/apache_rat_license_check/rat-excludes
+++ b/tests/nightly/apache_rat_license_check/rat-excludes
@@ -49,3 +49,4 @@ include/*
 .*.json.ref
 searchtools_custom.js
 theme.conf
+LICENSE.binary.dependencies
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index a627467cb959..1b7dad487a68 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -27,6 +27,7 @@
 SMALL_Y = 50
 LARGE_SIZE = LARGE_X * SMALL_Y
 
+
 def test_gluon_embedding():
     m = gluon.nn.Embedding(SMALL_Y, MEDIUM_X)
     m.initialize()
@@ -35,22 +36,26 @@ def test_gluon_embedding():
     assert b.shape == (MEDIUM_X, SMALL_Y, MEDIUM_X)
     assert b.asnumpy().size == LARGE_SIZE
 
+
 def test_ndarray_zeros():
     a = nd.zeros(shape=(LARGE_X, SMALL_Y))
     assert a[-1][0] == 0
     assert a.shape == (LARGE_X, SMALL_Y)
     assert a.size == LARGE_SIZE
 
+
 def test_ndarray_ones():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     assert a[-1][0] == 1
     assert nd.sum(a).asnumpy() == LARGE_SIZE
 
+
 @with_seed()
 def test_ndarray_random_uniform():
     a = nd.random.uniform(shape=(LARGE_X, SMALL_Y))
     assert a[-1][0] != 0
 
+
 @with_seed()
 def test_ndarray_random_randint():
     a = nd.random.randint(100, 10000, shape=(LARGE_X, SMALL_Y))
@@ -59,14 +64,16 @@ def test_ndarray_random_randint():
     low_large_value = 2**32
     high_large_value = 2**34
     a = nd.random.randint(low_large_value,high_large_value)
-    low = mx.nd.array([low_large_value],dtype='int64')
-    high = mx.nd.array([high_large_value],dtype='int64')
+    low = mx.nd.array([low_large_value], dtype='int64')
+    high = mx.nd.array([high_large_value], dtype='int64')
     assert a.__gt__(low) & a.__lt__(high)
 
+
 def test_ndarray_empty():
     a = nd.empty((LARGE_X, SMALL_Y))
     assert a.shape == (LARGE_X, SMALL_Y)
 
+
 def test_elementwise():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     b = nd.ones(shape=(LARGE_X, SMALL_Y))
@@ -77,22 +84,26 @@ def test_elementwise():
     res = nd.sqrt(a + 3)
     assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
 
+
 def test_reduce():
     a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
     assert nd.sum(a).asnumpy() == a.shape[0] * a.shape[1]
 
+
 def test_dot():
     a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
     b = nd.ones(shape=(SMALL_Y, SMALL_Y))
     res = nd.dot(a, b)
     assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
 
+
 def test_FullyConnected():
     a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
     b = nd.ones(shape=(SMALL_Y, SMALL_Y)) 
     res = nd.FullyConnected(a, b, num_hidden=b.shape[1], no_bias=True)
     assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
 
+
 def test_broadcast():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     b = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
@@ -101,45 +112,53 @@ def test_broadcast():
     res = mx.nd.broadcast_like(b, a)
     assert np.sum(res[-1].asnumpy() == LARGE_X) == a.shape[1]
 
+
 def test_clip():
     a = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
     b = nd.broadcast_to(a, shape=(a.shape[0], SMALL_Y))
     res = nd.clip(b, a_min=100, a_max=1000)
     assert np.sum(res[-1].asnumpy() == 1000) == b.shape[1]
 
+
 def test_take():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     idx = nd.arange(LARGE_X-1000, LARGE_X)
     res = nd.take(a, idx)
     assert np.sum(res[-1].asnumpy() == 1) == res.shape[1]
 
+
 def test_slice():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     res = nd.slice(a, begin=(LARGE_X-1000, 1), end=(LARGE_X, SMALL_Y))
     assert np.sum(res[-1].asnumpy() == 1) == res.shape[1]
 
+
 def test_slice_assign():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     a[LARGE_X-1:LARGE_X] = 1000
     assert np.sum(a[-1].asnumpy() == 1000) == a.shape[1]
- 
+
+
 def test_expand_dims():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     res = nd.expand_dims(a, axis=1)
     assert res.shape == (a.shape[0], 1, a.shape[1])
 
+
 def test_squeeze():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     data = nd.expand_dims(a, axis=1)
     res = nd.squeeze(data)
     assert res.shape == a.shape
 
+
 def test_broadcast_div():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     b = nd.ones(shape=(LARGE_X, 1)) * 2
     res = a / b
     assert np.sum(res[-1].asnumpy() == 0.5) == a.shape[1]
 
+
 def test_Dense(ctx=mx.cpu(0)):
     data = mx.nd.ones(shape=(50*1000*1000, 100))
     linear = gluon.nn.Dense(100)
@@ -148,6 +167,7 @@ def test_Dense(ctx=mx.cpu(0)):
     res.wait_to_read()
     assert res.shape == (50000000, 100)
 
+
 def test_where():
     a = nd.ones(shape=(LARGE_X, SMALL_Y))
     b = nd.arange(0, LARGE_X).reshape(LARGE_X, 1)
@@ -159,12 +179,14 @@ def test_where():
     res = nd.sparse.where(csr_cond, a, b)
     assert np.sum(res[0].asnumpy() == 1) == b.shape[1]
 
+
 def test_pick():
     a = mx.nd.ones(shape=(256*35, 1024*1024))
     b = mx.nd.ones(shape=(256*35,))
     res = mx.nd.pick(a,b)
     assert res.shape == b.shape
 
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 3bf7bcb06751..ea6255465bb1 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -48,6 +48,7 @@
 set_default_context(mx.gpu(0))
 del test_support_vector_machine_l1_svm  # noqa
 del test_support_vector_machine_l2_svm  # noqa
+del test_custom_op_fork  #noqa
 
 
 def check_countsketch(in_dim,out_dim,n):
@@ -1963,19 +1964,21 @@ def check_proposal_consistency(op, batch_size, with_nms=False):
 # The following 2 functions launch 0-thread kernels, an error that should be caught and signaled.
 def kernel_error_check_imperative():
     os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
-    a = mx.nd.array([1,2,3],ctx=mx.gpu(0))
-    b = mx.nd.array([],ctx=mx.gpu(0))
-    c = (a / b).asnumpy()
+    with mx.np_compat(active=True):
+        a = mx.nd.array([1,2,3],ctx=mx.gpu(0))
+        b = mx.nd.array([],ctx=mx.gpu(0))
+        c = (a / b).asnumpy()
 
 def kernel_error_check_symbolic():
     os.environ['MXNET_ENGINE_TYPE'] = 'NaiveEngine'
-    a = mx.sym.Variable('a')
-    b = mx.sym.Variable('b')
-    c = a / b
-    f = c.bind(mx.gpu(0), { 'a':mx.nd.array([1,2,3],ctx=mx.gpu(0)),
-                            'b':mx.nd.array([],ctx=mx.gpu(0))})
-    f.forward()
-    g = f.outputs[0].asnumpy()
+    with mx.np_compat(active=True):
+        a = mx.sym.Variable('a')
+        b = mx.sym.Variable('b')
+        c = a / b
+        f = c.bind(mx.gpu(0), { 'a':mx.nd.array([1,2,3],ctx=mx.gpu(0)),
+                                'b':mx.nd.array([],ctx=mx.gpu(0))})
+        f.forward()
+        g = f.outputs[0].asnumpy()
 
 def test_kernel_error_checking():
     # Running tests that may throw exceptions out of worker threads will stop CI testing
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 757df81e1607..a65a9e7180e2 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -63,19 +63,45 @@ def test_quantize_float32_to_int8():
 
 @with_seed()
 def test_dequantize_int8_to_float32():
+
+    def get_test_data(real_range, qdata_np):
+        qdata = mx.nd.array(qdata_np, dtype=np.int8)
+        min_range = mx.nd.array([-real_range], dtype=np.float32)
+        max_range = mx.nd.array([real_range], dtype=np.float32)
+        return qdata, min_range, max_range
+
+    def baseline_dequantization(qdata, real_range, qdata_np):
+        quantized_range = 127.0
+        scale = real_range / quantized_range
+        data_np = qdata_np * scale
+        return data_np
+
+    def test_nd_array_dequantization(qdata, min_range, max_range, expected_result):
+        data = mx.nd.contrib.dequantize(qdata, min_range, max_range, out_type='float32')
+        assert data.dtype == np.float32
+        assert_almost_equal(data.asnumpy(), expected_result)
+
+    def test_symbolic_api_dequantization(qdata, min_range, max_range, expected_result):
+        sym_data = mx.sym.Variable('data')
+        sym_min_range = mx.sym.Variable('min_range')
+        sym_max_range = mx.sym.Variable('max_range')
+        dequant = mx.sym.contrib.dequantize(sym_data, sym_min_range,
+                                            sym_max_range, out_type='float32')
+        out = dequant.bind(ctx=mx.current_context(),
+                           args={'data':qdata, 'min_range':min_range, 'max_range':max_range})
+        data = out.forward()[0]
+        assert data.dtype == np.float32
+        assert_almost_equal(data.asnumpy(), expected_result)
+
+    real_range = 402.3347
     shape = rand_shape_nd(4)
     qdata_np = np.random.uniform(low=-127, high=127, size=shape).astype(dtype=np.int8)
-    qdata = mx.nd.array(qdata_np, dtype=np.int8)
-    real_range = 402.3347
-    min_range = mx.nd.array([-real_range], dtype=np.float32)
-    max_range = mx.nd.array([real_range], dtype=np.float32)
-    data = mx.nd.contrib.dequantize(qdata, min_range, max_range, out_type='float32')
-    quantized_range = 127.0
-    scale = real_range / quantized_range
-    assert data.dtype == np.float32
-    data_np = qdata_np * scale
-    assert_almost_equal(data.asnumpy(), data_np)
-
+    qdata, min_range, max_range = get_test_data(real_range, qdata_np)
+    expected_result = baseline_dequantization(qdata, real_range, qdata_np)
+    # test nd array implementation.
+    test_nd_array_dequantization(qdata, min_range, max_range, expected_result)
+    # test symbolic api implementaion.
+    test_symbolic_api_dequantization(qdata, min_range, max_range, expected_result)
 
 @with_seed()
 def test_requantize_int32_to_int8():
@@ -115,7 +141,8 @@ def check_requantize(shape, min_calib_range=None, max_calib_range=None):
             qdata_int8, min_output, max_output = mx.nd.contrib.requantize(qdata, min_range, max_range)
         else:
             qdata_int8, min_output, max_output = mx.nd.contrib.requantize(qdata, min_range, max_range,
-                                                                          min_calib_range, max_calib_range)
+                                                                          min_calib_range=min_calib_range,
+                                                                          max_calib_range=max_calib_range)
 
         qdata_int8_np, min_output_np, max_output_np = requantize_baseline(qdata.asnumpy(), min_range.asscalar(),
                                                                           max_range.asscalar(),
@@ -125,6 +152,41 @@ def check_requantize(shape, min_calib_range=None, max_calib_range=None):
         assert_almost_equal(min_output.asnumpy(), np.array([min_output_np]))
         assert_almost_equal(max_output.asnumpy(), np.array([max_output_np]))
 
+    def check_requantize_with_symbol(shape, min_calib_range=None, max_calib_range=None):
+        qdata = mx.nd.random.uniform(low=-1000.0, high=1000.0, shape=shape).astype('int32')
+        min_range = mx.nd.array([-1010.0])
+        max_range = mx.nd.array([1020.0])
+        sym_data = mx.sym.Variable('data')
+        sym_min_range = mx.sym.Variable('min_range')
+        sym_max_range = mx.sym.Variable('max_range')
+        if min_calib_range is None or max_calib_range is None:
+            requant = mx.sym.contrib.requantize(sym_data, sym_min_range, sym_max_range)
+            out = requant.bind(ctx=mx.current_context(),
+                               args={'data':qdata, 'min_range':min_range,
+                               'max_range':max_range})
+            qdata_int8, min_output, max_output = out.forward()
+        else:
+            requant = mx.sym.contrib.requantize(sym_data, sym_min_range, sym_max_range,
+                                                min_calib_range=min_calib_range,
+                                                max_calib_range=max_calib_range)
+            out = requant.bind(ctx=mx.current_context(), args={'data':qdata, 'min_range':min_range,
+                               'max_range':max_range})
+            qdata_int8, min_output, max_output = out.forward()
+
+        qdata_int8_np, min_output_np, max_output_np = requantize_baseline(qdata.asnumpy(), min_range.asscalar(),
+                                                                          max_range.asscalar(),
+                                                                          min_calib_range=min_calib_range,
+                                                                          max_calib_range=max_calib_range)
+        assert_almost_equal(qdata_int8.asnumpy(), qdata_int8_np)
+        assert_almost_equal(min_output.asnumpy(), np.array([min_output_np]))
+        assert_almost_equal(max_output.asnumpy(), np.array([max_output_np]))
+
+    # test with symbol API.
+    check_requantize_with_symbol((3, 4, 10, 10))
+    check_requantize_with_symbol((32, 3, 23, 23))
+    check_requantize_with_symbol((3, 4, 10, 10), min_calib_range=-1050.0, max_calib_range=1040.0)
+    check_requantize_with_symbol((32, 3, 23, 23), min_calib_range=-134.349, max_calib_range=523.43)
+    # Test with nd array API
     check_requantize((3, 4, 10, 10))
     check_requantize((32, 3, 23, 23))
     check_requantize((3, 4, 10, 10), min_calib_range=-1050.0, max_calib_range=1040.0)
@@ -414,6 +476,57 @@ def check_quantized_flatten(shape, qdtype):
         check_quantized_flatten((10, 15, 18), qdtype)
         check_quantized_flatten((3, 4, 23, 23), qdtype)
 
+@with_seed()
+def test_quantized_act():
+    def check_quantized_act(data_shape, qdtype):
+        if is_test_for_native_cpu():
+            print('skipped testing quantized_act for native cpu since it is not supported yet')
+            return
+        elif qdtype == 'int8' and is_test_for_mkldnn():
+            print('skipped testing quantized_act for mkldnn cpu int8 since it is not supported yet')
+            return
+        elif is_test_for_gpu():
+            print('skipped testing quantized_act for gpu since it is not supported yet')
+            return
+        data = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
+        act_fp32 = mx.sym.Activation(data=data, act_type='relu', name='relu')
+        arg_shapes, _, _ = act_fp32.infer_shape(data=data_shape)
+        arg_names = act_fp32.list_arguments()
+        act_fp32_exe = act_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
+        if qdtype == 'uint8':
+            data_low = 0.0
+            data_high = 127.0
+        else:
+            data_low = -127.0
+            data_high = 127.0
+
+        act_fp32_exe.arg_dict[arg_names[0]][:] = mx.nd.random.uniform(low=data_low,
+                                                high=data_high, shape=data_shape).astype(qdtype)
+        output = act_fp32_exe.forward()[0]
+
+        qdata = mx.sym.Variable(name='qdata', shape=data_shape, dtype=qdtype)
+        min_data = mx.sym.Variable(name='min_data')
+        max_data = mx.sym.Variable(name='max_data')
+        quantized_act = mx.sym.contrib.quantized_act(data=qdata, min_data=min_data, max_data=max_data, act_type='relu')
+        act_int8_exe = quantized_act.simple_bind(ctx=mx.current_context(), grad_req='null')
+        qarg_names = quantized_act.list_arguments()
+
+        act_int8_exe.arg_dict[qarg_names[0]][:] = act_fp32_exe.arg_dict[arg_names[0]].astype(qdtype)
+        quantized_range_min = mx.nd.min(act_int8_exe.arg_dict[qarg_names[0]][:])
+        quantized_range_max = mx.nd.max(act_int8_exe.arg_dict[qarg_names[0]][:])
+        act_int8_exe.arg_dict[qarg_names[1]][:] = quantized_range_min.astype(qdtype)
+        act_int8_exe.arg_dict[qarg_names[2]][:] = quantized_range_max.astype(qdtype)
+        qoutput, min_range, max_range = act_int8_exe.forward()
+
+        assert_almost_equal(output.asnumpy(), qoutput.asnumpy())
+        assert_almost_equal(min_range.asscalar(), quantized_range_min.asscalar())
+        assert_almost_equal(max_range.asscalar(), quantized_range_max.asscalar())
+
+    for qdtype in ['int8', 'uint8']:
+        check_quantized_act((10,), qdtype)
+        check_quantized_act((10, 15), qdtype)
+        check_quantized_act((10, 15, 18), qdtype)
+        check_quantized_act((3, 4, 23, 23), qdtype)
 
 @with_seed()
 def test_quantize_params():
@@ -634,7 +747,9 @@ def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape, label_shape):
             arg_params, aux_params = mod.get_params()
             excluded_names = []
             if mx.current_context() == mx.cpu():
-               excluded_names += ['fc']
+               excluded_names += ['fc', 'conv1']
+            if mx.current_context() == mx.gpu():
+               excluded_names += ['relu0', 'relu1']
             excluded_names += ['concat']
 
             optional_names = ['pool0']
diff --git a/tests/python/unittest/test_contrib_operator.py b/tests/python/unittest/test_contrib_operator.py
index 38aeb99c2d89..813a6b092b9c 100644
--- a/tests/python/unittest/test_contrib_operator.py
+++ b/tests/python/unittest/test_contrib_operator.py
@@ -293,7 +293,7 @@ def f(x, a, b, c):
     b = np.random.random_sample()
     c = np.random.random_sample()
     m = np.random.random_sample() - 0.5
-    
+
     data = mx.symbol.Variable('data')
     quad_sym = mx.sym.contrib.quadratic(data=data, a=a, b=b, c=c)
     gr_q_sym = mx.sym.contrib.gradientmultiplier(quad_sym, scalar=m)
@@ -320,6 +320,18 @@ def f(x, a, b, c):
                                         [backward_expected],
                                         rtol=1e-2 if dtype is np.float16 else 1e-5,
                                         atol=1e-2 if dtype is np.float16 else 1e-5)
+def test_multibox_prior_op():
+    h = 561
+    w = 728
+    X = mx.nd.random.uniform(shape=(1, 3, h, w))
+    Y = mx.contrib.nd.MultiBoxPrior(X, sizes=[0.75, 0.5, 0.25], ratios=[1, 2, 0.5])
+    assert_array_equal(Y.shape, np.array((1, 2042040, 4)))
+    boxes = Y.reshape((h, w, 5, 4))
+    assert_allclose(boxes.asnumpy()[250, 250, 0, :], np.array([0.055117, 0.071524, 0.63307 , 0.821524]), atol=1e-5, rtol=1e-5)
+    # relax first ratio if user insists
+    Y = mx.contrib.nd.MultiBoxPrior(X, sizes=[0.75, 0.5, 0.25], ratios=[20, 2, 0.5])
+    boxes = Y.reshape((h, w, 5, 4))
+    assert_allclose(boxes.asnumpy()[250, 250, 0, :], np.array([-0.948249,  0.362671,  1.636436,  0.530377]), atol=1e-5, rtol=1e-5)
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_exc_handling.py b/tests/python/unittest/test_exc_handling.py
index 60799f821b8e..5627ac50d26e 100644
--- a/tests/python/unittest/test_exc_handling.py
+++ b/tests/python/unittest/test_exc_handling.py
@@ -165,6 +165,22 @@ def test_multiple_waitalls():
     assert caught, "No exception thrown"
     mx.nd.waitall()
 
+@with_seed()
+def test_exc_profiler():
+    def run_training_iteration(data):
+        output = net(data)
+
+    net = gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.add(gluon.nn.Dense(10))
+
+    ctx = default_context()
+    net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
+    data = mx.nd.ones((3, 4))
+    mx.profiler.set_state("run")
+    run_training_iteration(data)
+    mx.nd.waitall()
+    mx.profiler.set_state("stop")
 
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 8c60ef6745f1..efa04f4fa47a 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -1180,7 +1180,7 @@ def swish_test(x):
     elu = mx.gluon.nn.ELU()
     def elu_test(x):
         def elu(x):
-            return 1.0 * (mx.nd.exp(x) - 1) if x < 0 else x
+            return mx.nd.expm1(x) if x <= 0.0 else x
         return [elu(x_i) for x_i in x]
 
     for test_point, ref_point in zip(elu_test(point_to_validate), elu(point_to_validate)):
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index b410362c8fd1..9d7892010839 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -22,8 +22,7 @@
 from numpy.testing import assert_allclose
 import unittest
 from mxnet.test_utils import almost_equal, assert_almost_equal
-from common import assert_raises_cudnn_not_satisfied
-
+from common import assert_raises_cudnn_not_satisfied, with_seed
 
 def test_rnn():
     cell = gluon.rnn.RNNCell(100, prefix='rnn_')
@@ -244,6 +243,7 @@ def test_bidirectional():
 
 
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
+@with_seed()
 def test_layer_bidirectional():
     class RefBiLSTM(gluon.Block):
         def __init__(self, size, **kwargs):
@@ -279,7 +279,7 @@ def forward(self, inpt):
         ref_net_params[k.replace('l0', 'l0l0').replace('r0', 'r0l0')].set_data(weights[k])
 
     data = mx.random.uniform(shape=(11, 10, in_size))
-    assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy())
+    assert_allclose(net(data).asnumpy(), ref_net(data).asnumpy(), rtol=1e-04, atol=1e-02)
 
 
 
diff --git a/tests/python/unittest/test_infer_shape.py b/tests/python/unittest/test_infer_shape.py
index 73654a604135..612861bd8303 100644
--- a/tests/python/unittest/test_infer_shape.py
+++ b/tests/python/unittest/test_infer_shape.py
@@ -147,6 +147,21 @@ def test_fc_infer_type():
         assert arg_type_dict[k] == v
 
 
+def test_shape_completely_unknown():
+    data = mx.sym.var("data")
+    ret = mx.sym.sin(data)
+    arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+    assert arg_shapes[0] == ()
+    assert out_shapes[0] == ()
+
+    with mx.np_compat():
+        data = mx.sym.var("data")
+        ret = mx.sym.sin(data)
+        arg_shapes, out_shapes, _ = ret.infer_shape_partial()
+        assert arg_shapes[0] is None
+        assert out_shapes[0] is None
+
+
 if __name__ == "__main__":
     test_mlp2_infer_shape()
     test_mlp2_infer_error()
@@ -156,3 +171,4 @@ def test_fc_infer_type():
     test_incomplete_infer_slicechannel()
     test_incomplete_infer_convolution()
     test_incomplete_infer_concat()
+    test_shape_completely_unknown()
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index 36c1993bf0ff..c82afdfe033a 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -931,6 +931,34 @@ def test_module_update_no_pragram():
     mod.update()
     assert(mod.get_outputs()[0].shape == data_shape)
 
+
+def test_module_init_optimizer():
+    def get_module_idx2name(mod):
+        idx2name = {}
+        idx2name.update(enumerate(mod._exec_group.param_names))
+        return idx2name
+
+    data = mx.sym.Variable('data')
+    sym = mx.sym.FullyConnected(data, num_hidden=20, name='fc')
+    batch_size = 8
+    opt_params = {'learning_rate': 1, 'rescale_grad': 1.0 / batch_size}
+
+    # Pass an optimizer str
+    mod1 = mx.mod.Module(sym, ('data',), None, context=mx.cpu(0))
+    mod1.bind(data_shapes=[('data', (batch_size, 20))])
+    mod1.init_params()
+    mod1.init_optimizer(optimizer='sgd', optimizer_params=opt_params)
+    assert mod1._optimizer.idx2name == get_module_idx2name(mod1)
+
+    # Pass an Optimizer object
+    mod2 = mx.mod.Module(sym, ('data',), None, context=mx.cpu(0))
+    mod2.bind(data_shapes=[('data', (batch_size, 20))])
+    mod2.init_params()
+    opt = mx.optimizer.SGD(**opt_params)
+    mod2.init_optimizer(optimizer=opt)
+    assert mod2._optimizer.idx2name == get_module_idx2name(mod2)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 2446107ad466..374050668612 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -122,7 +122,11 @@ def test_ndarray_setitem():
 
     # numpy assignment for empty axis
     for trivial_shape in [(), (1,), (1, 1), (1, 1, 1)]:
-        x = mx.nd.zeros(trivial_shape)
+        if trivial_shape == tuple():
+            with mx.np_compat():
+                x = mx.nd.zeros(trivial_shape)
+        else:
+            x = mx.nd.zeros(trivial_shape)
         x[:] = np.ones(trivial_shape)
         x_np = np.ones(trivial_shape, dtype=x.dtype)
         assert x.shape == trivial_shape
@@ -1649,6 +1653,37 @@ def test_ndarray_nan_comparison():
     for i in (np.isnan(data1_grad))[1][0].flatten():
         assert i == True
 
+
+def test_zero_from_numpy():
+    # Test zero_copy
+    arrays = [
+        # ordinary numpy array
+        np.array([[1, 2], [3, 4], [5, 6]], dtype="float32"),
+        # 0-dim
+        np.array((1, )).reshape(()),
+        # 0-size
+        np.array(()).reshape((1, 0, 2)),
+    ]
+    for zero_copy in [False, True]:
+        for np_array in arrays:
+            mx_array = mx.nd.from_numpy(np_array, zero_copy=zero_copy)
+            mx.test_utils.assert_almost_equal(np_array, mx_array.asnumpy())
+    np_array = arrays[0]
+    mx_array = mx.nd.from_numpy(np_array)
+    np_array[2, 1] = 0
+    mx.test_utils.assert_almost_equal(np_array, mx_array.asnumpy())
+    mx_array[2, 1] = 100
+    mx.test_utils.assert_almost_equal(np_array, mx_array.asnumpy())
+    np_array = np.array([[1, 2], [3, 4], [5, 6]]).transpose()
+    assert not np_array.flags["C_CONTIGUOUS"]
+    try:
+        mx_array = mx.nd.from_numpy(np_array)
+    except ValueError:
+        pass
+    else:
+        assert False
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index e2f519e62b95..6b4d4d395f29 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -29,6 +29,8 @@
 from mxnet.test_utils import *
 from mxnet.base import py_str, MXNetError, _as_list
 from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, assertRaises
+from common import run_in_spawned_process
+from nose.tools import assert_raises
 import unittest
 import os
 
@@ -884,7 +886,7 @@ def fgelu_grad(grad, x, y):
     y = mx.sym.LeakyReLU(data=x, act_type="gelu")
     for dtype in [np.float16, np.float32, np.float64]:
         xa = np.random.uniform(low=-0.1,high=0.1,size=shape).astype(dtype)
-        eps, rtol, atol = (7.5e-4, 1e-1, 1e-2) if dtype is np.float16 else (1e-4, 1e-2, 1e-4)
+        eps, rtol, atol = (7.5e-4, 2e-2, 1e-3) if dtype is np.float16 else (1e-4, 1e-3, 1e-5)
         if dtype is np.float16:
             xa /= 10.0
         xa[abs(xa) < eps] = 0.01
@@ -1524,17 +1526,32 @@ def check_nearest_upsampling_with_shape(shapes, scale, root_scale):
         assert_allclose(arr[name].asnumpy()*root_scale**2*scale**(2*k), arr_grad[name].asnumpy(), rtol=1e-4)
 
 
-def check_bilinear_upsampling_with_shape(shapes, scale, root_scale):
-    arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)}
-    arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)}
-
-    up = mx.sym.UpSampling(*[mx.sym.Variable('arg_%d'%i) for i in range(len(shapes))], sample_type='bilinear', scale=root_scale)
+def check_bilinear_upsampling_with_shape(data_shape, weight_shape, scale, root_scale, num_filter):
+    def _init_bilinear(arr, f):
+        weight = np.zeros(np.prod(arr.shape), dtype='float32')
+        shape = arr.shape
+        c = (2 * f - 1 - f % 2) / (2. * f)
+        for i in range(np.prod(shape)):
+            x = i % shape[3]
+            y = (i // shape[3]) % shape[2]
+            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
+        arr[:] = weight.reshape(shape)
+        return arr
+
+    up = mx.sym.UpSampling(mx.sym.Variable("data"),
+        mx.sym.Variable('weight'), sample_type='bilinear', scale=root_scale,
+        num_filter=num_filter, num_args=2)
+    arg_shapes, out_shapes, _ = up.infer_shape(data=data_shape)
+    arr = {'data': mx.random.uniform(-5, 5, data_shape, ctx=mx.cpu()).copyto(default_context()),
+        'weight':  mx.nd.array(_init_bilinear(mx.ndarray.empty(arg_shapes[1]).asnumpy(), root_scale))}
+
+    arr_grad = [mx.nd.empty(s) for s in arg_shapes]
     exe = up.bind(default_context(), args=arr, args_grad=arr_grad)
     exe.forward(is_train=True)
+    out = exe.outputs[0].asnumpy()
     exe.backward(exe.outputs)
-    for k in range(len(shapes)):
-        name = 'arg_%d'%k
-        assert_allclose(arr[name].asnumpy()*root_scale**2*scale**(2*k), arr_grad[name].asnumpy(), rtol=1e-4)
+    target_shape = (data_shape[2] * root_scale, data_shape[3] * root_scale)
+    assert out.shape == data_shape[:2] + target_shape
 
 
 @with_seed()
@@ -1547,6 +1564,22 @@ def test_nearest_upsampling():
                     check_nearest_upsampling_with_shape(shapes, scale, root_scale)
 
 
+@with_seed()
+def test_bilinear_upsampling():
+    rootscale = [2,3]
+    scales = [1,2,3]
+    filters = [1,2,3]
+    bases = [1,2,3]
+    for params in itertools.product(rootscale, scales, filters, bases):
+        root_scale, scale, num_filter, base = params
+        # bilinear upsampling takes only 1 data and 1 weight
+        # multi input mode is not applicable
+        dimension = base*root_scale*scale
+        kernel = 2 * root_scale - root_scale % 2
+        data_shape = (1, num_filter, dimension, dimension)
+        weight_shape = (1, num_filter, kernel, kernel)
+        check_bilinear_upsampling_with_shape(data_shape, weight_shape, scale, root_scale, num_filter)
+
 @with_seed()
 def test_batchnorm_training():
     def check_batchnorm_training(stype):
@@ -3384,41 +3417,55 @@ def l2norm(input_data, axis=0, keepdims=True):
     ctx = default_context()
     data = mx.symbol.Variable('data')
     in_data_dim = random_sample([4,5,6], 1)[0]
-    in_shape = rand_shape_nd(in_data_dim)
+    in_shape = rand_shape_nd(in_data_dim, dim=5)
     epsilon = 1e-3
+    acc_type = {np.float16: np.float32, np.float32: np.float32, np.float64: np.float64,
+                np.int32: np.int32, np.int64: np.int64}
+    is_windows = sys.platform.startswith('win')
     for order in [1, 2]:
-        for dtype in [np.float16, np.float32, np.float64]:
-            in_data = np.random.uniform(-1, 1, in_shape).astype(dtype)
-            in_data[abs(in_data) < epsilon] = 2 * epsilon
+        for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
             for i in range(in_data_dim):
-                norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, keepdims=True)
-                npy_out = l1norm(in_data, i) if order is 1 else l2norm(in_data, i)
-                npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
-                check_symbolic_forward(norm_sym, [in_data], [npy_out],
-                                        rtol=1e-2 if dtype is np.float16 else 1e-5,
-                                        atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
-                check_symbolic_backward(norm_sym, [in_data], [np.ones(npy_out.shape)],
-                                        [npy_out_backward],
-                                        rtol=1e-2 if dtype is np.float16 else 1e-5,
-                                        atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
-                # Disable numeric gradient /~https://github.com/apache/incubator-mxnet/issues/11509
-                # # check gradient
-                # if dtype is not np.float16:
-                #     check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon, rtol=1e-1, atol=1e-3)
-                if i < in_data_dim-1:
-                    norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True)
-                    npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1))
+                for out_dtype in ['float32', 'float64', 'int32', 'int64']:
+                    if (dtype == np.int32 or dtype == np.int64) and ('int' not in out_dtype or is_windows):
+                        continue
+                    if dtype != np.int32 and dtype != np.int64 and 'int' in out_dtype:
+                        continue
+                    backward_dtype = np.float32 if out_dtype == 'float32' else np.float64
+                    skip_backward = 'int' in out_dtype
+                    print(order, dtype, i, out_dtype, in_shape)
+                    in_data = np.random.uniform(-1, 1, in_shape).astype(acc_type[dtype])
+                    in_data[abs(in_data) < epsilon] = 2 * epsilon
+                    norm_sym = mx.symbol.norm(data=data, ord=order, axis=i, out_dtype=out_dtype, keepdims=True)
+                    npy_out = l1norm(in_data, i) if order is 1 else l2norm(in_data, i)
                     npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
-                    check_symbolic_forward(norm_sym, [in_data], [npy_out],
-                                           rtol=1e-2 if dtype is np.float16 else 1e-5,
-                                           atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
-                    check_symbolic_backward(norm_sym, [in_data], [np.ones(npy_out.shape)],
-                                            [npy_out_backward],
-                                            rtol=1e-2 if dtype is np.float16 else 1e-5,
-                                            atol=1e-2 if dtype is np.float16 else 1e-5, ctx=ctx)
-                    # # check gradient
-                    # if dtype is not np.float16:
-                    #     check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon, rtol=1e-1, atol=1e-3)
+                    check_symbolic_forward(norm_sym, [in_data.astype(dtype)], [npy_out.astype(out_dtype)],
+                                           rtol=1e-3, atol=1e-5, ctx=ctx)
+                    if not skip_backward:
+                        check_symbolic_backward(norm_sym, [in_data.astype(dtype)],
+                                                [np.ones(npy_out.shape).astype(out_dtype)],
+                                                [npy_out_backward], rtol=1e-3, atol=1e-5, ctx=ctx,
+                                                dtype=backward_dtype)
+                    # Disable numeric gradient /~https://github.com/apache/incubator-mxnet/issues/11509
+                    # check gradient
+                    if dtype is not np.float16 and not skip_backward:
+                        check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon,
+                                               rtol=1e-1, atol=1e-3, dtype=backward_dtype)
+                    if i < in_data_dim-1:
+                        norm_sym = mx.symbol.norm(data=data, ord=order, axis=(i, i+1), keepdims=True)
+                        npy_out = l1norm(in_data, (i, i+1)) if order is 1 else l2norm(in_data, (i, i+1))
+                        npy_out_backward = np.sign(in_data) if order is 1 else in_data/npy_out
+                        check_symbolic_forward(norm_sym, [in_data], [npy_out.astype(dtype)],
+                                               rtol=1e-3 if dtype is np.float16 else 1e-3,
+                                               atol=1e-5 if dtype is np.float16 else 1e-5, ctx=ctx)
+                        if not skip_backward:
+                            check_symbolic_backward(norm_sym, [in_data],
+                                                    [np.ones(npy_out.shape).astype(out_dtype)],
+                                                    [npy_out_backward.astype(out_dtype)],
+                                                    rtol=1e-3, atol=1e-5, ctx=ctx, dtype=backward_dtype)
+                        # check gradient
+                        if dtype is not np.float16 and not skip_backward:
+                            check_numeric_gradient(norm_sym, [in_data], numeric_eps=epsilon,
+                                                   rtol=1e-1, atol=1e-3, dtype=backward_dtype)
 
 
 def test_layer_norm():
@@ -4398,7 +4445,8 @@ def test_invalid_reps():
         assert_exception(mx.nd.tile, MXNetError, data, (1, 0, 3))
 
     test_normal_case()
-    test_empty_tensor()
+    with mx.np_compat():
+        test_empty_tensor()
     test_empty_reps()
     test_tile_backward()
     test_tile_numeric_gradient()
@@ -4458,7 +4506,8 @@ def test_zero_depth():
     test_normal_case(index_type=np.float64)
     test_normal_case(index_type=np.float32)
     test_normal_case(index_type=np.float16)
-    test_empty_indices()
+    with mx.np_compat():
+        test_empty_indices()
     test_zero_depth()
 
 
@@ -5353,31 +5402,34 @@ def create_operator(self, ctx, shapes, dtypes):
         x = mx.nd.Custom(length=10, depth=10, op_type="no_input_op")
     assert_almost_equal(x.asnumpy(), np.ones(shape=(10, 10), dtype=np.float32))
 
+
+@with_seed()
+def test_custom_op_fork():
     # test custom operator fork
     # see /~https://github.com/apache/incubator-mxnet/issues/14396
-    if not sys.platform.startswith('win'):  # no fork in windows
-        class AdditionOP(mx.operator.CustomOp):
-            def __init__(self):
-                super(AdditionOP, self).__init__()
-            def forward(self, is_train, req, in_data, out_data, aux):
-                out_data[0][:] = in_data[0] + in_data[1]
-            def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
-                in_grad[0][:] = out_grad[0]
-                in_grad[1][:] = out_grad[0]
-
-        @mx.operator.register("AdditionOP")
-        class AdditionOPProp(mx.operator.CustomOpProp):
-            def __init__(self):
-                super(AdditionOPProp, self).__init__()
-            def list_arguments(self):
-                return ['a', 'b']
-            def list_outputs(self):
-                return ['output']
-            def infer_shape(self, in_shape):
-                return in_shape, [in_shape[0]]
-            def create_operator(self, ctx, shapes, dtypes):
-                return AdditionOP()
+    class AdditionOP(mx.operator.CustomOp):
+        def __init__(self):
+            super(AdditionOP, self).__init__()
+        def forward(self, is_train, req, in_data, out_data, aux):
+            out_data[0][:] = in_data[0] + in_data[1]
+        def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+            in_grad[0][:] = out_grad[0]
+            in_grad[1][:] = out_grad[0]
+
+    @mx.operator.register("AdditionOP")
+    class AdditionOPProp(mx.operator.CustomOpProp):
+        def __init__(self):
+            super(AdditionOPProp, self).__init__()
+        def list_arguments(self):
+            return ['a', 'b']
+        def list_outputs(self):
+            return ['output']
+        def infer_shape(self, in_shape):
+            return in_shape, [in_shape[0]]
+        def create_operator(self, ctx, shapes, dtypes):
+            return AdditionOP()
 
+    if not sys.platform.startswith('win'):  # no fork in windows
         def custom_add():
             a = mx.nd.array([1, 2, 3])
             b = mx.nd.array([4, 5, 6])
@@ -5390,7 +5442,90 @@ def custom_add():
         p.daemon = True
         p.start()
         p.join(5)
-        assert not p.is_alive(), "deadlock may exist in custom operator"
+        assert not p.is_alive() and p.exitcode == 0
+
+
+def _build_dot_custom(fun_forward, name):
+    class Dot(mx.operator.CustomOp):
+        def __init__(self):
+            super(Dot, self).__init__()
+        def forward(self, is_train, req, in_data, out_data, aux):
+            fun_forward(in_data, out_data)
+        def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+            pass
+
+    @mx.operator.register(name)
+    class DotProp(mx.operator.CustomOpProp):
+        def __init__(self):
+            super(DotProp, self).__init__()
+        def list_arguments(self):
+            return ['a', 'b']
+        def list_outputs(self):
+            return ['output']
+        def infer_shape(self, in_shape):
+            return in_shape, [(in_shape[0][0], in_shape[1][1])]
+        def create_operator(self, ctx, shapes, dtypes):
+            return Dot()
+
+def _custom_exc3(seed):
+    def custom_exc3():
+        def f(in_data, out_data):
+            out_data[0][:] = mx.nd.dot(in_data[0], in_data[1])
+            out_data[0].wait_to_read()
+        _build_dot_custom(f, 'Dot3')
+        n = int(1e8)
+        a = mx.nd.zeros((n, 1))
+        b = mx.nd.zeros((1, n))
+        # trigger OOM
+        c = mx.nd.Custom(a, b, op_type='Dot3')
+        c.wait_to_read()
+    assert_raises(MXNetError, custom_exc3)
+
+def _custom_exc4(seed):
+    def custom_exc4():
+        def f(in_data, out_data):
+            out_data[0][:] = mx.nd.dot(in_data[0], in_data[1])
+        _build_dot_custom(f, 'Dot4')
+        n = int(1e8)
+        a = mx.nd.zeros((n, 1))
+        b = mx.nd.zeros((1, n))
+        # trigger OOM
+        c = mx.nd.Custom(a, b, op_type='Dot4')
+        c.wait_to_read()
+    assert_raises(MXNetError, custom_exc4)
+
+@with_seed()
+def test_custom_op_exc():
+    # test except handling
+    # see /~https://github.com/apache/incubator-mxnet/pull/14693
+    # 1. error in python code
+    def custom_exc1():
+        def f(in_data, out_data):
+            assert False
+            out_data[0][:] = mx.nd.dot(in_data[0], in_data[1])
+        _build_dot_custom(f, 'Dot1')
+        a = mx.nd.zeros((4, 1))
+        b = mx.nd.zeros((1, 4))
+        c = mx.nd.Custom(a, b, op_type='Dot1')
+        c.wait_to_read()
+    assert_raises(MXNetError, custom_exc1)
+
+    # 2. error in pushing operator to engine
+    def custom_exc2():
+        def f(in_data, out_data):
+            out_data[0][:] = mx.nd.dot(in_data[0], in_data[1])
+        _build_dot_custom(f, 'Dot2')
+        a = mx.nd.zeros((4, 2))
+        b = mx.nd.zeros((1, 4))
+        # trigger error by invalid input shapes of operands
+        c = mx.nd.Custom(a, b, op_type='Dot2')
+        c.wait_to_read()
+    assert_raises(MXNetError, custom_exc2)
+
+    # 3. error in real execution
+    run_in_spawned_process(_custom_exc3, {})
+    run_in_spawned_process(_custom_exc4, {})
+
 
 @with_seed()
 def test_psroipooling():
@@ -6843,6 +6978,20 @@ def check_slice_axis_partial_infer(data, axis, begin, end, expected_out_shape):
     check_slice_axis_partial_infer(var1, 0, 0, 5, (5, 0))
     check_slice_axis_partial_infer(var1, 1, 0, 5, (10, 0))
 
+    with mx.np_compat():
+        var1 = mx.sym.var(name="data", shape=(-1, 20))
+        check_slice_partial_infer(var1, (None, None), (None, 10), [], (-1, 10))
+        check_slice_partial_infer(var1, (None, None), (None, 10), (None, 2), (-1, 5))
+        check_slice_partial_infer(var1, (None, 3), (None, 10), [], (-1, 7))
+        check_slice_partial_infer(var1, (None, 3), (5, 10), [], (-1, 7))
+        check_slice_partial_infer(var1, (2, 3), (None, 10), [], (-1, 7))
+        check_slice_partial_infer(var1, (2, 3), (None, 10), (None, 1), (-1, 7))
+        check_slice_partial_infer(var1, (2, 3), (None, 10), (3, 3), (-1, 3))
+
+        var1 = mx.sym.var(name='data', shape=(10, -1))
+        check_slice_axis_partial_infer(var1, 0, 0, 5, (5, -1))
+        check_slice_axis_partial_infer(var1, 1, 0, 5, (10, -1))
+
 
 @with_seed()
 def test_float16_min_max():
@@ -6853,6 +7002,21 @@ def test_float16_min_max():
     assert np.finfo('float16').max == mx.nd.max(a).asscalar()
 
 
+@with_seed()
+@mx.use_np_compat
+def test_zero_size_min_max():
+    def min():
+        a = mx.nd.zeros(shape=(5, 0))
+        a.min()
+
+    def max():
+        a = mx.nd.zeros(shape=(5, 0))
+        a.max()
+
+    assert_raises(MXNetError, min)
+    assert_raises(MXNetError, max)
+
+
 @with_seed()
 def test_squeeze_op():
     def check_squeeze_op(shape, axis=None):
@@ -7916,6 +8080,74 @@ def test_index_array_select_axes():
         check_symbolic_backward(index_array, [np.ones(shape)], [np.ones(shape)], [np.zeros(shape)])
 
 
+@with_seed()
+def test_scalar_tensor_creation():
+    assertRaises(MXNetError, mx.nd.zeros, shape=())
+    assertRaises(MXNetError, mx.nd.ones, shape=())
+    with mx.np_compat():
+        data_mx = mx.nd.ones(shape=())
+        data_np = np.ones((), dtype=data_mx.dtype)
+        assert same(data_mx.asnumpy(), data_np)
+
+
+@with_seed()
+def test_zero_size_tensor_creation():
+    assertRaises(MXNetError, mx.nd.zeros, shape=(0, 1, 3, 0))
+    assertRaises(MXNetError, mx.nd.ones, shape=(0, 1, 3, 0))
+    with mx.np_compat():
+        data_mx = mx.nd.ones(shape=(0, 1, 0, 4))
+        data_np = np.ones(shape=data_mx.shape, dtype=data_mx.dtype)
+        assert same(data_mx.asnumpy(), data_np)
+
+
+@with_seed()
+def test_concat_with_zero_size_tensor():
+    with mx.np_compat():
+        data1 = mx.nd.ones((0, 8, 12))
+        data2 = mx.nd.ones((3, 8, 12))
+        data3 = mx.nd.ones((0, 8, 12))
+        ret = mx.nd.Concat(data1, data2, data3, dim=0)
+        assert ret.shape == (3, 8, 12)
+
+        data1 = mx.nd.ones((0, 3, 10))
+        data2 = mx.nd.ones((0, 4, 10))
+        data3 = mx.nd.ones((0, 5, 10))
+        ret = mx.nd.Concat(data1, data2, data3, dim=1)
+        assert ret.shape == (0, 12, 10)
+
+
+@with_seed()
+def test_np_compat_decorator():
+    @mx.use_np_compat
+    def check_scalar_one():
+        """Generate scalar one tensor"""
+        return mx.nd.ones(shape=())
+    assert check_scalar_one.__name__ == "check_scalar_one"
+    assert check_scalar_one.__doc__ == "Generate scalar one tensor"
+    assert check_scalar_one().shape == ()
+    for active in [True, False]:
+        with mx.np_compat(active=active):
+            assert check_scalar_one.__name__ == "check_scalar_one"
+            assert check_scalar_one.__doc__ == "Generate scalar one tensor"
+            assert check_scalar_one().shape == ()
+
+    @mx.use_np_compat
+    def check_concat(shape1, shape2, axis):
+        data1 = mx.nd.ones(shape1)
+        data2 = mx.nd.ones(shape2)
+        ret = mx.nd.Concat(data1, data2, dim=axis)
+        expected_ret = np.concatenate((data1.asnumpy(), data2.asnumpy()), axis=axis)
+        assert ret.shape == expected_ret.shape
+
+    check_concat((0, 3, 4), (5, 3, 4), 0)
+    check_concat((8, 0, 5), (8, 7, 5), 1)
+    check_concat((8, 0, 0), (8, 0, 0), 2)
+    for active in [True, False]:
+        check_concat((0, 3, 4), (5, 3, 4), 0)
+        check_concat((8, 0, 5), (8, 7, 5), 1)
+        check_concat((8, 0, 0), (8, 0, 0), 2)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tools/dependencies/LICENSE.binary.dependencies b/tools/dependencies/LICENSE.binary.dependencies
new file mode 100644
index 000000000000..c9202bac98e2
--- /dev/null
+++ b/tools/dependencies/LICENSE.binary.dependencies
@@ -0,0 +1,289 @@
+    
+    Apache MXNET (incubating) Dependencies:
+
+    The Apache MXNET (incubating) project contains dependencies with separate copyright
+    notices and license terms. Your use of these dependencies is subject to the terms and conditions of the following licenses.
+
+    =======================================================================================
+    MIT licenses
+    =======================================================================================
+
+    1. cityhash  - For details, see /~https://github.com/google/cityhash/blob/master/COPYING
+    	Copyright (c) 2011 Google, Inc.
+    2. protobuf - For details, see /~https://github.com/protocolbuffers/protobuf/blob/master/LICENSE
+        Copyright 2008 Google Inc.  All rights reserved.
+
+    =======================================================================================
+    Mozilla Public License Version 2.0
+    =======================================================================================
+
+    1. eigen - For details, see /~https://github.com/eigenteam/eigen-git-mirror/blob/master/COPYING.MPL2
+    Source code available at: /~https://github.com/eigenteam/eigen-git-mirror
+        Copyright (c) 2006-2019 various contributors
+
+    =======================================================================================
+    2-clause BSD licenses
+    =======================================================================================
+
+    1. lz4 - For details, see /~https://github.com/lz4/lz4/blob/dev/LICENSE
+        Copyright (c) 2011-2016, Yann Collet
+
+    =======================================================================================
+    3-clause BSD licenses
+    =======================================================================================
+
+    1. openblas - For details, see /~https://github.com/xianyi/OpenBLAS/blob/develop/LICENSE
+        Copyright (c) 2011-2014, The OpenBLAS Project
+
+    2. opencv - For details, see /~https://github.com/opencv/opencv/blob/master/LICENSE
+        Copyright (C) 2000-2019, Intel Corporation, all rights reserved.
+        Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+        Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved.
+        Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+        Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved.
+        Copyright (C) 2015-2016, Itseez Inc., all rights reserved.
+
+    =======================================================================================
+    Apache-2.0 licenses
+    =======================================================================================
+
+    1. openssl - For details, see /~https://github.com/openssl/openssl/blob/master/LICENSE
+        Copyright (c) 1998-2018 The OpenSSL Project
+        Copyright (c) 1995-1998 Eric A. Young, Tim J. Hudson
+
+    =======================================================================================
+    Other Licenses
+    =======================================================================================
+
+    1. curl
+
+    COPYRIGHT AND PERMISSION NOTICE
+
+    Copyright (c) 1996 - 2019, Daniel Stenberg, <daniel@haxx.se>, and many
+    contributors, see the THANKS file.
+
+    All rights reserved.
+
+    Permission to use, copy, modify, and distribute this software for any purpose
+    with or without fee is hereby granted, provided that the above copyright
+    notice and this permission notice appear in all copies.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN
+    NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+    DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+    OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+    OR OTHER DEALINGS IN THE SOFTWARE.
+
+    Except as contained in this notice, the name of a copyright holder shall not
+    be used in advertising or otherwise to promote the sale, use or other dealings
+    in this Software without prior written authorization of the copyright holder.
+
+    =======================================================================================
+
+    2. libpng
+
+    COPYRIGHT NOTICE, DISCLAIMER, and LICENSE
+    =========================================
+
+    PNG Reference Library License version 2
+    ---------------------------------------
+
+     * Copyright (c) 1995-2019 The PNG Reference Library Authors.
+     * Copyright (c) 2018-2019 Cosmin Truta.
+     * Copyright (c) 2000-2002, 2004, 2006-2018 Glenn Randers-Pehrson.
+     * Copyright (c) 1996-1997 Andreas Dilger.
+     * Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
+
+    The software is supplied "as is", without warranty of any kind,
+    express or implied, including, without limitation, the warranties
+    of merchantability, fitness for a particular purpose, title, and
+    non-infringement.  In no event shall the Copyright owners, or
+    anyone distributing the software, be liable for any damages or
+    other liability, whether in contract, tort or otherwise, arising
+    from, out of, or in connection with the software, or the use or
+    other dealings in the software, even if advised of the possibility
+    of such damage.
+
+    Permission is hereby granted to use, copy, modify, and distribute
+    this software, or portions hereof, for any purpose, without fee,
+    subject to the following restrictions:
+
+     1. The origin of this software must not be misrepresented; you
+        must not claim that you wrote the original software.  If you
+        use this software in a product, an acknowledgment in the product
+        documentation would be appreciated, but is not required.
+
+     2. Altered source versions must be plainly marked as such, and must
+        not be misrepresented as being the original software.
+
+     3. This Copyright notice may not be removed or altered from any
+        source or altered source distribution.
+
+
+    PNG Reference Library License version 1 (for libpng 0.5 through 1.6.35)
+    -----------------------------------------------------------------------
+
+    libpng versions 1.0.7, July 1, 2000, through 1.6.35, July 15, 2018 are
+    Copyright (c) 2000-2002, 2004, 2006-2018 Glenn Randers-Pehrson, are
+    derived from libpng-1.0.6, and are distributed according to the same
+    disclaimer and license as libpng-1.0.6 with the following individuals
+    added to the list of Contributing Authors:
+
+        Simon-Pierre Cadieux
+        Eric S. Raymond
+        Mans Rullgard
+        Cosmin Truta
+        Gilles Vollant
+        James Yu
+        Mandar Sahastrabuddhe
+        Google Inc.
+        Vadim Barkov
+
+    and with the following additions to the disclaimer:
+
+        There is no warranty against interference with your enjoyment of
+        the library or against infringement.  There is no warranty that our
+        efforts or the library will fulfill any of your particular purposes
+        or needs.  This library is provided with all faults, and the entire
+        risk of satisfactory quality, performance, accuracy, and effort is
+        with the user.
+
+    Some files in the "contrib" directory and some configure-generated
+    files that are distributed with libpng have other copyright owners, and
+    are released under other open source licenses.
+
+    libpng versions 0.97, January 1998, through 1.0.6, March 20, 2000, are
+    Copyright (c) 1998-2000 Glenn Randers-Pehrson, are derived from
+    libpng-0.96, and are distributed according to the same disclaimer and
+    license as libpng-0.96, with the following individuals added to the
+    list of Contributing Authors:
+
+        Tom Lane
+        Glenn Randers-Pehrson
+        Willem van Schaik
+
+    libpng versions 0.89, June 1996, through 0.96, May 1997, are
+    Copyright (c) 1996-1997 Andreas Dilger, are derived from libpng-0.88,
+    and are distributed according to the same disclaimer and license as
+    libpng-0.88, with the following individuals added to the list of
+    Contributing Authors:
+
+        John Bowler
+        Kevin Bracey
+        Sam Bushell
+        Magnus Holmgren
+        Greg Roelofs
+        Tom Tanner
+
+    Some files in the "scripts" directory have other copyright owners,
+    but are released under this license.
+
+    libpng versions 0.5, May 1995, through 0.88, January 1996, are
+    Copyright (c) 1995-1996 Guy Eric Schalnat, Group 42, Inc.
+
+    For the purposes of this copyright and license, "Contributing Authors"
+    is defined as the following set of individuals:
+
+        Andreas Dilger
+        Dave Martindale
+        Guy Eric Schalnat
+        Paul Schmidt
+        Tim Wegner
+
+    The PNG Reference Library is supplied "AS IS".  The Contributing
+    Authors and Group 42, Inc. disclaim all warranties, expressed or
+    implied, including, without limitation, the warranties of
+    merchantability and of fitness for any purpose.  The Contributing
+    Authors and Group 42, Inc. assume no liability for direct, indirect,
+    incidental, special, exemplary, or consequential damages, which may
+    result from the use of the PNG Reference Library, even if advised of
+    the possibility of such damage.
+
+    Permission is hereby granted to use, copy, modify, and distribute this
+    source code, or portions hereof, for any purpose, without fee, subject
+    to the following restrictions:
+
+     1. The origin of this source code must not be misrepresented.
+
+     2. Altered versions must be plainly marked as such and must not
+        be misrepresented as being the original source.
+
+     3. This Copyright notice may not be removed or altered from any
+        source or altered source distribution.
+
+    The Contributing Authors and Group 42, Inc. specifically permit,
+    without fee, and encourage the use of this source code as a component
+    to supporting the PNG file format in commercial products.  If you use
+    this source code in a product, acknowledgment is not required but would
+    be appreciated.
+
+    =======================================================================================
+
+    3. libtiff
+
+    Copyright (c) 1988-1997 Sam Leffler
+    Copyright (c) 1991-1997 Silicon Graphics, Inc.
+
+    Permission to use, copy, modify, distribute, and sell this software and
+    its documentation for any purpose is hereby granted without fee, provided
+    that (i) the above copyright notices and this permission notice appear in
+    all copies of the software and related documentation, and (ii) the names of
+    Sam Leffler and Silicon Graphics may not be used in any advertising or
+    publicity relating to the software without the specific, prior written
+    permission of Sam Leffler and Silicon Graphics.
+
+    THE SOFTWARE IS PROVIDED "AS-IS" AND WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS, IMPLIED OR OTHERWISE, INCLUDING WITHOUT LIMITATION, ANY
+    WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+    IN NO EVENT SHALL SAM LEFFLER OR SILICON GRAPHICS BE LIABLE FOR
+    ANY SPECIAL, INCIDENTAL, INDIRECT OR CONSEQUENTIAL DAMAGES OF ANY KIND,
+    OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+    WHETHER OR NOT ADVISED OF THE POSSIBILITY OF DAMAGE, AND ON ANY THEORY OF
+    LIABILITY, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+    OF THIS SOFTWARE.
+
+    =======================================================================================
+
+    4. libturbojpeg
+
+    The Modified (3-clause) BSD License
+    Copyright (C)2009-2019 D. R. Commander. All Rights Reserved. Copyright (C)2015 Viktor Szathmáry. All Rights Reserved.
+
+    Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    Neither the name of the libjpeg-turbo Project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS", AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    =======================================================================================
+
+    5. libz
+
+    Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler
+
+    This software is provided 'as-is', without any express or implied
+    warranty.  In no event will the authors be held liable for any damages
+    arising from the use of this software.
+
+    Permission is granted to anyone to use this software for any purpose,
+    including commercial applications, and to alter it and redistribute it
+    freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+    2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+    3. This notice may not be removed or altered from any source distribution.
+
+    Jean-loup Gailly        Mark Adler
+    jloup@gzip.org          madler@alumni.caltech.edu
+
+    =======================================================================================
+
+
diff --git a/tools/dependencies/eigen.sh b/tools/dependencies/eigen.sh
index 96547b920a31..0dde3023d374 100755
--- a/tools/dependencies/eigen.sh
+++ b/tools/dependencies/eigen.sh
@@ -32,6 +32,7 @@ if [[ ! -d $DEPS_PATH/include/eigen3 ]]; then
     cd $DEPS_PATH/eigen-git-mirror-$EIGEN_VERSION/build
     cmake \
           -D CMAKE_BUILD_TYPE=RELEASE \
+          -D EIGEN_MPL2_ONLY=1 \
           -D CMAKE_INSTALL_PREFIX=$DEPS_PATH ..
     $MAKE install
     popd
diff --git a/tools/pip/setup.py b/tools/pip/setup.py
index c9134d51e200..71e2549a3f19 100644
--- a/tools/pip/setup.py
+++ b/tools/pip/setup.py
@@ -66,12 +66,8 @@ def has_ext_modules(self):
 shutil.copy(LIB_PATH[0], os.path.join(CURRENT_DIR, 'mxnet'))
 
 # copy license and notice
-shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/LICENSE'),
-            os.path.join(CURRENT_DIR, 'mxnet/LICENSE'))
-shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/DISCLAIMER'),
-            os.path.join(CURRENT_DIR, 'mxnet/DISCLAIMER'))
-shutil.copy(os.path.join(CURRENT_DIR, 'mxnet-build/NOTICE'),
-            os.path.join(CURRENT_DIR, 'mxnet/NOTICE'))
+shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/licenses'),
+            os.path.join(CURRENT_DIR, 'mxnet/licenses'))
 
 # copy tools to mxnet package
 shutil.rmtree(os.path.join(CURRENT_DIR, 'mxnet/tools'), ignore_errors=True)
@@ -154,20 +150,17 @@ def has_ext_modules(self):
         package_data['mxnet'].append('mxnet/libmklml_intel.so')
         package_data['mxnet'].append('mxnet/libiomp5.so')
         package_data['mxnet'].append('mxnet/libmkldnn.so.0')
-    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), '../MKLML_LICENSE'), os.path.join(CURRENT_DIR, 'mxnet'))
     shutil.copytree(os.path.join(CURRENT_DIR, 'mxnet-build/3rdparty/mkldnn/include'),
                     os.path.join(CURRENT_DIR, 'mxnet/include/mkldnn'))
-    package_data['mxnet'].append('mxnet/MKLML_LICENSE')
 if platform.system() == 'Linux':
     shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libgfortran.so.3'), os.path.join(CURRENT_DIR, 'mxnet'))
     package_data['mxnet'].append('mxnet/libgfortran.so.3')
     shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), 'libquadmath.so.0'), os.path.join(CURRENT_DIR, 'mxnet'))
     package_data['mxnet'].append('mxnet/libquadmath.so.0')
 
-# copy licenses
-if variant.startswith('CU'):
-    shutil.copy(os.path.join(os.path.dirname(LIB_PATH[0]), '../CUB_LICENSE'), os.path.join(CURRENT_DIR, 'mxnet'))
-    package_data['mxnet'].append('mxnet/CUB_LICENSE')
+# Copy licenses and notice
+for f in os.listdir('mxnet/licenses'):
+  package_data['mxnet'].append('mxnet/licenses/{}'.format(f))
 
 from mxnet.base import _generate_op_module_signature
 from mxnet.ndarray.register import _generate_ndarray_function_code
diff --git a/tools/staticbuild/build.sh b/tools/staticbuild/build.sh
index 4f8a78d70490..7207cf062180 100755
--- a/tools/staticbuild/build.sh
+++ b/tools/staticbuild/build.sh
@@ -64,5 +64,13 @@ mkdir -p $DEPS_PATH
 # Build Dependencies
 source tools/dependencies/make_shared_dependencies.sh
 
+# Copy LICENSE
+mkdir -p licenses
+cp tools/dependencies/LICENSE.binary.dependencies licenses/
+cp NOTICE licenses/
+cp LICENSE licenses/
+cp DISCLAIMER licenses/
+
+
 # Build mxnet
 source tools/staticbuild/build_lib.sh
diff --git a/tools/staticbuild/build_lib.sh b/tools/staticbuild/build_lib.sh
index 728f3a1cda25..472bf57b101e 100755
--- a/tools/staticbuild/build_lib.sh
+++ b/tools/staticbuild/build_lib.sh
@@ -34,7 +34,6 @@ $MAKE DEPS_PATH=$DEPS_PATH $PWD/3rdparty/tvm/nnvm/lib/libnnvm.a
 $MAKE DEPS_PATH=$DEPS_PATH PSLITE
 
 if [[ $VARIANT == *mkl ]]; then
-    MKLDNN_LICENSE='license.txt'
     if [[ $PLATFORM == 'linux' ]]; then
         IOMP_LIBFILE='libiomp5.so'
         MKLML_LIBFILE='libmklml_intel.so'
@@ -48,12 +47,6 @@ if [[ $VARIANT == *mkl ]]; then
     cp 3rdparty/mkldnn/build/install/lib/$IOMP_LIBFILE lib
     cp 3rdparty/mkldnn/build/install/lib/$MKLML_LIBFILE lib
     cp 3rdparty/mkldnn/build/install/lib/$MKLDNN_LIBFILE lib
-    cp 3rdparty/mkldnn/build/install/$MKLDNN_LICENSE lib
-    cp 3rdparty/mkldnn/LICENSE ./MKLML_LICENSE
-fi
-
-if [[ $VARIANT == cu* ]]; then
-    cp 3rdparty/nvidia_cub/LICENSE.TXT ./CUB_LICENSE
 fi
 
 >&2 echo "Now building mxnet..."