diff --git a/.gitignore b/.gitignore
index 7eb8e7d6e..705ef92da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -63,11 +63,10 @@ __pycache__
 *.states
 *.json
 *.d
-build
 cmake-build*
 data
+model
 recommonmark
-deps
 
 # R
 *.Rcheck
@@ -96,6 +95,8 @@ input.txt*
 
 # Jetbrain
 .idea
+.gradle
+*.iml
 
 # ctags
 tags
@@ -104,28 +105,14 @@ tags
 cscope.out
 cscope.files
 
-# Scala package
-*.class
-scala-package/*/target/
-scala-package/*/*/target/
-*.scala_dependencies
-*.worksheet
-*.idea
-*.iml
-*.classpath
-*.project
-*.settings
-!scala-package/*/bin
-*.bak
-*/node_modules/
-
 # Eclipse project config
 .project
 .cproject
+.classpath
+.settings
 .pydevproject
 CMakeFiles
 cmake_install.cmake
-lib
 
 # Visual Studio Code
 .vscode
@@ -145,12 +132,12 @@ tools/pip_package/mxnet.egg-info
 tools/pip_package/mxnet
 
 # temporary path for building dependencies when building wheel
-./deps/
-bld
-./tmp/*
-*.jar
-target
-bin/im2rec
+deps/
+staticdeps/
+tmp/
+build/
+lib/
+bin/
 model/
 
 # VTune
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 6dc04f7c7..3dc80815d 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 6dc04f7c729cd5c6c6210d5d4d2026a26ce0bfbf
+Subproject commit 3dc80815d965b56b9a975dc27229361955bf66fe
diff --git a/3rdparty/onnx-tensorrt b/3rdparty/onnx-tensorrt
index 3d8ee0499..f1c7aa63d 160000
--- a/3rdparty/onnx-tensorrt
+++ b/3rdparty/onnx-tensorrt
@@ -1 +1 @@
-Subproject commit 3d8ee049970e81ff4935cc7f36b653c0b27bcbbc
+Subproject commit f1c7aa63d88d8d8ef70490f2ebb6b33f7450218b
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45ab12d14..f980f8978 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,17 @@
 cmake_minimum_required(VERSION 3.0.2)
 
+# workaround to store CMAKE_CROSSCOMPILING because is getting reset by the project command
+if(CMAKE_CROSSCOMPILING)
+  set(__CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING})
+  set(__CMAKE_CROSSCOMPILING_OVERRIDE ON)
+endif()
+
 project(mxnet C CXX)
 
+if(__CMAKE_CROSSCOMPILING_OVERRIDE)
+  set(CMAKE_CROSSCOMPILING ${__CMAKE_CROSSCOMPILING})
+endif()
+
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
   include(${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
 endif()
@@ -21,7 +31,7 @@ mxnet_option(USE_LAPACK           "Build with lapack support" ON)
 mxnet_option(USE_NGRAPH           "Build with nGraph support" ON)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
 mxnet_option(USE_MKLML_MKL        "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE))
-mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_SYSTEM_PROCESSOR MATCHES x86_64) AND (NOT USE_NGRAPH))
+mxnet_option(USE_MKLDNN           "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON IF NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
@@ -42,6 +52,10 @@ mxnet_option(USE_TENSORRT         "Enable infeference optimization with TensorRT
 mxnet_option(USE_ASAN             "Enable Clang/GCC ASAN sanitizers." OFF)
 mxnet_option(ENABLE_TESTCOVERAGE  "Enable compilation with test coverage metric output" OFF)
 
+message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}")
+message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+message(STATUS "CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}")
+
 message(STATUS "CMAKE_SYSTEM_NAME ${CMAKE_SYSTEM_NAME}")
 if(USE_CUDA AND NOT USE_OLDCMAKECUDA)
   message(STATUS "CMake version '${CMAKE_VERSION}' using generator '${CMAKE_GENERATOR}'")
@@ -257,6 +271,7 @@ if(USE_TENSORRT)
   include_directories(${ONNX_PATH})
   include_directories(3rdparty/onnx-tensorrt/)
   include_directories(3rdparty/)
+  include_directories(3rdparty/onnx-tensorrt/third_party/onnx/)
   add_definitions(-DMXNET_USE_TENSORRT=1)
   add_definitions(-DONNX_NAMESPACE=onnx)
 
@@ -285,7 +300,7 @@ if(ENABLE_TESTCOVERAGE)
   if(NOT GCOV_PATH)
     message(FATAL_ERROR "gcov not found! Aborting...")
   endif() # NOT GCOV_PATH
-  
+
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --coverage")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage")
   set(CMAKE_LINKER_FLAGS "${CMAKE_LINKER_FLAGS} --coverage")
@@ -302,9 +317,11 @@ if(USE_MKLDNN)
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc")
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /EHsc /Gy")
   endif()
+
   set(WITH_TEST OFF CACHE INTERNAL "" FORCE)
   set(WITH_EXAMPLE OFF CACHE INTERNAL "" FORCE)
   set(ARCH_OPT_FLAGS "" CACHE INTERNAL "" FORCE)
+
   add_subdirectory(3rdparty/mkldnn)
 
   include_directories(3rdparty/mkldnn/include)
@@ -325,7 +342,6 @@ if(USE_CUDA)
     if(NOT CUDA_TOOLSET)
       set(CUDA_TOOLSET "${CUDA_VERSION_STRING}")
     endif()
-    set(CMAKE_GENERATOR_TOOLSET "cuda=${CUDA_TOOLSET},host=x64")
   else()
     set(FIRST_CUDA FALSE)
   endif()
@@ -477,12 +493,14 @@ if(USE_OPENMP)
     endif()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    add_definitions(-DMXNET_USE_OPENMP=1)
   else()
     if(OPENMP_FOUND)
       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
       set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
       set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+      add_definitions(-DMXNET_USE_OPENMP=1)
     endif()
   endif()
 elseif(UNIX AND NOT ANDROID)
@@ -815,6 +833,10 @@ install(TARGETS ${MXNET_INSTALL_TARGETS}
 #       https://cmake.org/cmake/help/v3.0/variable/CMAKE_INSTALL_PREFIX.html
 #       https://cmake.org/cmake/help/v3.0/module/GNUInstallDirs.html
 
+# NOTE: Public headers will be installed into ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}, see
+#       https://cmake.org/cmake/help/v3.0/variable/CMAKE_INSTALL_PREFIX.html
+#       https://cmake.org/cmake/help/v3.0/module/GNUInstallDirs.html
+
 install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 install(DIRECTORY 3rdparty/tvm/nnvm/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 if (INSTALL_EXAMPLES)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 5b5fdce71..6bf44c55d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -194,6 +194,7 @@ List of Contributors
 * [Harsh Patel](/~https://github.com/harshp8l)
 * [Xiao Wang](/~https://github.com/BeyonderXX)
 * [Piyush Ghai](/~https://github.com/piyushghai)
+* [Zach Boldyga](/~https://github.com/zboldyga)
 
 Label Bot
 ---------
diff --git a/MXNET_README.md b/MXNET_README.md
index 369df9b64..f5d40a6f6 100644
--- a/MXNET_README.md
+++ b/MXNET_README.md
@@ -77,7 +77,7 @@ Features
 * Mix and match imperative and symbolic programming to maximize flexibility and efficiency
 * Lightweight, memory efficient and portable to smart devices
 * Scales up to multi GPUs and distributed setting with auto parallelism
-* Support for Python, R, Scala, C++ and Julia
+* Support for Python, Scala, C++, Java, Clojure, R and Julia
 * Cloud-friendly and directly compatible with S3, HDFS, and Azure
 
 License
diff --git a/Makefile b/Makefile
index cf05a0035..bbe40842d 100644
--- a/Makefile
+++ b/Makefile
@@ -76,6 +76,7 @@ endif
 endif
 endif
 endif
+
 ifeq ($(USE_MKL2017), 1)
 $(warning "USE_MKL2017 is deprecated. We will switch to USE_MKLDNN.")
 	USE_MKLDNN=1
@@ -230,6 +231,16 @@ ifeq ($(USE_CUDNN), 1)
 	LDFLAGS += -lcudnn
 endif
 
+ifeq ($(use_blas), open)
+	CFLAGS += -DMXNET_USE_BLAS_OPEN=1
+else ifeq ($(use_blas), atlas)
+	CFLAGS += -DMXNET_USE_BLAS_ATLAS=1
+else ifeq ($(use_blas), mkl)
+	CFLAGS += -DMXNET_USE_BLAS_MKL=1
+else ifeq ($(use_blas), apple)
+	CFLAGS += -DMXNET_USE_BLAS_APPLE=1
+endif
+
 # whether to use F16C instruction set extension for fast fp16 compute on CPU
 # if cross compiling you may want to explicitly turn it off if target system does not support it
 ifndef USE_F16C
@@ -485,6 +496,7 @@ endif
 ifeq ($(CI), 1)
 	MAVEN_ARGS := -B
 endif
+
 # For quick compile test, used smaller subset
 ALLX_DEP= $(ALL_DEP)
 
@@ -494,7 +506,7 @@ build/src/%.o: src/%.cc | mkldnn ngraph
 
 build/src/%_gpu.o: src/%.cu | mkldnn ngraph
 	@mkdir -p $(@D)
-	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" -M -MT build/src/$*_gpu.o $< >build/src/$*_gpu.d
+	$(NVCC) $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" --generate-dependencies -MT build/src/$*_gpu.o $< >build/src/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) $(CUDA_ARCH) -Xcompiler "$(CFLAGS)" $<
 
 # A nvcc bug cause it to generate "generic/xxx.h" dependencies from torch headers.
@@ -521,6 +533,7 @@ build/plugin/%.o: plugin/%.cc | ngraph
 ifeq ($(UNAME_S), Darwin)
         LDFLAGS += -Wl,-install_name,@rpath/libmxnet.so
 endif
+
 # NOTE: to statically link libmxnet.a we need the option
 # --Wl,--whole-archive -lmxnet --Wl,--no-whole-archive
 lib/libmxnet.a: $(ALLX_DEP)
@@ -621,6 +634,13 @@ rpkg:
 	mkdir -p R-package/inst/libs
 	cp src/io/image_recordio.h R-package/src
 	cp -rf lib/libmxnet.so R-package/inst/libs
+
+	if [ -e "lib/libmkldnn.so.0" ]; then \
+		cp -rf lib/libmkldnn.so.0 R-package/inst/libs; \
+		cp -rf lib/libiomp5.so R-package/inst/libs; \
+		cp -rf lib/libmklml_intel.so R-package/inst/libs; \
+	fi
+
 	mkdir -p R-package/inst/include
 	cp -rf include/* R-package/inst/include
 	rm R-package/inst/include/dmlc
@@ -628,7 +648,7 @@ rpkg:
 	cp -rf 3rdparty/dmlc-core/include/* R-package/inst/include/
 	cp -rf 3rdparty/tvm/nnvm/include/* R-package/inst/include
 	Rscript -e "if(!require(devtools)){install.packages('devtools', repo = 'https://cloud.r-project.org/')}"
-	Rscript -e "if(!require(devtools)||packageVersion('roxygen2') < '6.1.1'){install.packages('roxygen2', repo = 'https://cloud.r-project.org/')}"
+	Rscript -e "if(!require(roxygen2)||packageVersion('roxygen2') < '6.1.1'){install.packages('roxygen2', repo = 'https://cloud.r-project.org/')}"
 	Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cloud.r-project.org/')); install_deps(pkg='R-package', dependencies = TRUE)"
 	cp R-package/dummy.NAMESPACE R-package/NAMESPACE
 	echo "import(Rcpp)" >> R-package/NAMESPACE
@@ -674,9 +694,8 @@ clean: rclean cyclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r  $(patsubst %, %/*.d, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.d, $(EXTRA_OPERATORS))
 	$(RM) -r  $(patsubst %, %/*.o, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.o, $(EXTRA_OPERATORS))
 else
-clean: ngraph_clean rclean mkldnn_clean cyclean testclean $(EXTRA_PACKAGES_CLEAN)
-	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
-		R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz
+clean: rclean mkldnn_clean cyclean testclean $(EXTRA_PACKAGES_CLEAN)
+	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ 
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
 	cd $(PS_PATH); $(MAKE) clean; cd -
 	cd $(NNVM_PATH); $(MAKE) clean; cd -
diff --git a/NOTICE b/NOTICE
index 98321cba7..ecc58f6a2 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,5 +1,5 @@
     Apache MXNET (incubating)
-    Copyright 2017-2018 The Apache Software Foundation
+    Copyright 2017 and onwards The Apache Software Foundation
 
     This product includes software developed at
     The Apache Software Foundation (http://www.apache.org/).
diff --git a/R-package/R/context.R b/R-package/R/context.R
index 6bbb9aa6a..1c5a56ed9 100644
--- a/R-package/R/context.R
+++ b/R-package/R/context.R
@@ -22,7 +22,7 @@ init.context.default <- function() {
 
 #' Set/Get default context for array creation.
 #'
-#' @param new, optional takes \code{mx.cpu()} or \code{mx.gpu(id)}, new default ctx.
+#' @param new optional takes \code{mx.cpu()} or \code{mx.gpu(id)}, new default ctx.
 #' @return The default context.
 #'
 #' @export
diff --git a/R-package/R/model.R b/R-package/R/model.R
index d71bc5cae..51d1705ba 100644
--- a/R-package/R/model.R
+++ b/R-package/R/model.R
@@ -562,7 +562,7 @@ mx.model.FeedForward.create <-
 #'
 #' @param model The MXNet Model.
 #' @param X The dataset to predict.
-#' @param ctx mx.cpu() or mx.gpu(i) The device used to generate the prediction.
+#' @param ctx mx.cpu() or mx.gpu(). The device used to generate the prediction.
 #' @param array.batch.size The batch size used in batching. Only used when X is R's array.
 #' @param array.layout can be "auto", "colmajor", "rowmajor", (detault=auto)
 #'     The layout of array. "rowmajor" is only supported for two dimensional array.
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index c0af11dc9..6f13f7b26 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -21,7 +21,7 @@
 #' @param learning.rate float, default=0.01
 #'      The initial learning rate.
 #' @param momentum float, default=0
-#'      The momentumvalue
+#'      The momentum value
 #' @param wd float, default=0.0
 #'      L2 regularization coefficient add to all the weights.
 #' @param rescale.grad float, default=1.0
@@ -453,6 +453,110 @@ mx.opt.adadelta <- function(rho = 0.90,
 }
 
 
+#' Create a Nesterov Accelerated SGD( NAG) optimizer.
+#'
+#' NAG optimizer is described in Aleksandar Botev. et al (2016).
+#' *NAG: A Nesterov accelerated SGD.*
+#' https://arxiv.org/pdf/1607.01981.pdf
+#'
+#' @param learning.rate float, default=0.01
+#'      The initial learning rate.
+#' @param momentum float, default=0
+#'      The momentum value
+#' @param wd float, default=0.0
+#'      L2 regularization coefficient added to all the weights.
+#' @param rescale.grad float, default=1.0
+#'      rescaling factor of gradient.
+#' @param clip_gradient float, optional, default=-1 (no clipping if < 0)
+#'      clip gradient in range [-clip_gradient, clip_gradient].
+#' @param lr_scheduler function, optional
+#'      The learning rate scheduler.
+#'
+mx.opt.nag <- function(learning.rate = 0.01,
+                       momentum = 0,
+                       wd = 0,
+                       rescale.grad = 1,
+                       clip_gradient = -1,
+                       lr_scheduler = NULL) {
+
+  lr <- learning.rate
+  count <- 0
+  num_update <- 0
+
+  nag <- new.env()
+  nag$lr <- learning.rate
+  nag$count <- 0
+  nag$num_update <- 0
+
+  create_exec <- function(index, weight_dim, ctx) {
+
+    weight <- mx.symbol.Variable("weight")
+    grad <- mx.symbol.Variable("grad")
+    mom <- mx.symbol.Variable("mom")
+    grad <- grad * rescale.grad
+
+    if (!is.null(clip_gradient)) {
+      if (clip_gradient >= 0) {
+        grad <- mx.symbol.clip(data = grad, a.min = -clip_gradient, a.max = clip_gradient)
+      }
+    }
+
+    if (momentum == 0) {
+
+      weight <- weight - lr * (grad + (wd * weight))
+      w <- mx.symbol.identity(weight, name = "w")
+      sym <- mx.symbol.Group(c(w))
+
+    } else {
+
+      mom <- momentum * mom + grad + wd * weight
+      grad <- momentum * mom + grad
+      weight <- weight - lr * grad
+
+      w <- mx.symbol.identity(weight, name = "w")
+      m <- mx.symbol.identity(mom, name = "m")
+      sym <- mx.symbol.Group(c(w, m))
+
+    }
+
+    exec <- mx.simple.bind(symbol = sym, weight = weight_dim, ctx = ctx, grad.req = "null")
+    return(exec)
+  }
+
+  update <- function(index, exec_w, weight, grad) {
+
+    if (!is.null(lr_scheduler)){
+      lr_scheduler(nag) ## changing lr
+      lr <- nag$lr
+      ## update count
+      indexKey <- paste0('ik', index)
+      if (!exists(envir = nag, x = indexKey, inherits = FALSE)){
+        nag[[indexKey]] <- 0
+      } else {
+        indexValue <- nag[[indexKey]]
+        nag[[indexKey]] <- indexValue + 1
+        nag$num_update <- max(nag$num_update, nag[[indexKey]])
+      }
+    }
+
+    mx.exec.update.arg.arrays(exec_w,
+                              arg.arrays = list(weight = weight,grad = grad),
+                              match.name = T)
+    mx.exec.forward(exec_w, is.train = F)
+
+    # update state
+    if (!is.null(exec_w$ref.outputs$m_output)){
+      mx.exec.update.arg.arrays(exec_w,
+                                arg.arrays = list(mom = exec_w$ref.outputs$m_output),
+                                match.name = T) 
+    }
+
+    return(exec_w$ref.outputs$w_output)
+  }
+  return(list(create_exec = create_exec, update = update))
+}
+
+
 #' Create an optimizer by name and parameters
 #'
 #' @param name The name of the optimizer
@@ -466,6 +570,7 @@ mx.opt.create <- function(name, ...) {
          "adam" = mx.opt.adam(...),
          "adagrad" = mx.opt.adagrad(...),
          "adadelta" = mx.opt.adadelta(...),
+         "nag" = mx.opt.nag(...),
          stop("Unknown optimizer ", name))
 }
 
diff --git a/R-package/R/rnn.graph.R b/R-package/R/rnn.graph.R
index 2ceefb57d..1225fa511 100644
--- a/R-package/R/rnn.graph.R
+++ b/R-package/R/rnn.graph.R
@@ -195,7 +195,7 @@ gru.cell <- function(num_hidden, indata, prev.state, param, seqidx, layeridx, dr
 }
 
 
-#' unroll representation of RNN running on non CUDA device
+#' Unroll representation of RNN running on non CUDA device
 #' 
 #' @param config Either seq-to-one or one-to-one
 #' @param cell_type Type of RNN cell: either gru or lstm
diff --git a/R-package/tests/testthat/test_optimizer.R b/R-package/tests/testthat/test_optimizer.R
index 1ae7bc23b..1eec83f2d 100644
--- a/R-package/tests/testthat/test_optimizer.R
+++ b/R-package/tests/testthat/test_optimizer.R
@@ -17,6 +17,12 @@
 
 context("optimizer")
 
+if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 
+		1) {
+	mx.ctx.default(new = mx.gpu())
+	message("Using GPU for testing.")
+}
+
 test_that("sgd", {
   
   data <- mx.symbol.Variable("data")
@@ -30,14 +36,14 @@ test_that("sgd", {
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
   
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.cpu(), arg.arrays = list(data = x, 
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, 
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
     "null"))
   
   optimizer <- mx.opt.create("sgd", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1, 
     clip_gradient = -1)
   
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
   
   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)
@@ -63,14 +69,14 @@ test_that("rmsprop", {
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
   
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.cpu(), arg.arrays = list(data = x, 
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, 
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
     "null"))
   
   optimizer <- mx.opt.create("rmsprop", learning.rate = 1, centered = TRUE, gamma1 = 0.95, 
     gamma2 = 0.9, epsilon = 1e-04, wd = 0, rescale.grad = 1, clip_gradient = -1)
   
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
   
   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)
@@ -97,14 +103,14 @@ test_that("adam", {
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
   
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.cpu(), arg.arrays = list(data = x, 
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, 
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
     "null"))
   
   optimizer <- mx.opt.create("adam", learning.rate = 1, beta1 = 0.9, beta2 = 0.999, 
     epsilon = 1e-08, wd = 0, rescale.grad = 1, clip_gradient = -1)
   
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
   
   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)
@@ -131,14 +137,14 @@ test_that("adagrad", {
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
   
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.cpu(), arg.arrays = list(data = x, 
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, 
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
     "null"))
   
   optimizer <- mx.opt.create("adagrad", learning.rate = 1, epsilon = 1e-08, wd = 0, 
     rescale.grad = 1, clip_gradient = -1)
   
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
   
   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)
@@ -164,22 +170,82 @@ test_that("adadelta", {
   y <- mx.nd.array(c(5, 11, 16))
   w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
   
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.cpu(), arg.arrays = list(data = x, 
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, 
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
     "null"))
   
   optimizer <- mx.opt.create("adadelta", rho = 0.9, epsilon = 1e-05, wd = 0, rescale.grad = 1, 
     clip_gradient = -1)
   
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.cpu())
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
   
   mx.exec.forward(exec, is.train = T)
   mx.exec.backward(exec)
   
   arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
   mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-  
+
   expect_equal(as.array(arg.blocks[[2]]), array(c(1.11, 1.81), dim = c(2, 1)), 
     tolerance = 0.1)
   
 })
+
+
+test_that("nag_no_momentum", {
+  data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
+  fc_weight <- mx.symbol.Variable("fc_weight")
+  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
+	name = "fc1", num_hidden = 1)
+  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
+
+  x <- mx.nd.array(array(1:6, dim = 2:3))
+	y <- mx.nd.array(c(5, 11, 16))
+	w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
+
+	exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
+    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", "null"))
+
+  optimizer <- mx.opt.create("nag", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1,
+	  clip_gradient = -1)
+
+	updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
+	
+  mx.exec.forward(exec, is.train = T)
+	mx.exec.backward(exec)
+		
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+	mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+		
+  expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2, 1)), tolerance = 0.05)
+})
+
+
+test_that("nag_momentum", {
+  data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
+  fc_weight <- mx.symbol.Variable("fc_weight")
+  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
+                                 name = "fc1", num_hidden = 1)
+  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
+  
+  x <- mx.nd.array(array(1:6, dim = 2:3))
+  y <- mx.nd.array(c(5, 11, 16))
+  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
+  
+  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
+                                                                                          fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", "null"))
+  
+  optimizer <- mx.opt.create("nag", learning.rate = 1, momentum = 0.1, wd = 0, rescale.grad = 1,
+                             clip_gradient = 5)
+  
+  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
+  
+  mx.exec.forward(exec, is.train = T)
+  mx.exec.backward(exec)
+  
+  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
+  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
+  
+  expect_equal(as.array(arg.blocks[[2]]), array(c(1.45, 2.65), dim = c(2, 1)), tolerance = 0.1)
+})
diff --git a/ci/Jenkinsfile_utils.groovy b/ci/Jenkinsfile_utils.groovy
index 8291bae1f..054deb5f8 100644
--- a/ci/Jenkinsfile_utils.groovy
+++ b/ci/Jenkinsfile_utils.groovy
@@ -174,9 +174,17 @@ def update_github_commit_status(state, message) {
     //properly and you would see an empty list of repos:
     //[Set GitHub commit status (universal)] PENDING on repos [] (sha:xxxxxxx) with context:test/mycontext
     //See https://cwiki.apache.org/confluence/display/MXNET/Troubleshooting#Troubleshooting-GitHubcommit/PRstatusdoesnotgetpublished
+
+    echo "Publishing commit status..."
+
     repoUrl = get_repo_url()
+    echo "repoUrl=${repoUrl}"
+
     commitSha = get_git_commit_hash()
+    echo "commitSha=${commitSha}"
+    
     context = get_github_context()
+    echo "context=${context}"
 
     step([
       $class: 'GitHubCommitStatusSetter',
@@ -190,6 +198,9 @@ def update_github_commit_status(state, message) {
         results: [[$class: "AnyBuildResult", message: message, state: state]]
       ]
     ])
+
+    echo "Publishing commit status done."
+
   }
 }
 
diff --git a/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt b/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
index 255da3160..f4844115c 100644
--- a/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
+++ b/ci/docker/Dockerfile.build.ubuntu_gpu_tensorrt
@@ -18,7 +18,7 @@
 #
 # Dockerfile to run MXNet on Ubuntu 16.04 for CPU
 
-FROM nvidia/cuda:9.0-cudnn7-devel
+FROM nvidia/cuda:10.0-cudnn7-devel
 
 WORKDIR /work/deps
 
diff --git a/example/autoencoder/data.py b/ci/docker/Dockerfile.publish.ubuntu1604_cpu
similarity index 58%
rename from example/autoencoder/data.py
rename to ci/docker/Dockerfile.publish.ubuntu1604_cpu
index 99dd4eb43..df284be2b 100644
--- a/example/autoencoder/data.py
+++ b/ci/docker/Dockerfile.publish.ubuntu1604_cpu
@@ -1,3 +1,4 @@
+# -*- mode: dockerfile -*-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -14,21 +15,28 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+#
+# Dockerfile to build and run MXNet on Ubuntu 16.04 for CPU
+
+FROM ubuntu:16.04
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_base.sh /work/
+RUN /work/ubuntu_base.sh
+
+COPY install/ubuntu_python.sh /work/
+RUN /work/ubuntu_python.sh
 
-# pylint: disable=missing-docstring
-from __future__ import print_function
+COPY install/ubuntu_scala.sh /work/
+RUN /work/ubuntu_scala.sh
 
-import os
-import numpy as np
-from sklearn.datasets import fetch_mldata
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
 
+COPY runtime_functions.sh /work/
 
-def get_mnist():
-    np.random.seed(1234) # set seed for deterministic ordering
-    data_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    data_path = os.path.join(data_path, '../../data')
-    mnist = fetch_mldata('MNIST original', data_home=data_path)
-    p = np.random.permutation(mnist.data.shape[0])
-    X = mnist.data[p].astype(np.float32)*0.02
-    Y = mnist.target[p]
-    return X, Y
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.ubuntu1604_gpu b/ci/docker/Dockerfile.publish.ubuntu1604_gpu
new file mode 100644
index 000000000..2a1f8594e
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.ubuntu1604_gpu
@@ -0,0 +1,42 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Dockerfile to run MXNet on Ubuntu 16.04 for GPU
+
+FROM nvidia/cuda:9.2-cudnn7-devel-ubuntu16.04
+
+WORKDIR /work/deps
+
+COPY install/ubuntu_base.sh /work/
+RUN /work/ubuntu_base.sh
+
+COPY install/ubuntu_python.sh /work/
+RUN /work/ubuntu_python.sh
+
+COPY install/ubuntu_scala.sh /work/
+RUN /work/ubuntu_scala.sh
+
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
+
+COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/install/centos7_scala.sh b/ci/docker/install/centos7_scala.sh
index 5c43f011c..5a1c4163a 100755
--- a/ci/docker/install/centos7_scala.sh
+++ b/ci/docker/install/centos7_scala.sh
@@ -25,8 +25,11 @@ set -ex
 yum install -y java-1.8.0-openjdk-devel
 export JAVA_HOME=/usr/lib/jvm/jre-1.8.0-openjdk
 export PATH=$JAVA_HOME/bin:$PATH
+
 # Build from source with Maven
-wget -q http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
+curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
+    || curl -o apache-maven-3.3.9-bin.tar.gz https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
+
 tar xzf apache-maven-3.3.9-bin.tar.gz
 mkdir /usr/local/maven
 mv apache-maven-3.3.9/ /usr/local/maven/
diff --git a/ci/docker/install/tensorrt.sh b/ci/docker/install/tensorrt.sh
index 61e73ef9a..1950cad0b 100755
--- a/ci/docker/install/tensorrt.sh
+++ b/ci/docker/install/tensorrt.sh
@@ -26,7 +26,7 @@ pip3 install gluoncv==0.2.0
 pushd .
 cd ..
 apt-get update
-apt-get install -y automake libtool
+apt-get install -y automake libtool zip
 git clone --recursive -b 3.5.1.1 /~https://github.com/google/protobuf.git
 cd protobuf
 ./autogen.sh
@@ -41,7 +41,7 @@ popd
 
 # Install TensorRT
 echo "TensorRT build enabled. Installing TensorRT."
-wget -qO tensorrt.deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0_1-1_amd64.deb
+wget -qO tensorrt.deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64/nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0_1-1_amd64.deb
 dpkg -i tensorrt.deb
 apt-get update
 apt-get install -y --allow-downgrades libnvinfer-dev
diff --git a/ci/docker/install/ubuntu_base.sh b/ci/docker/install/ubuntu_base.sh
index b34c0b3e1..f36e53279 100755
--- a/ci/docker/install/ubuntu_base.sh
+++ b/ci/docker/install/ubuntu_base.sh
@@ -33,4 +33,8 @@ apt-get install -y \
     software-properties-common \
     sudo \
     unzip \
+    expect \
+    gnupg \
+    gnupg2 \
+    gnupg-agent \
     wget
diff --git a/ci/docker/install/ubuntu_clang.sh b/ci/docker/install/ubuntu_clang.sh
index 19aada9b3..ac1bdac46 100755
--- a/ci/docker/install/ubuntu_clang.sh
+++ b/ci/docker/install/ubuntu_clang.sh
@@ -24,7 +24,7 @@ set -ex
 
 apt-get update || true
 # Install clang 3.9 (the same version as in XCode 8.*) and 6.0 (latest major release)
-wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
+wget -qO - http://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
     apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-3.9 main" && \
     apt-add-repository "deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main" && \
     apt-get update && \
@@ -35,4 +35,6 @@ wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - && \
 # Use llvm's master version of run-clang-tidy.py.  This version has mostly minor updates, but
 # importantly will properly return a non-zero exit code when an error is reported in clang-tidy.
 # Please remove the below if we install a clang version higher than 6.0.
-wget https://raw.githubusercontent.com/llvm-mirror/clang-tools-extra/7654135f0cbd155c285fd2a37d87e27e4fff3071/clang-tidy/tool/run-clang-tidy.py -O /usr/lib/llvm-6.0/share/clang/run-clang-tidy.py
+wget \
+ -qO /usr/lib/llvm-6.0/share/clang/run-clang-tidy.py\
+ https://raw.githubusercontent.com/llvm-mirror/clang-tools-extra/7654135f0cbd155c285fd2a37d87e27e4fff3071/clang-tidy/tool/run-clang-tidy.py
diff --git a/ci/docker/install/ubuntu_core.sh b/ci/docker/install/ubuntu_core.sh
index 4382aa6ae..fc903e5c8 100755
--- a/ci/docker/install/ubuntu_core.sh
+++ b/ci/docker/install/ubuntu_core.sh
@@ -22,6 +22,10 @@
 
 set -ex
 apt-get update || true
+
+# Avoid interactive package installers such as tzdata.
+export DEBIAN_FRONTEND=noninteractive
+
 apt-get install -y \
     apt-transport-https \
     build-essential \
@@ -41,10 +45,11 @@ apt-get install -y \
     unzip \
     wget
 
-
-# Ubuntu 14.04
-if [[ $(lsb_release -r | grep 14.04) ]]; then
-    apt-get install -y cmake3
-else
-    apt-get install -y cmake
-fi
+# Note: we specify an exact cmake version to work around a cmake 3.10 CUDA 10 issue.
+# Reference: /~https://github.com/clab/dynet/issues/1457
+mkdir /opt/cmake && cd /opt/cmake
+wget -nv https://cmake.org/files/v3.12/cmake-3.12.4-Linux-x86_64.sh
+sh cmake-3.12.4-Linux-x86_64.sh --prefix=/opt/cmake --skip-license
+ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
+rm cmake-3.12.4-Linux-x86_64.sh
+cmake --version
diff --git a/ci/docker/install/ubuntu_julia.sh b/ci/docker/install/ubuntu_julia.sh
index 13093acc4..6849fe228 100755
--- a/ci/docker/install/ubuntu_julia.sh
+++ b/ci/docker/install/ubuntu_julia.sh
@@ -32,8 +32,8 @@ function install_julia() {
     # The julia version in Ubuntu repo is too old
     # We download the tarball from the official link:
     #   https://julialang.org/downloads/
-    wget -O $JLBINARY https://julialang-s3.julialang.org/bin/linux/x64/$1/julia-$2-linux-x86_64.tar.gz
-    tar xzvf $JLBINARY -C $JULIADIR --strip 1
+    wget -qO $JLBINARY https://julialang-s3.julialang.org/bin/linux/x64/$1/julia-$2-linux-x86_64.tar.gz
+    tar xzf $JLBINARY -C $JULIADIR --strip 1
     rm $JLBINARY
 
     $JULIA -e 'using InteractiveUtils; versioninfo()'
diff --git a/ci/docker/install/ubuntu_llvm.sh b/ci/docker/install/ubuntu_llvm.sh
index afd881eae..8b6e765b5 100755
--- a/ci/docker/install/ubuntu_llvm.sh
+++ b/ci/docker/install/ubuntu_llvm.sh
@@ -22,6 +22,6 @@ echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-5.0 main\
 echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial-5.0 main\
      >> /etc/apt/sources.list.d/llvm.list
 
-wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
+wget -qO - http://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
 apt-get update || true
 apt-get install -y --force-yes llvm-5.0
diff --git a/ci/docker/install/ubuntu_mkl.sh b/ci/docker/install/ubuntu_mkl.sh
index 36fc7b07f..97ef15809 100755
--- a/ci/docker/install/ubuntu_mkl.sh
+++ b/ci/docker/install/ubuntu_mkl.sh
@@ -25,7 +25,7 @@ set -ex
 apt-get update || true
 # Install Intel Math Kernel Library (latest major release)
 # https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo
-wget -O - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB | apt-key add - && \
+wget -qO - wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB | apt-key add - && \
     sh -c 'echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list' && \
     apt-get update && \
     apt-get install -y intel-mkl-2019.1-053
diff --git a/ci/docker/install/ubuntu_publish.sh b/ci/docker/install/ubuntu_publish.sh
index 1ad6ab947..1fb7bf165 100644
--- a/ci/docker/install/ubuntu_publish.sh
+++ b/ci/docker/install/ubuntu_publish.sh
@@ -18,6 +18,8 @@
 # under the License.
 
 # Build on Ubuntu 14.04 LTS for LINUX CPU/GPU
+set -ex
+
 apt-get update
 apt-get install -y software-properties-common
 add-apt-repository ppa:ubuntu-toolchain-r/test -y
@@ -45,7 +47,10 @@ apt-get install -y git \
     automake \
     pkg-config \
     openjdk-8-jdk
-curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
+
+curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
+    || curl -o apache-maven-3.3.9-bin.tar.gz https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
+
 tar xzf apache-maven-3.3.9-bin.tar.gz
 mkdir /usr/local/maven
 mv apache-maven-3.3.9/ /usr/local/maven/
diff --git a/ci/docker/install/ubuntu_scala.sh b/ci/docker/install/ubuntu_scala.sh
index 5bade4746..9115bbc8a 100755
--- a/ci/docker/install/ubuntu_scala.sh
+++ b/ci/docker/install/ubuntu_scala.sh
@@ -36,14 +36,13 @@ apt-get install -y \
     openjdk-8-jdk \
     openjdk-8-jre \
     software-properties-common \
-    gnupg \
-    gnupg2 \
-    gnupg-agent \
     scala
 
 # Ubuntu 14.04
 if [[ $(lsb_release -r | grep 14.04) ]]; then
-    curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
+    curl -o apache-maven-3.3.9-bin.tar.gz http://www.eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz \
+        || curl -o apache-maven-3.3.9-bin.tar.gz https://search.maven.org/remotecontent?filepath=org/apache/maven/apache-maven/3.3.9/apache-maven-3.3.9-bin.tar.gz
+
     tar xzf apache-maven-3.3.9-bin.tar.gz
     mkdir /usr/local/maven
     mv apache-maven-3.3.9/ /usr/local/maven/
diff --git a/ci/docker/install/ubuntu_tutorials.sh b/ci/docker/install/ubuntu_tutorials.sh
index 404d4bbf6..60adf46e6 100755
--- a/ci/docker/install/ubuntu_tutorials.sh
+++ b/ci/docker/install/ubuntu_tutorials.sh
@@ -23,5 +23,5 @@
 set -ex
 apt-get update || true
 apt-get install graphviz python-opencv
-pip2 install jupyter matplotlib Pillow opencv-python scikit-learn graphviz tqdm mxboard
-pip3 install jupyter matplotlib Pillow opencv-python scikit-learn graphviz tqdm mxboard
+pip2 install jupyter matplotlib Pillow opencv-python scikit-learn graphviz tqdm mxboard scipy
+pip3 install jupyter matplotlib Pillow opencv-python scikit-learn graphviz tqdm mxboard scipy
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 6c3f99948..2622029f7 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -602,23 +602,23 @@ build_ubuntu_gpu_tensorrt() {
     cp -L 3rdparty/onnx-tensorrt/build/libnvonnxparser_runtime.so.0 /work/mxnet/lib/
     cp -L 3rdparty/onnx-tensorrt/build/libnvonnxparser.so.0 /work/mxnet/lib/
 
-    rm -rf build
-    make \
-        DEV=1                                                \
-        ENABLE_TESTCOVERAGE=1                                \
-        USE_BLAS=openblas                                    \
-        USE_CUDA=1                                           \
-        USE_CUDA_PATH=/usr/local/cuda                        \
-        USE_CUDNN=1                                          \
-        USE_OPENCV=0                                         \
-        USE_MKLDNN=0                                         \
-        USE_DIST_KVSTORE=0                                   \
-        USE_TENSORRT=1                                       \
-        USE_JEMALLOC=0                                       \
-        USE_GPERFTOOLS=0                                     \
-        ONNX_NAMESPACE=onnx                                  \
-        CUDA_ARCH="-gencode arch=compute_70,code=compute_70" \
-        -j$(nproc)
+    cd /work/build
+    cmake -DUSE_CUDA=1                            \
+          -DCMAKE_CXX_COMPILER_LAUNCHER=ccache    \
+          -DCMAKE_C_COMPILER_LAUNCHER=ccache      \
+          -DUSE_CUDNN=1                           \
+          -DUSE_OPENCV=1                          \
+          -DUSE_TENSORRT=1                        \
+          -DUSE_OPENMP=0                          \
+          -DUSE_MKLDNN=0                          \
+          -DUSE_MKL_IF_AVAILABLE=OFF              \
+          -DENABLE_TESTCOVERAGE=ON                \
+          -DCUDA_ARCH_NAME=Manual                 \
+          -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+          -G Ninja                                \
+          /work/mxnet
+
+    ninja -v
 }
 
 build_ubuntu_gpu_mkldnn() {
@@ -868,6 +868,15 @@ unittest_ubuntu_cpu_clojure() {
     ./contrib/clojure-package/ci-test.sh
 }
 
+unittest_ubuntu_cpu_clojure_integration() {
+    set -ex
+    cd scala-package
+    mvn -B install
+    cd ..
+    ./contrib/clojure-package/integration-tests.sh
+}
+
+
 unittest_ubuntu_cpugpu_perl() {
     set -ex
     ./perl-package/test.sh
@@ -883,6 +892,7 @@ unittest_ubuntu_cpu_R() {
     mkdir -p /tmp/r-site-library
     # build R packages in parallel
     mkdir -p ~/.R/
+    build_ccache_wrappers
     echo  "MAKEFLAGS = -j"$(nproc) > ~/.R/Makevars
     # make -j not supported
     make rpkg                           \
@@ -893,11 +903,42 @@ unittest_ubuntu_cpu_R() {
     make rpkgtest R_LIBS=/tmp/r-site-library
 }
 
+unittest_ubuntu_minimal_R() {
+    set -ex
+    mkdir -p /tmp/r-site-library
+    # build R packages in parallel
+    mkdir -p ~/.R/
+    build_ccache_wrappers
+    echo  "MAKEFLAGS = -j"$(nproc) > ~/.R/Makevars
+    # make -j not supported
+    make rpkg                           \
+        USE_BLAS=openblas               \
+        R_LIBS=/tmp/r-site-library
+
+    R CMD INSTALL --library=/tmp/r-site-library R-package
+    # pick mlp as minimal R test
+    R_LIBS=/tmp/r-site-library \
+        Rscript -e "library(mxnet); require(mlbench); \
+                    data(Sonar, package=\"mlbench\"); \
+                    Sonar[,61] = as.numeric(Sonar[,61])-1; \
+                    train.ind = c(1:50, 100:150); \
+                    train.x = data.matrix(Sonar[train.ind, 1:60]); \
+                    train.y = Sonar[train.ind, 61]; \
+                    test.x = data.matrix(Sonar[-train.ind, 1:60]); \
+                    test.y = Sonar[-train.ind, 61]; \
+                    model = mx.mlp(train.x, train.y, hidden_node = 10, \
+                                   out_node = 2, out_activation = \"softmax\", \
+                                   learning.rate = 0.1, \
+                                   array.layout = \"rowmajor\"); \
+                    preds = predict(model, test.x, array.layout = \"rowmajor\")"
+}
+
 unittest_ubuntu_gpu_R() {
     set -ex
     mkdir -p /tmp/r-site-library
     # build R packages in parallel
     mkdir -p ~/.R/
+    build_ccache_wrappers
     echo  "MAKEFLAGS = -j"$(nproc) > ~/.R/Makevars
     # make -j not supported
     make rpkg                           \
@@ -912,29 +953,31 @@ unittest_ubuntu_cpu_julia() {
     export PATH="$1/bin:$PATH"
     export MXNET_HOME='/work/mxnet'
     export JULIA_DEPOT_PATH='/work/julia-depot'
-    export DEVDIR="$JULIA_DEPOT_PATH/dev"
 
     julia -e 'using InteractiveUtils; versioninfo()'
 
-    # install package
-    mkdir -p $DEVDIR
-    ln -sf ${MXNET_HOME}/julia ${DEVDIR}/MXNet
-
-    # register MXNet.jl and dependencies
-    julia -e 'using Pkg; Pkg.develop("MXNet")'
-
     # FIXME
     export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
     export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
 
     # use the prebuilt binary from $MXNET_HOME/lib
-    julia -e 'using Pkg; Pkg.build("MXNet")'
+    julia --project=./julia -e 'using Pkg; Pkg.build("MXNet")'
 
     # run the script `julia/test/runtests.jl`
-    julia -e 'using Pkg; Pkg.test("MXNet")'
+    julia --project=./julia -e 'using Pkg; Pkg.test("MXNet")'
 
     # See /~https://github.com/dmlc/MXNet.jl/pull/303#issuecomment-341171774
-    julia -e 'using MXNet; mx._sig_checker()'
+    julia --project=./julia -e 'using MXNet; mx._sig_checker()'
+}
+
+unittest_ubuntu_cpu_julia07() {
+    set -ex
+    unittest_ubuntu_cpu_julia /work/julia07
+}
+
+unittest_ubuntu_cpu_julia10() {
+    set -ex
+    unittest_ubuntu_cpu_julia /work/julia10
 }
 
 unittest_ubuntu_cpu_julia07() {
@@ -1239,7 +1282,7 @@ nightly_tutorial_test_ubuntu_python2_gpu() {
 nightly_java_demo_test_cpu() {
     set -ex
     cd /work/mxnet/scala-package/mxnet-demo/java-demo
-    make java_ci_demo
+    mvn -B -Pci-nightly install
     bash bin/java_sample.sh
     bash bin/run_od.sh
 }
@@ -1247,7 +1290,7 @@ nightly_java_demo_test_cpu() {
 nightly_scala_demo_test_cpu() {
     set -ex
     cd /work/mxnet/scala-package/mxnet-demo/scala-demo
-    make scala_ci_demo
+    mvn -B -Pci-nightly install
     bash bin/demo.sh
     bash bin/run_im.sh
 }
@@ -1258,7 +1301,9 @@ deploy_docs() {
     set -ex
     pushd .
 
-    make docs SPHINXOPTS=-W
+    export CC="ccache gcc"
+    export CXX="ccache g++"
+    make docs SPHINXOPTS=-W USE_MKLDNN=0
 
     popd
 }
@@ -1268,10 +1313,8 @@ deploy_jl_docs() {
     export PATH="/work/julia10/bin:$PATH"
     export MXNET_HOME='/work/mxnet'
     export JULIA_DEPOT_PATH='/work/julia-depot'
-    export DEVDIR="$JULIA_DEPOT_PATH/dev"
 
     julia -e 'using InteractiveUtils; versioninfo()'
-    mkdir -p $DEVDIR
 
     # FIXME
     export LD_PRELOAD='/usr/lib/x86_64-linux-gnu/libjemalloc.so'
@@ -1283,6 +1326,24 @@ deploy_jl_docs() {
     # ...
 }
 
+build_scala_static_mkl() {
+    set -ex
+    pushd .
+    scala_prepare
+    export MAVEN_PUBLISH_OS_TYPE=linux-x86_64-cpu
+    export mxnet_variant=mkl
+    ./ci/publish/scala/build.sh
+    popd
+}
+
+build_static_python_mkl() {
+    set -ex
+    pushd .
+    export mxnet_variant=mkl
+    ./ci/publish/python/build.sh
+    popd
+}
+
 publish_scala_build() {
     set -ex
     pushd .
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index e812c4e24..511ff1dcb 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -34,7 +34,7 @@ mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/li
 mx_cmake_lib_debug = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests'
 mx_cmake_mkldnn_lib = 'build/libmxnet.so, build/libmxnet.a, build/3rdparty/dmlc-core/libdmlc.a, build/tests/mxnet_unit_tests, build/3rdparty/openmp/runtime/src/libomp.so, build/3rdparty/mkldnn/src/libmkldnn.so.0'
 mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a'
-mx_tensorrt_lib = 'lib/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
+mx_tensorrt_lib = 'build/libmxnet.so, lib/libnvonnxparser_runtime.so.0, lib/libnvonnxparser.so.0, lib/libonnx_proto.so, lib/libonnx.so'
 mx_lib_cpp_examples = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, build/cpp-package/example/*'
 mx_lib_cpp_examples_cpu = 'build/libmxnet.so, build/cpp-package/example/*'
 
@@ -521,6 +521,32 @@ def compile_windows_gpu_mkldnn() {
     }]
 }
 
+def test_static_scala_cpu() {
+  return ['Static build CPU 14.04 Scala' : {
+    node(NODE_LINUX_CPU) {
+        ws('workspace/ut-publish-scala-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run("publish.ubuntu1404_cpu", 'build_scala_static_mkl', false)
+          }
+        }
+    }
+  }]
+}
+
+def test_static_python_cpu() {
+  return ['Static build CPU 14.04 Python' : {
+    node(NODE_LINUX_CPU) {
+        ws('workspace/ut-publish-python-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git()
+            utils.docker_run("publish.ubuntu1404_cpu", 'build_static_python_mkl', false)
+          }
+        }
+    }
+  }]
+}
+
 def test_unix_python2_cpu() {
     return ['Python2: CPU': {
       node(NODE_LINUX_CPU) {
@@ -874,7 +900,48 @@ def test_unix_clojure_cpu() {
     }]
 }
 
+def test_unix_clojure_integration_cpu() {
+    return ['Clojure: CPU Integration': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/ut-clojure-integration-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init('cpu', mx_lib, true)
+            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_clojure_integration', false)
+          }
+        }
+      }
+    }]
+}
+
 def test_unix_r_cpu() {
+    return ['R: CPU': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/ut-r-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init('cpu', mx_lib, true)
+            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_cpu_R', false)
+            utils.publish_test_coverage()
+          }
+        }
+      }
+    }]
+}
+
+def test_unix_r_mkldnn_cpu() {
+    return ['R: MKLDNN-CPU': {
+      node(NODE_LINUX_CPU) {
+        ws('workspace/ut-r-mkldnn-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init('mkldnn_cpu', mx_mkldnn_lib, true)
+            utils.docker_run('ubuntu_cpu', 'unittest_ubuntu_minimal_R', false)
+            utils.publish_test_coverage()
+          }
+        }
+      }
+    }]
+}
+
+def test_unix_perl_cpu() {
     return ['Perl: CPU': {
       node(NODE_LINUX_CPU) {
         ws('workspace/ut-perl-cpu') {
@@ -891,7 +958,7 @@ def test_unix_r_cpu() {
 def test_unix_cpp_gpu() {
     return ['Cpp: GPU': {
       node(NODE_LINUX_GPU) {
-        ws('workspace/ut-cpp-gpu') {
+        ws('workspace/ut-cpp-mkldnn-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('cmake_gpu', mx_cmake_lib, true)
             utils.docker_run('ubuntu_gpu', 'unittest_cpp', true)
@@ -905,7 +972,7 @@ def test_unix_cpp_gpu() {
 def test_unix_cpp_mkldnn_gpu() {
     return ['Cpp: MKLDNN+GPU': {
       node(NODE_LINUX_GPU) {
-        ws('workspace/ut-cpp-mkldnn-gpu') {
+        ws('workspace/ut-perl-gpu') {
           timeout(time: max_time, unit: 'MINUTES') {
             utils.unpack_and_init('cmake_mkldnn_gpu', mx_cmake_mkldnn_lib, true)
             utils.docker_run('ubuntu_gpu', 'unittest_cpp', true)
@@ -930,7 +997,7 @@ def test_unix_cpp_cpu() {
     }]
 }
 
-def test_unix_r_gpu() {
+def test_unix_perl_gpu() {
     return ['Perl: GPU': {
       node(NODE_LINUX_GPU) {
         ws('workspace/ut-perl-gpu') {
@@ -944,6 +1011,20 @@ def test_unix_r_gpu() {
     }]
 }
 
+def test_unix_r_gpu() {
+    return ['R: GPU': {
+      node(NODE_LINUX_GPU) {
+        ws('workspace/it-dist-kvstore') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.unpack_and_init('gpu', mx_lib, true)
+            utils.docker_run('ubuntu_gpu', 'integrationtest_ubuntu_gpu_dist_kvstore', true)
+            utils.publish_test_coverage()
+          }
+        }
+      }
+    }]
+}
+
 def test_unix_julia07_cpu() {
     return ['Julia 0.7: CPU': {
       node(NODE_LINUX_CPU) {
@@ -1156,6 +1237,34 @@ def test_windows_python3_cpu() {
     }]
 }
 
+def test_windows_julia07_cpu() {
+    return ['Julia 0.7: CPU Win': {
+      node(NODE_WINDOWS_CPU) {
+        ws('workspace/ut-julia07-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git_win()
+            unstash 'windows_package_cpu'
+            powershell 'ci/windows/test_jl07_cpu.ps1'
+          }
+        }
+      }
+    }]
+}
+
+def test_windows_julia10_cpu() {
+    return ['Julia 1.0: CPU Win': {
+      node(NODE_WINDOWS_CPU) {
+        ws('workspace/ut-julia10-cpu') {
+          timeout(time: max_time, unit: 'MINUTES') {
+            utils.init_git_win()
+            unstash 'windows_package_cpu'
+            powershell 'ci/windows/test_jl10_cpu.ps1'
+          }
+        }
+      }
+    }]
+}
+
 def test_qemu_armv7_cpu() {
     return ['ARMv7 QEMU': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_cpu b/ci/jenkins/Jenkinsfile_unix_cpu
index ea3c06175..919381ebc 100644
--- a/ci/jenkins/Jenkinsfile_unix_cpu
+++ b/ci/jenkins/Jenkinsfile_unix_cpu
@@ -52,11 +52,16 @@ core_logic: {
     custom_steps.test_unix_scala_cpu(),
     custom_steps.test_unix_scala_mkldnn_cpu(),
     custom_steps.test_unix_clojure_cpu(),
+    custom_steps.test_unix_clojure_integration_cpu(),
+    custom_steps.test_unix_perl_cpu(),
     custom_steps.test_unix_r_cpu(),
+    custom_steps.test_unix_r_mkldnn_cpu(),
     custom_steps.test_unix_julia07_cpu(),
     custom_steps.test_unix_julia10_cpu(),
     custom_steps.test_unix_onnx_cpu(),
     custom_steps.test_unix_cpp_cpu(),
+    custom_steps.test_static_scala_cpu(),
+    custom_steps.test_static_python_cpu(),
     /*  Disabled due to master build failure:
      *  http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/incubator-mxnet/detail/master/1221/pipeline/
      *  /~https://github.com/apache/incubator-mxnet/issues/11801
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index bd884904d..664e591ab 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -51,6 +51,7 @@ core_logic: {
     custom_steps.test_unix_python3_mkldnn_gpu(),
     custom_steps.test_unix_python3_mkldnn_nocudnn_gpu(),
     custom_steps.test_unix_python3_tensorrt_gpu(),
+    custom_steps.test_unix_perl_gpu(),
     custom_steps.test_unix_r_gpu(),
     custom_steps.test_unix_cpp_gpu(),
     custom_steps.test_unix_cpp_mkldnn_gpu(),
diff --git a/ci/jenkins/Jenkinsfile_windows_cpu b/ci/jenkins/Jenkinsfile_windows_cpu
index a8746db73..5bc40d625 100644
--- a/ci/jenkins/Jenkinsfile_windows_cpu
+++ b/ci/jenkins/Jenkinsfile_windows_cpu
@@ -35,12 +35,14 @@ utils.main_wrapper(
 core_logic: {
   utils.parallel_stage('Build', [
     custom_steps.compile_windows_cpu()
-  ]) 
+  ])
 
   utils.parallel_stage('Tests', [
     custom_steps.test_windows_python2_cpu(),
-    custom_steps.test_windows_python3_cpu()
-  ]) 
+    custom_steps.test_windows_python3_cpu(),
+    custom_steps.test_windows_julia07_cpu(),
+    custom_steps.test_windows_julia10_cpu()
+  ])
 }
 ,
 failure_handler: {
diff --git a/ci/publish/Jenkinsfile b/ci/publish/Jenkinsfile
index 9a360c6b5..2b91f4f74 100644
--- a/ci/publish/Jenkinsfile
+++ b/ci/publish/Jenkinsfile
@@ -21,7 +21,7 @@
 // See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
 
 //mxnet libraries
-mx_scala_pub = 'lib/libmxnet.so, lib/libmxnet.a, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, config.mk, scala-package/pom.xml, scala-package/**/pom.xml, scala-package/*/target/**, scala-package/local-snapshot/**'
+mx_scala_pub = 'lib/**, 3rdparty/dmlc-core/libdmlc.a, 3rdparty/tvm/nnvm/lib/libnnvm.a, 3rdparty/ps-lite/build/libps.a, deps/lib/libprotobuf-lite.a, deps/lib/libzmq.a, config.mk, scala-package/pom.xml, scala-package/**/pom.xml, scala-package/**/target/**, scala-package/*/target/repo/**'
 
 // timeout in minutes
 max_time = 120
@@ -34,8 +34,9 @@ node('restricted-utility') {
 utils.assign_node_labels(utility: 'restricted-utility', linux_cpu: 'restricted-mxnetlinux-cpu', linux_gpu: 'restricted-mxnetlinux-gpu', linux_gpu_p3: 'restricted-mxnetlinux-gpu-p3', windows_cpu: 'restricted-mxnetwindows-cpu', windows_gpu: 'restricted-mxnetwindows-gpu')
 
 // CPU and GPU. OSX nodes are not currently supported by Jenkins
-def nodeMap = ['cpu': NODE_LINUX_CPU, 'gpu': NODE_LINUX_GPU]
+def nodeMap = ['cpu': NODE_LINUX_CPU, 'gpu': NODE_LINUX_GPU_P3]
 def scalaOSMap = ['cpu': 'linux-x86_64-cpu', 'gpu': 'linux-x86_64-gpu']
+def scalaVariantMap = ['cpu': 'mkl', 'gpu': 'cu92mkl']
 
 def wrapStep(nodeToRun, workspaceName, step) {
   return {
@@ -50,13 +51,13 @@ def wrapStep(nodeToRun, workspaceName, step) {
 }
 
 def toBuild = [:]
-def labels = ['cpu'] // , 'gpu']
+def labels = ['cpu', 'gpu']
 for (x in labels) {
   def label = x // Required due to language
-  toBuild["Scala Build ${label}"] = wrapStep(nodeMap[label], "build-scala-${label}") {
-    withEnv(["MAVEN_PUBLISH_OS_TYPE=${scalaOSMap[label]}"]) {
+  toBuild["Scala Build ${label}"] = wrapStep(nodeMap['cpu'], "build-scala-${label}") {
+    withEnv(["MAVEN_PUBLISH_OS_TYPE=${scalaOSMap[label]}", "mxnet_variant=${scalaVariantMap[label]}"]) {
       utils.init_git()
-      utils.docker_run("ubuntu_${label}", 'publish_scala_build', label == 'gpu', '500m', 'MAVEN_PUBLISH_OS_TYPE')
+      utils.docker_run("publish.ubuntu1404_cpu", 'publish_scala_build', false, '500m', 'MAVEN_PUBLISH_OS_TYPE mxnet_variant')
       utils.pack_lib("scala_${label}", mx_scala_pub, false)
     }
   }
@@ -69,8 +70,10 @@ for (x in labels) {
   for (y in systems) {
     def system = y // Required due to language
     toTest["Scala Test ${system} ${label}"] = wrapStep(nodeMap[label], "test-scala-${system}-${label}") {
-      utils.unpack_and_init("scala_${label}", mx_scala_pub, false)
-      utils.docker_run("publish.test.${system}_${label}", 'publish_scala_test', label == 'gpu')
+      withEnv(["mxnet_variant=${scalaVariantMap[label]}"]) {
+        utils.unpack_and_init("scala_${label}", mx_scala_pub, false)
+        utils.docker_run("publish.test.${system}_${label}", 'publish_scala_test', label == 'gpu', '500m', 'mxnet_variant')
+      }
     }
   }
 }
@@ -78,10 +81,10 @@ for (x in labels) {
 def toDeploy = [:]
 for (x in labels) {
   def label = x // Required due to language
-  toDeploy["Scala Deploy ${label}"] = wrapStep(nodeMap[label], "deploy-scala-${label}") {
-    withEnv(["MAVEN_PUBLISH_OS_TYPE=${scalaOSMap[label]}"]) {
+  toDeploy["Scala Deploy ${label}"] = wrapStep(nodeMap['cpu'], "deploy-scala-${label}") {
+    withEnv(["MAVEN_PUBLISH_OS_TYPE=${scalaOSMap[label]}", "mxnet_variant=${scalaVariantMap[label]}"]) {
       utils.unpack_and_init("scala_${label}", mx_scala_pub, false)
-      utils.docker_run("ubuntu_${label}", 'publish_scala_deploy', label == 'gpu', '500m', 'MAVEN_PUBLISH_OS_TYPE MAVEN_PUBLISH_SECRET_ENDPOINT_URL MAVEN_PUBLISH_SECRET_NAME_CREDENTIALS MAVEN_PUBLISH_SECRET_NAME_GPG DOCKERHUB_SECRET_ENDPOINT_REGION')
+      utils.docker_run("publish.ubuntu1604_${label}", 'publish_scala_deploy', false, '500m', 'MAVEN_PUBLISH_OS_TYPE MAVEN_PUBLISH_SECRET_ENDPOINT_URL MAVEN_PUBLISH_SECRET_NAME_CREDENTIALS MAVEN_PUBLISH_SECRET_NAME_GPG DOCKERHUB_SECRET_ENDPOINT_REGION mxnet_variant')
     }
   }
 }
@@ -101,7 +104,7 @@ core_logic: {
 ,
 failure_handler: {
   if (currentBuild.result == "FAILURE") {
-    // emailext body: 'Generating the nightly maven has failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[NIGHTLY MAVEN FAILED] Build ${BUILD_NUMBER}', to: '${EMAIL}'
+    emailext body: 'Generating the nightly maven has failed. Please view the build at ${BUILD_URL}', replyTo: '${EMAIL}', subject: '[NIGHTLY MAVEN FAILED] Build ${BUILD_NUMBER}', to: '${EMAIL}'
   }
 }
 )
diff --git a/ci/publish/README.md b/ci/publish/README.md
new file mode 100644
index 000000000..f1ece6f84
--- /dev/null
+++ b/ci/publish/README.md
@@ -0,0 +1,33 @@
+# MXNet Publish Settings
+
+This folder contains the configuration for restricted nodes on Jenkins for the publishing MXNet artifacts. It also contains a folder called `scala` that contains everything required for publishing to Maven. In this `README`, we provide a brief walkthrough of the Jenkins configuration as well as the usage of the Scala deployment files. Python publishing is TBD.
+
+## Jenkins
+Currently, Jenkins contains three build stages, namely `Build Packages`, `Test Packages` and `Deploy Packages`. During the `build package` stages, all dependencies are built and a Scala package are created. In the second stage, the package created from the previous stage moves to this stage to specifically run the tests. In the final stage, the packages that pass the tests are deployed by the instances.
+
+The job is scheduled to be triggered every 24 hours on a [restricted instance](http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/restricted-publish-artifacts).
+
+Currently, we are supporting tests in the following systems:
+
+- Ubuntu 16.04
+- Ubuntu 18.04
+- Cent OS 7
+
+All packages are currently built in `Ubuntu 14.04`. All Dockerfile used for publishing are available in `ci/docker/` with prefix `Dockerfile.publish`.
+
+Apart from that, the script used to create the environment and publish are available under `ci/docker/install`:
+
+- `ubuntu_publish.sh` installs all required dependencies for Ubuntu 14.04 for publishing
+- `ubuntu_base.sh` installs minimum dependencies required to run the published packages
+
+## Scala publishing
+Currently Scala publish on Linux is fully supported on Jenkins. The `scala/` folder contains all files needed for publishing. Here is a brief introduction of the files:
+
+- `build.sh` Main executable files to build the backend as well as scala package
+- `buildkey.py` Main file used to extract password from the system and configure the maven
+- `deploy.sh` Script to deploy the package
+- `fullDeploy.sh` Used by CI to make full publish
+- `test.sh` Make Scala test on CI
+
+## Python publishing
+Python build support is TBD.
diff --git a/ci/publish/python/build.sh b/ci/publish/python/build.sh
new file mode 100755
index 000000000..61549896c
--- /dev/null
+++ b/ci/publish/python/build.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -ex
+
+source tools/staticbuild/build.sh $mxnet_variant pip
+
+set -ex
+
+# Compile tests for discovery later
+source tools/staticbuild/build_wheel.sh
diff --git a/ci/publish/scala/build.sh b/ci/publish/scala/build.sh
index 17f969afe..11386ce21 100755
--- a/ci/publish/scala/build.sh
+++ b/ci/publish/scala/build.sh
@@ -22,7 +22,9 @@ set -ex
 # MAVEN_PUBLISH_OS_TYPE: linux-x86_64-cpu|linux-x86_64-gpu|osx-x86_64-cpu
 # export MAVEN_PUBLISH_OS_TYPE=linux-x86_64-cpu
 
-bash scala-package/dev/compile-mxnet-backend.sh $MAVEN_PUBLISH_OS_TYPE ./
+source tools/staticbuild/build.sh $mxnet_variant maven
+
+set -ex
 
 # Compile tests for discovery later
 cd scala-package
diff --git a/ci/publish/scala/deploy.sh b/ci/publish/scala/deploy.sh
index 4eb33907e..00c672147 100755
--- a/ci/publish/scala/deploy.sh
+++ b/ci/publish/scala/deploy.sh
@@ -18,12 +18,10 @@
 
 set -ex
 
-# Setup Environment Variables
-# MAVEN_PUBLISH_OS_TYPE: linux-x86_64-cpu|linux-x86_64-gpu|osx-x86_64-cpu
-# export MAVEN_PUBLISH_OS_TYPE=linux-x86_64-cpu
-
-# Run python to configure keys
-python3 ci/publish/scala/buildkey.py
+# On Jenkins, run python script to configure keys
+if [[ $BUILD_ID ]]; then
+    python3 ci/publish/scala/buildkey.py
+fi
 
 # Updating cache
 mkdir -p ~/.gnupg
@@ -37,5 +35,7 @@ cd scala-package
 
 mvn -B deploy -Pnightly
 
-# Clear all password .xml files, exp files, and gpg key files
-rm -rf ~/.m2/*.xml ~/.m2/key.asc ~/.m2/*.exp
+# On Jenkins, clear all password .xml files, exp files, and gpg key files
+if [[ $BUILD_ID ]]; then
+    rm -rf ~/.m2/*.xml ~/.m2/key.asc ~/.m2/*.exp
+fi
diff --git a/ci/publish/scala/test.sh b/ci/publish/scala/test.sh
index 5cef35ca3..32e36299a 100755
--- a/ci/publish/scala/test.sh
+++ b/ci/publish/scala/test.sh
@@ -24,5 +24,10 @@ fi
 
 # Test
 cd scala-package/packageTest
-# make testlocal CI=1
-make testsnapshot UNIT=1 CI=1
+
+if [[ $mxnet_variant == cu* ]]; then
+    export SCALA_TEST_ON_GPU=1
+    make testlocal USE_CUDA=1 CI=1
+else
+    make testlocal CI=1
+fi
diff --git a/ci/windows/test_jl07_cpu.ps1 b/ci/windows/test_jl07_cpu.ps1
new file mode 100644
index 000000000..6cd34ef20
--- /dev/null
+++ b/ci/windows/test_jl07_cpu.ps1
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+7z x -y windows_package.7z
+
+# set default output encoding to utf8
+$PSDefaultParameterValues['Out-File:Encoding'] = 'utf8'
+
+$env:MXNET_HOME = [System.IO.Path]::GetFullPath('.\windows_package')
+$env:JULIA_URL = "https://julialang-s3.julialang.org/bin/winnt/x64/0.7/julia-0.7.0-win64.exe"
+$env:JULIA_DEPOT_PATH = [System.IO.Path]::GetFullPath('.\julia-depot')
+
+$JULIA_DIR = [System.IO.Path]::GetFullPath('.\julia07')
+$JULIA = "$JULIA_DIR\bin\julia"
+
+# Download most recent Julia Windows binary
+[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12
+(New-Object System.Net.WebClient).DownloadFile($env:JULIA_URL, "julia-binary.exe")
+if (! $?) { Throw ("Error on downloading Julia Windows binary") }
+
+# Run installer silently, output to C:\julia07\julia
+Start-Process -Wait "julia-binary.exe" -ArgumentList "/S /D=$JULIA_DIR"
+if (! $?) { Throw ("Error on installing Julia") }
+
+& $JULIA -e "using InteractiveUtils; versioninfo()"
+
+dir
+
+$src='
+    using Pkg
+    Pkg.activate(".\\julia")
+    Pkg.build()
+    Pkg.test()
+'
+
+$src > .\ci-build.jl
+
+# Redirect all stderr output to stdout,
+# since Julia loggers output stuffs to stderr.
+# Then, stderr triggers powershell NativeCommandError.
+& $JULIA .\ci-build.jl 2>&1 | %{ "$_" }
+if ($LastExitCode -eq 1) { Throw ("Error") }
diff --git a/ci/windows/test_jl10_cpu.ps1 b/ci/windows/test_jl10_cpu.ps1
new file mode 100644
index 000000000..96c419066
--- /dev/null
+++ b/ci/windows/test_jl10_cpu.ps1
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+7z x -y windows_package.7z
+
+# set default output encoding to utf8
+$PSDefaultParameterValues['Out-File:Encoding'] = 'utf8'
+
+$env:MXNET_HOME = [System.IO.Path]::GetFullPath('.\windows_package')
+$env:JULIA_URL = "https://julialang-s3.julialang.org/bin/winnt/x64/1.0/julia-1.0.3-win64.exe"
+$env:JULIA_DEPOT_PATH = [System.IO.Path]::GetFullPath('.\julia-depot')
+
+$JULIA_DIR = [System.IO.Path]::GetFullPath('.\julia10')
+$JULIA = "$JULIA_DIR\bin\julia"
+
+# Download most recent Julia Windows binary
+[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12
+(New-Object System.Net.WebClient).DownloadFile($env:JULIA_URL, "julia-binary.exe")
+if (! $?) { Throw ("Error on downloading Julia Windows binary") }
+
+# Run installer silently, output to C:\julia10\julia
+Start-Process -Wait "julia-binary.exe" -ArgumentList "/S /D=$JULIA_DIR"
+if (! $?) { Throw ("Error on installing Julia") }
+
+& $JULIA -e "using InteractiveUtils; versioninfo()"
+
+dir
+
+$src='
+    using Pkg
+    Pkg.activate(".\\julia")
+    Pkg.build()
+    Pkg.test()
+'
+
+$src > .\ci-build.jl
+
+# Redirect all stderr output to stdout,
+# since Julia loggers output stuffs to stderr.
+# Then, stderr triggers powershell NativeCommandError.
+& $JULIA .\ci-build.jl 2>&1 | %{ "$_" }
+if ($LastExitCode -eq 1) { Throw ("Error") }
diff --git a/cmake/ChooseBlas.cmake b/cmake/ChooseBlas.cmake
index 13d7083f3..5f4af2d89 100644
--- a/cmake/ChooseBlas.cmake
+++ b/cmake/ChooseBlas.cmake
@@ -37,22 +37,26 @@ if(BLAS STREQUAL "Atlas" OR BLAS STREQUAL "atlas")
   list(APPEND mshadow_LINKER_LIBS ${Atlas_LIBRARIES})
   add_definitions(-DMSHADOW_USE_CBLAS=1)
   add_definitions(-DMSHADOW_USE_MKL=0)
+  add_definitions(-DMXNET_USE_BLAS_ATLAS=1)
 elseif(BLAS STREQUAL "Open" OR BLAS STREQUAL "open")
   find_package(OpenBLAS REQUIRED)
   include_directories(SYSTEM ${OpenBLAS_INCLUDE_DIR})
   list(APPEND mshadow_LINKER_LIBS ${OpenBLAS_LIB})
   add_definitions(-DMSHADOW_USE_CBLAS=1)
   add_definitions(-DMSHADOW_USE_MKL=0)
+  add_definitions(-DMXNET_USE_BLAS_OPEN=1)
 elseif(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl")
   find_package(MKL REQUIRED)
   include_directories(SYSTEM ${MKL_INCLUDE_DIR})
   list(APPEND mshadow_LINKER_LIBS ${MKL_LIBRARIES})
   add_definitions(-DMSHADOW_USE_CBLAS=0)
   add_definitions(-DMSHADOW_USE_MKL=1)
+  add_definitions(-DMXNET_USE_BLAS_MKL=1)
 elseif(BLAS STREQUAL "apple")
   find_package(Accelerate REQUIRED)
   include_directories(SYSTEM ${Accelerate_INCLUDE_DIR})
   list(APPEND mshadow_LINKER_LIBS ${Accelerate_LIBRARIES})
   add_definitions(-DMSHADOW_USE_MKL=0)
   add_definitions(-DMSHADOW_USE_CBLAS=1)
+  add_definitions(-DMXNET_USE_BLAS_APPLE=1)
 endif()
diff --git a/contrib/clojure-package/README.md b/contrib/clojure-package/README.md
index ba6160aed..f06ad3653 100644
--- a/contrib/clojure-package/README.md
+++ b/contrib/clojure-package/README.md
@@ -142,22 +142,15 @@ With this option, you will install a Git revision of the Clojure package source
 
 To run examples, you can now use `lein run` in any of the example directories, e.g., `examples/imclassification`. You can also specify the compute device, e.g., `lein run :cpu 2` (for 2 CPUs) or `lein run :gpu` (for 1 GPU).
 
-#### Experimental: Using Scala Snapshot Jars
-**Note:** Instead of a release tag, you can also use a development version of the Clojure package, e.g., Git `master`, together with the prebuilt Scala jar. There is a repo of nightly built snapshots of Scala jars. You can use them in your `project.clj` by adding a repository:
+#### Using Scala Nightly Snapshot Jars
+**Note:** Instead of a release tag, you can also use a development version of the Clojure package, e.g., Git `master`, together with the prebuilt Scala jar. There is a repo of nightly built snapshots of Scala jars. You can use them in your `project.clj` by adding them as a dependency:
 
-```
-["snapshots" {:url "https://repository.apache.org/content/repositories/snapshots"
-                              :snapshots true
-                              :sign-releases false
-                              :checksum :fail
-                              :update :always
-                              :releases {:checksum :fail :update :always}}]
-```
 
 Then you should be able to run with your dependency:
 
-    [org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "latest-version-SNAPSHOT"]
+    [org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "<insert-version-SNAPSHOT>"]
 
+In particular, ignore all of the language-interface-specific sections.
 
 In that case, however, breakage can happen at any point, for instance when the Scala development version adds, changes or removes an interface and the Clojure development version moves along. If you really need the most recent version, you should consider [installation option 3](#option-3-everything-from-source).
 
diff --git a/contrib/clojure-package/examples/captcha/get_data.sh b/contrib/clojure-package/examples/captcha/get_data.sh
old mode 100755
new mode 100644
diff --git a/contrib/clojure-package/examples/gan/.gitignore b/contrib/clojure-package/examples/gan/.gitignore
index c53038ec0..ea8013148 100644
--- a/contrib/clojure-package/examples/gan/.gitignore
+++ b/contrib/clojure-package/examples/gan/.gitignore
@@ -9,3 +9,4 @@ pom.xml.asc
 /.nrepl-port
 .hgignore
 .hg/
+results
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/gan/project.clj b/contrib/clojure-package/examples/gan/project.clj
index a326f7a56..439398783 100644
--- a/contrib/clojure-package/examples/gan/project.clj
+++ b/contrib/clojure-package/examples/gan/project.clj
@@ -15,11 +15,12 @@
 ;; limitations under the License.
 ;;
 
-(defproject gan "0.1.0-SNAPSHOT"
+(defproject gan-origami "0.1.0-SNAPSHOT"
   :description "GAN MNIST with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
+  :repositories [["vendredi" {:url "https://repository.hellonico.info/repository/hellonico/"}]]
   :dependencies [[org.clojure/clojure "1.9.0"]
                  [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
-                 [org.openpnp/opencv "3.4.2-1"]
+                 [origami "4.0.0-3"]
                  ]
-  :main gan.gan-mnist)
+  :main gan.gan-mnist)
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/gan/src/gan/viz.clj b/contrib/clojure-package/examples/gan/src/gan/viz.clj
index 67f78806d..08da53cb2 100644
--- a/contrib/clojure-package/examples/gan/src/gan/viz.clj
+++ b/contrib/clojure-package/examples/gan/src/gan/viz.clj
@@ -18,14 +18,9 @@
 (ns gan.viz
   (:require [org.apache.clojure-mxnet.ndarray :as ndarray]
             [org.apache.clojure-mxnet.shape :as mx-shape]
-            [org.apache.clojure-mxnet.io :as mx-io])
-  (:import (nu.pattern OpenCV)
-           (org.opencv.core Core CvType Mat Size)
-           (org.opencv.imgproc Imgproc)
-           (org.opencv.imgcodecs Imgcodecs)))
-
-;;; Viz stuff
-(OpenCV/loadShared)
+            [org.apache.clojure-mxnet.io :as mx-io]
+            [opencv4.utils :as cvu]
+            [opencv4.core :as cv :refer [CV_8UC1 new-matofbyte flip! imwrite new-size hconcat! vconcat! new-mat merge!]]))
 
 (defn clip [x]
   (->> x
@@ -37,29 +32,11 @@
        (mapv #(.byteValue %))))
 
 (defn get-img [raw-data channels height width flip]
-  (let [totals (* height width)
-        img (if (> channels 1)
-              ;; rgb image
-              (let [[ra ga ba] (byte-array (partition totals raw-data))
-                    rr (new Mat height width (CvType/CV_8U))
-                    gg (new Mat height width (CvType/CV_8U))
-                    bb (new Mat height width (CvType/CV_8U))
-                    result (new Mat)]
-                (.put rr (int 0) (int 0) ra)
-                (.put gg (int 0) (int 0) ga)
-                (.put bb (int 0) (int 0) ba)
-                (Core/merge (java.util.ArrayList. [bb gg rr]) result)
-                result)
+  (let [img (if (> channels 1)
+              (throw (Exception. "Image with 3 channels (RGB) not supported"))
               ;; gray image
-              (let [result (new Mat height width (CvType/CV_8U))
-                    _ (.put result (int 0) (int 0) (byte-array raw-data))]
-                result))]
-    (do
-      (if flip
-        (let [result (new Mat)
-              _ (Core/flip img result (int 0))]
-          result)
-        img))))
+              (cv/>> (new-mat height width CV_8UC1) (byte-array raw-data)))]
+    (if flip (flip! img 0) img)))
 
 (defn im-sav [{:keys [title output-path x flip]
                :or {flip false} :as g-mod}]
@@ -73,15 +50,10 @@
         line-arrs (into [] (partition (* col c totals) raw-data))
         line-mats (mapv (fn [line]
                           (let [img-arr (into [] (partition (* c totals) line))
-                                col-mats (new Mat)
-                                src (mapv (fn [arr] (get-img (into [] arr) c h w flip)) img-arr)
-                                _ (Core/hconcat (java.util.ArrayList. src) col-mats)]
-                            col-mats))
-                        line-arrs)
-        result (new Mat)
-        resized-img (new Mat)
-        _ (Core/vconcat (java.util.ArrayList. line-mats) result)]
-    (do
-      (Imgproc/resize result resized-img (new Size (* (.width result) 1.5) (* (.height result) 1.5)))
-      (Imgcodecs/imwrite (str output-path title ".jpg") resized-img)
-      (Thread/sleep 1000))))
+                                src (mapv (fn [arr] (get-img (into [] arr) c h w flip)) img-arr)]
+                            (hconcat! src)))
+                        line-arrs)]
+    (-> line-mats
+        (vconcat!)
+        (cvu/resize-by 1.5)
+        (imwrite (str output-path title ".jpg")))))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/imclassification/src/imclassification/train_mnist.clj b/contrib/clojure-package/examples/imclassification/src/imclassification/train_mnist.clj
index e61e9ebf6..164b5f262 100644
--- a/contrib/clojure-package/examples/imclassification/src/imclassification/train_mnist.clj
+++ b/contrib/clojure-package/examples/imclassification/src/imclassification/train_mnist.clj
@@ -25,7 +25,8 @@
             [org.apache.clojure-mxnet.kvstore :as kvstore]
             [org.apache.clojure-mxnet.kvstore-server :as kvstore-server]
             [org.apache.clojure-mxnet.optimizer :as optimizer]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric])
+            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
+            [org.apache.clojure-mxnet.resource-scope :as resource-scope])
   (:gen-class))
 
 (def data-dir "data/") ;; the data directory to store the mnist data
@@ -51,28 +52,6 @@
 (when-not (.exists (io/file (str data-dir "train-images-idx3-ubyte")))
   (sh "../../scripts/get_mnist_data.sh"))
 
-;;; Load the MNIST datasets
-(defonce train-data (mx-io/mnist-iter {:image (str data-dir "train-images-idx3-ubyte")
-                                       :label (str data-dir "train-labels-idx1-ubyte")
-                                       :label-name "softmax_label"
-                                       :input-shape [784]
-                                       :batch-size batch-size
-                                       :shuffle true
-                                       :flat true
-                                       :silent false
-                                       :seed 10
-                                       :num-parts num-workers
-                                       :part-index 0}))
-
-(defonce test-data (mx-io/mnist-iter {:image (str data-dir "t10k-images-idx3-ubyte")
-                                      :label (str data-dir "t10k-labels-idx1-ubyte")
-                                      :input-shape [784]
-                                      :batch-size batch-size
-                                      :flat true
-                                      :silent false
-                                      :num-parts num-workers
-                                      :part-index 0}))
-
 (defn get-symbol []
   (as-> (sym/variable "data") data
     (sym/fully-connected "fc1" {:data data :num-hidden 128})
@@ -82,7 +61,31 @@
     (sym/fully-connected "fc3" {:data data :num-hidden 10})
     (sym/softmax-output "softmax" {:data data})))
 
-(defn start 
+
+(defn train-data []
+  (mx-io/mnist-iter {:image (str data-dir "train-images-idx3-ubyte")
+                     :label (str data-dir "train-labels-idx1-ubyte")
+                     :label-name "softmax_label"
+                     :input-shape [784]
+                     :batch-size batch-size
+                     :shuffle true
+                     :flat true
+                     :silent false
+                     :seed 10
+                     :num-parts num-workers
+                     :part-index 0}))
+
+(defn eval-data []
+  (mx-io/mnist-iter {:image (str data-dir "t10k-images-idx3-ubyte")
+                     :label (str data-dir "t10k-labels-idx1-ubyte")
+                     :input-shape [784]
+                     :batch-size batch-size
+                     :flat true
+                     :silent false
+                     :num-parts num-workers
+                     :part-index 0}))
+
+(defn start
   ([devs] (start devs num-epoch))
   ([devs _num-epoch]
   (when scheduler-host
@@ -96,18 +99,16 @@
     (do
       (println "Starting Training of MNIST ....")
       (println "Running with context devices of" devs)
-      (let [_mod (m/module (get-symbol) {:contexts devs})]
-        (m/fit _mod {:train-data train-data
-                    :eval-data test-data
+      (resource-scope/with-let [_mod (m/module (get-symbol) {:contexts devs})]
+        (-> _mod
+            (m/fit {:train-data (train-data)
+                    :eval-data (eval-data)
                     :num-epoch _num-epoch
                     :fit-params (m/fit-params {:kvstore kvstore
                                                :optimizer optimizer
                                                :eval-metric eval-metric})})
-        (println "Finish fit")
-        _mod
-        )
-      
-      ))))
+            (m/save-checkpoint {:prefix "target/test" :epoch _num-epoch}))
+        (println "Finish fit"))))))
 
 (defn -main [& args]
   (let [[dev dev-num] args
diff --git a/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj b/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj
index 2ebefc2fc..f185891ab 100644
--- a/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj
+++ b/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj
@@ -16,7 +16,7 @@
 ;;
 
 (ns imclassification.train-mnist-test
-	(:require 
+	(:require
 		[clojure.test :refer :all]
 		[clojure.java.io :as io]
 		[clojure.string :as s]
@@ -26,14 +26,15 @@
 
 (defn- file-to-filtered-seq [file]
 	(->>
-		file 
+		file
 		(io/file)
 		(io/reader)
 		(line-seq)
 		(filter  #(not (s/includes? % "mxnet_version")))))
 
 (deftest mnist-two-epochs-test
-	(module/save-checkpoint (mnist/start [(context/cpu)] 2) {:prefix "target/test" :epoch 2})
-	(is (= 
-		(file-to-filtered-seq "test/test-symbol.json.ref") 
-		(file-to-filtered-seq "target/test-symbol.json"))))
\ No newline at end of file
+  (do
+    (mnist/start [(context/cpu)] 2)
+    (is (=
+         (file-to-filtered-seq "test/test-symbol.json.ref")
+         (file-to-filtered-seq "target/test-symbol.json")))))
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/src/infer/imageclassifier_example.clj b/contrib/clojure-package/examples/infer/imageclassifier/src/infer/imageclassifier_example.clj
index 6994b4fad..bc8b82e1e 100644
--- a/contrib/clojure-package/examples/infer/imageclassifier/src/infer/imageclassifier_example.clj
+++ b/contrib/clojure-package/examples/infer/imageclassifier/src/infer/imageclassifier_example.clj
@@ -74,6 +74,8 @@
         image-file-batches (->> input-dir
                                 io/file
                                 file-seq
+                                sort
+                                reverse
                                 (filter #(.isFile %))
                                 (filter #(re-matches #".*\.jpg$" (.getPath %)))
                                 (mapv #(.getPath %))
diff --git a/contrib/clojure-package/examples/infer/objectdetector/.gitignore b/contrib/clojure-package/examples/infer/objectdetector/.gitignore
index 35491f1a0..a1f046803 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/.gitignore
+++ b/contrib/clojure-package/examples/infer/objectdetector/.gitignore
@@ -10,3 +10,4 @@ pom.xml.asc
 /.nrepl-port
 .hgignore
 .hg/
+results
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/infer/objectdetector/README.md b/contrib/clojure-package/examples/infer/objectdetector/README.md
index 921c53e04..ec092a296 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/README.md
+++ b/contrib/clojure-package/examples/infer/objectdetector/README.md
@@ -16,6 +16,9 @@ $ ./scripts/get_ssd_data.sh
 $
 $ lein run -- --help
 $ lein run -- -m models/resnet50_ssd/resnet50_ssd_model -i images/dog.jpg -d images/
+$ 
+$ # or the available lein alias
+$ lein run-detector
 $
 $ lein uberjar
 $ java -jar target/objectdetector-0.1.0-SNAPSHOT-standalone.jar --help
diff --git a/contrib/clojure-package/examples/infer/objectdetector/images/marcel.jpg b/contrib/clojure-package/examples/infer/objectdetector/images/marcel.jpg
new file mode 100644
index 000000000..1bf7387e0
Binary files /dev/null and b/contrib/clojure-package/examples/infer/objectdetector/images/marcel.jpg differ
diff --git a/contrib/clojure-package/examples/infer/objectdetector/project.clj b/contrib/clojure-package/examples/infer/objectdetector/project.clj
index 4501f14a3..cdd9a8991 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/project.clj
+++ b/contrib/clojure-package/examples/infer/objectdetector/project.clj
@@ -17,9 +17,12 @@
 
 (defproject objectdetector "0.1.0-SNAPSHOT"
   :description "Object detection using infer with MXNet"
+  :repositories [["vendredi" "https://repository.hellonico.info/repository/hellonico/"]]
   :plugins [[lein-cljfmt "0.5.7"]]
+  :aliases {"run-detector" ["run" "--" "-m" "models/resnet50_ssd/resnet50_ssd_model" "-i" "images/dog.jpg" "-d" "images/"]}
   :dependencies [[org.clojure/clojure "1.9.0"]
                  [org.clojure/tools.cli "0.4.1"]
+                 [origami "4.0.0-3"]
                  [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]]
   :main ^:skip-aot infer.objectdetector-example
   :profiles {:uberjar {:aot :all}})
diff --git a/contrib/clojure-package/examples/infer/objectdetector/src/infer/draw.clj b/contrib/clojure-package/examples/infer/objectdetector/src/infer/draw.clj
new file mode 100644
index 000000000..d29b34b5c
--- /dev/null
+++ b/contrib/clojure-package/examples/infer/objectdetector/src/infer/draw.clj
@@ -0,0 +1,44 @@
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns infer.draw
+  (:require
+   [opencv4.colors.rgb :as rgb]
+   [opencv4.core :refer [FONT_HERSHEY_PLAIN imread imwrite new-point put-text! rectangle]]))
+
+(defn black-boxes! [img results]
+  (doseq [{confidence :confidence label :label top-left :top-left bottom-right :bottom-right} results]
+    (let [w (.width img)
+          h (.height img)
+          top-left-p (new-point (int (* w (first top-left))) (int (* h (second top-left))))
+          bottom-right-p (new-point (int (* w (first bottom-right))) (int (* h (second bottom-right))))]
+      (if (< 15 confidence)
+        (do
+          (rectangle img top-left-p bottom-right-p rgb/white 1)
+          (put-text! img
+                     (str label "[" confidence "% ]")
+                     top-left-p
+                     FONT_HERSHEY_PLAIN
+                     1.0
+                     rgb/white 1)))))
+  img)
+
+(defn draw-bounds [image results output-dir]
+  (let [out-file (str output-dir "/" (.getName (clojure.java.io/as-file image)))]
+    (-> image
+        (imread)
+        (black-boxes! results)
+        (imwrite out-file))))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj b/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
index 5c30e5db6..9331798b0 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
+++ b/contrib/clojure-package/examples/infer/objectdetector/src/infer/objectdetector_example.clj
@@ -20,6 +20,7 @@
             [org.apache.clojure-mxnet.infer :as infer]
             [org.apache.clojure-mxnet.layout :as layout]
             [clojure.java.io :as io]
+            [infer.draw :as draw]
             [clojure.string :refer [join]]
             [clojure.tools.cli :refer [parse-opts]])
   (:gen-class))
@@ -45,54 +46,81 @@
    ["-i" "--input-image IMAGE" "Input image"
     :default "images/dog.jpg"
     :validate [check-valid-file "Input file not found"]]
+   ["-o" "--output-dir IMAGE_DIR" "Output directory. Defaults to results"
+    :default "results/"
+    :validate [check-valid-dir "Output directory not found"]]
    ["-d" "--input-dir IMAGE_DIR" "Input directory"
     :default "images/"
     :validate [check-valid-dir "Input directory not found"]]
    ["-h" "--help"]])
 
-(defn print-predictions
-  "Print image detector predictions for the given input file"
-  [predictions width height]
-  (println (apply str (repeat 80 "=")))
-  (doseq [{:keys [class prob x-min y-min x-max y-max]} predictions]
-    (println (format
-              "Class: %s Prob=%.5f Coords=(%.3f, %.3f, %.3f, %.3f)"
-              class
-              prob
-              (* x-min width)
-              (* y-min height)
-              (* x-max width)
-              (* y-max height))))
-  (println (apply str (repeat 80 "="))))
+(defn result->map [{:keys [class prob x-min y-min x-max y-max]}]
+  (hash-map
+   :label class
+   :confidence (int (* 100 prob))
+   :top-left [x-min y-min]
+   :bottom-right [x-max y-max]))
+
+(defn print-results [results]
+  (doseq [_r results]
+    (println (format "Class: %s Confidence=%s Coords=(%s, %s)"
+                     (_r :label)
+                     (_r :confidence)
+                     (_r :top-left)
+                     (_r :bottom-right)))))
+
+(defn process-results [images results output-dir]
+  (dotimes [i (count images)]
+    (let [image (nth images i) _results (map result->map (nth results i))]
+      (println "processing: " image)
+      (print-results _results)
+      (draw/draw-bounds image _results output-dir))))
 
 (defn detect-single-image
   "Detect objects in a single image and print top-5 predictions"
-  [detector input-image]
+  ([detector input-dir] (detect-single-image detector input-dir "results"))
+  ([detector input-image output-dir]
+    (.mkdir (io/file output-dir))
   (let [image (infer/load-image-from-file input-image)
         topk 5
-        [predictions] (infer/detect-objects detector image topk)]
-    predictions))
+        res (infer/detect-objects detector image topk)
+        ]
+    (process-results
+     [input-image]
+     res
+     output-dir)
+    (first res)
+    )))
 
 (defn detect-images-in-dir
   "Detect objects in all jpg images in the directory"
-  [detector input-dir]
+  ([detector input-dir] (detect-images-in-dir detector input-dir "results"))
+  ([detector input-dir output-dir]
+  (.mkdir (io/file output-dir))
   (let [batch-size 20
         image-file-batches (->> input-dir
                                 io/file
                                 file-seq
+                                sort
                                 (filter #(.isFile %))
                                 (filter #(re-matches #".*\.jpg$" (.getPath %)))
                                 (mapv #(.getPath %))
                                 (partition-all batch-size))]
-    (apply concat (for [image-files image-file-batches]
-                    (let [image-batch (infer/load-image-paths image-files)
-                          topk 5]
-                      (infer/detect-objects-batch detector image-batch topk))))))
+    (apply concat
+     (for [image-files image-file-batches]
+       (let [image-batch (infer/load-image-paths image-files) 
+             topk 5 
+             res (infer/detect-objects-batch detector image-batch topk) ]
+         (process-results
+          image-files
+          res
+          output-dir) 
+         res))))))
 
 (defn run-detector
   "Runs an image detector based on options provided"
   [options]
-  (let [{:keys [model-path-prefix input-image input-dir
+  (let [{:keys [model-path-prefix input-image input-dir output-dir
                 device device-id]} options
         width 512 height 512
         descriptors [{:name "data"
@@ -103,12 +131,11 @@
         detector (infer/create-object-detector
                   factory
                   {:contexts [(context/default-context)]})]
+    (println "Output results to:" output-dir ":")
     (println "Object detection on a single image")
-    (print-predictions (detect-single-image detector input-image) width height)
-    (println "\n")
+    (detect-single-image detector input-image output-dir)
     (println "Object detection on images in a directory")
-    (doseq [predictions (detect-images-in-dir detector input-dir)]
-      (print-predictions predictions width height))))
+    (detect-images-in-dir detector input-dir output-dir)))
 
 (defn -main
   [& args]
diff --git a/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj b/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
index 2b8ad951a..696d96b3a 100644
--- a/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
+++ b/contrib/clojure-package/examples/infer/objectdetector/test/infer/objectdetector_example_test.clj
@@ -45,6 +45,7 @@
   (let [detector (create-detector)
         predictions (detect-single-image detector image-file)
         {:keys [class prob x-min x-max y-min y-max] :as pred} (first predictions)]
+    (clojure.pprint/pprint predictions)
     (is (some? predictions))
     (is (= 5 (count predictions)))
     (is (string? class))
@@ -55,11 +56,13 @@
 (deftest test-batch-detection
   (let [detector (create-detector)
         batch-predictions (detect-images-in-dir detector image-dir)
+        _ (clojure.pprint/pprint batch-predictions)
         predictions (first batch-predictions)
         {:keys [class prob x-min x-max y-min y-max] :as pred} (first predictions)]
     (is (some? batch-predictions))
     (is (= 5 (count predictions)))
     (is (string? class))
     (is (< 0.8 prob))
+    (println [x-min x-max y-min y-max])
     (every? #(< 0 % 1) [x-min x-max y-min y-max])
-    (is (= #{"dog" "person" "bicycle" "car"} (set (mapv :class predictions))))))
+    (is (= #{"dog" "person"} (set (mapv :class predictions))))))
diff --git a/contrib/clojure-package/examples/neural-style/project.clj b/contrib/clojure-package/examples/neural-style/project.clj
index b6d29f7c0..2614a6987 100644
--- a/contrib/clojure-package/examples/neural-style/project.clj
+++ b/contrib/clojure-package/examples/neural-style/project.clj
@@ -18,8 +18,8 @@
 (defproject neural-style "0.1.0-SNAPSHOT"
   :description "Neural Style Transfer with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
+  :repositories [["vendredi" {:url "https://repository.hellonico.info/repository/hellonico/"}]]
   :dependencies [[org.clojure/clojure "1.9.0"]
                  [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
-                 [net.mikera/imagez "0.12.0"]
-                 [thinktopic/think.image "0.4.16"]]
-  :main neural-style.core)
+                 [origami "4.0.0-3"]]
+  :main neural-style.core)
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/neural-style/src/neural_style/core.clj b/contrib/clojure-package/examples/neural-style/src/neural_style/core.clj
index ac1f537f1..aa4c44717 100644
--- a/contrib/clojure-package/examples/neural-style/src/neural_style/core.clj
+++ b/contrib/clojure-package/examples/neural-style/src/neural_style/core.clj
@@ -26,9 +26,8 @@
             [org.apache.clojure-mxnet.symbol :as sym]
             [clojure.java.io :as io]
             [clojure.java.shell :refer [sh]]
-            [mikera.image.core :as img]
-            [mikera.image.filters :as img-filter]
-            [think.image.pixel :as pixel]
+            [opencv4.core :as cv]
+            [opencv4.utils :as cvu]
             [neural-style.model-vgg-19 :as model-vgg-19])
   (:gen-class));; An Implementation of the paper A Neural Algorithm of Artistic Style
  ;;by Leon A. Gatys, Alexander S. Ecker, and Matthias Bethge
@@ -41,63 +40,63 @@
 (def model-path "model/vgg19.params")
 (def max-long-edge 600) ;; resize the content image
 (def style-weight 1) ;; the weight for the style image
-(def content-weight 5) ;; the weight for the content image
-(def blur-radius 1) ;; the blur filter radius
+(def content-weight 3) ;; the weight for the content image
+(def blur-radius 5) ;; the blur filter radius
 (def output-dir "output")
 (def lr 10.0) ;; the learning rate
 (def tv-weight 0.01) ;; the magnitude on the tv loss
 (def num-epochs 1000)
-(def num-channels 3)
 
-(defn image->ndarray [simg]
-  (let [h (img/height simg)
-        w (img/width simg)
-        pixels (img/get-pixels simg)
-        ;; normalize the pixels for vgg19
-        rgb-pixels (reduce (fn [result pixel]
-                             (let [[rs gs bs] result
-                                   [r g b _] (pixel/unpack-pixel pixel)]
-                               [(conj rs (- r 123.68))
-                                (conj gs (- g 116.779))
-                                (conj bs (- b 103.939))]))
-                           [[] [] []]
-                           pixels)]
-    (println "The resized image is size " {:height h :width w})
-    (-> rgb-pixels
-        (flatten)
-        (ndarray/array [1 num-channels h w]))))
+;;;;
+; IMAGE MANIPULATION
+;;;;
+
+(defn image->ndarray 
+  "normalize the pixels for vgg19"
+  [simg]
+  (let [h (.height simg) w (.width simg)]
+    (println "The nd image size is:" {:height h :width w})
+    (-> simg 
+      (cv/convert-to! cv/CV_8SC3 0.5) 
+      (cv/add! (cv/new-scalar -103.939 -116.779 -123.68) ) 
+      (cvu/mat->flat-rgb-array)
+      (ndarray/array [1 (.channels simg) h w]))))
+
+(defn ndarray->image [img]
+  (let [nd (ndarray/->vec img)
+        [_ _ h w] (mx-shape/->vec (ndarray/shape img))
+        to-cv1 (fn [bytes h w] (cv/>> (cv/new-mat h w cv/CV_8S) (byte-array bytes)))
+        byte-arrays (reverse (partition (* h w) nd))
+        mats (map #(to-cv1 % h w) byte-arrays)]
+    (-> mats
+        (cv/merge! (cv/new-mat h w cv/CV_8SC3))
+        (cv/add! (cv/new-scalar 103.939 116.779 123.68))
+        (cv/convert-to! cv/CV_8UC3 2))))
 
 (defn preprocess-content-image [path short-edge]
-  (let [simg (img/load-image path)
-        _ (println "The content image is size " {:height (img/height simg) :width (img/width simg)})
-        factor (/ short-edge (img/width simg))
-        resized-img (img/resize simg (* (img/width simg) factor) (* (img/height simg) factor))
-        new-height (img/height resized-img)
-        new-width (img/width resized-img)]
-    (image->ndarray resized-img)))
+  (-> path 
+    (cv/imread)
+    (#(cvu/resize-by % (/ short-edge (.width %))))
+    (image->ndarray)))
 
 (defn preprocess-style-image [path shape-vec]
-  (let [[_ _ h w] shape-vec
-        simg (img/load-image path)
-        _ (println "The image is size " {:height (img/height simg) :width (img/width simg)})
-        resized-img (img/resize simg w h)]
-    (image->ndarray resized-img)))
+  (let [[_ _ h w] shape-vec]
+    (println "The style image is size " {:height h :width w})
+    (-> path 
+      (cv/imread)
+      (cv/resize! (cv/new-size w h))
+      (image->ndarray))))
 
-(defn postprocess-image [img]
-  (let [datas (ndarray/->vec img)
-        image-shape (mx-shape/->vec (ndarray/shape img))
-        spatial-size (* (get image-shape 2) (get image-shape 3))
-        [rs gs bs] (doall (partition spatial-size datas))
-        pixels  (mapv (fn [r g b]
-                        (pixel/pack-pixel
-                         (int (+ r 123.68))
-                         (int (+ g 116.779))
-                         (int (+ b 103.939))
-                         (int 255)))
-                      rs gs bs)
-        new-image (img/new-image (get image-shape 3) (get image-shape 2))
-        _  (img/set-pixels new-image (int-array pixels))]
-    new-image))
+(defn save-image [img filename radius blur?]
+  (println "Saving image:" filename)
+  (-> img 
+    (ndarray->image)
+    (#(if blur? (cv/blur! % (cv/new-size blur-radius blur-radius)) %))
+    (cv/imwrite filename)))
+
+;;;;
+; TRAINING
+;;;;
 
 (defn style-gram-symbol [input-size style]
   (let [[_ output-shape _] (sym/infer-shape style {:data [1 3 (first input-size) (second input-size)]})
@@ -125,27 +124,6 @@
         content-loss (sym/sum (sym/square (sym/- cvar content)))]
     {:style-loss (sym/group gram-loss) :content-loss content-loss}))
 
-(defn old-clip [v]
-  (mapv (fn [a] (cond
-                  (neg? a) 0
-                  (> a 255) 255
-                  :else a))
-        v))
-
-(defn clip [a]
-  (cond
-    (neg? a) 0
-    (> a 255) 255
-    :else a))
-
-(defn save-image [img filename radius blur?]
-  (let [filtered-image (if blur?
-                         ((img-filter/box-blur blur-radius blur-radius) (postprocess-image img))
-                         (postprocess-image img))]
-    (do
-      ;(img/show filtered-image) ;; Uncomment to have the image display 
-      (img/write filtered-image filename "png"))))
-
 (defn get-tv-grad-executor [img ctx tv-weight]
   (when (pos? tv-weight)
     (let [img-shape (mx-shape/->vec (ndarray/shape img))
@@ -163,7 +141,7 @@
       (sym/bind out ctx {"img" img "kernel" kernel}))))
 
 (defn train 
-  ([devs] (train devs 20)) 
+  ([devs] (train devs 30)) 
   ([devs n-epochs]
     (let [dev (first devs)
         content-np (preprocess-content-image content-image max-long-edge)
@@ -244,9 +222,8 @@
         (when (zero? (mod i 2))
           (save-image (ndarray/copy img) (str output-dir "/out_" i ".png") blur-radius true)))
       (ndarray/set old-img img))
-    ; (save-image (ndarray/copy img) (str output-dir "/final.png") 0 false)
-    ; (postprocess-image img)
-    )))
+    (save-image (ndarray/copy img) (str output-dir "/final.png") 0 false)
+    (ndarray->image img))))
 
 (defn -main [& args]
   ;;; Note this only works on cpu right now
diff --git a/contrib/clojure-package/examples/neural-style/test/neural_style/vgg_19_test.clj b/contrib/clojure-package/examples/neural-style/test/neural_style/vgg_19_test.clj
index a7c978607..83be4a88b 100644
--- a/contrib/clojure-package/examples/neural-style/test/neural_style/vgg_19_test.clj
+++ b/contrib/clojure-package/examples/neural-style/test/neural_style/vgg_19_test.clj
@@ -18,7 +18,7 @@
 (ns neural-style.vgg-19-test
 	(:require 
 		[clojure.test :refer :all]
-		[mikera.image.core :as img]
+		[opencv4.core :as cv]
 		[clojure.java.io :as io]
 		[org.apache.clojure-mxnet.ndarray :as ndarray]
 		[org.apache.clojure-mxnet.context :as context]
@@ -26,9 +26,8 @@
 
 (defn pic-to-ndarray-vec[path]
 	(-> path 
-		img/load-image 
-	 	neural/image->ndarray
-	 	ndarray/->vec))
+		cv/imread
+	 	neural/image->ndarray))
 
 (defn last-modified-check[x]
 	(let [t (- (System/currentTimeMillis) (.lastModified x)) ]
@@ -48,6 +47,4 @@
 
 (deftest vgg-19-test
 	(neural/train [(context/cpu)] 3)
-    (is (not (nil? (latest-pic-to-ndarray-vec "output")))))
-; generated file different depending on the platform :/ 
-; (pic-to-ndarray-vec "test/ref_out_2.png"))))
\ No newline at end of file
+    (is (not (nil? (latest-pic-to-ndarray-vec "output")))))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/pre-trained-models/README.md b/contrib/clojure-package/examples/pre-trained-models/README.md
index 751109f7b..b0996da69 100644
--- a/contrib/clojure-package/examples/pre-trained-models/README.md
+++ b/contrib/clojure-package/examples/pre-trained-models/README.md
@@ -13,6 +13,16 @@ The `predict-image.clj` file loads up the pre-trained resnet-152 model and uses
 
 *To use run download-reset-152.sh to get the model params and json *
 
+Run the example with the available leiningen alias:
+
+```
+$ lein predict-image
+#
+# or with your own image:
+#
+$ lein predict-image <url_or_path_to_image>
+```
+
 
 ## Fine Tune from pretrained models
 
diff --git a/contrib/clojure-package/examples/pre-trained-models/project.clj b/contrib/clojure-package/examples/pre-trained-models/project.clj
index 11e002503..e4f6939cb 100644
--- a/contrib/clojure-package/examples/pre-trained-models/project.clj
+++ b/contrib/clojure-package/examples/pre-trained-models/project.clj
@@ -18,8 +18,9 @@
 (defproject pre-trained-models "0.1.0-SNAPSHOT"
   :description "Example of using pre-trained models with MXNet"
   :plugins [[lein-cljfmt "0.5.7"]]
+  :repositories [["vendredi" {:url "https://repository.hellonico.info/repository/hellonico/"}]]
+  :aliases {"predict-image" ["run" "-m" "pre-trained-models.predict-image" ]}
   :dependencies [[org.clojure/clojure "1.9.0"]
                  [org.apache.mxnet.contrib.clojure/clojure-mxnet "1.5.0-SNAPSHOT"]
-                 [net.mikera/imagez "0.12.0"]
-                 [thinktopic/think.image "0.4.16"]]
+                 [origami "4.0.0-3"]]
   :main pre-trained-models.fine-tune)
diff --git a/contrib/clojure-package/examples/pre-trained-models/src/pre_trained_models/predict_image.clj b/contrib/clojure-package/examples/pre-trained-models/src/pre_trained_models/predict_image.clj
index 71202bc00..4df641da7 100644
--- a/contrib/clojure-package/examples/pre-trained-models/src/pre_trained_models/predict_image.clj
+++ b/contrib/clojure-package/examples/pre-trained-models/src/pre_trained_models/predict_image.clj
@@ -22,8 +22,8 @@
             [org.apache.clojure-mxnet.ndarray :as ndarray]
             [org.apache.clojure-mxnet.shape :as mx-shape]
             [org.apache.clojure-mxnet.symbol :as sym]
-            [mikera.image.core :as img]
-            [think.image.pixel :as pixel]))
+            [opencv4.core :as cv]
+            [opencv4.utils :as cvu]))
 
 ;; based on https://mxnet.incubator.apache.org/tutorials/python/predict_image.html
 
@@ -40,21 +40,13 @@
     (io/copy in out)))
 
 (defn get-image [url show?]
-  (let [fname "test-image.jpg"
-        _ (download url fname)
-        image (-> (img/load-image fname)
-                  (img/resize h w))
-        pixels (img/get-pixels image)
-        rgb-pixels (reduce (fn [result pixel]
-                             (let [[rs gs bs] result
-                                   [r g b _] (pixel/unpack-pixel pixel)]
-                               [(conj rs r) (conj gs g) (conj bs b)]))
-                           [[] [] []]
-                           pixels)]
-    (when show? (img/show image))
-    (-> rgb-pixels
-        (flatten)
-        (ndarray/array [1 num-channels h w]))))
+  (-> url
+      (cvu/mat-from-url)
+      (cv/resize! (cv/new-size h w))
+      (#(do (if show? (cvu/imshow %)) %))
+      (cv/convert-to! cv/CV_8SC3 0.5) 
+      (cvu/mat->flat-rgb-array)
+      (ndarray/array [1 num-channels h w])))
 
 (defn predict [img-url show?]
   (let [mod (m/load-checkpoint {:prefix (str model-dir "/resnet-152") :epoch 0})
@@ -90,6 +82,13 @@
         (ndarray/shape)
         (mx-shape/->vec))))
 
+(defn -main [& args]
+  (println 
+   (predict 
+    (or (first args)
+        "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/python/predict_image/cat.jpg" )
+        true)))
+
 (comment
 
   (predict "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/python/predict_image/cat.jpg" true)
diff --git a/contrib/clojure-package/integration-tests.sh b/contrib/clojure-package/integration-tests.sh
index 6e5868712..ce480a507 100755
--- a/contrib/clojure-package/integration-tests.sh
+++ b/contrib/clojure-package/integration-tests.sh
@@ -18,11 +18,15 @@
 
 set -evx
 
-MXNET_HOME=$(cd "$(dirname $0)/../.."; pwd)
+MXNET_HOME=${PWD}
+cd ${MXNET_HOME}/contrib/clojure-package
+# first build the package and install it
+lein install
+
+# then run through the examples 
 EXAMPLES_HOME=${MXNET_HOME}/contrib/clojure-package/examples
-#cd ${MXNET_HOME}/contrib/clojure-package
-#lein test
-#lein cloverage --codecov
-for test_dir in `find ${EXAMPLES_HOME} -name test` ; do
-  cd ${test_dir} && lein test
-done
+# use AWK pattern for blacklisting
+TEST_CASES=`find ${EXAMPLES_HOME} -name test | awk '!/dontselect1|dontselect2/'`
+for i in $TEST_CASES ; do
+ cd ${i} && lein test
+done
\ No newline at end of file
diff --git a/contrib/clojure-package/project.clj b/contrib/clojure-package/project.clj
index c4428ce6e..61d39e28a 100644
--- a/contrib/clojure-package/project.clj
+++ b/contrib/clojure-package/project.clj
@@ -23,10 +23,10 @@
   :dependencies [[org.clojure/clojure "1.9.0"]
                  [t6/from-scala "0.3.0"]
 
-                 ;; Jars from Nexus
-                 ;[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "1.2.1"]
-                 ;[org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "1.2.1"]
-                 ;[org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu "1.2.1"]
+                 ;; To use with nightly snapshot
+                 ;[org.apache.mxnet/mxnet-full_2.11-osx-x86_64-cpu "<insert-snapshot-version>"]
+                 ;[org.apache.mxnet/mxnet-full_2.11-linux-x86_64-cpu "<insert-snapshot-version>"]
+                 ;[org.apache.mxnet/mxnet-full_2.11-linux-x86_64-gpu "<insert-snapshot-version"]
 
                  ;;; CI
                  [org.apache.mxnet/mxnet-full_2.11 "INTERNAL"]
@@ -41,18 +41,7 @@
             [lein-cljfmt "0.5.7"]]
   :codox {:namespaces [#"^org\.apache\.clojure-mxnet\.(?!gen).*"]}
   :aot [dev.generator]
-  :repositories [["staging" {:url "https://repository.apache.org/content/repositories/staging"
-                              ;; If a repository contains releases only setting
-                              ;; :snapshots to false will speed up dependencies.
-                              :snapshots true
-                              ;; Disable signing releases deployed to this repo.
-                              ;; (Not recommended.)
-                              :sign-releases false
-                              ;; You can also set the policies for how to handle
-                              ;; :checksum failures to :fail, :warn, or :ignore.
-                              :checksum :fail
-                              ;; How often should this repository be checked for
-                              ;; snapshot updates? (:daily, :always, or :never)
-                              :update :always
-                              ;; You can also apply them to releases only:
-                              :releases {:checksum :fail :update :always}}]])
+  :repositories [["staging" {:url "https://repository.apache.org/content/repositories/staging"                  :snapshots true
+                             :update :always}]
+                 ["snapshots" {:url "https://repository.apache.org/content/repositories/snapshots"               :snapshots true
+                              :update :always}]])
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/resource_scope.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/resource_scope.clj
new file mode 100644
index 000000000..26673485e
--- /dev/null
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/resource_scope.clj
@@ -0,0 +1,53 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.resource-scope
+  (:require [org.apache.clojure-mxnet.util :as util])
+  (:import (org.apache.mxnet ResourceScope)))
+
+(defmacro
+  using
+  "Uses a Resource Scope for all forms. This is a way to manage all Native Resources like NDArray and Symbol - it will deallocate all Native Resources by calling close on them automatically. It will not call close on Native Resources returned from the form.
+  Example:
+  (resource-scope/using
+   (let [temp-x (ndarray/ones [3 1])
+        temp-y (ndarray/ones [3 1])]
+    (ndarray/+ temp-x temp-y))) "
+  [& forms]
+  `(ResourceScope/using (new ResourceScope) (util/forms->scala-fn ~@forms)))
+
+
+(defmacro
+  with-do
+  "Alias for a do within a resource scope using.
+  Example:
+  (resource-scope/with-do
+    (ndarray/ones [3 1])
+    :all-cleaned-up)
+  "
+  [& forms]
+  `(using (do ~@forms)))
+
+(defmacro
+  with-let
+  "Alias for a let within a resource scope using.
+  Example:
+  (resource-scope/with-let [temp-x (ndarray/ones [3 1])
+                            temp-y (ndarray/ones [3 1])]
+  (ndarray/+ temp-x temp-y))"
+  [& forms]
+  `(using (let ~@forms)))
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
index 43970c0ab..6b5f50792 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/util.clj
@@ -239,3 +239,9 @@
            (apply $/immutable-list))
       ;; pass-through
       map-or-tuple-seq)))
+
+(defmacro forms->scala-fn
+  "Creates a scala fn of zero args from forms"
+  [& forms]
+  `($/fn []
+     (do ~@forms)))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
index c97711b5f..3b9719085 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
@@ -51,33 +51,16 @@
         (is (= out-grad grad))))))
 
 (deftest test-concat
-  (let [shape-vecs [[2 2] [3 2]]
-        x (sym/variable "x")
-        y (sym/variable "y")
-        out (sym/concat "conc" nil [x y] {:dim 0})
-        arr (mapv #(ndarray/empty %) shape-vecs)
-        arr-np (mapv #(ndarray/copy %) arr)
-        arr-grad (map #(ndarray/empty %) shape-vecs)
-        arg-names (sym/list-arguments out)
-        grad-map (zipmap arg-names arr-grad)
-        args (sym/list-arguments out)
-        [arg-shapes out-shapes aux-shapes] (sym/infer-shape out (zipmap args shape-vecs))
-        out-shape-vec (first out-shapes)
-        out-grad (ndarray/empty out-shape-vec)
-        exec1 (sym/bind out (context/default-context) arr grad-map)
-        out1 (-> (executor/forward exec1)
+  (let [a (sym/variable "a")
+      b (sym/variable "b")
+      c (sym/concat "conc" nil [a b] {:dim 0})
+      exec (sym/bind c (context/default-context) {"a" (ndarray/array [1 2] [2 1])
+                                                  "b" (ndarray/array [3 4] [2 1])})
+      output (-> (executor/forward exec)
                  (executor/outputs)
-                 (first))
-        ret (ndarray/concatenate arr)]
-    (is (= out1 ret))
-
-    ;;backward
-    (ndarray/copy-to out1 out-grad)
-    (ndarray/+= out-grad 1)
-    (executor/backward exec1 out-grad)
-    (let [grads arr-grad
-          np-grads arr-np]
-      (is (= grads (mapv #(ndarray/+ % 1) np-grads))))))
+                 (first))]
+    (is (= [1.0 2.0 3.0 4.0] (ndarray/->vec output)))
+    (is (= [4 1] (ndarray/shape-vec output)))))
 
 (defn check-regression [model forward-fn backward-fn]
   (let [shape-vec [3 1]
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/resource_scope_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/resource_scope_test.clj
new file mode 100644
index 000000000..77df03402
--- /dev/null
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/resource_scope_test.clj
@@ -0,0 +1,146 @@
+;;
+;; Licensed to the Apache Software Foundation (ASF) under one or more
+;; contributor license agreements.  See the NOTICE file distributed with
+;; this work for additional information regarding copyright ownership.
+;; The ASF licenses this file to You under the Apache License, Version 2.0
+;; (the "License"); you may not use this file except in compliance with
+;; the License.  You may obtain a copy of the License at
+;;
+;;    http://www.apache.org/licenses/LICENSE-2.0
+;;
+;; Unless required by applicable law or agreed to in writing, software
+;; distributed under the License is distributed on an "AS IS" BASIS,
+;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+;; See the License for the specific language governing permissions and
+;; limitations under the License.
+;;
+
+(ns org.apache.clojure-mxnet.resource-scope-test
+  (:require [org.apache.clojure-mxnet.ndarray :as ndarray]
+            [org.apache.clojure-mxnet.symbol :as sym]
+            [org.apache.clojure-mxnet.resource-scope :as resource-scope]
+            [clojure.test :refer :all]))
+
+
+(deftest test-resource-scope-with-ndarray
+  (let [native-resources (atom {})
+        x (ndarray/ones [2 2])
+        return-val (resource-scope/using
+                    (let [temp-x (ndarray/ones [3 1])
+                          temp-y (ndarray/ones [3 1])]
+                      (swap! native-resources assoc :temp-x temp-x)
+                      (swap! native-resources assoc :temp-y temp-y)
+                      (ndarray/+ temp-x 1)))]
+    (is (true? (ndarray/is-disposed (:temp-x @native-resources))))
+    (is (true? (ndarray/is-disposed (:temp-y @native-resources))))
+    (is (false? (ndarray/is-disposed return-val)))
+    (is (false? (ndarray/is-disposed x)))
+    (is (= [2.0 2.0 2.0] (ndarray/->vec return-val)))))
+
+(deftest test-nested-resource-scope-with-ndarray
+  (let [native-resources (atom {})
+        x (ndarray/ones [2 2])
+        return-val (resource-scope/using
+                    (let [temp-x (ndarray/ones [3 1])]
+                      (swap! native-resources assoc :temp-x temp-x)
+                     (resource-scope/using
+                      (let [temp-y (ndarray/ones [3 1])]
+                        (swap! native-resources assoc :temp-y temp-y)))))]
+    (is (true? (ndarray/is-disposed (:temp-y @native-resources))))
+    (is (true? (ndarray/is-disposed (:temp-x @native-resources))))
+    (is (false? (ndarray/is-disposed x)))))
+
+(deftest test-resource-scope-with-sym
+  (let [native-resources (atom {})
+        x (sym/ones [2 2])
+        return-val (resource-scope/using
+                    (let [temp-x (sym/ones [3 1])
+                          temp-y (sym/ones [3 1])]
+                      (swap! native-resources assoc :temp-x temp-x)
+                      (swap! native-resources assoc :temp-y temp-y)
+                      (sym/+ temp-x 1)))]
+    (is (true? (sym/is-disposed (:temp-x @native-resources))))
+    (is (true? (sym/is-disposed (:temp-y @native-resources))))
+    (is (false? (sym/is-disposed return-val)))
+    (is (false? (sym/is-disposed x)))))
+
+(deftest test-nested-resource-scope-with-ndarray
+  (let [native-resources (atom {})
+        x (ndarray/ones [2 2])
+        return-val (resource-scope/using
+                    (let [temp-x (ndarray/ones [3 1])]
+                      (swap! native-resources assoc :temp-x temp-x)
+                     (resource-scope/using
+                      (let [temp-y (ndarray/ones [3 1])]
+                        (swap! native-resources assoc :temp-y temp-y)))))]
+    (is (true? (ndarray/is-disposed (:temp-y @native-resources))))
+    (is (true? (ndarray/is-disposed (:temp-x @native-resources))))
+    (is (false? (ndarray/is-disposed x)))))
+
+(deftest test-nested-resource-scope-with-sym
+  (let [native-resources (atom {})
+        x (sym/ones [2 2])
+        return-val (resource-scope/using
+                    (let [temp-x (sym/ones [3 1])]
+                      (swap! native-resources assoc :temp-x temp-x)
+                     (resource-scope/using
+                      (let [temp-y (sym/ones [3 1])]
+                        (swap! native-resources assoc :temp-y temp-y)))))]
+    (is (true? (sym/is-disposed (:temp-y @native-resources))))
+    (is (true? (sym/is-disposed (:temp-x @native-resources))))
+    (is (false? (sym/is-disposed x)))))
+
+(deftest test-list-creation-with-returning-first
+  (let [native-resources (atom [])
+        return-val (resource-scope/using
+                    (let [temp-ndarrays (doall (repeatedly 3 #(ndarray/ones [3 1])))
+                          _ (reset! native-resources temp-ndarrays)]
+                      (first temp-ndarrays)))]
+    (is (false? (ndarray/is-disposed return-val)))
+    (is (= [false true true] (mapv ndarray/is-disposed @native-resources)))))
+
+(deftest test-list-creation
+  (let [native-resources (atom [])
+        return-val (resource-scope/using
+                    (let [temp-ndarrays (doall (repeatedly 3 #(ndarray/ones [3 1])))
+                          _ (reset! native-resources temp-ndarrays)]
+                      (ndarray/ones [3 1])))]
+    (is (false? (ndarray/is-disposed return-val)))
+    (is (= [true true true] (mapv ndarray/is-disposed @native-resources)))))
+
+(deftest test-list-creation-without-let
+  (let [native-resources (atom [])
+        return-val (resource-scope/using
+                    (first (doall (repeatedly 3 #(do
+                                             (let [x (ndarray/ones [3 1])]
+                                               (swap! native-resources conj x)
+                                               x))))))]
+    (is (false? (ndarray/is-disposed return-val)))
+    (is (= [false true true] (mapv ndarray/is-disposed @native-resources)))))
+
+(deftest test-with-let
+  (let [native-resources (atom {})
+        x (ndarray/ones [2 2])
+        return-val (resource-scope/with-let [temp-x (ndarray/ones [3 1])
+                                             temp-y (ndarray/ones [3 1])]
+                     (swap! native-resources assoc :temp-x temp-x)
+                     (swap! native-resources assoc :temp-y temp-y)
+                     (ndarray/+ temp-x 1))]
+    (is (true? (ndarray/is-disposed (:temp-x @native-resources))))
+    (is (true? (ndarray/is-disposed (:temp-y @native-resources))))
+    (is (false? (ndarray/is-disposed return-val)))
+    (is (false? (ndarray/is-disposed x)))
+    (is (= [2.0 2.0 2.0] (ndarray/->vec return-val)))))
+
+(deftest test-with-do
+  (let [native-resources (atom {})
+        x (ndarray/ones [2 2])
+        return-val (resource-scope/with-do
+                     (swap! native-resources assoc :temp-x (ndarray/ones [3 1]))
+                     (swap! native-resources assoc :temp-y (ndarray/ones [3 1]))
+                     (ndarray/ones [3 1]))]
+    (is (true? (ndarray/is-disposed (:temp-x @native-resources))))
+    (is (true? (ndarray/is-disposed (:temp-y @native-resources))))
+    (is (false? (ndarray/is-disposed return-val)))
+    (is (false? (ndarray/is-disposed x)))
+    (is (= [1.0 1.0 1.0] (ndarray/->vec return-val)))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
index c26f83d5a..4ed7d38e6 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/util_test.clj
@@ -226,3 +226,10 @@
   (let [nda (util/map->scala-tuple-seq {:a-b (ndarray/ones [1 2])})]
     (is (= "a_b" (._1 (.head nda))))
     (is (= [1.0 1.0] (ndarray/->vec (._2 (.head nda)))))))
+
+(deftest test-forms->scala-fn
+  (let [scala-fn (util/forms->scala-fn
+                  (def x 1)
+                  (def y 2)
+                  {:x x :y y})]
+    (is (= {:x 1 :y 2} (.apply scala-fn)))))
diff --git a/cpp-package/README.md b/cpp-package/README.md
index c4fe63c9e..45941555b 100644
--- a/cpp-package/README.md
+++ b/cpp-package/README.md
@@ -8,7 +8,7 @@ The users of these bindings are required to build this package as mentioned belo
 The cpp-package directory contains the implementation of C++ API. As mentioned above, users are required to build this directory or package before using it.
 **The cpp-package is built while building the MXNet shared library, *libmxnet.so*.**
 
-###Steps to build the C++ package:
+### Steps to build the C++ package:
 1.  Building the MXNet C++ package requires building MXNet from source.
 2.  Clone the MXNet GitHub repository **recursively** to ensure the code in submodules is available for building MXNet.
 	```
@@ -17,10 +17,10 @@ The cpp-package directory contains the implementation of C++ API. As mentioned a
 
 3.  Install the [prerequisites](<https://mxnet.incubator.apache.org/install/build_from_source#prerequisites>), desired [BLAS libraries](<https://mxnet.incubator.apache.org/install/build_from_source#blas-library>) and optional [OpenCV, CUDA, and cuDNN](<https://mxnet.incubator.apache.org/install/build_from_source#optional>) for building MXNet from source.
 4.  There is a configuration file for make, [make/config.mk](</~https://github.com/apache/incubator-mxnet/blob/master/make/config.mk>) that contains all the compilation options. You can edit this file and set the appropriate options prior to running the **make** command.
-5.  Please refer to  [platform specific build instructions](<https://mxnet.incubator.apache.org/install/build_from_source#build-instructions-by-operating-system>) and available [build configurations](https://mxnet.incubator.apache.org/install/build_from_source#build-configurations) for more details. 
+5.  Please refer to  [platform specific build instructions](<https://mxnet.incubator.apache.org/install/build_from_source#build-instructions-by-operating-system>) and available [build configurations](https://mxnet.incubator.apache.org/install/build_from_source#build-configurations) for more details.
 5.  For enabling the build of C++ Package, set the **USE\_CPP\_PACKAGE = 1** in [make/config.mk](</~https://github.com/apache/incubator-mxnet/blob/master/make/config.mk>). Optionally, the compilation flag can also be specified on **make** command line as follows.
 	```
-	make -j USE_CPP_PACKAGE=1 
+	make -j USE_CPP_PACKAGE=1
 	```
 
 ## Usage
@@ -42,5 +42,4 @@ A basic tutorial can be found at <https://mxnet.incubator.apache.org/tutorials/c
 
 ## Examples
 
-The example directory contains examples for you to get started.
-
+The example directory contains examples for you to get started. Please build the MXNet C++ Package before building the examples.
diff --git a/cpp-package/cpp-package.mk b/cpp-package/cpp-package.mk
index 1f1281745..b9e7c3331 100644
--- a/cpp-package/cpp-package.mk
+++ b/cpp-package/cpp-package.mk
@@ -42,4 +42,4 @@ cpp-package-lint:
 	(cd cpp-package; python scripts/lint.py dmlc ${LINT_LANG} include example)
 
 include cpp-package/example/example.mk
-
+include cpp-package/example/inference/inference.mk
diff --git a/cpp-package/example/README.md b/cpp-package/example/README.md
index c2329330b..724478f3d 100644
--- a/cpp-package/example/README.md
+++ b/cpp-package/example/README.md
@@ -3,7 +3,8 @@
 ## Building C++ examples
 
 The examples in this folder demonstrate the **training** workflow. The **inference workflow** related examples can be found in [inference](</~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference>) folder.
-The examples in this folder are built while building the MXNet library and cpp-package from source . However, they can be built manually as follows
+Please build the MXNet C++ Package as explained in the [README](</~https://github.com/apache/incubator-mxnet/tree/master/cpp-package#building-c-package>) File before building these examples manually.
+The examples in this folder are built while building the MXNet library and cpp-package from source. However, they can be built manually as follows
 
 From cpp-package/examples directory
 
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index 7564d4361..1c0f7130d 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -244,10 +244,14 @@ int main(int argc, char const *argv[]) {
                               };
 
   auto train_iter =  MXDataIter("MNISTIter");
-  setDataIter(&train_iter, "Train", data_files, batch_size);
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
 
   auto val_iter = MXDataIter("MNISTIter");
-  setDataIter(&val_iter, "Label", data_files, batch_size);
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
 
   Optimizer* opt = OptimizerRegistry::Find("sgd");
   opt->SetParam("momentum", 0.9)
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index 4bd3be27f..c8078afd0 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -139,10 +139,14 @@ int main(int argc, char const *argv[]) {
                                         };
 
   auto train_iter =  MXDataIter("MNISTIter");
-  setDataIter(&train_iter, "Train", data_files, batch_size);
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
 
   auto val_iter = MXDataIter("MNISTIter");
-  setDataIter(&val_iter, "Label", data_files, batch_size);
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
 
   Optimizer* opt = OptimizerRegistry::Find("sgd");
   opt->SetParam("momentum", 0.9)
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index 5b444e467..456e0d913 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -168,10 +168,14 @@ int main(int argc, char const *argv[]) {
                                         };
 
   auto train_iter =  MXDataIter("MNISTIter");
-  setDataIter(&train_iter, "Train", data_files, batch_size);
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
 
   auto val_iter = MXDataIter("MNISTIter");
-  setDataIter(&val_iter, "Label", data_files, batch_size);
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
 
   // initialize parameters
   Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2);
diff --git a/cpp-package/example/inference/README.md b/cpp-package/example/inference/README.md
index 79831b40b..efd235757 100644
--- a/cpp-package/example/inference/README.md
+++ b/cpp-package/example/inference/README.md
@@ -2,7 +2,7 @@
 
 ## Building C++ Inference examples
 
-The examples in this folder demonstrate the **inference** workflow.
+The examples in this folder demonstrate the **inference** workflow. Please build the MXNet C++ Package as explained in the [README](</~https://github.com/apache/incubator-mxnet/tree/master/cpp-package#building-c-package>) File before building these examples.
 To build examples use following commands:
 
 -  Release: **make all**
@@ -39,3 +39,69 @@ Alternatively, The script [unit_test_inception_inference.sh](<https://github.com
 ```
 ./unit_test_inception_inference.sh
 ```
+
+### [sentiment_analysis_rnn.cpp](</~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/sentiment_analysis_rnn.cpp>)
+This example demonstrates how you can load a pre-trained RNN model and use it to predict the sentiment expressed in the given movie review with the MXNet C++ API. The example is capable of processing variable legnth inputs. It performs the following tasks
+- Loads the pre-trained RNN model.
+- Loads the dictionary file containing the word to index mapping.
+- Splits the review in multiple lines separated by "."
+- The example predicts the sentiment score for individual lines and outputs the average score.
+
+The example is capable of processing variable length input by implementing following technique:
+- The example creates executors for pre-determined input lenghts such as 5, 10, 15, 20, 25, etc called **buckets**.
+- Each bucket is identified by **bucket-key** representing the length on input required by corresponding executor.
+- For each line in the review, the example finds the number of words in the line and tries to find a closest bucket or executor.
+- If the bucket key does not match the number of words in the line, the example pads or trims the input line to match the required length.
+
+The example uses a pre-trained RNN model trained with a IMDB dataset. The RNN model was built by exercising the [GluonNLP Sentiment Analysis Tutorial](<http://gluon-nlp.mxnet.io/examples/sentiment_analysis/sentiment_analysis.html#>). The tutorial uses 'standard_lstm_lm_200' available in Gluon Model Zoo and fine tunes it for the IMDB dataset
+The model consists of :
+- Embedding Layer
+- 2 LSTM Layers with hidden dimension size of 200
+- Average pooling layer
+- Sigmoid output layer
+The model was trained for 10 epochs to achieve 85% test accuracy.
+The visual representation of the model is [here](<http://gluon-nlp.mxnet.io/examples/sentiment_analysis/sentiment_analysis.html#Sentiment-analysis-model-with-pre-trained-language-model-encoder>).
+
+The model files can be found here.
+- [sentiment_analysis-symbol.json](< https://s3.amazonaws.com/mxnet-cpp/RNN_model/sentiment_analysis-symbol.json>)
+- [sentiment_analysis-0010.params](< https://s3.amazonaws.com/mxnet-cpp/RNN_model/sentiment_analysis-0010.params>)
+- [sentiment_token_to_idx.txt](<https://s3.amazonaws.com/mxnet-cpp/RNN_model/sentiment_token_to_idx.txt>) Each line of the dictionary file contains a word and a unique index for that word, separated by a space, with a total of 32787 words generated from the training dataset.
+The example downloads the above files while running.
+
+The example's command line parameters are as shown below:
+
+```
+./sentiment_analysis_rnn --help
+Usage:
+sentiment_analysis_rnn
+--input Input movie review. The review can be single line or multiline.e.g. "This movie is the best." OR  "This movie is the best. The direction is awesome."
+[--gpu]  Specify this option if workflow needs to be run in gpu context
+If the review is multiline, the example predicts sentiment score for each line and the final score is the average of scores obtained for each line.
+
+```
+
+The following command line shows running the example with the movie review containing only one line.
+
+```
+./sentiment_analysis_rnn --input "This movie has the great story"
+```
+
+The above command will output the sentiment score as follows:
+```
+sentiment_analysis_rnn.cpp:346: Input Line : [This movie has the great story] Score : 0.999898
+sentiment_analysis_rnn.cpp:449: The sentiment score between 0 and 1, (1 being positive)=0.999898
+```
+
+The following command line shows invoking the example with the multi-line review.
+
+```
+./sentiment_analysis_rnn --input "This movie is the best. The direction is awesome."
+```
+The above command will output the sentiment score for each line in the review and average score as follows:
+```
+Input Line : [This movie is the best] Score : 0.964498
+Input Line : [ The direction is awesome] Score : 0.968855
+The sentiment score between 0 and 1, (1 being positive)=0.966677
+```
+
+Alternatively, you can run the [unit_test_sentiment_analysis_rnn.sh](</~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh>) script.
diff --git a/cpp-package/example/inference/inception_inference.cpp b/cpp-package/example/inference/inception_inference.cpp
index 7005e745b..78487e6ee 100644
--- a/cpp-package/example/inference/inception_inference.cpp
+++ b/cpp-package/example/inference/inception_inference.cpp
@@ -215,7 +215,6 @@ void Predictor::LoadMeanImageData() {
   mean_image_data.SyncCopyFromCPU(
         NDArray::LoadToMap(mean_image_file)["mean_img"].GetData(),
         input_shape.Size());
-  NDArray::WaitAll();
 }
 
 
@@ -244,7 +243,6 @@ void Predictor::LoadDefaultMeanImageData() {
   }
   mean_image_data = NDArray(input_shape, global_ctx, false);
   mean_image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
-  NDArray::WaitAll();
 }
 
 
@@ -273,7 +271,6 @@ NDArray Predictor::LoadInputImage(const std::string& image_file) {
   }
   NDArray image_data = NDArray(input_shape, global_ctx, false);
   image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
-  NDArray::WaitAll();
   return image_data;
 }
 
@@ -299,21 +296,26 @@ void Predictor::PredictImage(const std::string& image_file) {
    *
    */
   image_data.CopyTo(&(executor->arg_dict()["data"]));
-  NDArray::WaitAll();
 
   // Run the forward pass.
   executor->Forward(false);
 
   // The output is available in executor->outputs.
   auto array = executor->outputs[0].Copy(global_ctx);
-  NDArray::WaitAll();
 
   /*
    * Find out the maximum accuracy and the index associated with that accuracy.
    * This is done by using the argmax operator on NDArray.
    */
   auto predicted = array.ArgmaxChannel();
-  NDArray::WaitAll();
+
+  /*
+   * Wait until all the previous write operations on the 'predicted'
+   * NDArray to be complete before we read it.
+   * This method guarantees that all previous write operations that pushed into the backend engine
+   * for execution are actually finished.
+   */
+  predicted.WaitToRead();
 
   int best_idx = predicted.At(0, 0);
   float best_accuracy = array.At(0, best_idx);
diff --git a/cpp-package/example/inference/inference.mk b/cpp-package/example/inference/inference.mk
new file mode 100644
index 000000000..b03055395
--- /dev/null
+++ b/cpp-package/example/inference/inference.mk
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+CPPEX_SRC = $(wildcard cpp-package/example/inference/*.cpp)
+CPPEX_EXE = $(patsubst cpp-package/example/inference/%.cpp, build/cpp-package/example/%, $(CPPEX_SRC))
+
+CPPEX_CFLAGS += -Icpp-package/include
+CPPEX_EXTRA_LDFLAGS := -L$(ROOTDIR)/lib -lmxnet
+
+EXTRA_PACKAGES += cpp-package-inference-example-all
+EXTRA_PACKAGES_CLEAN += cpp-package-inference-example-clean
+
+.PHONY: cpp-package-inference-example-all cpp-package-inference-example-clean
+
+cpp-package-inference-example-all: cpp-package-all $(CPPEX_EXE)
+
+build/cpp-package/example/% : cpp-package/example/inference/%.cpp lib/libmxnet.so $(CPP_PACKAGE_OP_H_FILE)
+	@mkdir -p $(@D)
+	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -MM -MT cpp-package/example/inference/$* $< >build/cpp-package/example/$*.d
+	$(CXX) -std=c++11 $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(LDFLAGS) $(CPPEX_EXTRA_LDFLAGS)
+
+cpp-package-inference-example-clean:
+	rm -rf build/cpp-package/example/inference*
+
+-include build/cpp-package/example/inference/*.d
diff --git a/cpp-package/example/inference/sentiment_analysis_rnn.cpp b/cpp-package/example/inference/sentiment_analysis_rnn.cpp
new file mode 100755
index 000000000..53b618ff1
--- /dev/null
+++ b/cpp-package/example/inference/sentiment_analysis_rnn.cpp
@@ -0,0 +1,488 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * This example demonstrates sentiment prediction workflow with pre-trained RNN model using MXNet C++ API.
+ * The example performs following tasks.
+ * 1. Load the pre-trained RNN model,
+ * 2. Load the dictionary file that contains word to index mapping.
+ * 3. Create executors for pre-determined input lengths.
+ * 4. Convert each line in the input to the vector of indices.
+ * 5. Predictor finds the right executor for each line.
+ * 4. Run the forward pass for each line and predicts the sentiment scores.
+ * The example uses a pre-trained RNN model that is trained with the IMDB dataset.
+ */
+
+#include <sys/stat.h>
+#include <iostream>
+#include <fstream>
+#include <cstdlib>
+#include <map>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <sstream>
+#include "mxnet-cpp/MxNetCpp.h"
+
+using namespace mxnet::cpp;
+
+static const int DEFAULT_BUCKET_KEYS[] = {30, 25, 20, 15, 10, 5};
+static const char DEFAULT_S3_URL[] = "https://s3.amazonaws.com/mxnet-cpp/RNN_model/";
+
+
+/*
+ * class Predictor
+ *
+ * This class encapsulates the functionality to load the model, process input image and run the forward pass.
+ */
+
+class Predictor {
+ public:
+    Predictor() {}
+    Predictor(const std::string& model_json,
+              const std::string& model_params,
+              const std::string& input_dictionary,
+              const std::vector<int>& bucket_keys,
+              bool use_gpu = false);
+    float PredictSentiment(const std::string &input_review);
+    ~Predictor();
+
+ private:
+    void LoadModel(const std::string& model_json_file);
+    void LoadParameters(const std::string& model_parameters_file);
+    void LoadDictionary(const std::string &input_dictionary);
+    inline bool FileExists(const std::string& name) {
+        struct stat buffer;
+        return (stat(name.c_str(), &buffer) == 0);
+    }
+    float PredictSentimentForOneLine(const std::string &input_line);
+    int ConvertToIndexVector(const std::string& input,
+                      std::vector<float> *input_vector);
+    int GetIndexForOutputSymbolName(const std::string& output_symbol_name);
+    float GetIndexForWord(const std::string& word);
+    int GetClosestBucketKey(int num_words);
+
+    std::map<std::string, NDArray> args_map;
+    std::map<std::string, NDArray> aux_map;
+    std::map<std::string, int>  wordToIndex;
+    Symbol net;
+    std::map<int, Executor*> executor_buckets;
+    Context global_ctx = Context::cpu();
+    int highest_bucket_key;
+};
+
+
+/*
+ * The constructor takes the following parameters as input:
+ * 1. model_json:  The RNN model in json formatted file.
+ * 2. model_params: File containing model parameters
+ * 3. input_dictionary: File containing the word and associated index.
+ * 4. bucket_keys: A vector of bucket keys for creating executors.
+ *
+ * The constructor:
+ *  1. Loads the model and parameter files.
+ *  2. Loads the dictionary file to create index to word and word to index maps.
+ *  3. For each bucket key in the input vector of bucket keys, it creates an executor.
+ *     The executors share the memory. The bucket key determines the length of input data
+ *     required for that executor.
+ *  4. Creates a map of bucket key to corresponding executor.
+ *  5. The model is loaded only once. The executors share the memory for the parameters.
+ */
+Predictor::Predictor(const std::string& model_json,
+                     const std::string& model_params,
+                     const std::string& input_dictionary,
+                     const std::vector<int>& bucket_keys,
+                     bool use_gpu) {
+  if (use_gpu) {
+    global_ctx = Context::gpu();
+  }
+
+  /*
+   * Load the dictionary file that contains the word and its index.
+   * The function creates word to index and index to word map. The maps are used to create index
+   * vector for the input sentence.
+   */
+  LoadDictionary(input_dictionary);
+
+  // Load the model
+  LoadModel(model_json);
+
+  // Load the model parameters.
+  LoadParameters(model_params);
+
+  /*
+   * Create the executors for each bucket key. The bucket key represents the shape of input data.
+   * The executors will share the memory by using following technique:
+   * 1. Infer the executor arrays and bind the first executor with the first bucket key.
+   * 2. Then for creating the next bucket key, adjust the shape of input argument to match that key.
+   * 3. Create the executor for the next bucket key by passing the inferred executor arrays and
+   *    pointer to the executor created for the first key.
+   */
+  std::vector<NDArray> arg_arrays;
+  std::vector<NDArray> grad_arrays;
+  std::vector<OpReqType> grad_reqs;
+  std::vector<NDArray> aux_arrays;
+
+  /*
+   * Create master executor with highest bucket key for optimizing the shared memory between the
+   * executors for the remaining bucket keys.
+   */
+  highest_bucket_key = *(std::max_element(bucket_keys.begin(), bucket_keys.end()));
+  args_map["data0"] = NDArray(Shape(highest_bucket_key, 1), global_ctx, false);
+  args_map["data1"] = NDArray(Shape(1), global_ctx, false);
+
+  net.InferExecutorArrays(global_ctx, &arg_arrays, &grad_arrays, &grad_reqs,
+                          &aux_arrays, args_map, std::map<std::string, NDArray>(),
+                              std::map<std::string, OpReqType>(), aux_map);
+  Executor *master_executor = net.Bind(global_ctx, arg_arrays, grad_arrays, grad_reqs, aux_arrays,
+                                 std::map<std::string, Context>(), nullptr);
+  executor_buckets[highest_bucket_key] = master_executor;
+
+  for (int bucket : bucket_keys) {
+    if (executor_buckets.find(bucket) == executor_buckets.end()) {
+      arg_arrays[0]  = NDArray(Shape(bucket, 1), global_ctx, false);
+      Executor *executor = net.Bind(global_ctx, arg_arrays, grad_arrays, grad_reqs, aux_arrays,
+                                    std::map<std::string, Context>(), master_executor);
+      executor_buckets[bucket] = executor;
+    }
+  }
+}
+
+
+/*
+ * The following function loads the model from json file.
+ */
+void Predictor::LoadModel(const std::string& model_json_file) {
+  if (!FileExists(model_json_file)) {
+    LG << "Model file " << model_json_file << " does not exist";
+    throw std::runtime_error("Model file does not exist");
+  }
+  LG << "Loading the model from " << model_json_file << std::endl;
+  net = Symbol::Load(model_json_file);
+}
+
+
+/*
+ * The following function loads the model parameters.
+ */
+void Predictor::LoadParameters(const std::string& model_parameters_file) {
+  if (!FileExists(model_parameters_file)) {
+    LG << "Parameter file " << model_parameters_file << " does not exist";
+    throw std::runtime_error("Model parameters does not exist");
+  }
+  LG << "Loading the model parameters from " << model_parameters_file << std::endl;
+  std::map<std::string, NDArray> parameters;
+  NDArray::Load(model_parameters_file, 0, &parameters);
+  for (const auto &k : parameters) {
+    if (k.first.substr(0, 4) == "aux:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      aux_map[name] = k.second.Copy(global_ctx);
+    }
+    if (k.first.substr(0, 4) == "arg:") {
+      auto name = k.first.substr(4, k.first.size() - 4);
+      args_map[name] = k.second.Copy(global_ctx);
+    }
+  }
+  /*WaitAll is need when we copy data between GPU and the main memory*/
+  NDArray::WaitAll();
+}
+
+
+/*
+ * The following function loads the dictionary file.
+ * The function constructs the word to index and index to word maps.
+ * These maps will be used to represent words in the input sentence to their indices.
+ * Ensure to use the same dictionary file that was used for training the network.
+ */
+void Predictor::LoadDictionary(const std::string& input_dictionary) {
+  if (!FileExists(input_dictionary)) {
+    LG << "Dictionary file " << input_dictionary << " does not exist";
+    throw std::runtime_error("Dictionary file does not exist");
+  }
+  LG << "Loading the dictionary file.";
+  std::ifstream fi(input_dictionary.c_str());
+  if (!fi.is_open()) {
+    std::cerr << "Error opening dictionary file " << input_dictionary << std::endl;
+    assert(false);
+  }
+
+  std::string line;
+  std::string word;
+  int index;
+  while (std::getline(fi, line)) {
+    std::istringstream stringline(line);
+    stringline >> word >> index;
+    wordToIndex[word] = index;
+  }
+  fi.close();
+}
+
+
+/*
+ * The function returns the index associated with the word in the dictionary.
+ * If the word is not present, the index representing "<unk>" is returned.
+ * If the "<unk>" is not present then 0 is returned.
+ */
+float Predictor::GetIndexForWord(const std::string& word) {
+  if (wordToIndex.find(word) == wordToIndex.end()) {
+    if (wordToIndex.find("<unk>") == wordToIndex.end())
+      return 0;
+    else
+      return static_cast<float>(wordToIndex["<unk>"]);
+  }
+  return static_cast<float>(wordToIndex[word]);
+}
+
+/*
+ * The function populates the input vector with indices from the dictionary that
+ * correspond to the words in the input string.
+ * The function returns the number of words in the input line.
+ */
+int Predictor::ConvertToIndexVector(const std::string& input, std::vector<float> *input_vector) {
+  std::istringstream input_string(input);
+  input_vector->clear();
+  const char delimiter = ' ';
+  std::string token;
+  size_t words = 0;
+  while (std::getline(input_string, token, delimiter) && (words <= input_vector->size())) {
+    input_vector->push_back(GetIndexForWord(token));
+    words++;
+  }
+  return words;
+}
+
+
+/*
+ * The function returns the index at which the given symbol name will appear
+ * in the output vector of NDArrays obtained after running the forward pass on the executor.
+ */
+int Predictor::GetIndexForOutputSymbolName(const std::string& output_symbol_name) {
+  int index = 0;
+  for (const std::string op : net.ListOutputs()) {
+    if (op == output_symbol_name) {
+      return index;
+    } else {
+      index++;
+    }
+  }
+  throw std::runtime_error("The output symbol name can not be found");
+}
+
+
+/*
+ * The function finds the closest bucket for the given num_words in the input line.
+ * If the exact bucket key exists, function returns that bucket key.
+ * If the matching bucket key does not exist, function looks for the next bucket key
+ * that is greater than given num_words.
+ * If the next larger bucket does not exist, function returns the largest bucket key.
+ */
+int Predictor::GetClosestBucketKey(int num_words) {
+  int closest_bucket_key = highest_bucket_key;
+
+  if (executor_buckets.lower_bound(num_words) != executor_buckets.end()) {
+    closest_bucket_key = executor_buckets.lower_bound(num_words)->first;
+  }
+  return closest_bucket_key;
+}
+
+
+/*
+ * The following function runs the forward pass on the model for the given line.
+ *
+ */
+float Predictor::PredictSentimentForOneLine(const std::string& input_line) {
+  /*
+   * Initialize a vector of length equal to 'num_words' with index corresponding to <eos>.
+   * Convert the input string to a vector of indices that represent
+   * the words in the input string.
+   */
+  std::vector<float> index_vector(GetIndexForWord("<eos>"));
+  int num_words = ConvertToIndexVector(input_line, &index_vector);
+  int bucket_key = GetClosestBucketKey(num_words);
+
+  /*
+   * The index_vector has size equal to num_words. The vector needs to be padded if
+   * the bucket_key is greater than num_words. The vector needs to be trimmed if
+   * the bucket_key is smaller than num_words.
+   */
+  index_vector.resize(bucket_key, GetIndexForWord("<eos>"));
+
+  Executor* executor = executor_buckets[bucket_key];
+  executor->arg_dict()["data0"].SyncCopyFromCPU(index_vector.data(), index_vector.size());
+  executor->arg_dict()["data1"] = num_words;
+
+  // Run the forward pass.
+  executor->Forward(false);
+
+  /*
+   * The output is available in executor->outputs. It is a vector of
+   * NDArray. We need to find the index in that vector that
+   * corresponds to the output symbol "sentimentnet0_hybridsequential0_dense0_fwd_output".
+   */
+  const std::string output_symbol_name = "sentimentnet0_hybridsequential0_dense0_fwd_output";
+  int output_index = GetIndexForOutputSymbolName(output_symbol_name);
+  std::vector<NDArray> outputs = executor->outputs;
+  auto arrayout = executor->outputs[output_index].Copy(global_ctx);
+  /*
+   * We will run sigmoid operator to find out the sentiment score between
+   * 0 and 1 where 1 represents positive.
+   */
+  NDArray ret;
+  Operator("sigmoid")(arrayout).Invoke(ret);
+  ret.WaitToRead();
+
+  return ret.At(0, 0);
+}
+
+
+/*
+ * The function predicts the sentiment score for the input review.
+ * The function splits the input review in lines (separated by '.').
+ * It finds sentiment score for each line and computes the average.
+ */
+float Predictor::PredictSentiment(const std::string& input_review) {
+  std::istringstream input_string(input_review);
+  int num_lines = 0;
+  float sentiment_score = 0.0f;
+
+  // Split the iput review in separate lines separated by '.'
+  const char delimiter = '.';
+  std::string line;
+  while (std::getline(input_string, line, delimiter)) {
+    // Predict the sentiment score for each line.
+    float score = PredictSentimentForOneLine(line);
+    LG << "Input Line : [" << line << "] Score : " << score;
+    sentiment_score += score;
+    num_lines++;
+  }
+
+  // Find the average sentiment score.
+  sentiment_score = sentiment_score / num_lines;
+  return sentiment_score;
+}
+
+
+/*
+ * The destructor frees the executor and notifies MXNetEngine to shutdown.
+ */
+Predictor::~Predictor() {
+  for (auto bucket : this->executor_buckets) {
+    Executor* executor = bucket.second;
+    delete executor;
+  }
+  MXNotifyShutdown();
+}
+
+
+/*
+ * The function prints the usage information.
+ */
+void printUsage() {
+    std::cout << "Usage:" << std::endl;
+    std::cout << "sentiment_analysis_rnn " << std::endl
+              << "--input Input movie review. The review can be single line or multiline."
+              << "e.g. \"This movie is the best.\" OR  "
+              << "\"This movie is the best. The direction is awesome.\" " << std::endl
+              << "[--gpu]  Specify this option if workflow needs to be run in gpu context "
+              << std::endl
+              << "If the review is multiline, the example predicts sentiment score for each line "
+              << "and the final score is the average of scores obtained for each line."
+              << std::endl;
+}
+
+
+/*
+ * The function downloads the model files from s3 bucket.
+ */
+void DownloadFiles(const std::vector<std::string> model_files) {
+  std::string wget_command("wget -nc ");
+  std::string s3_url(DEFAULT_S3_URL);
+  for (auto &file : model_files) {
+    std::ostringstream oss;
+    oss << wget_command << s3_url << file << " -O " << file;
+    int status = system(oss.str().c_str());
+    LG << "Downloading " << file << " with status " << status;
+  }
+  return;
+}
+
+
+int main(int argc, char** argv) {
+  std::string model_file_json = "./sentiment_analysis-symbol.json";
+  std::string model_file_params ="./sentiment_analysis-0010.params";
+  std::string input_dictionary = "./sentiment_token_to_idx.txt";
+  std::string input_review = "This movie is the best";
+  bool use_gpu = false;
+
+  int index = 1;
+  while (index < argc) {
+    if (strcmp("--input", argv[index]) == 0) {
+      index++;
+      input_review = (index < argc ? argv[index]:input_review);
+    } else if (strcmp("--gpu", argv[index]) == 0) {
+      use_gpu = true;
+    } else if (strcmp("--help", argv[index]) == 0) {
+      printUsage();
+      return 0;
+    }
+    index++;
+  }
+
+
+  /*
+   * Download the trained RNN model file, param file and dictionary file.
+   * The dictionary file contains word to index mapping.
+   * Each line of the dictionary file contains a word and the unique index for that word separated
+   * by a space. For example:
+   * snippets 11172
+   * This dictionary file is created when the RNN model was trained with a particular dataset.
+   * Hence the dictionary file is specific to the dataset with which model was trained.
+   */
+  std::vector<std::string> files;
+  files.push_back(model_file_json);
+  files.push_back(model_file_params);
+  files.push_back(input_dictionary);
+
+  DownloadFiles(files);
+
+  std::vector<int> buckets(DEFAULT_BUCKET_KEYS,
+                           DEFAULT_BUCKET_KEYS + sizeof(DEFAULT_BUCKET_KEYS) / sizeof(int));
+
+  try {
+    // Initialize the predictor object
+    Predictor predict(model_file_json, model_file_params, input_dictionary, buckets, use_gpu);
+
+    // Run the forward pass to predict the sentiment score for the given review.
+    float sentiment_score = predict.PredictSentiment(input_review);
+    LG << "The sentiment score between 0 and 1, (1 being positive)=" << sentiment_score;
+  } catch (std::runtime_error &error) {
+    LG << MXGetLastError();
+    LG << "Execution failed with ERROR: " << error.what();
+    return 1;
+  } catch (...) {
+    /*
+     * If underlying MXNet code has thrown an exception the error message is
+     * accessible through MXGetLastError() function.
+     */
+    LG << "Execution failed with following MXNet error";
+    LG << MXGetLastError();
+    return 1;
+  }
+  return 0;
+}
diff --git a/cpp-package/example/inference/unit_test_inception_inference.sh b/cpp-package/example/inference/unit_test_inception_inference.sh
index 4f40b496b..f33b8f19b 100755
--- a/cpp-package/example/inference/unit_test_inception_inference.sh
+++ b/cpp-package/example/inference/unit_test_inception_inference.sh
@@ -22,9 +22,6 @@ wget -nc -O model/dog.jpg /~https://github.com/dmlc/web-data/blob/master/mxnet/doc
 wget -nc -O model/mean_224.nd /~https://github.com/dmlc/web-data/raw/master/mxnet/example/feature_extract/mean_224.nd
 tar -xvzf inception-bn.tar.gz -C model
 
-# Building
-make all
-
 
 # Running the example with dog image.
 if [ "$(uname)" == "Darwin" ]; then
diff --git a/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh b/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh
new file mode 100755
index 000000000..6f42e449c
--- /dev/null
+++ b/cpp-package/example/inference/unit_test_sentiment_analysis_rnn.sh
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+function compare_range() {
+    perl -e "{if($1>$2 && $1<=$3){print 1} else {print 0}}"
+}
+
+set -e # exit on the first error
+export EXE_NAME="sentiment_analysis_rnn"
+
+# Running the example with a movie review.
+if [ "$(uname)" == "Darwin" ]; then
+    DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:../../../lib ./${EXE_NAME}  --input "This movie is the best." 2&> ${EXE_NAME}.log
+else
+    LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../../../lib ./${EXE_NAME}  --input "This movie is the best." 2&> ${EXE_NAME}.log
+fi
+result=`grep "The sentiment score between 0 and 1.*\=" ${EXE_NAME}.log | cut -d '=' -f2`
+lower_bound=0.8
+upper_bound=0.99
+if [ $(compare_range $result $lower_bound $upper_bound) == 1 ];
+then
+    echo "PASS: ${EXE_NAME} correctly predicted the sentiment with score = $result"
+    exit 0
+else
+    echo "FAIL: ${EXE_NAME} FAILED to predict the sentiment with score = $result"
+    exit 1
+fi
\ No newline at end of file
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index 4df6fbee9..39550a3e9 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -97,10 +97,14 @@ int main(int argc, char const *argv[]) {
                                         };
 
   auto train_iter =  MXDataIter("MNISTIter");
-  setDataIter(&train_iter, "Train", data_files, batch_size);
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
 
   auto val_iter = MXDataIter("MNISTIter");
-  setDataIter(&val_iter, "Label", data_files, batch_size);
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
 
   Optimizer* opt = OptimizerRegistry::Find("sgd");
   opt->SetParam("momentum", 0.9)
diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp
index dc1ab3672..93eaf0538 100644
--- a/cpp-package/example/mlp_cpu.cpp
+++ b/cpp-package/example/mlp_cpu.cpp
@@ -63,10 +63,14 @@ int main(int argc, char** argv) {
                                         };
 
   auto train_iter =  MXDataIter("MNISTIter");
-  setDataIter(&train_iter, "Train", data_files, batch_size);
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
 
   auto val_iter = MXDataIter("MNISTIter");
-  setDataIter(&val_iter, "Label", data_files, batch_size);
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
 
   auto net = mlp(layers);
 
diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp
index 67992a19f..0befde8ae 100644
--- a/cpp-package/example/mlp_gpu.cpp
+++ b/cpp-package/example/mlp_gpu.cpp
@@ -63,10 +63,14 @@ int main(int argc, char** argv) {
                                         };
 
   auto train_iter =  MXDataIter("MNISTIter");
-  setDataIter(&train_iter, "Train", data_files, batch_size);
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
 
   auto val_iter = MXDataIter("MNISTIter");
-  setDataIter(&val_iter, "Label", data_files, batch_size);
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
 
   auto net = mlp(layers);
 
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index 0bb77a1a1..7c9dd4daa 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -179,10 +179,14 @@ int main(int argc, char const *argv[]) {
                                         };
 
   auto train_iter =  MXDataIter("MNISTIter");
-  setDataIter(&train_iter, "Train", data_files, batch_size);
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
 
   auto val_iter = MXDataIter("MNISTIter");
-  setDataIter(&val_iter, "Label", data_files, batch_size);
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
 
   // initialize parameters
   Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2);
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
index 9252701e2..7e5096abb 100644
--- a/cpp-package/example/test_score.cpp
+++ b/cpp-package/example/test_score.cpp
@@ -70,10 +70,14 @@ int main(int argc, char** argv) {
                                         };
 
   auto train_iter =  MXDataIter("MNISTIter");
-  setDataIter(&train_iter, "Train", data_files, batch_size);
+  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
+    return 1;
+  }
 
   auto val_iter = MXDataIter("MNISTIter");
-  setDataIter(&val_iter, "Label", data_files, batch_size);
+  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
+    return 1;
+  }
 
   auto net = mlp(layers);
 
diff --git a/cpp-package/example/utils.h b/cpp-package/example/utils.h
index 2ed5c4c11..020d1ec58 100644
--- a/cpp-package/example/utils.h
+++ b/cpp-package/example/utils.h
@@ -40,12 +40,13 @@ bool check_datafiles(const std::vector<std::string> &data_files) {
     }
   }
   return true;
-  }
+}
 
 bool setDataIter(MXDataIter *iter , const std::string &useType,
               const std::vector<std::string> &data_files, int batch_size) {
-    if (!check_datafiles(data_files))
+    if (!check_datafiles(data_files)) {
         return false;
+    }
 
     iter->SetParam("batch_size", batch_size);
     iter->SetParam("shuffle", 1);
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
index 0aa698174..acb6b461d 100644
--- a/cpp-package/include/mxnet-cpp/executor.hpp
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -32,6 +32,7 @@
 #include "mxnet-cpp/executor.h"
 #include "mxnet-cpp/optimizer.h"
 
+
 namespace mxnet {
 namespace cpp {
 inline Executor::Executor(const Symbol &symbol, Context context,
@@ -71,8 +72,8 @@ inline Executor::Executor(const Symbol &symbol, Context context,
     dev_ids.push_back(s.second.GetDeviceId());
   }
 
-  ExecutorHandle *shared_exec_handle =
-      shared_exec == nullptr ? nullptr : &shared_exec->handle_;
+  ExecutorHandle shared_exec_handle =
+      shared_exec == nullptr ? nullptr : shared_exec->handle_;
 
   CHECK_EQ(MXExecutorBindEX(symbol.GetHandle(), context.GetDeviceType(),
                             context.GetDeviceId(), group_to_ctx.size(),
diff --git a/cpp-package/include/mxnet-cpp/monitor.h b/cpp-package/include/mxnet-cpp/monitor.h
index c1494d0bd..76e7ce836 100644
--- a/cpp-package/include/mxnet-cpp/monitor.h
+++ b/cpp-package/include/mxnet-cpp/monitor.h
@@ -70,8 +70,9 @@ class Monitor {
   /*!
   * \brief install callback to executor. Supports installing to multiple executors.
   * \param exe The executor to install to.
+  * \param monitor_all If true, monitor both input and output, otherwise monitor output only.
   */
-  void install(Executor *exe);
+  void install(Executor *exe, bool monitor_all = false);
 
   /*!
   * \brief Start collecting stats for current batch. Call before calling forward.
diff --git a/cpp-package/include/mxnet-cpp/monitor.hpp b/cpp-package/include/mxnet-cpp/monitor.hpp
index f3584e2e8..4439e1bd3 100644
--- a/cpp-package/include/mxnet-cpp/monitor.hpp
+++ b/cpp-package/include/mxnet-cpp/monitor.hpp
@@ -43,10 +43,10 @@ inline Monitor::Monitor(int interval, std::regex pattern, StatFunc stat_func)
   : interval(interval), pattern(pattern), stat_func(stat_func), step(0) {
 }
 
-inline void Monitor::install(Executor *exe) {
-  MXExecutorSetMonitorCallback(exe->handle_,
-      static_cast<ExecutorMonitorCallback>(&Monitor::executor_callback),
-      this);
+inline void Monitor::install(Executor *exe, bool monitor_all) {
+  MXExecutorSetMonitorCallbackEX(exe->handle_,
+                                 static_cast<ExecutorMonitorCallback>(&Monitor::executor_callback),
+                                 this, monitor_all);
   exes.push_back(exe);
 }
 
diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index ca430ec99..65ba247c2 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -97,7 +97,8 @@ class Arg:
         'double':'double',\
         'double or None':'dmlc::optional<double>',\
         'Shape or None':'dmlc::optional<Shape>',\
-        'string':'const std::string&'}
+        'string':'const std::string&',\
+        'tuple of <float>':'nnvm::Tuple<mx_float>'}
     name = ''
     type = ''
     description = ''
@@ -407,6 +408,7 @@ def ParseAllOps():
                       "#include \"mxnet-cpp/op_util.h\"\n"
                       "#include \"mxnet-cpp/operator.h\"\n"
                       "#include \"dmlc/optional.h\"\n"
+                      "#include \"nnvm/tuple.h\"\n"
                       "\n"
                       "namespace mxnet {\n"
                       "namespace cpp {\n"
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 4a17d8d34..7abdef481 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -55,3 +55,11 @@ cp ../../build/cpp-package/example/test_score .
 ./test_score 0.93
 
 sh unittests/unit_test_mlp_csv.sh
+
+cd inference
+cp ../../../build/cpp-package/example/inception_inference .
+./unit_test_inception_inference.sh
+
+cp ../../../build/cpp-package/example/sentiment_analysis_rnn .
+./unit_test_sentiment_analysis_rnn.sh
+cd ..
diff --git a/docker/Dockerfiles/Dockerfile.in.scala b/docker/Dockerfiles/Dockerfile.in.scala
index 6898126c7..92e098384 100755
--- a/docker/Dockerfiles/Dockerfile.in.scala
+++ b/docker/Dockerfiles/Dockerfile.in.scala
@@ -4,4 +4,4 @@
 COPY install/scala.sh install/
 RUN install/scala.sh
 
-RUN cd mxnet && make scalapkg $BUILD_OPTS
+RUN cd mxnet/scala-package && mvn package
diff --git a/docs/_static/js/docversion.js b/docs/_static/js/docversion.js
index 1119f4ec1..320179854 100644
--- a/docs/_static/js/docversion.js
+++ b/docs/_static/js/docversion.js
@@ -18,16 +18,19 @@
  */
 
 /* Set the version of the website */
-function setVersion(){
-        let doc = window.location.pathname.match(/^\/(api\/.*)$/) || window.location.pathname.match(/^\/versions\/[^*]+\/(api\/.*)$/);
+function setVersion(anchor){
+        if (arguments.length==0) {
+            anchor = window.location.hash
+        };
+        let doc = window.location.pathname.match(/^\/versions\/[^\/]+\/([^*]+.*)$/);
         if (doc) {
             if (document.getElementById('dropdown-menu-position-anchor-version')) {
                     versionNav = $('#dropdown-menu-position-anchor-version a.main-nav-link');
                     $(versionNav).each( function( index, el ) {
                             currLink = $( el ).attr('href');
-                            version = currLink.match(/\/versions\/([0-9.master]+)\//);
+                            version = currLink.match(/\/versions\/([^\/]+)\//);
                             if (version) {
-                                    versionedDoc = '/versions/' + version[1] + '/' + doc[1] + (window.location.hash || '');
+                                    versionedDoc = '/versions/' + version[1] + '/' + doc[1] + (anchor || '') + (window.location.search || '');
                                     $( el ).attr('href', versionedDoc);
                             }
                     });
@@ -40,5 +43,5 @@ $(document).ready(function () {
 });
 
 $('a.reference.internal').click(function(){
-    setVersion();
+    setVersion($(this).attr("href"));
 });
diff --git a/docs/_static/searchtools_custom.js b/docs/_static/searchtools_custom.js
index 5f8c30a24..adeb0c2ab 100644
--- a/docs/_static/searchtools_custom.js
+++ b/docs/_static/searchtools_custom.js
@@ -457,7 +457,7 @@ var Search = {
                 highlightstring + item[2]).html(item[1]));
             } else {
                 // normal html builders
-                var baseURL = 'https://' + window.location.hostname + '/';
+                var baseURL = window.location.protocol + '//' + window.location.hostname + '/';
                 var urlHref = window.location.href;
                 let urlSplits = urlHref.split("/");
                 let versionString = '';
@@ -570,7 +570,7 @@ var Search = {
         }
         Search.title.text(_('Search Results'));
         if (!resultCount)
-          Search.status.text(_('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.'));
+          Search.status.text(_('Your search did not match any documents in this version of the documentation. You can use the dropdown selector in the navigation bar to try another version. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.'));
         else
             Search.status.text(_('Search finished, found %s page(s) matching the search query.').replace('%s', resultCount));
         Search.status.fadeIn(500);
diff --git a/docs/api/python/ndarray/ndarray.md b/docs/api/python/ndarray/ndarray.md
index 6419c4ed4..2df18c286 100644
--- a/docs/api/python/ndarray/ndarray.md
+++ b/docs/api/python/ndarray/ndarray.md
@@ -659,6 +659,7 @@ The `ndarray` package provides several classes:
     relu
     sigmoid
     erf
+    erfinv
 ```
 
 ### More
diff --git a/docs/api/python/symbol/symbol.md b/docs/api/python/symbol/symbol.md
index 9eba26180..0fc2aa7c6 100644
--- a/docs/api/python/symbol/symbol.md
+++ b/docs/api/python/symbol/symbol.md
@@ -659,6 +659,7 @@ Composite multiple symbols into a new one by an operator.
     relu
     sigmoid
     erf
+    erfinv
 ```
 
 ### More
diff --git a/docs/build_version_doc/artifacts/.htaccess b/docs/build_version_doc/artifacts/.htaccess
index 6bf3a659c..f334982a3 100644
--- a/docs/build_version_doc/artifacts/.htaccess
+++ b/docs/build_version_doc/artifacts/.htaccess
@@ -4,15 +4,10 @@ RewriteRule ^get_started/why_mxnet.html$ %{ENV:default_version}/faq/why_mxnet.ht
 RewriteRule ^get_started.*$ %{ENV:default_version}/install/ [R=301,L]
 RewriteRule ^how_to.*$ %{ENV:default_version}/faq/ [R=301,L]
 RewriteRule ^api/python/symbol.html$ %{ENV:default_version}/api/python/symbol/symbol.html [R=301,L]
-RewriteRule ^community/index.html$ %{ENV:default_version}/community/contribute.html [R=301,L]
 
 # Navigation bar redirects to latest info
-RewriteRule ^versions\/[0-9.]+\/architecture/(.*)$ %{ENV:default_version}/architecture/$1 [R=301,L]
-RewriteRule ^versions\/[0-9.]+\/community/(.*)$ %{ENV:default_version}/community/$1 [R=301,L]
-RewriteRule ^versions\/[0-9.]+\/faq/(.*)$ %{ENV:default_version}/faq/$1 [R=301,L]
-RewriteRule ^versions\/[0-9.]+\/gluon/(.*)$ %{ENV:default_version}/gluon/$1 [R=301,L]
-RewriteRule ^versions\/[0-9.]+\/install/(.*)$ %{ENV:default_version}/install/$1 [R=301,L]
-RewriteRule ^versions\/[0-9.]+\/tutorials/(.*)$ %{ENV:default_version}/tutorials/$1 [R=301,L]
+RewriteCond %{ENV:default_version}#\/$1 !^([^#]+)#\1$
+RewriteRule ^(versions\/[^\/]+)?(?:\/)?(faq|community|install|gluon|tutorials|architecture)(.*)?$ %{ENV:default_version}/$2$3 [R=301,L]
 
 # Redirect navbar APIs that did not exist
 RewriteRule ^versions/0.11.0/api/python/contrib/onnx.html %{ENV:default_version}/error/api.html [R=301,L]
@@ -26,4 +21,4 @@ RewriteRule ^versions/1.0.0/api/clojure/.*$ %{ENV:default_version}/error/api.htm
 RewriteRule ^versions/1.1.0/api/clojure/.*$ %{ENV:default_version}/error/api.html [R=301,L]
 RewriteRule ^versions/1.2.1/api/clojure/.*$ %{ENV:default_version}/error/api.html [R=301,L]
 
-ErrorDocument 404 https://mxnet.incubator.apache.org/error/404.html
+ErrorDocument 404 /error/404.html
diff --git a/docs/build_version_doc/artifacts/404.html b/docs/build_version_doc/artifacts/404.html
new file mode 100644
index 000000000..86c91f738
--- /dev/null
+++ b/docs/build_version_doc/artifacts/404.html
@@ -0,0 +1,50 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+<style>
+    { margin: 0; padding: 0; }
+
+    html {
+      background: url('https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-background-compressed.jpeg') no-repeat center center fixed;
+      -webkit-background-size: cover;
+      -moz-background-size: cover;
+      -o-background-size: cover;
+      background-size: cover;
+    }
+
+    h1, p {
+      color: white;
+      font-family: verdana;
+    }
+
+    a:link {
+      color: white;
+    }
+
+    a:visited {
+      color: linen;
+    }
+
+    a:hover {
+      color: powderblue;
+    }
+
+    a:active {
+      color: aqua;
+    }
+
+</style>
+  </head>
+  <body>
+    <div>
+      <a href="/" id="logo" ><img width="200"  src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet_logo.png"></a>
+    </div>
+    <div>
+    <span id="page-does-not-exist"></span><h1>Page Does Not Exist</h1>
+    <p>If you’re here that means you requested a page that doesn’t exist. Sorry about that! Maybe try the search box to find what you’re looking for, or navigate to the <a href="/">Home Page</a>. Also, make sure you’re looking in the correct version, as some features may only be available in <a href="/~https://github.com/apache/incubator-mxnet/releases">newer versions</a> or the <a href="/versions/master">master branch</a>.</p>
+    </div>
+  </body>
+</html>
diff --git a/docs/build_version_doc/artifacts/api.html b/docs/build_version_doc/artifacts/api.html
new file mode 100644
index 000000000..001b77192
--- /dev/null
+++ b/docs/build_version_doc/artifacts/api.html
@@ -0,0 +1,50 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+<style>
+    { margin: 0; padding: 0; }
+
+    html {
+      background: url('https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-background-compressed.jpeg') no-repeat center center fixed;
+      -webkit-background-size: cover;
+      -moz-background-size: cover;
+      -o-background-size: cover;
+      background-size: cover;
+    }
+
+    h1, p {
+      color: white;
+      font-family: verdana;
+    }
+
+    a:link {
+      color: white;
+    }
+
+    a:visited {
+      color: linen;
+    }
+
+    a:hover {
+      color: powderblue;
+    }
+
+    a:active {
+      color: aqua;
+    }
+
+</style>
+  </head>
+  <body>
+    <div>
+      <a href="/" id="logo" ><img width="200"  src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet_logo.png"></a>
+    </div>
+    <div>
+    <span id="page-does-not-exist"></span><h1>API Not Available</h1>
+    <p>You selected an API that is not available for this version of MXNet. Try a more recent version of MXNet, or go the <a href="/versions/master">master branch</a>.</p>
+    </div>
+  </body>
+</html>
diff --git a/docs/build_version_doc/artifacts/index.html b/docs/build_version_doc/artifacts/index.html
deleted file mode 100644
index 51eced8f4..000000000
--- a/docs/build_version_doc/artifacts/index.html
+++ /dev/null
@@ -1,73 +0,0 @@
-<div id="splash">
-  <div class="container">
-    <div class="row">
-      <div class="col-lg-12">
-          <div id="banner-title" class='col-sm-6 col-xs-12'><span>Apache MXNet (Incubating)</span>
-            <p id="landing-title">A flexible and efficient library for deep learning.</p>
-            <div id='landing-btn-blk'>
-                <div id="install_blk">
-                    <a href="install/index.html" id="install_btn">Install</a>
-                </div>
-                <div id="why_mxnet">
-                    <a href="get_started/why_mxnet.html" id="why_mxnet_btn">Learn More</a>
-                </div>
-            </div>
-        </div>
-      </div>
-    </div>
-  </div>
-</div>
-<div class="section-tout">
-  <div class="container">
-    <div class="row">
-      <div class="col-lg-4 col-sm-12">
-        <h3>MXNet 1.2.0.rc0 Released</h3>
-        <p>We're excited to announce the release of MXNet 1.2.0.rc0! Check out the release notes for latest updates.</p>
-        <a href="/~https://github.com/apache/incubator-mxnet/releases/tag/1.2.0.rc0">Learn More</a>
-      </div>
-      <div class="col-lg-4 col-sm-12">
-        <h3>Improved ONNX Support</h3>
-        <p>MXNet now supports importing ONNX models natively with the new ONNX-MXNet API. Try out the super resolution example, or a tutorial on fine-tuning an ONXX model with Gluon. 
-        </p>
-        <a href="api/python/contrib/onnx.html#onnx-tutorials">Learn More</a>
-      </div>
-      <div class="col-lg-4 col-sm-12">
-          <h3>Introducing the Scala Inference API</h3>
-          <p>A model loading and inference API is now available for Scala developers. Try out the examples for single shot detection and loading models for image classification.
-          </p>
-          <a href="api/scala/index.html">Learn More</a>
-        </div>
-    </div>
-  </div>
-</div>
-
-<div class="section-util">
-    <div class="container">
-      <div class="row">
-        <div id="model-zoo-blk" class="col-lg-4 col-sm-12">
-          <span class="glyphicon glyphicon-folder-open"></span>
-          <h2>Gluon Model Zoo</h2>
-          <p>One-click pre-trained models, included in Gluon. Fast implementations of many state-of-the-art models, for plug-and-play effortless use.</p>
-          <div class='util-btn'>
-            <a id="model-zoo-link" href="api/python/gluon/model_zoo.html">Gluon model zoo</a>
-          </div>
-        </div>
-        <div id="example-blk" class="col-lg-4 col-sm-12">
-          <span class="glyphicon glyphicon-list-alt"></span>
-          <h2>Examples</h2>
-          <p>Explore projects from simple demos, examples, tutorials to state-of-the-art research.</p>
-          <div class='util-btn'>
-            <a id="example-link" href="/~https://github.com/dmlc/mxnet/tree/master/example">MXNet examples</a>
-          </div>
-        </div>
-        <div id="tutorial-blk" class="col-lg-4 col-sm-12">
-          <span class="glyphicon glyphicon-ok-circle"></span>
-          <h2>Tutorials</h2>
-          <p>These tutorials introduce a few fundamental concepts in deep learning and how to implement them in MXNet.</p>
-          <div class='util-btn'>
-            <a id="tutorial-link" href="tutorials/index.html">MXNet tutorials</a>
-          </div>
-        </div>
-      </div>
-    </div>
-</div>
diff --git a/docs/build_version_doc/update_all_version.sh b/docs/build_version_doc/update_all_version.sh
index 0c91973bd..d3305d462 100755
--- a/docs/build_version_doc/update_all_version.sh
+++ b/docs/build_version_doc/update_all_version.sh
@@ -78,20 +78,6 @@ for tag in $tag_list; do
     echo "$tag" >> "$tag_file"
 done
 
-function update_mxnet_css {
-  tag=$1
-  echo "Begin update fixes.."
-  # All fixes are done on the master branch of mxnet-incubator repository
-  # During a nightly build, these fixes will be patched to all the versions in the asf-site repository including the master folder under versions directory.
-  # copy <master folder location> <version folder location>
-
-  echo "Copying mxnet.css from artifacts folder..."
-  cp "artifacts/mxnet.css"  "$built/versions/$tag/_static"
-
-  echo "Update fixes complete.."
-}
-
-
 
 # Update the specified tags with the Versions dropdown
 # Add various artifacts depending on the version
@@ -101,16 +87,6 @@ for tag in $tag_list; do
 
     python AddVersion.py --root_url "$root_url" --file_path "$built/versions/$tag" --current_version "$tag" --tag_default "$tag_default" || exit 1
 
-    # Patch any fixes to all versions except 0.11.0.
-    # Version 0.11.0 has old theme and does not make use of the current mxnet.css
-    # It also has its install page in /getting_started, so we skip updating that
-    if [ $tag != '0.11.0' ]; then
-        if [ -d $built/versions/$tag ]; then
-            echo "The $tag is going to be updated with new css and install pages."
-            update_mxnet_css $tag
-        fi
-    fi
-
     # Update all the files that are required to go into the root folder or live version
     if [ $tag == $tag_default ]
     then
diff --git a/docs/conf.py b/docs/conf.py
index c6c8204d9..3b5c38dd2 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -107,7 +107,7 @@
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['3rdparty', 'api/python/model.md', 'build_version_doc', 'error', 'README.md', 'tutorial_utils', 'virtualenv']
+exclude_patterns = ['3rdparty', 'api/python/model.md', 'build_version_doc', 'README.md', 'tutorial_utils', 'virtualenv']
 
 
 # The reST default role (used for this markup: `text`) to use for all documents.
diff --git a/docs/error/404.md b/docs/error/404.md
deleted file mode 100644
index 63ebdeb4c..000000000
--- a/docs/error/404.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Page Does Not Exist
-
-If you're here that means you requested a page that doesn't exist. Sorry about that! Maybe try the search box to find what you're looking for, or navigate to the [Home Page](../index.html). Also, make sure you're looking in the correct version, as some features may only be available in [newer versions](/~https://github.com/apache/incubator-mxnet/releases) or the [master branch](../versions/master).
diff --git a/docs/error/api.md b/docs/error/api.md
deleted file mode 100644
index 37de50291..000000000
--- a/docs/error/api.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# API Not Available
-
-You selected an API that is not available for this version of MXNet. Try a more recent version of MXNet, or go to the [master](../versions/master/) version.
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index 98057d0d7..83368bf4d 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -145,6 +145,10 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
   - If true, MXNet tries to use GPU peer-to-peer communication, if available on your device,
     when kvstore's type is `device`.
 
+* MXNET_UPDATE_ON_KVSTORE
+  - Values: 0(false) or 1(true) ```(default=1)```
+  - If true, weight updates are performed during the communication step, if possible.
+
 ## Memonger
 
 * MXNET_BACKWARD_DO_MIRROR
@@ -218,6 +222,31 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   - When the array size is bigger than or equal to  this threshold, NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP Thread Count.
   - When the array size is less than this threshold, NDArray::Copy(from , to)) is implemented by memcpy in single thread.
 
+* MXNET_OPTIMIZER_AGGREGATION_SIZE
+  - Values: Int ```(default=4)```
+  - Maximum value is 60.
+  - This variable controls how many weights will be updated in a single call to optimizer (for optimizers that support aggregation, currently limited to SGD).
+
+* MXNET_CPU_TEMP_COPY
+  - Values: Int ```(default=4)```
+  - This variable controls how many temporary memory resources to create for all CPU context for use in operator.
+
+* MXNET_GPU_TEMP_COPY
+  - Values: Int ```(default=1)```
+  - This variable controls how many temporary memory resources to create for each GPU context for use in operator.
+
+* MXNET_CPU_PARALLEL_RAND_COPY
+  - Values: Int ```(default=1)```
+  - This variable controls how many parallel random number generator resources to create for all CPU context for use in operator.
+
+* MXNET_GPU_PARALLEL_RAND_COPY
+  - Values: Int ```(default=4)```
+  - This variable controls how many parallel random number generator resources to create for each GPU context for use in operator.
+
+* MXNET_GPU_CUDNN_DROPOUT_STATE_COPY
+  - Values: Int ```(default=4)```
+  - This variable controls how many CuDNN dropout state resources to create for each GPU context for use in operator.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/docs/install/index.md b/docs/install/index.md
index 319e72a32..ad3d083a7 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -65,8 +65,8 @@ Indicate your preferred configuration. Then, follow the customized commands to i
 <!-- No CPU GPU for other Devices -->
 <div class="linux macos windows cloud">
 <div class="btn-group opt-group" role="group">
-  <button type="button" class="btn btn-default processors opt active">CPU</button>
-  <button type="button" class="btn btn-default processors opt">GPU</button>
+  <button type="button" class="btn btn-default processors opt active">GPU</button>
+  <button type="button" class="btn btn-default processors opt">CPU</button>
 </div>
 </div>
 
@@ -479,9 +479,7 @@ You can use the Maven packages defined in the following dependency to include MX
 <dependency>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
-    <scope>system</scope>
-    <version>1.4.0</version>
-    <systemPath>/system/path/to/jar/mxnet-full_2.11-linux-x86_64-gpu-1.4.0-SNAPSHOT.jar</systemPath>
+    <version>[1.4.0, )</version>
 </dependency>
 ```
 
@@ -498,9 +496,7 @@ You can use the Maven packages defined in the following dependency to include MX
 <dependency>
     <groupId>org.apache.mxnet</groupId>
     <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
-    <scope>system</scope>
-    <version>1.4.0</version>
-    <systemPath>/system/path/to/jar/mxnet-full_2.11-linux-x86_64-cpu-1.4.0-SNAPSHOT.jar</systemPath>
+    <version>[1.4.0, )</version>
 </dependency>
 ```
 <br>
@@ -781,10 +777,8 @@ You can use the Maven packages defined in the following dependency to include MX
 ```html
 <dependency>
     <groupId>org.apache.mxnet</groupId>
-    <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
-    <scope>system</scope>
-    <version>1.4.0</version>
-    <systemPath>/system/path/to/jar/mxnet-full_2.11-osx-x86_64-cpu-1.4.0-SNAPSHOT.jar</systemPath>
+    <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
+    <version>[1.4.0, )</version>
 </dependency>
 ```
 <br>
@@ -797,30 +791,6 @@ Not available at this time. <br>
 </div> <!-- End of java -->
 
 
-<div class="clojure">
-<div class="cpu">
-</br>
-You can use the Maven packages defined in the following `dependency` to include MXNet in your Clojure project. To maximize leverage, the Clojure package has been built on the existing Scala package. Please refer to the <a href="scala_setup.html">MXNet-Scala setup guide</a> for a detailed set of instructions to help you with the setup process that is required to use the Clojure dependency.
-
-<a href="https://mvnrepository.com/artifact/org.apache.mxnet.contrib.clojure/clojure-mxnet-osx-cpu"><img src="https://img.shields.io/badge/org.apache.mxnet-mac cpu-green.svg" alt="maven badge"/></a>
-
-```html
-<dependency>
-    <groupId>org.apache.mxnet.contrib.clojure</groupId>
-    <artifactId>clojure-mxnet-osx-cpu</artifactId>
-</dependency>
-```
-<br>
-</div> <!-- End of cpu  -->
-<div class="gpu">
-
-Not available at this time. <br>
-
-</div> <!-- End of gpu -->
-</div> <!-- End of clojure -->
-
-
-
 <div class="julia">
 <div class="cpu gpu">
 </br>
@@ -1164,11 +1134,36 @@ For more installation options, refer to the <a href="windows_setup.html">MXNet W
 <!-- START - Cloud Python Installation Instructions -->
 
 <div class="cloud">
+<div class="gpu">
+
+MXNet is available on several cloud providers with GPU support. You can also find GPU/CPU-hybrid support for use cases like scalable inference, or even fractional GPU support with AWS Elastic Inference.
+
+* **Alibaba**
+    - [NVIDIA VM](https://docs.nvidia.com/ngc/ngc-alibaba-setup-guide/launching-nv-cloud-vm-console.html#launching-nv-cloud-vm-console)
+* **Amazon Web Services**
+    - [Amazon SageMaker](https://aws.amazon.com/sagemaker/) - Managed training and deployment of MXNet models
+    - [AWS Deep Learning AMI](https://aws.amazon.com/machine-learning/amis/) - Preinstalled Conda environments for Python 2 or 3 with MXNet, CUDA, cuDNN, MKL-DNN, and AWS Elastic Inference
+    - [Dynamic Training on AWS](/~https://github.com/awslabs/dynamic-training-with-apache-mxnet-on-aws) - experimental manual EC2 setup or semi-automated CloudFormation setup
+    - [NVIDIA VM](https://aws.amazon.com/marketplace/pp/B076K31M1S)
+* **Google Cloud Platform**
+    - [NVIDIA VM](https://console.cloud.google.com/marketplace/details/nvidia-ngc-public/nvidia_gpu_cloud_image)
+* **Microsoft Azure**
+    - [NVIDIA VM](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/nvidia.ngc_azure_17_11?tab=Overview)
+* **Oracle Cloud**
+    - [NVIDIA VM](https://docs.cloud.oracle.com/iaas/Content/Compute/References/ngcimage.htm)
 
-AWS Marketplace distributes Deep Learning AMIs (Amazon Machine Image) with MXNet pre-installed. You can launch one of these Deep Learning AMIs by following instructions in the [AWS Deep Learning AMI Developer Guide](http://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html).
+All NVIDIA VMs use the [NVIDIA MXNet Docker container](https://ngc.nvidia.com/catalog/containers/nvidia:mxnet).
+Follow the [container usage instructions](https://ngc.nvidia.com/catalog/containers/nvidia:mxnet) found in [NVIDIA's container repository](https://ngc.nvidia.com/).
 
-You can also run distributed deep learning with *MXNet* on AWS using [Cloudformation Template](/~https://github.com/awslabs/deeplearning-cfn/blob/master/README.md).
+</div> <!-- END gpu -->
 
+<div class="cpu">
+MXNet should work on any cloud provider's CPU-only instances. Follow the Python pip install instructions, Docker instructions, or try the following preinstalled option.
+
+* **Amazon Web Services**
+    - [AWS Deep Learning AMI](https://aws.amazon.com/machine-learning/amis/) - Preinstalled Conda environments for Python 2 or 3 with MXNet and MKL-DNN.
+
+</div> <!-- end cpu -->
 </div> <!-- END - Cloud Python Installation Instructions -->
 
 
@@ -1405,6 +1400,7 @@ You are now ready to run MXNet on your NVIDIA Jetson TX2 device.
 <!-- Download -->
 <hr>
 
+
 # Source Download
 
 <a href="download.html">Download</a> your required version of MXNet and <a href="build_from_source.html">build from source</a>.
diff --git a/docs/install/java_setup.md b/docs/install/java_setup.md
index 0075e9205..bd20c9596 100644
--- a/docs/install/java_setup.md
+++ b/docs/install/java_setup.md
@@ -85,6 +85,13 @@ Also, add the dependency which corresponds to your platform to the `dependencies
 The official Java Packages will be released with the release of MXNet 1.4 and will be available on  [MXNet Maven package repository](https://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.mxnet%22).
 <hr>
 
+### Eclipse IDE Support
+You can convert your existing Maven project to a project that can run in Eclipse by:
+```
+mvn eclipse:eclipse
+```
+This can be done once you have your maven project properly configured.
+
 ## Source
 
 The previously mentioned setup with Maven is recommended. Otherwise, the following instructions for macOS and Ubuntu are provided for reference only:
@@ -99,11 +106,10 @@ The previously mentioned setup with Maven is recommended. Otherwise, the followi
 
 
 #### Build Java from an Existing MXNet Installation
-If you have already built MXNet **from source** and are looking to setup Java from that point, you may simply run the following from the MXNet source root:
+If you have already built MXNet **from source** and are looking to setup Java from that point, you may simply run the following from the MXNet `scala-package` folder:
 
 ```
-make scalapkg
-make scalainstall
+mvn install
 ```
 This will install both the Java Inference API and the required MXNet-Scala package. 
 <hr>
diff --git a/docs/install/scala_setup.md b/docs/install/scala_setup.md
index 98e752b21..9ee9ceac3 100644
--- a/docs/install/scala_setup.md
+++ b/docs/install/scala_setup.md
@@ -89,11 +89,10 @@ The previously mentioned setup with Maven is recommended. Otherwise, the followi
 
 
 #### Build Scala from an Existing MXNet Installation
-If you have already built MXNet **from source** and are looking to setup Scala from that point, you may simply run the following from the MXNet source root:
+If you have already built MXNet **from source** and are looking to setup Scala from that point, you may simply run the following from the MXNet `scala-package` folder:
 
 ```
-make scalapkg
-make scalainstall
+mvn install
 ```
 
 <hr>
diff --git a/docs/mxdoc.py b/docs/mxdoc.py
index 52f050e0a..280f2a6ce 100644
--- a/docs/mxdoc.py
+++ b/docs/mxdoc.py
@@ -16,6 +16,7 @@
 # under the License.
 
 """A sphnix-doc plugin to build mxnet docs"""
+from __future__ import print_function
 import subprocess
 import re
 import os
@@ -49,6 +50,7 @@
 _CLOJURE_DOCS = parser.getboolean(_DOC_SET, 'clojure_docs')
 _DOXYGEN_DOCS = parser.getboolean(_DOC_SET,  'doxygen_docs')
 _R_DOCS = parser.getboolean(_DOC_SET, 'r_docs')
+_ARTIFACTS = parser.getboolean(_DOC_SET, 'artifacts')
 
 # white list to evaluate the code block output, such as ['tutorials/gluon']
 _EVAL_WHILTELIST = []
@@ -87,10 +89,10 @@ def generate_doxygen(app):
 def build_mxnet(app):
     """Build mxnet .so lib"""
     if not os.path.exists(os.path.join(app.builder.srcdir, '..', 'config.mk')):
-        _run_cmd("cd %s/.. && cp make/config.mk config.mk && make -j$(nproc) DEBUG=1 USE_MKLDNN=0" %
+        _run_cmd("cd %s/.. && cp make/config.mk config.mk && make -j$(nproc) USE_MKLDNN=0 USE_CPP_PACKAGE=1 " %
                 app.builder.srcdir)
     else:
-        _run_cmd("cd %s/.. && make -j$(nproc) DEBUG=1 USE_MKLDNN=0" %
+        _run_cmd("cd %s/.. && make -j$(nproc) USE_MKLDNN=0 USE_CPP_PACKAGE=1 " %
                 app.builder.srcdir)
 
 def build_r_docs(app):
@@ -104,8 +106,11 @@ def build_r_docs(app):
 
 def build_scala(app):
     """build scala for scala docs, java docs, and clojure docs to use"""
-    _run_cmd("cd %s/.. && make scalapkg" % app.builder.srcdir)
-    _run_cmd("cd %s/.. && make scalainstall" % app.builder.srcdir)
+    if any(v in _BUILD_VER for v in ['1.2.', '1.3.', '1.4.']):
+        _run_cmd("cd %s/.. && make scalapkg" % app.builder.srcdir)
+        _run_cmd("cd %s/.. && make scalainstall" % app.builder.srcdir)
+    else:
+        _run_cmd("cd %s/../scala-package && mvn -B install -DskipTests" % app.builder.srcdir)
 
 def build_scala_docs(app):
     """build scala doc and then move the outdir"""
@@ -118,7 +123,7 @@ def build_scala_docs(app):
         '`find infer -name "*.jar" | tr "\\n" ":" `'
     ])
     # There are unresolvable errors on mxnet 1.2.x. We are ignoring those errors while aborting the ci on newer versions
-    scala_ignore_errors = '; exit 0' if '1.2.' or '1.3.' in _BUILD_VER else ''
+    scala_ignore_errors = '; exit 0' if any(v in _BUILD_VER for v in ['1.2.', '1.3.']) else ''
     _run_cmd('cd {}; scaladoc `{}` -classpath {} -feature -deprecation {}'
              .format(scala_path, scala_doc_sources, scala_doc_classpath, scala_ignore_errors))
     dest_path = app.builder.outdir + '/api/scala/docs'
@@ -434,6 +439,22 @@ def add_buttons(app, docname, source):
 
         # source[i] = '\n'.join(lines)
 
+
+def copy_artifacts(app):
+    """Copies artifacts needed for website presentation"""
+    dest_path = app.builder.outdir + '/error'
+    source_path = app.builder.srcdir + '/build_version_doc/artifacts'
+    _run_cmd('cd ' + app.builder.srcdir)
+    _run_cmd('rm -rf ' + dest_path)
+    _run_cmd('mkdir -p ' + dest_path)
+    _run_cmd('cp ' + source_path + '/404.html ' + dest_path)
+    _run_cmd('cp ' + source_path + '/api.html ' + dest_path)
+    dest_path = app.builder.outdir + '/_static'
+    _run_cmd('rm -rf ' + dest_path)
+    _run_cmd('mkdir -p ' + dest_path)
+    _run_cmd('cp ' + app.builder.srcdir + '/_static/mxnet.css ' + dest_path)
+
+
 def setup(app):
     # If MXNET_DOCS_BUILD_MXNET is set something different than 1
     # Skip the build step
@@ -458,6 +479,9 @@ def setup(app):
     if _R_DOCS:
         print("Building R Docs!")
         app.connect("builder-inited", build_r_docs)
+    if _ARTIFACTS:
+        print("Copying Artifacts!")
+        app.connect("builder-inited", copy_artifacts)
     app.connect('source-read', convert_table)
     app.connect('source-read', add_buttons)
     app.add_config_value('recommonmark_config', {
diff --git a/docs/settings.ini b/docs/settings.ini
index 7de3268ab..e16177604 100644
--- a/docs/settings.ini
+++ b/docs/settings.ini
@@ -2,6 +2,7 @@
 build_mxnet = 0
 
 [document_sets_tutorial]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
@@ -9,6 +10,23 @@ r_docs = 0
 scala_docs = 0
 
 [document_sets_default]
+artifacts = 1
+clojure_docs = 1
+doxygen_docs = 1
+java_docs = 1
+r_docs = 0
+scala_docs = 1
+
+[document_sets_1.4.0]
+artifacts = 0
+clojure_docs = 1
+doxygen_docs = 1
+java_docs = 1
+r_docs = 0
+scala_docs = 1
+
+[document_sets_v1.4.x]
+artifacts = 0
 clojure_docs = 1
 doxygen_docs = 1
 java_docs = 1
@@ -16,6 +34,7 @@ r_docs = 0
 scala_docs = 1
 
 [document_sets_1.3.1]
+artifacts = 0
 clojure_docs = 1
 doxygen_docs = 1
 java_docs = 0
@@ -23,6 +42,7 @@ r_docs = 0
 scala_docs = 1
 
 [document_sets_1.3.0]
+artifacts = 0
 clojure_docs = 1
 doxygen_docs = 1
 java_docs = 0
@@ -30,13 +50,15 @@ r_docs = 0
 scala_docs = 1
 
 [document_sets_v1.3.x]
-clojure_docs = 1
+artifacts = 0
+clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
 r_docs = 0
-scala_docs = 1
+scala_docs = 0
 
 [document_sets_1.2.0]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
@@ -44,6 +66,7 @@ r_docs = 0
 scala_docs = 1
 
 [document_sets_v1.2.0]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
@@ -51,6 +74,7 @@ r_docs = 0
 scala_docs = 1
 
 [document_sets_1.1.0]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
@@ -58,6 +82,7 @@ r_docs = 0
 scala_docs = 0
 
 [document_sets_v1.1.0]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
@@ -65,6 +90,7 @@ r_docs = 0
 scala_docs = 0
 
 [document_sets_1.0.0]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
@@ -72,6 +98,7 @@ r_docs = 0
 scala_docs = 0
 
 [document_sets_v1.0.0]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
@@ -79,6 +106,7 @@ r_docs = 0
 scala_docs = 0
 
 [document_sets_0.12.0]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
@@ -86,6 +114,7 @@ r_docs = 0
 scala_docs = 0
 
 [document_sets_v0.12.0]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
@@ -93,6 +122,7 @@ r_docs = 0
 scala_docs = 0
 
 [document_sets_0.11.0]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
@@ -100,6 +130,7 @@ r_docs = 0
 scala_docs = 0
 
 [document_sets_v0.11.0]
+artifacts = 0
 clojure_docs = 0
 doxygen_docs = 1
 java_docs = 0
diff --git a/docs/tutorial_utils/data/oxford_102_flower_dataset.py b/docs/tutorial_utils/data/oxford_102_flower_dataset.py
new file mode 100644
index 000000000..0dcae2209
--- /dev/null
+++ b/docs/tutorial_utils/data/oxford_102_flower_dataset.py
@@ -0,0 +1,219 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+This scripts downloads and prepares the Oxford 102 Category Flower Dataset for training
+Dataset is from: http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
+Script is modified from: /~https://github.com/Arsey/keras-transfer-learning-for-oxford102
+"""
+
+import glob
+import os
+import tarfile
+from shutil import copyfile
+
+import numpy as np
+from mxnet import gluon
+from scipy.io import loadmat
+
+label_names = [
+    'pink primrose',
+    'hard-leaved pocket orchid',
+    'canterbury bells',
+    'sweet pea',
+    'english marigold',
+    'tiger lily',
+    'moon orchid',
+    'bird of paradise',
+    'monkshood',
+    'globe thistle',
+    'snapdragon',
+    "colt's foot",
+    'king protea',
+    'spear thistle',
+    'yellow iris',
+    'globe-flower',
+    'purple coneflower',
+    'peruvian lily',
+    'balloon flower',
+    'giant white arum lily',
+    'fire lily',
+    'pincushion flower',
+    'fritillary',
+    'red ginger',
+    'grape hyacinth',
+    'corn poppy',
+    'prince of wales feathers',
+    'stemless gentian',
+    'artichoke',
+    'sweet william',
+    'carnation',
+    'garden phlox',
+    'love in the mist',
+    'mexican aster',
+    'alpine sea holly',
+    'ruby-lipped cattleya',
+    'cape flower',
+    'great masterwort',
+    'siam tulip',
+    'lenten rose',
+    'barbeton daisy',
+    'daffodil',
+    'sword lily',
+    'poinsettia',
+    'bolero deep blue',
+    'wallflower',
+    'marigold',
+    'buttercup',
+    'oxeye daisy',
+    'common dandelion',
+    'petunia',
+    'wild pansy',
+    'primula',
+    'sunflower',
+    'pelargonium',
+    'bishop of llandaff',
+    'gaura',
+    'geranium',
+    'orange dahlia',
+    'pink-yellow dahlia?',
+    'cautleya spicata',
+    'japanese anemone',
+    'black-eyed susan',
+    'silverbush',
+    'californian poppy',
+    'osteospermum',
+    'spring crocus',
+    'bearded iris',
+    'windflower',
+    'tree poppy',
+    'gazania',
+    'azalea',
+    'water lily',
+    'rose',
+    'thorn apple',
+    'morning glory',
+    'passion flower',
+    'lotus',
+    'toad lily',
+    'anthurium',
+    'frangipani',
+    'clematis',
+    'hibiscus',
+    'columbine',
+    'desert-rose',
+    'tree mallow',
+    'magnolia',
+    'cyclamen',
+    'watercress',
+    'canna lily',
+    'hippeastrum ',
+    'bee balm',
+    'ball moss',
+    'foxglove',
+    'bougainvillea',
+    'camellia',
+    'mallow',
+    'mexican petunia',
+    'bromelia',
+    'blanket flower',
+    'trumpet creeper',
+    'blackberry lily'
+]
+
+def download_data():
+    data_url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/'
+    image_file_name = '102flowers.tgz'
+    label_file_name = 'imagelabels.mat'
+    setid_file_name = 'setid.mat'
+
+    global data_path, image_path, label_path, setid_path
+    image_path = os.path.join(data_path, image_file_name)
+    label_path = os.path.join(data_path, label_file_name)
+    setid_path = os.path.join(data_path, setid_file_name)
+    # download the dataset into current directory
+    if not os.path.exists(data_path):
+        os.mkdir(data_path)
+    if not os.path.isfile(image_path):
+        gluon.utils.download(url=data_url + image_file_name, path=data_path)
+    if not os.path.exists(os.path.join(data_path, 'jpg')):
+        print("Extracting downloaded dataset...")
+        tarfile.open(image_path).extractall(path=data_path)
+    if not os.path.isfile(label_path):
+        gluon.utils.download(url=data_url + label_file_name, path=data_path)
+    if not os.path.isfile(setid_path):
+        gluon.utils.download(url=data_url + setid_file_name, path=data_path)
+
+
+def prepare_data():
+    # Read .mat file containing training, testing, and validation sets.
+    global data_path, image_path, label_path, setid_path, label_names
+    setid = loadmat(setid_path)
+
+    idx_train = setid['trnid'][0] - 1
+    idx_test = setid['tstid'][0] - 1
+    idx_valid = setid['valid'][0] - 1
+
+    # Read .mat file containing image labels.
+    image_labels = loadmat(label_path)['labels'][0]
+
+    # Subtract one to get 0-based labels
+    image_labels -= 1
+
+    # convert label from number to flower names
+    image_labels = [label_names[i] for i in image_labels]
+    # extracted images are stored in folder 'jpg'
+    files = sorted(glob.glob(os.path.join(data_path, 'jpg', '*.jpg')))
+    file_label_pairs = np.array([i for i in zip(files, image_labels)])
+
+    # move files from extracted folder to train, test, valid
+    move_files('train', file_label_pairs[idx_test, :])
+    move_files('test', file_label_pairs[idx_train, :])
+    move_files('valid', file_label_pairs[idx_valid, :])
+
+
+def move_files(dir_name, file_label_pairs):
+    data_segment_dir = os.path.join(data_path, dir_name)
+    if not os.path.exists(data_segment_dir):
+        os.mkdir(data_segment_dir)
+
+    for label in label_names:
+        class_dir = os.path.join(data_segment_dir, label)
+        if not os.path.exists(class_dir):
+            os.mkdir(class_dir)
+
+    for file, label in file_label_pairs:
+        src = str(file)
+        dst = os.path.join(data_path, dir_name, label, src.split(os.sep)[-1])
+        copyfile(src, dst)
+
+
+def generate_synset():
+    with open('synset.txt', 'w') as f:
+        # Gluon Dataset API will load synset in sorted order
+        for label in sorted(label_names):
+            f.write(label.strip() + '\n')
+        f.close()
+
+
+def get_data(dir_name):
+    global data_path
+    data_path = dir_name
+    download_data()
+    prepare_data()
+    generate_synset()
diff --git a/docs/tutorials/c++/mxnet_cpp_inference_tutorial.md b/docs/tutorials/c++/mxnet_cpp_inference_tutorial.md
new file mode 100644
index 000000000..ab55a0e78
--- /dev/null
+++ b/docs/tutorials/c++/mxnet_cpp_inference_tutorial.md
@@ -0,0 +1,267 @@
+# MXNet C++ API inference tutorial
+
+## Overview
+MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](/~https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
+Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](https://mxnet.incubator.apache.org/api/python/module/module.html),    [Java](https://mxnet.incubator.apache.org/api/java/index.html), [Scala](https://mxnet.incubator.apache.org/api/scala/index.html), and [C++](https://mxnet.incubator.apache.org/api/c++/index.html) APIs.
+
+This tutorial is a continuation of the [Gluon end to end tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/gluon_from_experiment_to_deployment.html), we will focus on the MXNet C++ API. We have slightly modified the code in [C++ Inference Example](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) for our use case.
+
+## Prerequisites
+
+To complete this tutorial, you need:
+- Complete the training part of [Gluon end to end tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/gluon_from_experiment_to_deployment.html)
+- Learn the basics about [MXNet C++ API](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
+
+
+## Setup the MXNet C++ API
+To use the C++ API in MXNet, you need to build MXNet from source with C++ package. Please follow the [built from source guide](https://mxnet.incubator.apache.org/install/ubuntu_setup.html), and [C++ Package documentation](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
+to enable the C++ API.
+The summary of those two documents is that you need to build MXNet from source with `USE_CPP_PACKAGE` flag set to 1. For example: `make -j USE_CPP_PACKAGE=1`.
+
+## Load the model and run inference
+
+After you complete [the previous tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/gluon_from_experiment_to_deployment.html), you will get the following output files:
+1. Model Architecture stored in `flower-recognition-symbol.json`
+2. Model parameter values stored in `flower-recognition-0040.params` (`0040` is for 40 epochs we ran)
+3. Label names stored in `synset.txt`
+4. Mean and standard deviation values stored in `mean_std_224` for image normalization.
+
+
+Now we need to write the C++ code to load them and run prediction on a test image.
+The full code is available in the [C++ Inference Example](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference), we will walk you through it and point out the necessary changes to make for our use case.
+
+
+
+### Write a predictor using the MXNet C++ API
+
+In general, the C++ inference code should follow the 4 steps below. We can do that using a Predictor class.
+1. Load the pre-trained model
+2. Load the parameters of pre-trained model
+3. Load the image to be classified in to NDArray and apply image transformation we did in training
+4. Run the forward pass and predict the class of the input image
+
+```cpp
+class Predictor {
+ public:
+    Predictor() {}
+    Predictor(const std::string& model_json_file,
+              const std::string& model_params_file,
+              const Shape& input_shape,
+              bool gpu_context_type = false,
+              const std::string& synset_file = "",
+              const std::string& mean_image_file = "");
+    void PredictImage(const std::string& image_file);
+    ~Predictor();
+
+ private:
+    void LoadModel(const std::string& model_json_file);
+    void LoadParameters(const std::string& model_parameters_file);
+    void LoadSynset(const std::string& synset_file);
+    NDArray LoadInputImage(const std::string& image_file);
+    void LoadMeanImageData();
+    void LoadDefaultMeanImageData();
+    void NormalizeInput(const std::string& mean_image_file);
+    inline bool FileExists(const std::string& name) {
+        struct stat buffer;
+        return (stat(name.c_str(), &buffer) == 0);
+    }
+    NDArray mean_img;
+    std::map<std::string, NDArray> args_map;
+    std::map<std::string, NDArray> aux_map;
+    std::vector<std::string> output_labels;
+    Symbol net;
+    Executor *executor;
+    Shape input_shape;
+    NDArray mean_image_data;
+    NDArray std_dev_image_data;
+    Context global_ctx = Context::cpu();
+    std::string mean_image_file;
+};
+```
+
+### Load the model, synset file, and normalization values
+
+In the Predictor constructor, you need to provide paths to saved json and param files. After that, add the following methods `LoadModel` and `LoadParameters` to load the network and its parameters. This part is the same as [the example](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/inception_inference.cpp).
+
+Next, we need to load synset file, and normalization values. We have made the following change since our synset file contains flower names and we used both mean and standard deviation for image normalization.
+
+```c++
+/*
+ * The following function loads the synset file.
+ * This information will be used later to report the label of input image.
+ */
+void Predictor::LoadSynset(const std::string& synset_file) {
+  if (!FileExists(synset_file)) {
+    LG << "Synset file " << synset_file << " does not exist";
+    throw std::runtime_error("Synset file does not exist");
+  }
+  LG << "Loading the synset file.";
+  std::ifstream fi(synset_file.c_str());
+  if (!fi.is_open()) {
+    std::cerr << "Error opening synset file " << synset_file << std::endl;
+    throw std::runtime_error("Error in opening the synset file.");
+  }
+  std::string lemma;
+  while (getline(fi, lemma)) {
+    output_labels.push_back(lemma);
+  }
+  fi.close();
+}
+
+/*
+ * The following function loads the mean and standard deviation values.
+ * This data will be used for normalizing the image before running the forward
+ * pass.
+ * The output data has the same shape as that of the input image data.
+ */
+void Predictor::LoadMeanImageData() {
+  LG << "Load the mean image data that will be used to normalize "
+     << "the image before running forward pass.";
+  mean_image_data = NDArray(input_shape, global_ctx, false);
+  mean_image_data.SyncCopyFromCPU(
+        NDArray::LoadToMap(mean_image_file)["mean_img"].GetData(),
+        input_shape.Size());
+  NDArray::WaitAll();
+   std_dev_image_data = NDArray(input_shape, global_ctx, false);
+   std_dev_image_data.SyncCopyFromCPU(
+       NDArray::LoadToMap(mean_image_file)["std_img"].GetData(),
+       input_shape.Size());
+    NDArray::WaitAll();
+}
+```
+
+
+
+### Load input image
+
+Now let's add a method to load the input image we want to predict and converts it to NDArray for prediction.
+```cpp
+NDArray Predictor::LoadInputImage(const std::string& image_file) {
+  if (!FileExists(image_file)) {
+    LG << "Image file " << image_file << " does not exist";
+    throw std::runtime_error("Image file does not exist");
+  }
+  LG << "Loading the image " << image_file << std::endl;
+  std::vector<float> array;
+  cv::Mat mat = cv::imread(image_file);
+  /*resize pictures to (224, 224) according to the pretrained model*/
+  int height = input_shape[2];
+  int width = input_shape[3];
+  int channels = input_shape[1];
+  cv::resize(mat, mat, cv::Size(height, width));
+  for (int c = 0; c < channels; ++c) {
+    for (int i = 0; i < height; ++i) {
+      for (int j = 0; j < width; ++j) {
+        array.push_back(static_cast<float>(mat.data[(i * height + j) * 3 + c]));
+      }
+    }
+  }
+  NDArray image_data = NDArray(input_shape, global_ctx, false);
+  image_data.SyncCopyFromCPU(array.data(), input_shape.Size());
+  NDArray::WaitAll();
+  return image_data;
+}
+```
+
+### Predict the image
+
+Finally, let's run the inference. It's basically using MXNet executor to do a forward pass. To run predictions on multiple images, you can load the images in a list of NDArrays and run prediction in batches. Note that the Predictor class may not be thread safe. Calling it in multi-threaded environments was not tested. To utilize multi-threaded prediction, you need to use the C predict API. Please follow the [C predict example](/~https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/predict-cpp).
+
+An additional step is to normalize the image NDArrays values to `(0, 1)` and apply mean and standard deviation we just loaded. 
+
+```cpp
+/*
+ * The following function runs the forward pass on the model.
+ * The executor is created in the constructor.
+ *
+ */
+void Predictor::PredictImage(const std::string& image_file) {
+  // Load the input image
+  NDArray image_data = LoadInputImage(image_file);
+
+  // Normalize the image
+  image_data.Slice(0, 1) /= 255.0;
+  image_data -= mean_image_data;
+  image_data /= std_dev_image_data;
+
+  LG << "Running the forward pass on model to predict the image";
+  /*
+   * The executor->arg_arrays represent the arguments to the model.
+   *
+   * Copying the image_data that contains the NDArray of input image
+   * to the arg map of the executor. The input is stored with the key "data" in the map.
+   *
+   */
+  image_data.CopyTo(&(executor->arg_dict()["data"]));
+  NDArray::WaitAll();
+
+  // Run the forward pass.
+  executor->Forward(false);
+
+  // The output is available in executor->outputs.
+  auto array = executor->outputs[0].Copy(global_ctx);
+  NDArray::WaitAll();
+
+  /*
+   * Find out the maximum accuracy and the index associated with that accuracy.
+   * This is done by using the argmax operator on NDArray.
+   */
+  auto predicted = array.ArgmaxChannel();
+  NDArray::WaitAll();
+
+  int best_idx = predicted.At(0, 0);
+  float best_accuracy = array.At(0, best_idx);
+
+  if (output_labels.empty()) {
+    LG << "The model predicts the highest accuracy of " << best_accuracy << " at index "
+       << best_idx;
+  } else {
+    LG << "The model predicts the input image to be a [" << output_labels[best_idx]
+       << " ] with Accuracy = " << best_accuracy << std::endl;
+  }
+}
+```
+
+### Compile and run the inference code
+
+You can find the [full code for the inference example](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) in the `cpp-package` folder of the project
+, and to compile it use this [Makefile](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/Makefile).
+
+Make a copy of the example code, rename it to `flower_inference` and apply the changes we mentioned above. Now you will be able to compile and run inference. Run `make all`. Once this is complete, run inference with the following parameters. Remember to set your `LD_LIBRARY_PATH` to point to MXNet library if you have not done so.
+
+```bash
+make all
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH=:path/to/incubator-mxnet/lib
+./flower_inference --symbol flower-recognition-symbol.json --params flower-recognition-0040.params --synset synset.txt --mean mean_std_224.nd --image ./data/test/lotus/image_01832.jpg
+```
+
+Then it will predict your image:
+
+```bash
+[17:38:51] resnet.cpp:150: Loading the model from flower-recognition-symbol.json
+
+[17:38:51] resnet.cpp:163: Loading the model parameters from flower-recognition-0040.params
+
+[17:38:52] resnet.cpp:190: Loading the synset file.
+[17:38:52] resnet.cpp:211: Load the mean image data that will be used to normalize the image before running forward pass.
+[17:38:52] resnet.cpp:263: Loading the image ./data/test/lotus/image_01832.jpg
+
+[17:38:52] resnet.cpp:299: Running the forward pass on model to predict the image
+[17:38:52] resnet.cpp:331: The model predicts the input image to be a [lotus ] with Accuracy = 8.63046
+```
+
+
+
+## What's next
+
+Now you can explore more ways to run inference and deploy your models:
+1. [Java Inference examples](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
+2. [Scala Inference examples](https://mxnet.incubator.apache.org/tutorials/scala/)
+3. [ONNX model inference examples](https://mxnet.incubator.apache.org/tutorials/onnx/inference_on_onnx_model.html)
+4. [MXNet Model Server Examples](/~https://github.com/awslabs/mxnet-model-server/tree/master/examples)
+
+## References
+
+1. [Gluon end to end tutorial](https://mxnet.apache.org/versions/master/tutorials/gluon/gluon_from_experiment_to_deployment.html)
+2. [Gluon C++ inference example](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
+3. [Gluon C++ package](/~https://github.com/apache/incubator-mxnet/tree/master/cpp-package)
\ No newline at end of file
diff --git a/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
new file mode 100644
index 000000000..36c7a2e4d
--- /dev/null
+++ b/docs/tutorials/gluon/gluon_from_experiment_to_deployment.md
@@ -0,0 +1,313 @@
+
+# Gluon: from experiment to deployment, an end to end tutorial
+
+## Overview
+MXNet Gluon API comes with a lot of great features, and it can provide you everything you need: from experimentation to deploying the model. In this tutorial, we will walk you through a common use case on how to build a model using gluon, train it on your data, and deploy it for inference.
+This tutorial covers training and inference in Python, please continue to [C++ inference part](https://mxnet.incubator.apache.org/versions/master/tutorials/c++/mxnet_cpp_inference_tutorial.html) after you finish.
+
+Let's say you need to build a service that provides flower species recognition. A common problem is that you don't have enough data to train a good model. In such cases, a technique called Transfer Learning can be used to make a more robust model.
+In Transfer Learning we make use of a pre-trained model that solves a related task, and was trained on a very large standard dataset, such as ImageNet. ImageNet is from a different domain, but we can utilize the knowledge in this pre-trained model to perform the new task at hand.
+
+Gluon provides State of the Art models for many of the standard tasks such as Classification, Object Detection, Segmentation, etc. In this tutorial we will use the pre-trained model [ResNet50 V2](https://arxiv.org/abs/1603.05027) trained on ImageNet dataset. This model achieves 77.11% top-1 accuracy on ImageNet. We seek to transfer as much knowledge as possible for our task of recognizing different species of flowers.
+
+
+
+
+## Prerequisites
+
+To complete this tutorial, you need:
+
+- [Build MXNet from source](https://mxnet.incubator.apache.org/install/ubuntu_setup.html#build-mxnet-from-source) with Python(Gluon) and C++ Packages
+- Learn the basics about Gluon with [A 60-minute Gluon Crash Course](https://gluon-crash-course.mxnet.io/)
+
+
+## The Data
+
+We will use the [Oxford 102 Category Flower Dataset](http://www.robots.ox.ac.uk/~vgg/data/flowers/102/) as an example to show you the steps.
+We have prepared a utility file to help you download and organize your data into train, test, and validation sets. Run the following Python code to download and prepare the data:
+
+
+```python
+import mxnet as mx
+data_util_file = "oxford_102_flower_dataset.py"
+base_url = "https://raw.githubusercontent.com/apache/incubator-mxnet/master/docs/tutorial_utils/data/{}?raw=true"
+mx.test_utils.download(base_url.format(data_util_file), fname=data_util_file)
+import oxford_102_flower_dataset
+
+# download and move data to train, test, valid folders
+path = './data'
+oxford_102_flower_dataset.get_data(path)
+```
+
+Now your data will be organized into train, test, and validation sets, images belong to the same class are moved to the same folder.
+
+## Training using Gluon
+
+### Define Hyper-parameters
+
+Now let's first import necessary packages:
+
+
+```python
+import math
+import os
+import time
+from multiprocessing import cpu_count
+
+from mxnet import autograd
+from mxnet import gluon, init
+from mxnet.gluon import nn
+from mxnet.gluon.data.vision import transforms
+from mxnet.gluon.model_zoo.vision import resnet50_v2
+```
+
+Next, we define the hyper-parameters that we will use for fine-tuning. We will use the [MXNet learning rate scheduler](https://mxnet.incubator.apache.org/tutorials/gluon/learning_rate_schedules.html) to adjust learning rates during training.
+Here we set the `epochs` to 1 for quick demonstration, please change to 40 for actual training.
+
+```python
+classes = 102
+epochs = 1
+lr = 0.001
+per_device_batch_size = 32
+momentum = 0.9
+wd = 0.0001
+
+lr_factor = 0.75
+# learning rate change at following epochs
+lr_epochs = [10, 20, 30]
+
+num_gpus = mx.context.num_gpus()
+num_workers = cpu_count()
+ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
+batch_size = per_device_batch_size * max(num_gpus, 1)
+```
+
+Now we will apply data augmentations on training images. This makes minor alterations on the training images, and our model will consider them as distinct images. This can be very useful for fine-tuning on a relatively small dataset, and it will help improve the model. We can use the Gluon [DataSet API](https://mxnet.incubator.apache.org/tutorials/gluon/datasets.html), [DataLoader API](https://mxnet.incubator.apache.org/tutorials/gluon/datasets.html), and [Transform API](https://mxnet.incubator.apache.org/tutorials/gluon/data_augmentation.html) to load the images and apply the following data augmentations:
+1. Randomly crop the image and resize it to 224x224
+2. Randomly flip the image horizontally
+3. Randomly jitter color and add noise
+4. Transpose the data from `[height, width, num_channels]` to `[num_channels, height, width]`, and map values from [0, 255] to [0, 1]
+5. Normalize with the mean and standard deviation from the ImageNet dataset.
+
+For validation and inference, we only need to apply step 1, 4, and 5. We also need to save the mean and standard deviation values for [inference using C++](https://mxnet.incubator.apache.org/versions/master/tutorials/c++/mxnet_cpp_inference_tutorial.html).
+
+```python
+jitter_param = 0.4
+lighting_param = 0.1
+
+# mean and std for normalizing image value in range (0,1)
+mean = [0.485, 0.456, 0.406]
+std = [0.229, 0.224, 0.225]
+
+training_transformer = transforms.Compose([
+    transforms.RandomResizedCrop(224),
+    transforms.RandomFlipLeftRight(),
+    transforms.RandomColorJitter(brightness=jitter_param, contrast=jitter_param,
+                                 saturation=jitter_param),
+    transforms.RandomLighting(lighting_param),
+    transforms.ToTensor(),
+    transforms.Normalize(mean, std)
+])
+
+validation_transformer = transforms.Compose([
+    transforms.Resize(256),
+    transforms.CenterCrop(224),
+    transforms.ToTensor(),
+    transforms.Normalize(mean, std)
+])
+
+# save mean and std NDArray values for inference
+mean_img = mx.nd.stack(*[mx.nd.full((224, 224), m) for m in mean])
+std_img = mx.nd.stack(*[mx.nd.full((224, 224), s) for s in std])
+mx.nd.save('mean_std_224.nd', {"mean_img": mean_img, "std_img": std_img})
+
+train_path = os.path.join(path, 'train')
+val_path = os.path.join(path, 'valid')
+test_path = os.path.join(path, 'test')
+
+# loading the data and apply pre-processing(transforms) on images
+train_data = gluon.data.DataLoader(
+    gluon.data.vision.ImageFolderDataset(train_path).transform_first(training_transformer),
+    batch_size=batch_size, shuffle=True, num_workers=num_workers)
+
+val_data = gluon.data.DataLoader(
+    gluon.data.vision.ImageFolderDataset(val_path).transform_first(validation_transformer),
+    batch_size=batch_size, shuffle=False, num_workers=num_workers)
+
+test_data = gluon.data.DataLoader(
+    gluon.data.vision.ImageFolderDataset(test_path).transform_first(validation_transformer),
+    batch_size=batch_size, shuffle=False, num_workers=num_workers)
+```
+
+### Loading pre-trained model
+
+
+We will use pre-trained ResNet50_v2 model which was pre-trained on the [ImageNet Dataset](http://www.image-net.org/) with 1000 classes. To match the classes in the Flower dataset, we must redefine the last softmax (output) layer to be 102, then initialize the parameters.
+
+Before we go to training, one unique Gluon feature you should be aware of is hybridization. It allows you to convert your imperative code to a static symbolic graph, which is much more efficient to execute. There are two main benefits of hybridizing your model: better performance and easier serialization for deployment. The best part is that it's as simple as just calling `net.hybridize()`. To know more about Gluon hybridization, please follow the [hybridization tutorial](https://mxnet.incubator.apache.org/tutorials/gluon/hybrid.html).
+
+
+
+```python
+# load pre-trained resnet50_v2 from model zoo
+finetune_net = resnet50_v2(pretrained=True, ctx=ctx)
+
+# change last softmax layer since number of classes are different
+with finetune_net.name_scope():
+    finetune_net.output = nn.Dense(classes)
+finetune_net.output.initialize(init.Xavier(), ctx=ctx)
+# hybridize for better performance
+finetune_net.hybridize()
+
+num_batch = len(train_data)
+
+# setup learning rate scheduler
+iterations_per_epoch = math.ceil(num_batch)
+# learning rate change at following steps
+lr_steps = [epoch * iterations_per_epoch for epoch in lr_epochs]
+schedule = mx.lr_scheduler.MultiFactorScheduler(step=lr_steps, factor=lr_factor, base_lr=lr)
+
+# setup optimizer with learning rate scheduler, metric, and loss function
+sgd_optimizer = mx.optimizer.SGD(learning_rate=lr, lr_scheduler=schedule, momentum=momentum, wd=wd)
+metric = mx.metric.Accuracy()
+softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
+```
+
+### Fine-tuning model on your custom dataset
+
+Now let's define the test metrics and start fine-tuning.
+
+
+
+```python
+def test(net, val_data, ctx):
+    metric = mx.metric.Accuracy()
+    for i, (data, label) in enumerate(val_data):
+        data = gluon.utils.split_and_load(data, ctx_list=ctx, even_split=False)
+        label = gluon.utils.split_and_load(label, ctx_list=ctx, even_split=False)
+        outputs = [net(x) for x in data]
+        metric.update(label, outputs)
+    return metric.get()
+
+trainer = gluon.Trainer(finetune_net.collect_params(), optimizer=sgd_optimizer)
+
+# start with epoch 1 for easier learning rate calculation
+for epoch in range(1, epochs + 1):
+
+    tic = time.time()
+    train_loss = 0
+    metric.reset()
+
+    for i, (data, label) in enumerate(train_data):
+        # get the images and labels
+        data = gluon.utils.split_and_load(data, ctx_list=ctx, even_split=False)
+        label = gluon.utils.split_and_load(label, ctx_list=ctx, even_split=False)
+        with autograd.record():
+            outputs = [finetune_net(x) for x in data]
+            loss = [softmax_cross_entropy(yhat, y) for yhat, y in zip(outputs, label)]
+        for l in loss:
+            l.backward()
+
+        trainer.step(batch_size)
+        train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
+        metric.update(label, outputs)
+
+    _, train_acc = metric.get()
+    train_loss /= num_batch
+    _, val_acc = test(finetune_net, val_data, ctx)
+
+    print('[Epoch %d] Train-acc: %.3f, loss: %.3f | Val-acc: %.3f | learning-rate: %.3E | time: %.1f' %
+          (epoch, train_acc, train_loss, val_acc, trainer.learning_rate, time.time() - tic))
+
+_, test_acc = test(finetune_net, test_data, ctx)
+print('[Finished] Test-acc: %.3f' % (test_acc))
+```
+
+Following is the training result:
+```
+[Epoch 40] Train-acc: 0.945, loss: 0.354 | Val-acc: 0.955 | learning-rate: 4.219E-04 | time: 17.8
+[Finished] Test-acc: 0.952
+```
+In the previous example output, we trained the model using an [AWS p3.8xlarge instance](https://aws.amazon.com/ec2/instance-types/p3/) with 4 Tesla V100 GPUs. We were able to reach a test accuracy of 95.5% with 40 epochs in around 12 minutes. This was really fast because our model was pre-trained on a much larger dataset, ImageNet, with around 1.3 million images. It worked really well to capture features on our small dataset.
+
+
+### Save the fine-tuned model
+
+
+We now have a trained our custom model. This can be serialized into model files using the export function. The export function will export the model architecture into a `.json` file and model parameters into a `.params` file.
+
+
+
+```python
+finetune_net.export("flower-recognition", epoch=epochs)
+
+```
+
+`export` creates `flower-recognition-symbol.json` and `flower-recognition-0040.params` (`0040` is for 40 epochs we ran) in the current directory. These files can be used for model deployment in the next section.
+
+## Load the model and run inference using the MXNet Module API
+
+MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](/~https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
+Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](https://mxnet.incubator.apache.org/api/python/module/module.html),    [Java](https://mxnet.incubator.apache.org/api/java/index.html), [Scala](https://mxnet.incubator.apache.org/api/scala/index.html), and [C++](https://mxnet.incubator.apache.org/api/c++/index.html) APIs.
+
+Here we will briefly introduce how to run inference using Module API in Python. There is more detailed explanation available in the [Predict Image Tutorial](https://mxnet.incubator.apache.org/tutorials/python/predict_image.html).
+In general, prediction consists of the following steps:
+1. Load the model architecture (symbol file) and trained parameter values (params file)
+2. Load the synset file for label names
+3. Load the image and apply the same transformation we did on validation dataset during training
+4. Run a forward pass on the image data
+5. Convert output probabilities to predicted label name
+
+```python
+import numpy as np
+from collections import namedtuple
+
+ctx = mx.cpu()
+# load model symbol and params
+sym, arg_params, aux_params = mx.model.load_checkpoint('flower-recognition', epochs)
+mod = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
+mod.bind(for_training=False, data_shapes=[('data', (1, 3, 224, 224))], label_shapes=mod._label_shapes)
+mod.set_params(arg_params, aux_params, allow_missing=True)
+
+# load synset for label names
+with open('synset.txt', 'r') as f:
+    labels = [l.rstrip() for l in f]
+
+# load an image for prediction
+img = mx.image.imread('./data/test/lotus/image_01832.jpg')
+# apply transform we did during training
+img = validation_transformer(img)
+# batchify
+img = img.expand_dims(axis=0)
+Batch = namedtuple('Batch', ['data'])
+mod.forward(Batch([img]))
+prob = mod.get_outputs()[0].asnumpy()
+prob = np.squeeze(prob)
+idx = np.argmax(prob)
+print('probability=%f, class=%s' % (prob[idx], labels[idx]))
+```
+
+Following is the output, you can see the image has been classified as lotus correctly.
+```
+probability=9.798435, class=lotus
+```
+
+## What's next
+
+You can continue to the [next tutorial](https://mxnet.incubator.apache.org/versions/master/tutorials/c++/mxnet_cpp_inference_tutorial.html) on how to load the model we just trained and run inference using MXNet C++ API.
+
+You can also find more ways to run inference and deploy your models here:
+1. [Java Inference examples](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer)
+2. [Scala Inference examples](https://mxnet.incubator.apache.org/tutorials/scala/)
+3. [ONNX model inference examples](https://mxnet.incubator.apache.org/tutorials/onnx/inference_on_onnx_model.html)
+4. [MXNet Model Server Examples](/~https://github.com/awslabs/mxnet-model-server/tree/master/examples)
+
+## References
+
+1. [Transfer Learning for Oxford102 Flower Dataset](/~https://github.com/Arsey/keras-transfer-learning-for-oxford102)
+2. [Gluon book on fine-tuning](https://gluon.mxnet.io/chapter08_computer-vision/fine-tuning.html)
+3. [Gluon CV transfer learning tutorial](https://gluon-cv.mxnet.io/build/examples_classification/transfer_learning_minc.html)
+4. [Gluon crash course](https://gluon-crash-course.mxnet.io/)
+5. [Gluon CPP inference example](/~https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inference/)
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
diff --git a/docs/tutorials/gluon/hybrid.md b/docs/tutorials/gluon/hybrid.md
index f11622bd6..17e9e1b20 100644
--- a/docs/tutorials/gluon/hybrid.md
+++ b/docs/tutorials/gluon/hybrid.md
@@ -154,33 +154,33 @@ However, that's not the case in Symbol API. It's not automatically broadcasted,
 
 | NDArray APIs  | Description  |
 |---|---|
-| [*NDArray.\__add\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__add__) | x.\__add\__(y) <=> x+y <=> mx.nd.add(x, y)  |
-| [*NDArray.\__sub\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__sub__) | x.\__sub\__(y) <=> x-y <=> mx.nd.subtract(x, y)  |
-| [*NDArray.\__mul\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__mul__) | x.\__mul\__(y) <=> x*y <=> mx.nd.multiply(x, y)  |
-| [*NDArray.\__div\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__div__) | x.\__div\__(y) <=> x/y <=> mx.nd.divide(x, y)  |
-| [*NDArray.\__mod\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__mod__) | x.\__mod\__(y) <=> x%y <=> mx.nd.modulo(x, y)  |
-| [*NDArray.\__lt\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__lt__) |  x.\__lt\__(y) <=> x<y <=> x mx.nd.lesser(x, y) |
-| [*NDArray.\__le\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__le__) |  x.\__le\__(y) <=> x<=y <=> mx.nd.less_equal(x, y) |
-| [*NDArray.\__gt\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__gt__) |  x.\__gt\__(y) <=> x>y <=> mx.nd.greater(x, y) |
-| [*NDArray.\__ge\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__ge__) |  x.\__ge\__(y) <=> x>=y <=> mx.nd.greater_equal(x, y)|
-| [*NDArray.\__eq\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__eq__) |  x.\__eq\__(y) <=> x==y <=> mx.nd.equal(x, y) |
-| [*NDArray.\__ne\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__ne__) |  x.\__ne\__(y) <=> x!=y <=> mx.nd.not_equal(x, y) |
+| [NDArray.\__add\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__add__) | x.\__add\__(y) <=> x+y <=> mx.nd.add(x, y)  |
+| [NDArray.\__sub\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__sub__) | x.\__sub\__(y) <=> x-y <=> mx.nd.subtract(x, y)  |
+| [NDArray.\__mul\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__mul__) | x.\__mul\__(y) <=> x*y <=> mx.nd.multiply(x, y)  |
+| [NDArray.\__div\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__div__) | x.\__div\__(y) <=> x/y <=> mx.nd.divide(x, y)  |
+| [NDArray.\__mod\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__mod__) | x.\__mod\__(y) <=> x%y <=> mx.nd.modulo(x, y)  |
+| [NDArray.\__lt\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__lt__) |  x.\__lt\__(y) <=> x<y <=> x mx.nd.lesser(x, y) |
+| [NDArray.\__le\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__le__) |  x.\__le\__(y) <=> x<=y <=> mx.nd.less_equal(x, y) |
+| [NDArray.\__gt\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__gt__) |  x.\__gt\__(y) <=> x>y <=> mx.nd.greater(x, y) |
+| [NDArray.\__ge\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__ge__) |  x.\__ge\__(y) <=> x>=y <=> mx.nd.greater_equal(x, y)|
+| [NDArray.\__eq\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__eq__) |  x.\__eq\__(y) <=> x==y <=> mx.nd.equal(x, y) |
+| [NDArray.\__ne\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__ne__) |  x.\__ne\__(y) <=> x!=y <=> mx.nd.not_equal(x, y) |
 
 The current workaround is to use corresponding broadcast operators for arithmetic and comparison to avoid potential hybridization failure when input shapes are different.
 
 | Symbol APIs  | Description  |
 |---|---|
-|[*broadcast_add*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_add) | Returns element-wise sum of the input arrays with broadcasting. |
-|[*broadcast_sub*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_sub) | Returns element-wise difference of the input arrays with broadcasting. |
-|[*broadcast_mul*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_mul) | Returns element-wise product of the input arrays with broadcasting. |
-|[*broadcast_div*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_div) | Returns element-wise division of the input arrays with broadcasting. |
-|[*broadcast_mod*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_mod) | Returns element-wise modulo of the input arrays with broadcasting. |
-|[*broadcast_equal*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_equal) | Returns the result of element-wise *equal to* (==) comparison operation with broadcasting. |
-|[*broadcast_not_equal*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_not_equal) | Returns the result of element-wise *not equal to* (!=) comparison operation with broadcasting. |
-|[*broadcast_greater*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_greater) | Returns the result of element-wise *greater than* (>) comparison operation with broadcasting. |
-|[*broadcast_greater_equal*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_greater_equal) | Returns the result of element-wise *greater than or equal to* (>=) comparison operation with broadcasting. |
-|[*broadcast_lesser*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_lesser) |	Returns the result of element-wise *lesser than* (<) comparison operation with broadcasting. |
-|[*broadcast_lesser_equal*](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_lesser_equal) | Returns the result of element-wise *lesser than or equal to* (<=) comparison operation with broadcasting. |
+|[broadcast_add](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_add) | Returns element-wise sum of the input arrays with broadcasting. |
+|[broadcast_sub](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_sub) | Returns element-wise difference of the input arrays with broadcasting. |
+|[broadcast_mul](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_mul) | Returns element-wise product of the input arrays with broadcasting. |
+|[broadcast_div](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_div) | Returns element-wise division of the input arrays with broadcasting. |
+|[broadcast_mod](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_mod) | Returns element-wise modulo of the input arrays with broadcasting. |
+|[broadcast_equal](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_equal) | Returns the result of element-wise *equal to* (==) comparison operation with broadcasting. |
+|[broadcast_not_equal](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_not_equal) | Returns the result of element-wise *not equal to* (!=) comparison operation with broadcasting. |
+|[broadcast_greater](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_greater) | Returns the result of element-wise *greater than* (>) comparison operation with broadcasting. |
+|[broadcast_greater_equal](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_greater_equal) | Returns the result of element-wise *greater than or equal to* (>=) comparison operation with broadcasting. |
+|[broadcast_lesser](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_lesser) |	Returns the result of element-wise *lesser than* (<) comparison operation with broadcasting. |
+|[broadcast_lesser_equal](https://mxnet.incubator.apache.org/api/python/symbol/symbol.html#mxnet.symbol.broadcast_lesser_equal) | Returns the result of element-wise *lesser than or equal to* (<=) comparison operation with broadcasting. |
 
 For example, if you want to add a NDarray to your input x, use `broadcast_add` instead of `+`:
 
@@ -196,7 +196,7 @@ If you used `+`, it would still work before hybridization, but will throw an err
 
 Gluon's imperative interface is very flexible and allows you to print the shape of the NDArray. However, Symbol does not have shape attributes. As a result, you need to avoid printing shapes in `hybrid_forward`.
 Otherwise, you will get the following error:
-```bash
+```
 AttributeError: 'Symbol' object has no attribute 'shape'
 ```
 
@@ -230,11 +230,11 @@ For example, avoid writing `x += y` and use `x  = x + y`, otherwise you will get
 
 | NDArray in-place arithmetic operators | Description |
 |---|---|
-|[*NDArray.\__iadd\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__iadd__) |	x.\__iadd\__(y) <=> x+=y |
-|[*NDArray.\__isub\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__isub__) |	x.\__isub\__(y) <=> x-=y |
-|[*NDArray.\__imul\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__imul__) |	x.\__imul\__(y) <=> x*=y |
-|[*NDArray.\__idiv\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__idiv__) |	x.\__rdiv\__(y) <=> x/=y |
-|[*NDArray.\__imod\__*](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__imod__) |	x.\__rmod\__(y) <=> x%=y |
+|[NDArray.\__iadd\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__iadd__) |	x.\__iadd\__(y) <=> x+=y |
+|[NDArray.\__isub\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__isub__) |	x.\__isub\__(y) <=> x-=y |
+|[NDArray.\__imul\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__imul__) |	x.\__imul\__(y) <=> x*=y |
+|[NDArray.\__idiv\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__idiv__) |	x.\__rdiv\__(y) <=> x/=y |
+|[NDArray.\__imod\__](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.NDArray.__imod__) |	x.\__rmod\__(y) <=> x%=y |
 
 
 
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 9457a409a..cad9099fc 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -73,6 +73,7 @@ Select API:&nbsp;
     * [Advanced Learning Rate Schedules](/tutorials/gluon/learning_rate_schedules_advanced.html)
     * [Profiling MXNet Models](/tutorials/python/profiler.html)
     * [Hybridize Gluon models with control flows](/tutorials/control_flow/ControlFlowTutorial.html)
+    * [Gluon end to end from training to inference](/tutorials/gluon/gluon_from_experiment_to_deployment.html)
 * API Guides
     * Core APIs
         * NDArray
@@ -173,6 +174,9 @@ Select API:&nbsp;
 * Backends
     * [Subgraph API](/tutorials/c%2B%2B/subgraphAPI.html)
 
+* Inference
+    * [C++ Inference](/tutorials/c%2B%2B/mxnet_cpp_inference_tutorial.html)
+
 <hr>
 
 ## R Tutorials
diff --git a/docs/tutorials/scala/mxnet_scala_on_intellij.md b/docs/tutorials/scala/mxnet_scala_on_intellij.md
index a0bf24e34..769a6b4fe 100644
--- a/docs/tutorials/scala/mxnet_scala_on_intellij.md
+++ b/docs/tutorials/scala/mxnet_scala_on_intellij.md
@@ -135,7 +135,7 @@ mxnet
 ```
 **ArtifactId**
 ```
-ArtifactId: scalaMXNet
+scalaMXNet
 ```
 **Version**
 ```
@@ -211,6 +211,7 @@ The project's `pom.xml` will be open for editing.
     <dependency>
       <groupId>org.apache.mxnet</groupId>
       <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
+      <version>[1.3.1,)</version>
     </dependency>
     <dependency>
       <groupId>org.scala-lang</groupId>
@@ -324,7 +325,7 @@ log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t]
 
 **Step 6.** Build the project:
 - To build the project, from the menu choose Build, and then choose Build Project.
-
+* Note that if you run into ```bad option: '-make:transitive'```, please remove ```<arg>-make:transitive</arg>``` from `pom.xml` and `<parameter value="-make:transitive" />` from  `.idea/scala_compiler`.
 
 **Step 7.** Run the Hello World App:
 
diff --git a/example/README.md b/example/README.md
index 2123104a1..dea7e289e 100644
--- a/example/README.md
+++ b/example/README.md
@@ -95,7 +95,7 @@ If your tutorial depends on specific packages, simply add them to this provision
 * [Gluon Examples](gluon) - several examples using the Gluon API
   * [Style Transfer](gluon/style_transfer) - a style transfer example using gluon
   * [Word Language Model](gluon/word_language_model) - an example that trains a multi-layer RNN on the Penn Treebank language modeling benchmark
-  * [SN-GAN](gluon/sn-gan) - an example that utilizes spectral normalization to train GAN(Generative adversarial network) using Gluon API
+  * [SN-GAN](gluon/sn_gan) - an example that utilizes spectral normalization to train GAN(Generative adversarial network) using Gluon API
 * [Image Classification with R](image-classification) - image classification on MNIST,CIFAR,ImageNet-1k,ImageNet-Full, with multiple GPU and distributed training.
 * [Kaggle 1st national data science bowl](kaggle-ndsb1) - a MXnet example for Kaggle Nation Data Science Bowl 1
 * [Kaggle 2nd national data science bowl](kaggle-ndsb2) - a tutorial for Kaggle Second Nation Data Science Bowl
diff --git a/example/autoencoder/README.md b/example/autoencoder/README.md
index 7efa30a19..960636cd7 100644
--- a/example/autoencoder/README.md
+++ b/example/autoencoder/README.md
@@ -1,16 +1,20 @@
-# Example of Autencoder
+# Example of a Convolutional Autoencoder
 
-Autoencoder architecture is often used for unsupervised feature learning. This [link](http://ufldl.stanford.edu/tutorial/unsupervised/Autoencoders/) contains an introduction tutorial to autoencoders. This example illustrates a simple autoencoder using stack of fully-connected layers for both encoder and decoder. The number of hidden layers and size of each hidden layer can be customized using command line arguments.
+Autoencoder architectures are often used for unsupervised feature learning. This [link](http://ufldl.stanford.edu/tutorial/unsupervised/Autoencoders/) contains an introduction tutorial to autoencoders. This example illustrates a simple autoencoder using a stack of convolutional layers for both the encoder and the decoder. 
 
-## Training Stages
-This example uses a two-stage training. In the first stage, each layer of encoder and its corresponding decoder are trained separately in a layer-wise training loop. In the second stage the entire autoencoder network is fine-tuned end to end.
+
+![](https://cdn-images-1.medium.com/max/800/1*LSYNW5m3TN7xRX61BZhoZA.png)
+
+([Diagram source](https://towardsdatascience.com/autoencoders-introduction-and-implementation-3f40483b0a85))
+
+
+The idea of an autoencoder is to learn to use bottleneck architecture to encode the input and then try to decode it to reproduce the original. By doing so, the network learns to effectively compress the information of the input, the resulting embedding representation can then be used in several domains. For example as featurized representation for visual search, or in anomaly detection.
 
 ## Dataset
-The dataset used in this example is [MNIST](http://yann.lecun.com/exdb/mnist/) dataset. This example uses scikit-learn module to download this dataset.
 
-## Simple autoencoder example
-mnist_sae.py: this example uses a simple auto-encoder architecture to encode and decode MNIST images with size of 28x28 pixels. It contains several command line arguments. Pass -h (or --help) to view all available options. To start the training on CPU (use --gpu option for training on GPU) using default options:
+The dataset used in this example is [FashionMNIST](/~https://github.com/zalandoresearch/fashion-mnist) dataset. 
+
+## Variational Autoencoder
+
+You can check an example of variational autoencoder [here](https://gluon.mxnet.io/chapter13_unsupervised-learning/vae-gluon.html)
 
-```
-python mnist_sae.py
-```
diff --git a/example/autoencoder/autoencoder.py b/example/autoencoder/autoencoder.py
deleted file mode 100644
index 47931e573..000000000
--- a/example/autoencoder/autoencoder.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=missing-docstring, arguments-differ
-from __future__ import print_function
-
-import logging
-
-import mxnet as mx
-import numpy as np
-import model
-from solver import Solver, Monitor
-
-
-class AutoEncoderModel(model.MXModel):
-    def setup(self, dims, sparseness_penalty=None, pt_dropout=None,
-              ft_dropout=None, input_act=None, internal_act='relu', output_act=None):
-        self.N = len(dims) - 1
-        self.dims = dims
-        self.stacks = []
-        self.pt_dropout = pt_dropout
-        self.ft_dropout = ft_dropout
-        self.input_act = input_act
-        self.internal_act = internal_act
-        self.output_act = output_act
-
-        self.data = mx.symbol.Variable('data')
-        for i in range(self.N):
-            if i == 0:
-                decoder_act = input_act
-                idropout = None
-            else:
-                decoder_act = internal_act
-                idropout = pt_dropout
-            if i == self.N-1:
-                encoder_act = output_act
-                odropout = None
-            else:
-                encoder_act = internal_act
-                odropout = pt_dropout
-            istack, iargs, iargs_grad, iargs_mult, iauxs = self.make_stack(
-                i, self.data, dims[i], dims[i+1], sparseness_penalty,
-                idropout, odropout, encoder_act, decoder_act
-            )
-            self.stacks.append(istack)
-            self.args.update(iargs)
-            self.args_grad.update(iargs_grad)
-            self.args_mult.update(iargs_mult)
-            self.auxs.update(iauxs)
-        self.encoder, self.internals = self.make_encoder(
-            self.data, dims, sparseness_penalty, ft_dropout, internal_act, output_act)
-        self.decoder = self.make_decoder(
-            self.encoder, dims, sparseness_penalty, ft_dropout, internal_act, input_act)
-        if input_act == 'softmax':
-            self.loss = self.decoder
-        else:
-            self.loss = mx.symbol.LinearRegressionOutput(data=self.decoder, label=self.data)
-
-    def make_stack(self, istack, data, num_input, num_hidden, sparseness_penalty=None,
-                   idropout=None, odropout=None, encoder_act='relu', decoder_act='relu'):
-        x = data
-        if idropout:
-            x = mx.symbol.Dropout(data=x, p=idropout)
-        x = mx.symbol.FullyConnected(name='encoder_%d'%istack, data=x, num_hidden=num_hidden)
-        if encoder_act:
-            x = mx.symbol.Activation(data=x, act_type=encoder_act)
-            if encoder_act == 'sigmoid' and sparseness_penalty:
-                x = mx.symbol.IdentityAttachKLSparseReg(
-                    data=x, name='sparse_encoder_%d' % istack, penalty=sparseness_penalty)
-        if odropout:
-            x = mx.symbol.Dropout(data=x, p=odropout)
-        x = mx.symbol.FullyConnected(name='decoder_%d'%istack, data=x, num_hidden=num_input)
-        if decoder_act == 'softmax':
-            x = mx.symbol.Softmax(data=x, label=data, prob_label=True, act_type=decoder_act)
-        elif decoder_act:
-            x = mx.symbol.Activation(data=x, act_type=decoder_act)
-            if decoder_act == 'sigmoid' and sparseness_penalty:
-                x = mx.symbol.IdentityAttachKLSparseReg(
-                    data=x, name='sparse_decoder_%d' % istack, penalty=sparseness_penalty)
-            x = mx.symbol.LinearRegressionOutput(data=x, label=data)
-        else:
-            x = mx.symbol.LinearRegressionOutput(data=x, label=data)
-
-        args = {'encoder_%d_weight'%istack: mx.nd.empty((num_hidden, num_input), self.xpu),
-                'encoder_%d_bias'%istack: mx.nd.empty((num_hidden,), self.xpu),
-                'decoder_%d_weight'%istack: mx.nd.empty((num_input, num_hidden), self.xpu),
-                'decoder_%d_bias'%istack: mx.nd.empty((num_input,), self.xpu),}
-        args_grad = {'encoder_%d_weight'%istack: mx.nd.empty((num_hidden, num_input), self.xpu),
-                     'encoder_%d_bias'%istack: mx.nd.empty((num_hidden,), self.xpu),
-                     'decoder_%d_weight'%istack: mx.nd.empty((num_input, num_hidden), self.xpu),
-                     'decoder_%d_bias'%istack: mx.nd.empty((num_input,), self.xpu),}
-        args_mult = {'encoder_%d_weight'%istack: 1.0,
-                     'encoder_%d_bias'%istack: 2.0,
-                     'decoder_%d_weight'%istack: 1.0,
-                     'decoder_%d_bias'%istack: 2.0,}
-        auxs = {}
-        if encoder_act == 'sigmoid' and sparseness_penalty:
-            auxs['sparse_encoder_%d_moving_avg' % istack] = mx.nd.ones(num_hidden, self.xpu) * 0.5
-        if decoder_act == 'sigmoid' and sparseness_penalty:
-            auxs['sparse_decoder_%d_moving_avg' % istack] = mx.nd.ones(num_input, self.xpu) * 0.5
-        init = mx.initializer.Uniform(0.07)
-        for k, v in args.items():
-            init(mx.initializer.InitDesc(k), v)
-
-        return x, args, args_grad, args_mult, auxs
-
-    def make_encoder(self, data, dims, sparseness_penalty=None, dropout=None, internal_act='relu',
-                     output_act=None):
-        x = data
-        internals = []
-        N = len(dims) - 1
-        for i in range(N):
-            x = mx.symbol.FullyConnected(name='encoder_%d'%i, data=x, num_hidden=dims[i+1])
-            if internal_act and i < N-1:
-                x = mx.symbol.Activation(data=x, act_type=internal_act)
-                if internal_act == 'sigmoid' and sparseness_penalty:
-                    x = mx.symbol.IdentityAttachKLSparseReg(
-                        data=x, name='sparse_encoder_%d' % i, penalty=sparseness_penalty)
-            elif output_act and i == N-1:
-                x = mx.symbol.Activation(data=x, act_type=output_act)
-                if output_act == 'sigmoid' and sparseness_penalty:
-                    x = mx.symbol.IdentityAttachKLSparseReg(
-                        data=x, name='sparse_encoder_%d' % i, penalty=sparseness_penalty)
-            if dropout:
-                x = mx.symbol.Dropout(data=x, p=dropout)
-            internals.append(x)
-        return x, internals
-
-    def make_decoder(self, feature, dims, sparseness_penalty=None, dropout=None,
-                     internal_act='relu', input_act=None):
-        x = feature
-        N = len(dims) - 1
-        for i in reversed(range(N)):
-            x = mx.symbol.FullyConnected(name='decoder_%d'%i, data=x, num_hidden=dims[i])
-            if internal_act and i > 0:
-                x = mx.symbol.Activation(data=x, act_type=internal_act)
-                if internal_act == 'sigmoid' and sparseness_penalty:
-                    x = mx.symbol.IdentityAttachKLSparseReg(
-                        data=x, name='sparse_decoder_%d' % i, penalty=sparseness_penalty)
-            elif input_act and i == 0:
-                x = mx.symbol.Activation(data=x, act_type=input_act)
-                if input_act == 'sigmoid' and sparseness_penalty:
-                    x = mx.symbol.IdentityAttachKLSparseReg(
-                        data=x, name='sparse_decoder_%d' % i, penalty=sparseness_penalty)
-            if dropout and i > 0:
-                x = mx.symbol.Dropout(data=x, p=dropout)
-        return x
-
-    def layerwise_pretrain(self, X, batch_size, n_iter, optimizer, l_rate, decay,
-                           lr_scheduler=None, print_every=1000):
-        def l2_norm(label, pred):
-            return np.mean(np.square(label-pred))/2.0
-        solver = Solver(optimizer, momentum=0.9, wd=decay, learning_rate=l_rate,
-                        lr_scheduler=lr_scheduler)
-        solver.set_metric(mx.metric.CustomMetric(l2_norm))
-        solver.set_monitor(Monitor(print_every))
-        data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=True,
-                                      last_batch_handle='roll_over')
-        for i in range(self.N):
-            if i == 0:
-                data_iter_i = data_iter
-            else:
-                X_i = list(model.extract_feature(
-                    self.internals[i-1], self.args, self.auxs, data_iter, X.shape[0],
-                    self.xpu).values())[0]
-                data_iter_i = mx.io.NDArrayIter({'data': X_i}, batch_size=batch_size,
-                                                last_batch_handle='roll_over')
-            logging.info('Pre-training layer %d...', i)
-            solver.solve(self.xpu, self.stacks[i], self.args, self.args_grad, self.auxs,
-                         data_iter_i, 0, n_iter, {}, False)
-
-    def finetune(self, X, batch_size, n_iter, optimizer, l_rate, decay, lr_scheduler=None,
-                 print_every=1000):
-        def l2_norm(label, pred):
-            return np.mean(np.square(label-pred))/2.0
-        solver = Solver(optimizer, momentum=0.9, wd=decay, learning_rate=l_rate,
-                        lr_scheduler=lr_scheduler)
-        solver.set_metric(mx.metric.CustomMetric(l2_norm))
-        solver.set_monitor(Monitor(print_every))
-        data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=True,
-                                      last_batch_handle='roll_over')
-        logging.info('Fine tuning...')
-        solver.solve(self.xpu, self.loss, self.args, self.args_grad, self.auxs, data_iter,
-                     0, n_iter, {}, False)
-
-    def eval(self, X):
-        batch_size = 100
-        data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=False,
-                                      last_batch_handle='pad')
-        Y = list(model.extract_feature(
-            self.loss, self.args, self.auxs, data_iter, X.shape[0], self.xpu).values())[0]
-        return np.mean(np.square(Y-X))/2.0
diff --git a/example/autoencoder/convolutional_autoencoder.ipynb b/example/autoencoder/convolutional_autoencoder.ipynb
new file mode 100644
index 000000000..c42ad900e
--- /dev/null
+++ b/example/autoencoder/convolutional_autoencoder.ipynb
@@ -0,0 +1,543 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Convolutional Autoencoder"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![](https://cdn-images-1.medium.com/max/800/1*LSYNW5m3TN7xRX61BZhoZA.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this example we will demonstrate how you can create a convolutional autoencoder in Gluon"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import mxnet as mx\n",
+    "from mxnet import autograd, gluon"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data\n",
+    "\n",
+    "We will use the FashionMNIST dataset, which is of a similar format to MNIST but is richer and has more variance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 512\n",
+    "ctx = mx.gpu() if len(mx.test_utils.list_gpus()) > 0 else mx.cpu()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "transform = lambda x,y: (x.transpose((2,0,1)).astype('float32')/255., y)\n",
+    "\n",
+    "train_dataset = gluon.data.vision.FashionMNIST(train=True)\n",
+    "test_dataset = gluon.data.vision.FashionMNIST(train=False)\n",
+    "\n",
+    "train_dataset_t = train_dataset.transform(transform)\n",
+    "test_dataset_t = test_dataset.transform(transform)\n",
+    "\n",
+    "train_data = gluon.data.DataLoader(train_dataset_t, batch_size=batch_size, last_batch='rollover', shuffle=True, num_workers=5)\n",
+    "test_data = gluon.data.DataLoader(test_dataset_t, batch_size=batch_size, last_batch='rollover', shuffle=True, num_workers=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAABIEAAACBCAYAAABXearSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJztnXm4VmW5/2+q0+QQ5iwITkwOCKKghWLOSuZsNqk5HI+WiXoqT9ox09LqKr2wKK/UIjNLvRrMIU3AMENESHECkUkEHBFTGk51PH/8fjx9n297Pb1uNnu/77s+n7/utZ9nr7Xe9YxrXff3vnu9/vrrAQAAAAAAAAAA7c2bevoGAAAAAAAAAABg7cNHIAAAAAAAAACAGsBHIAAAAAAAAACAGsBHIAAAAAAAAACAGsBHIAAAAAAAAACAGsBHIAAAAAAAAACAGsBHIAAAAAAAAACAGsBHIAAAAAAAAACAGsBHIAAAAAAAAACAGvCW7rxYr169Xu/O68E/eP3113t1xXlow56jq9owYu2347/9278l+69//WtD/3P66adnx6+++mqy3/GOdyT7rW99a1bvL3/5S7L79++flX3+85/v8FpvetObKo//9re/ZWW9ev3jsb/++po/tu4ei3r/fvy///u/a3wfZ599drIPOOCArOzOO+9M9rRp05K9aNGirN7w4cOT3a9fv6xszJgxyda+dMEFF2T1nnnmmYbutyvas5XGYlcwdOjQZM+ePTvZG2+8cVZv5513Tvbdd9+99m9sDenusajz4v+/vp6j8v90nPrcpf+nc1dXjO0Seh9+7295yz+2ln//+98rz1H6zY2uG3Ubi9/73veSrW2wYsWKrN52222X7Hnz5mVll156abJfeOGFrr7FTsEetfVp5rH45je/OTvWubc0V44ePTrZP/3pT7Oyl156Kdnrr79+sv/nf/4nq/f2t7892ZdccklW9q1vfat02x2i82vEP+9Z15R2Hou+5lTtAb/+9a9nxzrvPvroo5Xn7Ip3hK6g0TbEEwgAAAAAAAAAoAbwEQgAAAAAAAAAoAb06k7XpWZ0DasL7ezeVxea2dW2s0yYMCHZhx56aFa2cuXKZM+ZMyfZG220UVZvp512Svaf/vSnrGz69OnJPvroozt1j60uB+vg//ReKuttttlmyf7hD3+YlS1fvjzZd911V7J32GGHrN4GG2yQ7He9613J9ra45ZZbkv3kk09mZSoPe+qpp5KtMsCI3OV68uTJWdmUKVOiIxp1DXZadSxuvvnm2fEPfvCDZKu0ZN99983qffazn032yJEjk73PPvtk9S666KJkX3HFFVnZpEmTkn3PPfck++KLL27k1tcK3T0W3Y2/aix6v1QJrM9xVVIGl+qpjO+2225L9vPPP5/V0zGs4zcil2Y2Olbe+c53ZsdVMi8/X6MSh1Ydiy7rq2pHf15a749//GOydX71/1u1alVWpu269dZbJ9tluiqhKcn6uoKeXhdhzWm2sahjrLPyWB0TPhavv/76ZKsUeu7cuVk9HZunnXZaVqbr6YMPPtipe+xq2nksejgJle4de+yxyfa1T/elzz33XFb2+OOPJ7s758wSyMEAAAAAAAAAACDBRyAAAAAAAAAAgBrARyAAAAAAAAAAgBrQrSniAaAeDBs2LNme+n333XdPtsa+0Lg/ERHrrrtuspcsWZLsP/zhD1m9d7/73cl++umns7I+ffokW3W8EydOzOppbBRP/9gsKR87Syn2hMbruPLKK7N6733vezusF5HHZdIUw7/85S+zervsskuyt9lmm2R7HJLevXt3eN2IXGOtem49d0Se0vOUU07JysaPH59sjSHV6m3bCJq6durUqVmZpix/9dVXk/3Rj340q6fxfD75yU8m+5VXXsnqaewDP4e23bhx45K9zjrrZPXOO++8Dn5F+6N90dMZe0wXZYsttki2xjTQeTYi4te//nWyR40alWyPJbN06dJkz5gxIyu74YYbkq3z5DXXXJPV05hhGrfG0RhJpXTx7UgpRskXvvCFyjKNi6bxoXws6hztsUwGDBiQ7GuvvTbZHuOrJ2NaAKwpOsY0ZmBExAc/+MFke78/8sgjk60xgXy/oHPn/Pnzk+1r2rPPPpvsBQsWZGU/+9nPkv3aa691eN2IiIULFyZbY+9F5HvbZkxX3kxUxaSLiHj/+9+f7OOPPz4rO+6445J9zDHHZGVnnHFGsnVNa4X5E08gAAAAAAAAAIAawEcgAAAAAAAAAIAa0BIp4tW9qtG0oSX22muvZLtLrqb2U/dBTSMXEdG3b99ku2vYrbfemuzf/va3a3azXUQ7p/yrC82WflNxCcdJJ52UbHeJ/POf/5xsT3esaDprHffuzqmut+qSG5FLXFQ2phIZP6enE//0pz+d7K5wtW2msfiZz3wm2SeeeGJWpvI8n//uu+++ZH/kIx9Jts59EblURVO633TTTVk9lSXp/0Tk0kK9pwMPPDCrp1JAT8W92267JVvneJeNNUozj0VH5Zhf+tKXsjKVlqgEacMNN8zqab/X8esSHn3u2qYRuURI66nsMyJi4MCBHfyKtUMzpYjX56/POCKXHRx++OFZmaYm1n3JsmXLsnoq29S04CrFjMhlCD4/6/3r/Owpc7WeSsgiqtMguwSuUVf6VhqLypgxY7Ljiy++ONk6B/br1y+rp9JAlfq6xNbXOEXbR/9v5cqVWT2dHx977LHK83UFzbQuQudotrGoa99ZZ52Vlb3jHe9Its81VftNDUsQke9BnnjiiQ7/JyJis802S/aLL76Ylek7qK6ZPh+ut956yXbp59ChQ5Ot+yxfbxp9f263sVhK277jjjsmWyVgujeOyMMZfO5zn8vKdJ5sFjkeKeIBAAAAAAAAACDBRyAAAAAAAAAAgBrARyAAAAAAAAAAgBrQEjGBGkVTuJ199tlZmWqsVYPpeuv//M//TLamRx07dmxWT/WCrvHUOCSqvb/sssuyev/1X//Vwa9YO7SbxrOONJveWnnooYeyY41p4Rpc1T1rzAlPQ656Zk13q7F9IiJmzZqVbE+HXnUtvyeNabHddttlZRqDQ9Mnd5aeHosay+Pcc89N9l133ZXV0/aYN29eVqaxR/R8/lzf8573JFtj9mgqzoiIBx54INkapyYij22i/cD7y+LFi5O9xx57ZGWaLlvjq4wcOTKr57GPqmjmsej86Ec/SvbBBx+clWm/13XR44toHBKNb+D7B4374yl5ddyqRn+TTTbJ6h1wwAHJ1jTka4PuHoseJ0njB5T63vXXX59sj3umKdh1TNx+++1Zvf322y/ZOk49DbzudTxN8R/+8Idka4wEv3ett+uuu2ZlV111VbI1ZuJb3/rWrF47jkXl4Ycfzo433njjZL/88svJ1lggEXm8Lo3x4XGk9Pl5bBA9v7bVpptumtXTeWDUqFEd/Iquo6fXRVhzmm0szpw5M9m67kfkY8DHh86Pb3vb25Kt8XZKZfp3P/a4PLoWapnHq9U4Qz5OJ06cmGyNV+Mx+xp932+3sViKK6zv8pMmTUq29h3nxhtvzI6PPfbYDut19vl3BcQEAgAAAAAAAACABB+BAAAAAAAAAABqwFv+dZXmQtOhurvWihUrku3uferyqrIQTTkcEXHOOeck+9JLL0329ttvn9VT+Yu7/qlL9rPPPtvhuSMiPvvZzyZbU/xFrH03+HbGXfBW01lXvCOOOCLZ6r4ekafdLbn+NUvawK7kQx/6ULI13WZE7pruqWpV7qNlLuWqSiXvaTp1bLsLrUoMdJy6VGWDDTZItqf31LSROie0KppyXeVQCxcuzOoNGjQo2SeddFJWps9SXae//e1vZ/VUMrLvvvsm26ULW221VbL79++fle25557J1jne+fGPf5xsn6+1fbWPHHXUUVk9T2fdDmgbuyu0PheVNbsERce31vO5TCUtXqZyMJ0PfdyrlK/d1kGfn3yfspqTTz658hwuKdMxNm3atGT7nKxz4TrrrJPsnXbaqfJa3jYqzdR9zuDBg7N6Or69L40bNy7Zup62y7pYYq+99kr2lltumZXpXlH7iUo2vZ6un94vXnvttWTrGhmRt6uORd3PROTzsktJ77jjjgBoNlSernOPSpUjchmyS091Xtax6POhjr/7778/2T6nVr0LRORrskrDqt5jIvKxHfHP829H160zvu9Rtt1222R/9atfTXbpfc7n5AMPPDDZd955Z7K1Pf/VffQUeAIBAAAAAAAAANQAPgIBAAAAAAAAANSAppGDNSqXUXd9zTYTkbvIuRuWuj+rG7PLElR2Mn78+GS7m6y617qLoEaJ19/yzDPPZPU0y8Ps2bOzMj/nanoy2nizsjaeibp5qmzPJTPaL0rXbcd2GjFiRLJ9vKk77brrrpuV6bGOFZdhqWRLn59noFK3ei/TMadu9C45Urw/DRkypLJuK3LGGWckW+egE088Maun0o/hw4dnZZqhSJ/XMccck9V7/PHHk61ZEz27hWZp1HaKyN2s586dm2x34VapsGcC0f/TvuTXakdU0uGyEF3v1HXe5ZI6vvW5+7jXdvVsT/qsdZxqBs2IXA723e9+N9oJn1s828xqXH6p7eQSMs3gpONUZWIR+VpV1Z4Rucu6Z2JUOZP2EZeeaTZWP3/v3r2jI3z+b0cOO+ywZPsz02OVoPjeQedRHc++zurzrJIdRuR7TW8bLdMsmRHIwaA5Oeigg5KtsnUfR1XvWBH5HFglDYuIeOWVV5KtEnS/lh77mlm1nvp8WJXp1q+tex+9v7qhz1L3G7vssktD/+/9Q8/h7+v6jqByMJfoIgcDAAAAAAAAAIAegY9AAAAAAAAAAAA1gI9AAAAAAAAAAAA1oFtjApXi/pRipnzhC19ItsYc8JhAmu7Wefnll5Ndpb2OyDX6qvtzHafGFVKNdkQef+iPf/xjsj1Vqqa6Vg19RMSECROSrTE82jG2TBWuyezMb+9savZvfvObyV65cmWyP/3pT2f1zjrrrGRr6mS/tuK/S+t5TJtmbm/tsz6OSrGwtK6OiVWrVmX1NP6SxmjSFKAR+Tjy56WaXNV2u95aj3XMRuRpr9sBfSYvvfRSsjVVZkQe02XmzJlZmaZX/fWvf51sj2WibX3TTTcl22MC3Xbbbcn2+EMLFixI9uLFi5Pt8/+wYcOS7emX9f80Ro6mle/ovloRjzWiqXF1HEXkMXfOPvvsZPtY1HbUuCEea0THn4+be++9N9lLly5NtsfcGjRoULQrpTgUSt++fbPjiRMnJlv3HhF5X9f4QB7HcPLkycnWdN8aj8v/T/8nIk99rLEVPEaF7oHmzZuXle2///7J1phDK1asiHbnkEMOSbY/M51vdT3yWD+6R1DbY3D5XkLRPYfGdvI9qq6FGmsFoFnZc889O/y7vx+W4rNUxZNxdC+l746lPb7Hoa2KOeTX1f32nDlzsjKNfTRmzJhk33LLLZX33u5UvX95TKAnn3zyDZ9bY11G/HMszNW0Qpw7PIEAAAAAAAAAAGoAH4EAAAAAAAAAAGpAt8rB1FXc3eVcTqKceeaZydaUd57GViVfpZTV6mbnKVqr0lK7a5m69PlvUTdD/T+XC+m1VZYREXH66acnW1OUu/RMr116hq1Io7/H26bqmZT63HnnnZeVqZRBZSe77rprVk9dtb19q9KytoKLYCOUUrPrOPKxqG7rmh6zJMNS+ZG78aoLu84Bfm09h6du1Pv3+61KadwqDBw4MDvW/qfPS2WPEfn85M/14YcfTvaiRYuS/dBDD2X1dOyopEzTuUdEHHHEEcnWuS8il1CoZOS5557L6t1zzz3JdjmYph3Xcb9s2bJoNzRdbETe110ycv311yf7lFNOSbZLUPS5q9SnlPLc52+Voj366KPJ9vHmx+1ElYt6RC5n1NTfEXkb+jhVpk2bluyxY8dmZSr31DTCKreMiNh6662TffTRR2dlG220UbIfe+yxZPv8r+PNJYh6bb3H6667LtodlaI+++yzWZnK/HTseJ/R8aeSMh832me8ffRYz+fn0L42dOjQAGh2VOqqIQV87dN5yfc3+q5QesfSPb5KcVXmGpHvc33vWZX63euV9sq6pxsxYkSy6ywHq5L7qaQ5IuKqq67qsF7p/VPXvoiIc845p6F76Gx4krUJnkAAAAAAAAAAADWAj0AAAAAAAAAAADWAj0AAAAAAAAAAADWgW2MCKaX4LJ5uTfWPr732WrI9JpDGI1Cdc0SugdZ4Ph77QLWVJc2eavtcR606QD2H31PVdSNyvfgPfvCDZGvsjIj2iwPUGVwzr8+klOrx/e9/f7I/8YlPZGW33nprsrXPecwTjYfiNBr7533ve1+yPfWgxz1pJjRWjo9njdPjcaw09oHG+PLUx/r8dMz6uFddtqfHVo21aqpLcX+8zM/ZanhsEE0Zrn3bNegDBgxI9pIlS7KyJ554Itk671500UVZPe0XqlXXOS0iT9968sknZ2Wa8lTvcdy4cVk9nWs9Ls78+fOTrX3J53iNy+JjvVXweAQ6PnzsaDwYbUeN2+KU1kWdez0mmvahGTNmJNvnjtK1W53Ss9t9992T7e2kcS58jdA5TmNt+d5GU7/rfXj8LB1jfr/aphpbwdPs9unTJ9ma2jgi72cap6gd8Tm1KuZkRJ7mWeco3+NVxSjx+BO6furaF5Gvabru+lrncVSUvn37JvuZZ56prAfQnWg8NR0ruteJyNcZ7/erVq1Ktr5f+Jqmx6V3Qi3zMaXjVMez7qEj8vnb3zv69euX7H322SfZF154YdQFfw+sWmu32Wab7NjX00bwuHy61up86rGbtF9UxSzqbvAEAgAAAAAAAACoAXwEAgAAAAAAAACoAT0mByu5Ql1yySXZscp41L3W3fu0zFO/q2udpopWt1tHXQTdXbCU8k/d+/TePQ2h/p+7xGtK3pEjRya7f//+Wb3Fixcn210Vm8XdrLOU3Pu0rCSJ0+c/atSorOyb3/xmsqdMmZKVab/QtlDpVkSeFv6HP/xhVvalL30p2Zpa3lOOn3rqqcnWdNjNjsobfbyp22NpjGnb+RjTMaHul0OGDMnqqaTMz6EutCo5cldRddH1caP3odIJl0g1KxMnTsyOzzzzzGSrvHTDDTfM6qmbrMurPvWpTyVbpZTef4888shka1pNdbeOiHjve9+bbB/3KpNQV++PfvSjWT2Vwvz+97/PyjQ9vUosXbKp0ppWxWVAelwai7pGNrp2+DjS+dbbUdPC33nnnZXn0DHsffKll15q6L5aEZ1bXL6j8ipf7+bNm5fsvffeO9nTp0/P6t17773J3mGHHZKtYy8iYu7cucmeM2dOVrbxxhsn+7DDDku2z7vaf1z+oC7ym222WbQzpd9X6vc6zzUqT/Z+ofOm2l5X11av5/1Q2XHHHZONHOz/oXNeKR10lVTlhBNOyI51HZs9e3ZX3GKnaMbU1qtxSamOD+2XHpZg0003Tba/w6lUTOcrXz81zbxKq13KpccelkL3ILrX9HvScap72Yh8n6vSsHZA5z/fUyilvch+++2X7N122+0NXzeivCfSd++99tor2b/61a8q/8d/i7Z3ad7tavAEAgAAAAAAAACoAXwEAgAAAAAAAACoAd0qByu5FKqbsWfGUNdYxd3q9P/UTS8ij6Z+yy23VJ5DXaM1O4y74Kq0y6UNGn1cXRU9S4a69/n5VV6jbmnjx4/P6qlLdqvLvxzvI51xSVUZy80335yVTZo0KdnuKqoSMHV7dnmCupseeuihWdm///u/J1uzE6n7fkTufl+SazQbOmbdHVzHortV6rHaKm2MyN1aVcKjmfMicndaH4s65vSeXEqqZX4OHVfqJtwqcjDtyxF5Bi+1NdtLRN6mjzzySFam40XnIM0AFpHPcaNHj062y191PvXMF9ttt12yL7744sr7HThwYLJVflk3XIKirsoPPPBA5f/p3KYy14jG5beKz9EbbbRRh/V8vdfjwYMHZ2X33XdfQ9duVkrPbquttkq2r+X6fy4Jv+2225Ktc5fLAjXznUohVEoQUc4WpTIEnUPvuuuurN4uu+ySbJWeReSu7vqb2xF9zk4pq6m2ne9NdG+oa5+3le4lXK6t59c1zUMKaFv5mulZ5epCleQrIm/D0h5VZZtf//rXk+0yZpVuu/zZpZqd4SMf+UiyVbq9bNmyrJ6GLHC50fPPP7/G97EmqFQ2Iu+Xuu/2cBylTE0qzdExURof+q7ne16VmbvMS8+p91HKCujnr8qM7RmpfW/bCvg7emfQzJv+nlHFG3mf1uevoRJcDlY6Z3dKwBQ8gQAAAAAAAAAAagAfgQAAAAAAAAAAagAfgQAAAAAAAAAAakC3xgQqaWQ1forrbFVHp/pJTz2qmjqPOaHa0FmzZiXbYwepll11nA8//HBWT+OheKwfvV+NZ+Qaav2drgfUc2hcjQ984ANZPU096NrxUjq9tY1fW9ujlEZY+4jrXvX/VP/uaZ1Vh3/33Xcne+rUqVk9fV4e00bjAGnKP4+foO3msVc0dbnG2NC+GJHHRfAYGF2h+15baOpMH4va3h7DR2OWqD7aY5loX9BxqmMvIo9V4H2mKr2np8LV9vf0m9pHe/fuHe1KKc2vzkEReQyQm266KdnnnXdeVk/jo+n487VAU4br2IvI22P//fdP9g033JDV81hCVWgfKcUd6wotek9QitWhce4cXVt17vKyEjqePZ37xz72sWRfeOGFyfbYB3oOjz/R6jGBSjEBdKx4DBfFYyjp/+mY0HUrIt/b6L7E9w0aL2bnnXeuvLaOZ49ld9RRRyXb9zY65nzv1G4MGDCgsqy0z9X10/eyutbqs/T5SsesX0vr6tzuMURK495jsbQ6VbF+fH7SZ156p9E4JCeffHJWNmbMmGQfd9xxyfa94bXXXpvs73znO1nZiSeemGxd+0455ZSs3he/+MVk+/5I96WTJ09O9tVXX53VO//885NdFaO1p/A+q6naNd7VJptsktXTNtbYWhHV/d7Hop5Dy3zO0zhrfi3di5bmfR33vkfV/9M+qfEUI/75PbYV0HbTdSUiH5va1hH5u5Omhff3xZEjRyZb49f5O4I+89mzZ2dlGmtJ32N0nEfk76Y+X+vxz3/+8+gu8AQCAAAAAAAAAKgBfAQCAAAAAAAAAKgB3SoHK3Haaacl213i1H2u0VS1pVTR++67b7LdbU9dwFSms/nmm2f11GXdU9/qfamMxX+XulaXXHn193s6xi9/+cvJ1nSSEY2nUV8b+LWrpBVv5B71mahLn0t0VKqiqa09pbdKu9xtb+jQoclWmZf3OXXz9N+iKZc19aenWtV6Bx54YFbWbHIwfdZqu4ulygP0GUXkUj6t567KVePbXXL12K+lUgftg95W6urpKTz1nO3sAl8ai54ifvjw4cmeMWNGstX1PCLihBNO6PB/3O1dJUwuXVB5ykEHHZRsd0tX6Z+7c1elXu3JOXJtUZIduzTz3HPPTbY+C5ct6bpYen46hj3NvEpd1QXbXdRVQjNq1KiszCWA7YSuA88991xWVpWyOCJi0KBBydZ9Smke07nQ90p6fpfLq0xQ9yLbb799Vk/v48knn8zKdL72NPbtRkk+7BJqRdd9lShE5POetmNJXuBrqcoSdG532a+udy6Fb7e1sFGZl/btPffcMytTuYruIS+//PKsnqZcL6Hn+93vfpeV3XzzzcnWuVavG5HvN7/yla9kZddcc01D99HMeD/Ufb6mhd96662zerpWNbr39L2J7jl0fvU5WvtTSV6o627pnVBDmkTk66S+j7oErhXR9/DTTz89K9Pn7+8ghx12WLL1/d/bZsKECcnWNpw3b15WT5+ry6Srwr7o+3lE3n/0PSgi35shBwMAAAAAAAAAgC6Fj0AAAAAAAAAAADWgx+RgngFG3aQ8M4m6DKuLnEs/1M3L3TnVRXKnnXZKtme10OP+/fsn2123VNpVyn6hrs/ucqj36+fQ86tszF3sP/GJTyTb5WBrg6po+BFld1qtq2WlrBWl7Dwq6bvyyiuzsqVLlyZbo7h79qPDDz882QMHDszKli1bluySK7721b59+2Zl6k44bdq0ynoqjenJjG6NoNkG9Dn7fauru2awi8ifpz4/l4W4FGQ1LqfTa3kbq7uo3rtLINQ93sdYqb+2Oo3KoTzLjWZtOv7445M9fvz4rJ7Oa9ru7pp98MEHJ1vbIiJvb5UueGahd7/73ckuSYWrMhW2Cy7N0d/rz3b06NHJ1mxenoGqK2RzOhZ1rdJ2i8jbZMSIEWt83VZBpUPLly/PyvT5u4xI1w9dxzQ7UUQujdC9k8vGtO197tYybRuXuqsMtyQZLI3TdsCfn+J7ygceeCDZuv9w6YGGB9C+4BLO0l5K1z+9D9+jauYl35d75r5moSrLV0Tj/U3nRc1kGJHL81xOdfbZZydb27CE76MVvd8jjjgiK7v//vs7vCd9b4n45xASjdyHj1ntZ57dqqfHsGeFUlnt4sWLk+19W3+vrk0R+VxcklWqTFrnPH831XqltVTnYn/OGrLA10W934ULFybb19ZW5Kmnnkq2z0GlvblKtLQflDKrqXRZ18uIPCvxtttum5VpH7n33nuTveuuu2b1St8oHnzwwegJ8AQCAAAAAAAAAKgBfAQCAAAAAAAAAKgBfAQCAAAAAAAAAKgBPRYTSLWzEeV4MqrZq0oTHpHrVl3jqSlXVQPtOkK9lsaicL21Xss1varrVK2ga/lL8UWqUsS7TlTjl2h8oIiIb33rW5Xn7yzaNm8knkZnYm8MGzYsO9Y+s8ceeyTbY8loqj1t90MOOSSrp3GpFixYkJVp7AONa+JtpulCXa86derUZGsMlFIK57Fjx2ZlV1xxRTQTGlOk1H91TJRiZqlO22OZqP5a6/k40mt5SkzV9Wq8L41/EpGnofS4TytWrEi26oLrhM/JGptH28Ofz2abbZZsjVXw/e9/P6s3d+7cZHu8Jp3zNE3u3nvvndXT+BulOAXtmBZe0XgWEfk6qWlOI/L5UedAX9M6E/fB4wrpmNOYbh7bqU5orIiqfUNEPsZ0fYuImDx5crK17adPn57V0xgip5xySrI9VoZea9Gcg9ATAAAYSElEQVSiRVmZxmcYM2ZMsm+99dasnqYa198Yka8H7T4WNZ6lozFEvK7GAfK4lbp/1Wfpe1Rdnz1Gie5zNe6Px4fSfbTH4mvWeCONpnovoenXx40bl5X9+Mc/7tyNVdDo3KpzdUQeM3HmzJnJ9vTS73nPe9b4PjzmVzPha5qi8YJ8Tq2aeyPysVNK2679qxT/q+od1s+p6+7666+f1dN5wO9D97k6P/g5WhGN7eN7BV2ffH7SmFwaB8jnXX12ulfyWFPaD55++umsTPe5+n+PPvpoVk/j9HlcoUZjd3U1eAIBAAAAAAAAANQAPgIBAAAAAAAAANSAHpODfeADH8iO1RXKZSaltGqKujOW3Pa0TF1hI3JXOnX/cldJPV/JDVfL3A1Q3RFL96tyCH826gJ3/vnnZ2VrQw6muKuz/j53H9XfoO5yI0eOzOp9/OMfT/aQIUOyMnWHveOOOzq8rqOuop7mWt0AXWKkz19/p6d11dSDM2bMqLy2pnB0eZSmJRw8eHBWpmnNmwF91n369Em2SxRKqWurUgS7O21VWmSXqqj0wNtH3eD1Wu5arRIk7wvq6llK59rOXHfdddnxCy+8kOyDDjoo2TvttFPlOb72ta8l2+fdefPmJdtTc2rf0jnBpbH33Xdfsn1O0D5YkjG2Ay6TVvr27Zsd67PQ8dHZfl56turOrpJYlws1s/Sgq1F5rc6Lvh/QMpVkReSSHZ0zlyxZUnkOrefrkUojnnjiiazsoYceSrauVT6eZ82alWyVXUfk82tp7W4HSuNBZQ4R+f5E28DlC3pObUeXtJTkI/rctV/oPsWv5b+lWefRQYMGJdvTpeszUglyRC6vUumky79UOl5Cx1VpPlXZrO89dE72sfLYY48lW+XVF1xwQVZP18UTTjghKxs+fHiydd4tvWf5feiz6gk0JXxE3te1P7usUvcwJdlmKQ15VbiPUqgSl9/qPqb0rqvzvss7dS6petdqVXTvVgot4WNs4MCBDZ1D5whd7/x82keWLl2alan0TCVr3oZaz9/1fL3uLur5RgMAAAAAAAAAUDP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAO6VZC96667JtvT+j3zzDPJ9hgiqpMspVEtpWPX/1PNpOuyVZ+rsQpcx6mUdLxVqcYj8t/l96G6Qk2t67pvjWXiz03Tl3cVmrpU4wNEREyaNCnZroXUZ65ppP25avyPe+65JytTzbK2TSm1pZaphjoi1457KkVtG9UOq746Io9l5W2j19Z6HvNEz+/Pw/tMT6P9uRQXS8ei6/IXLlyYbP19nnJW9deqsXZNetXY9ntUW/txR+dUNOZQXWMCaWrLiIhNNtkk2bfffnuyXSt95plnJlvH29ixY7N6DzzwQLI9/abGxdJ28lgERx55ZLJvvPHGrEzjbmkb+hrSDvh6pGNs6NChWdny5cuTXUqtXBUjohQXxOdlfe7aTzyO1IMPPphsn1N9fLc6GrPCY/0oOv95LBONVaDxyzQtbkTehrrf8Dhq2l/22WefrGzYsGHJ1phwnt5W19pddtml8j503fA94YsvvhitzhuZX6r2mB5fRNtb90GeIlnLfNxUxbv0WCMaR8XHusfyahZ0fOj+OCIfHx4fTeM7/vd//3eyjzvuuMpr6X4oIt+jlt4ZGkXbyWOlaXw8jVt0zTXXZPX23nvvZPtv8eezmtK7lcfP8TW/u9H3iYi8r2sb+PjQdw2NBxXxz3v01fgYaLSNde3z9VnbVe/d+5b2XZ8bte/qPqsr+mAzofu4iIgddtgh2T5P6r5dn4PHhpo8eXKydS3U5xiRr4s+PnTe1H7Wr1+/rJ6WaXr7iIg5c+ZET1DPNxoAAAAAAAAAgJrBRyAAAAAAAAAAgBrQrXIwdS12ty51tXJ3uSrc3VxdtEpp5tV2FzJNt6rnL6XHLKUNVDc0d21XV7EJEyZkZerud9lllyXb05DrtV3+VXJj7Szqgqpu+xG5tMDlNfocVP7kMjjF3dTVDVX7i0sV9FpqP/LII1k9dVl313l1GdQ+4ikXNdWxt6+6gKpbr6dCV7dql381m0u8u6iuxn97yY1Z3f7VldjHc1V6T3dtV7duT1Wv7pfaJ93tXaWHKiuKyNPJV/3+VqIqxXAJlxHpOFBX6ssvvzyrp89LU8m7u7VKAf0Zq8uvpks+8MADs3oLFixIdkki2qypjbuK0m/3NULnqNL/ldLkVuHSySoJyooVKyrPUbqndkD3AKW20LXPpbeagv2VV15J9r333pvV22233ZLtKd0VXe80hX1EnkJX51aX8pbSTev+SPdbKneIaL61rzOU9rLejvrcde9TOoc+Zx+X+mz9HHp+bUeXSuj/+Trua34zomEmOjpuhNmzZ3fV7fQImj6+HfF3Dd1D6zzq65FKWH3PUZWqvbQeVcnQIvKx7mVV49vHm/ZdleNHVL9ztlv4At2LR+T7Up9P9bdXvX9G5O8Zui76Gqnrkb+bahvq+Urv/C5jRA4GAAAAAAAAAABrDT4CAQAAAAAAAADUgG6Vg6k7srtCqQufu6Rq5iZ1+XI3QP0/d9tTVz110XK3Lq2nLnbuVqfuZe6Gq66Fej53V1PpymmnnZaV6W8+/fTTk73VVltl9fSc06dPz8p+8pOfJPsb3/hGdAUqzfB2UqmGyz20PdTNzt3I1a2u5C5dkidU9RGVbkXk0ePdZV3va/vtt0+2u3KqBKWUOaDkoqn9zKVOLlfsaVRGVZISqYu59ouIvH20n/uzVffXRjOReRtUSSzUVd7xuUPHs1+7Ljz++OPZsUpQNBvLXnvtldVbtmxZsnW8eWaWrbfeOtnz5s3LylQupPJRPV9ELqv0DCZVeH9pVB7XzJR+QynboP5faY5qlFJ2MJ3nXMJZdU/tiEqDtW28X7obvKJr1axZs5Lt40il3Cq9Uhl3RL4G+xqvdXVO8HG/7bbbJtv3ArqGqBxYs6C2C6UMWr5/rcp46mNR26QqW6efz/cVeo7Snkv3aj6nljJqAnQXLrnR/YLK1r3/aniIk046KSvTvae+I/r6qeNIx4rPmyUJvo453e9vs802WT2VI/n+SZ+BSoI97EGrM2XKlOx43333Tba/P+j8p+vsiBEjsnrapirBc5m6rn0ux9Pnr+ux35PO5d5Hego8gQAAAAAAAAAAagAfgQAAAAAAAAAAagAfgQAAAAAAAAAAakC3inovuOCCZGusiIiI3XffPdkjR47Myq699tpkqw790ksvzeqpHt612KrJ1PgGnp5PNZQaN6SU5rQUf0j1pKW4CqVYIxoH6O67787KrrrqqmTfdNNNlefoKjSWgMcnUr2jxxnQ2Dmq/fd0fYprfVVfqe3m2nT9P405o3ZEHiNo5513zsr0nKpD9bbW9vU2rIqtUPrN3keaLSaGPtuqVJwRuRbbddQa16UU60dTyas+13W23q6KtqPOAR4TaO+99062a3X1uB3iIHSmT22xxRbZsbb3/Pnzk3388cdn9XQ+ve6665KtMYAiIm6++ebKMtXyaxt6Cuk99tgj2RqTytHf32zja20zderU7Picc85Jts7fpefSaLp4P4fW1TmhFJ+r3dtnww03TLbGg/D0wEuWLKk8h/b1QYMGJdvn3Q022CDZOo/7Oqv7F0+Tq7EoNB6Nzw96Txqjwq+nv7kd5lanlFLa0d+vbeDnqIrzURqLGhPDz1lKJd9oLBOAnsLTu+v+UvfTuseLKKd+r4ol6ej40P/xOF6Nrq2lejqn+pqpe+DOxEZsFe67777suBRTVPuBPhNfq1atWpVsfca6Xkbk776+tuoap9fytVXfJUrxGbsTPIEAAAAAAAAAAGoAH4EAAAAAAAAAAGpAt/rfairhs846q7Je//79s+PFixcn+6KLLkq2u3+pi5zLwdw9bzWeOlNdXN2VS2nUzVfP5+5fev477rijofPtt99+DdVbW5x66qnJPvzww7Oyc889N9l9+vTJytRNXZ+dp4VVFz5/Xur2qc9OJVkR1S6Q3ma/+93vkn3++ednZffff3+y1aVUUxJGREyYMCHZCxcuzMq0f6rbqEoAIvLf6S7W7sbd06g8QMeYt0HJnVbPoZIeH29aT9vAU85rv/A+o9fW87sLce/evSvPX5oHWp2Su7/icpRbbrmlwzKfu1Xu8eCDDya7X79+WT2VkbnMV2WbTz/9dLJnzJiR1dOxo/OI80YkGq1IqR1nzpyZHasrdGnMlqQmjV5bz+GSv0b+51+dvxWpeg46biLylMX+DFTCo204cODAyuvq//gzVpd13x9palyV6/r8ryntfe+lc7mm0/W5oySBa1VefvnlZPse1Z/1avzZ6vPT9c7bUfcfvq/QNU3lZd7v9Jwu12t0TgBYm7h0tuo9wWWUuvZ539Zz6DhyGZaOq9K+QudAl1xWScB8ftC9j6ePrwo74udodfQZRORzkK45EXm/0Pb11O/6vLTd/d1L+49/e9B+oOfz0BI6xzfL+oYnEAAAAAAAAABADeAjEAAAAAAAAABADeAjEAAAAAAAAABADejWmECNpt3TGEDOnDlzku2aZNVFuhZP9YGq53NtX1XKP7+WHjcat6CUYrOUJtfvsQo//9qOffHzn/+88tjvediwYckeOXJksg855JCs3pAhQ5Kt8QIi8jbVWEKuCb7zzjuTfdtttyVbYwB1Fo+poRpV1xxXxajyNPCaTn3atGlZmcen6Wk0vaK2gWuPNSaL90Ptp6qPdq20tre2scdI0Oe+YMGCrExjDmmKR4+RoPpwf+Z6/naLJ9NojJWdd945O9ZU0d/73veSfdlll2X1NM2mju0pU6Zk9apicPm1jzrqqGR/5zvfyepNnz492dttt11W9vzzzyebWBb/oComlLeBzueN9hmvp+upjrc6M2DAgGRrPBZf+5YvX55sn7uq2sPXI40fU0rHXmprjbGxcuXKZHsqeUXHXkQes0/H6eDBg7N6v/3tbyvP2Sp43AedD30e0rhPGkPxhRdeyOppG+i66/sgXU99f6njW+/JYzTqOXX+jmiemBZQb7zf635a94oe2/FHP/pRsr/97W9nZTrHVr07+rGe32NT6trnsb+q9iMe11D3TKNHj87KdAxrunLfU7cbOjd6rFWPb7Ya33toH9F10fdASikun+LzrvaRpUuXVp6/O8ETCAAAAAAAAACgBvARCAAAAAAAAACgBnSrHKwkpSilolT3uRtuuCHZ6s4XkbuDeQpoddfS87nLl96j2iUXeP9dev/6f+6etv766ye75Pqs52gmOYq7Ruqz9OeqMiq13Q2zO2k09bf2F3WBj4jYf//9u/Semp0qGYH3S3V/9f/RcaD9xN1fNX3yokWLku3SM21Hl4ptvvnmHd5TKaW9uhNH5O7AVS6mrUqjEl0fKyoNOPzww5O9bNmyrN4BBxyQbB1Hjz76aFZP5Q8u5dJ5RsffnnvuWXlPl1xySVamUlBPWV1n1G1d27grJHMlOVhJ8l0nVLKl84ymX4/I9y+aZjwiH7fabr4H0mM9h7dTlezaz6/9xV3s9Vo+n2o/UPnuk08+Ge3G+PHjs2Pds6pUOSLikUceSXZJglK1l3W0zOdvbR9d01TmG5H3Q5e4ADQDKn+KyN8DS3IoHWO+p9Q9q86HPgaqxpHvpfT/fMzqtfR+9f0wIg894XtlldgqjUq3W5UZM2Yk++CDD87KVIqltr4TROTPrhQqRt9VvB/osbZ9SXat8t+ehN0wAAAAAAAAAEAN4CMQAAAAAAAAAEAN6FY5WAl1Wyu5uCpXX311dqySApclVGX6KkkD1DXP3fv02KVP+ls0yry7kGkGkIkTJ1beR8mlryq7S3dQip7eCjTaz+AfaAamY489NtmesUbdZD07jLrhqpu6Z3nQLF2vvfZast3FUvu9y7zUbVbdhtdbb72s3tSpU5O9ww47ZGU6Tn1eaXUalYO5+/GsWbOSrZKO//iP/8jqqSu1nv/QQw/N6mnbuyxEXbU1o8Kll16a1TvnnHM6vFbd8DVNn0VJWtKZ9aMkGyvdR6NS3HZHM1keccQRyXaJw5VXXplsz8CndUvu51qmmRL9f3TudklCleShlKnVpe5bbbVVsnVe92xm7YhLwBRdd7RNfVyqtKEkd9EylzHr3kfnBF8XkYBBs+PvITqfNSrfd/n4kUcemexSRr+q90ofszrGfF3Ue9RsV1dccUVWT/eefh8+T7cT2p7+nqHPxCV9uo5tueWWHf49In/+VXZEPnf7vKhrl867fk/aDxYuXBjNAJ5AAAAAAAAAAAA1gI9AAAAAAAAAAAA1gI9AAAAAAAAAAAA1oGliAnWGU089tadvoVsoxWpo9xSA0Fx43J7VeKycG2+8Mdlf/vKXszKNv6PxDTwmhOp/R44cmWxNEx6Rx09wra7el8bBGDBgQFZPU5nfddddWZlqwjU+UDtQmj9Uu16KDTFp0qRkL1myJCs75phjkq1xQubPn5/Vu//++5PtGv+DDjqow7Kjjz46q6dp4OfNm1d5v+0eL6jRGHIR+fNUuxQrT8s8rlop/pD+X6OxGtp9fdtmm22S3bt372R7Gvjbb7892Z4K91e/+lWyly9fnmyfkzWelra1n0/xuAUaw0fbU+89Io/7M2XKlKxsxIgRyZ45c2bltdqBUlwsZ9GiRcl+17veVVlP42lVxfaJyNdPjx2k96G2py0uxeMolQF0F7/4xS+yY90r6li55pprKs/x+c9/vvJ4iy22SHa/fv2yen369Em2pqZfsWJFVk/HlcdX1Lle5+8S06ZNy4533333ZFelK29VSrFndc7Ud4mIiFWrViX7ueeeS7bvParO7/FF9XjTTTfNynzuXU0plrCviz0FnkAAAAAAAAAAADWAj0AAAAAAAAAAADWgpeVgANC9qMurutoOHTo0qzd8+PBku/vl5ZdfnmxNB+6SI5UYqAzLJUfqXqvSrYiI0aNHd3j+M844I6rYf//9s2OVKahrcDtQktuoi+u6666blQ0ZMiTZV111VbKvvvrqrJ5KtNTF+tlnn83qqVu1l+23337Jfvzxxzv8n4i8P95zzz1RRbtLjEqUZBsbb7xxst1FeqONNurwHO4GrRKR559/PivTdKnual1Xxo0bl2ztzypdjcjnrtNOO61L7+GOO+5Y43M8/fTT2fHs2bMr644aNWqNr9cqvBE5hs57us66lEvHmKc77sy1VIbtc29pvijJNAC6C5dX6Tqje0iXaCkl2abKal1iuzYp3dOCBQuyMn0GulZUyZRaCX0OPudoGIF11lknK9N+UJIu67uF7nN1no3In7/L4PUdR+v5/Kky6RdeeCGaATyBAAAAAAAAAABqAB+BAAAAAAAAAABqAB+BAAAAAAAAAABqQK/ujI/Qq1ev+gZj6GFef/31Xv+61r+GNuw5uqoNI7qmHXfcccdka6rGiDyVsDN48OBkH3/88cnu27dvVm/LLbdMtqZk1LTCEXmKTY2lEJGnqvdUolV4GtD11lsv2Y899lhD5yjRrGPR04c3ujZssskmlf+jumrtL54C+89//nOytd0jcv37U089lezOxqTQ39nZ9a/ZxmJn0XhLw4YNS7Zr2TVGjWrvvQ00FetLL72Ulc2dOzfZv/nNbzp5x11Ld49FjzPQaGptjWngsSI0FpbGIyiNZ61XGrN+v3/5y186vI/StbyPaJme3+PneNyFKtplLCoaz8Lnw/XXXz/Z+tw9Ht6LL75Yef6VK1cmW2OleHyV7qRZ10VonJ4Yiz73jB07Ntka42XGjBlZvfnz51eeU+c2javjc2+j87dSijWj5+vJ2IXNNBb1+Zf2fLo3jMjXD7X93eTVV1/t8Foei033rB4jU2Mc6nrscYoeeuihZH/4wx/u4Fd0HY22IZ5AAAAAAAAAAAA1gI9AAAAAAAAAAAA1oFvlYAAAAAAAAAAA0DPgCQQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP4CAQAAAAAAAAAUAP+D/fLABOCduBWAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<Figure size 1440x720 with 10 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "plt.figure(figsize=(20,10))\n",
+    "for i in range(10):\n",
+    "    ax = plt.subplot(1, 10, i+1)\n",
+    "    ax.imshow(train_dataset[i][0].squeeze().asnumpy(), cmap='gray')\n",
+    "    ax.axis('off')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "net = gluon.nn.HybridSequential(prefix='autoencoder_')\n",
+    "with net.name_scope():\n",
+    "    # Encoder 1x28x28 -> 32x1x1\n",
+    "    encoder = gluon.nn.HybridSequential(prefix='encoder_')\n",
+    "    with encoder.name_scope():\n",
+    "        encoder.add(\n",
+    "            gluon.nn.Conv2D(channels=4, kernel_size=3, padding=1, strides=(2,2), activation='relu'),\n",
+    "            gluon.nn.BatchNorm(),\n",
+    "            gluon.nn.Conv2D(channels=8, kernel_size=3, padding=1, strides=(2,2), activation='relu'),\n",
+    "            gluon.nn.BatchNorm(),\n",
+    "            gluon.nn.Conv2D(channels=16, kernel_size=3, padding=1, strides=(2,2), activation='relu'),\n",
+    "            gluon.nn.BatchNorm(),\n",
+    "            gluon.nn.Conv2D(channels=32, kernel_size=3, padding=0, strides=(2,2),activation='relu'),\n",
+    "            gluon.nn.BatchNorm()\n",
+    "        )\n",
+    "    decoder = gluon.nn.HybridSequential(prefix='decoder_')\n",
+    "    # Decoder 32x1x1 -> 1x28x28\n",
+    "    with decoder.name_scope():\n",
+    "        decoder.add(\n",
+    "            gluon.nn.Conv2D(channels=32, kernel_size=3, padding=2, activation='relu'),\n",
+    "            gluon.nn.HybridLambda(lambda F, x: F.UpSampling(x, scale=2, sample_type='nearest')),\n",
+    "            gluon.nn.BatchNorm(),\n",
+    "            gluon.nn.Conv2D(channels=16, kernel_size=3, padding=1, activation='relu'),\n",
+    "            gluon.nn.HybridLambda(lambda F, x: F.UpSampling(x, scale=2, sample_type='nearest')),\n",
+    "            gluon.nn.BatchNorm(),\n",
+    "            gluon.nn.Conv2D(channels=8, kernel_size=3, padding=2, activation='relu'),\n",
+    "            gluon.nn.HybridLambda(lambda F, x: F.UpSampling(x, scale=2, sample_type='nearest')),\n",
+    "            gluon.nn.BatchNorm(),\n",
+    "            gluon.nn.Conv2D(channels=4, kernel_size=3, padding=1, activation='relu'),\n",
+    "            gluon.nn.Conv2D(channels=1, kernel_size=3, padding=1, activation='sigmoid')\n",
+    "        )\n",
+    "    net.add(\n",
+    "        encoder,\n",
+    "        decoder\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "net.initialize(ctx=ctx)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------------------------------------------------------------\n",
+      "        Layer (type)                                Output Shape         Param #\n",
+      "================================================================================\n",
+      "               Input                              (1, 1, 28, 28)               0\n",
+      "        Activation-1  <Symbol autoencoder_encoder_conv0_relu_fwd>               0\n",
+      "        Activation-2                              (1, 4, 14, 14)               0\n",
+      "            Conv2D-3                              (1, 4, 14, 14)              40\n",
+      "         BatchNorm-4                              (1, 4, 14, 14)              16\n",
+      "        Activation-5  <Symbol autoencoder_encoder_conv1_relu_fwd>               0\n",
+      "        Activation-6                                (1, 8, 7, 7)               0\n",
+      "            Conv2D-7                                (1, 8, 7, 7)             296\n",
+      "         BatchNorm-8                                (1, 8, 7, 7)              32\n",
+      "        Activation-9  <Symbol autoencoder_encoder_conv2_relu_fwd>               0\n",
+      "       Activation-10                               (1, 16, 4, 4)               0\n",
+      "           Conv2D-11                               (1, 16, 4, 4)            1168\n",
+      "        BatchNorm-12                               (1, 16, 4, 4)              64\n",
+      "       Activation-13  <Symbol autoencoder_encoder_conv3_relu_fwd>               0\n",
+      "       Activation-14                               (1, 32, 1, 1)               0\n",
+      "           Conv2D-15                               (1, 32, 1, 1)            4640\n",
+      "        BatchNorm-16                               (1, 32, 1, 1)             128\n",
+      "       Activation-17  <Symbol autoencoder_decoder_conv0_relu_fwd>               0\n",
+      "       Activation-18                               (1, 32, 3, 3)               0\n",
+      "           Conv2D-19                               (1, 32, 3, 3)            9248\n",
+      "     HybridLambda-20                               (1, 32, 6, 6)               0\n",
+      "        BatchNorm-21                               (1, 32, 6, 6)             128\n",
+      "       Activation-22  <Symbol autoencoder_decoder_conv1_relu_fwd>               0\n",
+      "       Activation-23                               (1, 16, 6, 6)               0\n",
+      "           Conv2D-24                               (1, 16, 6, 6)            4624\n",
+      "     HybridLambda-25                             (1, 16, 12, 12)               0\n",
+      "        BatchNorm-26                             (1, 16, 12, 12)              64\n",
+      "       Activation-27  <Symbol autoencoder_decoder_conv2_relu_fwd>               0\n",
+      "       Activation-28                              (1, 8, 14, 14)               0\n",
+      "           Conv2D-29                              (1, 8, 14, 14)            1160\n",
+      "     HybridLambda-30                              (1, 8, 28, 28)               0\n",
+      "        BatchNorm-31                              (1, 8, 28, 28)              32\n",
+      "       Activation-32  <Symbol autoencoder_decoder_conv3_relu_fwd>               0\n",
+      "       Activation-33                              (1, 4, 28, 28)               0\n",
+      "           Conv2D-34                              (1, 4, 28, 28)             292\n",
+      "       Activation-35  <Symbol autoencoder_decoder_conv4_sigmoid_fwd>               0\n",
+      "       Activation-36                              (1, 1, 28, 28)               0\n",
+      "           Conv2D-37                              (1, 1, 28, 28)              37\n",
+      "================================================================================\n",
+      "Parameters in forward computation graph, duplicate included\n",
+      "   Total params: 21969\n",
+      "   Trainable params: 21737\n",
+      "   Non-trainable params: 232\n",
+      "Shared params in forward computation graph: 0\n",
+      "Unique parameters in model: 21969\n",
+      "--------------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "net.summary(test_dataset_t[0][0].expand_dims(axis=0).as_in_context(ctx))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that the original image goes from 28x28 = 784 pixels to a vector of length 32. That is a ~25x information compression rate.\n",
+    "Then the decoder brings back this compressed information to the original shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "l2_loss = gluon.loss.L2Loss()\n",
+    "l1_loss = gluon.loss.L1Loss()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 0.001, 'wd':0.001})\n",
+    "net.hybridize(static_shape=True, static_alloc=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch [0], Loss 0.2246280246310764\n",
+      "Epoch [1], Loss 0.14493223337026742\n",
+      "Epoch [2], Loss 0.13147933666522688\n",
+      "Epoch [3], Loss 0.12138325943906084\n",
+      "Epoch [4], Loss 0.11291297684367906\n",
+      "Epoch [5], Loss 0.10611823453741559\n",
+      "Epoch [6], Loss 0.09942417470817892\n",
+      "Epoch [7], Loss 0.09408332955124032\n",
+      "Epoch [8], Loss 0.08883619716024807\n",
+      "Epoch [9], Loss 0.08491455795418502\n",
+      "Epoch [10], Loss 0.0809355994402352\n",
+      "Epoch [11], Loss 0.07784551636785524\n",
+      "Epoch [12], Loss 0.07570812029716296\n",
+      "Epoch [13], Loss 0.07417513366438384\n",
+      "Epoch [14], Loss 0.07218785571236895\n",
+      "Epoch [15], Loss 0.07093704352944584\n",
+      "Epoch [16], Loss 0.0700181406787318\n",
+      "Epoch [17], Loss 0.0689836893326197\n",
+      "Epoch [18], Loss 0.06782063459738708\n",
+      "Epoch [19], Loss 0.06713279088338216\n"
+     ]
+    }
+   ],
+   "source": [
+    "epochs = 20\n",
+    "for e in range(epochs):\n",
+    "    curr_loss = 0.\n",
+    "    for i, (data, _) in enumerate(train_data):\n",
+    "        data = data.as_in_context(ctx)\n",
+    "        with autograd.record():\n",
+    "            output = net(data)\n",
+    "            # Compute the L2 and L1 losses between the original and the generated image\n",
+    "            l2 = l2_loss(output.flatten(), data.flatten())\n",
+    "            l1 = l1_loss(output.flatten(), data.flatten())\n",
+    "            l =  l2 + l1 \n",
+    "        l.backward()\n",
+    "        trainer.step(data.shape[0])\n",
+    "        \n",
+    "        curr_loss += l.mean()\n",
+    "\n",
+    "    print(\"Epoch [{}], Loss {}\".format(e, curr_loss.asscalar()/(i+1)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Testing reconstruction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We plot 10 images and their reconstruction by the autoencoder. The results are pretty good for a ~25x compression rate!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAABIEAAAD4CAYAAAB7VPbbAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzsvWe8XVXVvj1QsQKBkJCQQiqBhEQ6gRBaIIAoIE1BimAB8UcVBfUvIGIBFSygPHZ/IihFRVSU3psGQpASEtJIJw0CKPqIvB98mc8975w13Qmn7LPXdX0aO3Oetdeefa2Me4y1Xn311QAAAAAAAAAAgNbmDV19AwAAAAAAAAAA0PHwEggAAAAAAAAAoAbwEggAAAAAAAAAoAbwEggAAAAAAAAAoAbwEggAAAAAAAAAoAbwEggAAAAAAAAAoAbwEggAAAAAAAAAoAbwEggAAAAAAAAAoAbwEggAAAAAAAAAoAbwEggAAAAAAAAAoAa8qTO/bK211nq1M79POfLII5N92mmnJfuHP/xhVm/lypXJnj59erI33njjrF6fPn2SPWbMmKxsyy23TPZ5552X7DvuuCOr9+qrndccr7766lrtcZ2u7MO60159GNE8/divX79kjx8/Pit77LHHkv32t7892f/617+yeltssUWy586dm5XdddddDd3HWmv9X9N29LxkLnZ/WnEudgfe+MY3JvuVV15Zo2u84Q3/939fr7zySrebi7pWRVSvV/Pnz88+L1iwINl/+9vfkv2Pf/wjq6efn3vuuaxs3XXXTfbChQuTrWtwRMRJJ52U7EcffbTN+2svutNc1L5be+21szLd1/79738nW8d8RETv3r2TvWTJkjb/xv9u2LBhWdm0adOSrePnTW+qfiTwfbe990z2xe5Pd5qLjbLJJpsk+5lnnmn36x911FHJ/vnPf97u118TuuNcHDhwYPZZn8MnTpyY7P/93//N6ukzwwMPPJBs3xe1nq+1hx56aLL33XffZP/ud7/L6l133XXJ9r21vWm0D/EEAgAAAAAAAACoAWt1pjdKe78V3H333bPPxx57bLJHjx6dlen/mLzwwgvJ7tu3b1ZP/+6ll15Ktv9vzDrrrJPsp556KivT/4HTt8j+P3jLli1L9g9+8IOs7Kqrror2pDu+2YWcVvlflhNPPDHZu+yyS7KPOOKIrN7y5cuTvf766yfb59vIkSOTfeONN2Zl6n333e9+N9nq8dfZMBe7P60yFzsS9fKLyL1xf/Ob3yR71qxZWb019fBZE7rjXCx5Au22227Jvuyyy7J6zz77bLLVG+otb3lLVk/XXf8fz/XWWy/Z8+bNS3aPHj2yep/73OeS/fDDD7fxK9qPZpiLVV4x2s4Rq7ZnI3j/aB/sueeeyfb/We7fv3+y58yZk5XdfvvtyV7T+YYnEDjNMBfXBPdCv/TSS5OtHiVTpkzJ6v3yl79M9uOPP55sn/ejRo1K9sEHH5yVbbXVVslWT7xPf/rTWb0LL7yw+ge0M91lLg4dOjTZv/rVr7Iy9Xz91re+lexPfvKTWb0f//jHyX7zm9+cbH/m12eLW265JStTb1f1kN1uu+2yei+++GKy3/Wud0VHgicQAAAAAAAAAAAkeAkEAAAAAAAAAFADeAkEAAAAAAAAAFADul1MoBNOOCHZHhNINdEeAVx1z2qvWLEiq/fWt7412arZ00xFERFTp05N9qabbpqVaeYN1Ya+4x3vqLwnzfgQEXH99dcn2zOYrQndReMJ1TSz3lpjZEVEXHzxxcl+3/vel5Wp7lbj++iYj4g4//zzk62ZSTxTn8YC8zVBP2tcBM9Yc/TRRyf773//e3QkzMXuTzPPxY5G4xZ4xqDjjjsu2R77YJtttkm27s8DBgzI6l1++eXJvvbaa7Myjbf3la98JdkaFywiYubMmcn2+F8af+/3v/99S83Fn/70p8nWeAkRES+//HKy3/a2tyX7n//8Z1Zv8eLFye7Zs2dWpnU1pqGemyIiLrjggmR7HI32ptnmYqOxcnbeeefss8ZR0nH+kY98pPIahx12WLJ/8YtfZGUXXXRRss8666zKa+g+u/nmm2dlH//4x5OtcTWd9ogPxL7Y/Wm2uahnTc38FJHvO/78pei66etce9Do9TWWrcYfiojYaaed2vWeustcHDt2bLL9ueDpp59Otmbz/sIXvpDV0/g+9957b7JvvvnmrN4OO+zQ5t9ERPzxj39M9pNPPpnsrbfeOqune+tHP/rR6EiICQQAAAAAAAAAAAleAgEAAAAAAAAA1IBuIQdTl74vfvGLyVbZVUQuvXr7299eeQ2VYXkaOE3hpvKyDTbYIKunbtKe9lPd5fWe/H7/8Y9/VF5D7+vMM89s895Xh+7i3tfeaDt2Zurh1UFdqUeMGJGVqVyq2VxtlUWLFmWfdb752NY1Z+211062pxnW/jrwwAOT/bvf/S6rp27qer2IfC6qlMHT7qr7/T777JOVeXrd10td52Ir0cxzsaMpycG+/e1vJ/vwww/PylSi9dJLLyXbzyBbbLFFsktu+s8880yyXdKk87tXr15Z2de+9rVkn3feeS01F1WmPnfu3KxM21zlYN7+Kjvwc5TK+DSVvKYtj4i45pprkn3VVVc1dO9rSjPPxQ033DD7rFJETQ0dkfeP8r3vfS/7rOmrlTvvvDP7fPzxxydbzxERuYzsnHPOSbbLUfRs8oc//CErO/XUU9u8D/2biMblYeyL3Z9mm4sPPvhgslXOE5E/f/mY9f2k6t/X5PnZnzkVXV9Lc9HDL+ga8bGPfWy178npLnNxxx13TLaHZfnTn/6U7FKbf/nLX062yurmzZuX1dPnAD1DROQp6KdPn57sfv36ZfV0zH3gAx+ovKf2ADkYAAAAAAAAAAAkeAkEAAAAAAAAAFADeAkEAAAAAAAAAFAD3vTfq3Q9qvtTbZ/rOKvif0Tk8UX07zQ9n5dpWjm9dkRZT6pxEqriAzmlWCb6+zWFHfx3tF09JtChhx6abE0RHhFx3XXXJXvWrFnJ1rSDEfk487hRrsOvYrPNNku2pheMWHVsNROagtbHtsaVeMc73pGVVcXT8rmomuif/OQnyfY20ev5PNXr69rhaeAHDhyY7LvuuisrGzRoULTFmsY+AOjOeIwvRXX0ukdG5PFlnn/++WTfeuutWb3+/fsne/LkyVmZrjMah8bjiekerLH9InL9/nnnndfGr+he6Nqlsdn8DKTrqbajrtVOo+up4/EP68oNN9yQfdb4VMuWLcvK9Hyi/eMpjTVuyNe//vVkL126NKt39dVXJ9v3po022ijZOmZ8zuo80jhCXlfjVrIPQrOgcYA03mtEvj7edNNNWZnGlNPxXIr/qvV8bdT5rDEsI/LnTI1NqXE1IyIOPvjgZOsaEBHxwQ9+MNntEROou6B7+957752V/fWvf022xun5+Mc/ntXTNVnjFnqcu4ceeijZ/mx3wAEHtHm9TTbZJKvn12wG8AQCAAAAAAAAAKgBvAQCAAAAAAAAAKgB3UIOpjIblfe4hEpdzD2Fnko3qlz9InKXXHV3dTdZde/zdNPqxucpdKvuyd0M9bd5iltonJJr8pAhQ5I9cuTIrGz99ddPtqZtdzdMdQf1lI5nnHFGsseNG5fsYcOGZfXULfzEE0+svN9m44gjjki2j9/S/Pj5z3+ebHWXHD9+fFZPJVvqfukulXp9d+t94oknkq0pGX190LVD5WUREbvuumuyXSoGHc9nPvOZZKtk8De/+U1Wb/bs2Z11S1DB0KFDk+3zXl23VQLraVTVbd8ll5raXNNvuxxV57e75utcbwXUDb4kl9f9Sc8o3nZ61vE+1POMXt9lY88++2xD996K6P7kEnGdA94/VedBl4hrSneVpz/66KNZPT37TJw4MStT2UkpTIGOkxUrVmRlBx54YLJVDgYdg0tLVAJ0/vnnd+h3l8ZISR7cFey2227J9vO6omvln//856xMn7m23nrrZF9++eVZvcMPPzzZjzzySLI9bIfOv0mTJmVl+pzw1a9+Ndkf+tCHsnoaVsElvLrf6bz87W9/G63M4sWLk33ppZdmZdpGeqb3dfL2229P9sknn5xsl3zttddeyXaZr6KyXJf0Kc0STgJPIAAAAAAAAACAGsBLIAAAAAAAAACAGtAt5GAqzVEXvnXXXTert3Dhwjb/xlF3Z5dr6fU1erxnllI3L3eZVvdIdfF66aWXsnr6ecCAAVmZ/p1nPoEy6mbnMj7l3e9+d7I9m4n2jbpB+zgoZR97//vfn+xNN9002e72p9lwTj/99Mr7bQbU7VTbSDP/ROSuqytXrszKNGq/zjHNgheRSwx0XrqLpfbxtGnTsjKdi3ofvXv3zuqpW6mvK2eddVayVQ5GFpSO4bjjjss+a/YaHXPXXHNNVm/ChAnJLmU8gtdHadyrG73X0zXhlltuSbbLHIYPH55sX1d0vS1lY1FJzk9/+tPK+20FdNyX+kb3J5UlDR48OKtXksvpGlpq/zqj8nE/u2nbeoZK3e/0XOr73dSpU5OtMgfPYqnnYd1zI1bN2Pka3o863zzbnKJnLvbF1aPRttt4442zz7pOasbD+++/P6v3y1/+Mtl+ttH9tCTrajbJVwnNmKfj158FdI/41Kc+lZXpnjFjxoxka/aoiIjly5cne+7cucn254n77rsv2T5PVeKs8/mYY47J6un67eEMdN4ee+yxyW51OZj24VFHHZWV6TjQ9e6xxx7L6um6q5nVXOKqz+hz5szJyt73vvclW88s/l3av82yTuIJBAAAAAAAAABQA3gJBAAAAAAAAABQA3gJBAAAAAAAAABQA7pFTCDV/ak2daONNsrqaapt1WpG5LEKVNtcShGvWmyP96KfPf6Eagw1XpCmvo3If5frfVVz2Ldv34DG0XSW3m+K6oU9bo3qrRtNu+vxpVSDrKkqfbzouF2wYEHl/TYDmsZd9eWeilO15prKOSLXOuvfeRpS/axt69+l/fOe97wnK9M5p33q6W71+n4f++23X8DrR+eb9+HAgQOTrTE1IvKxpOlfL7zwwqyeplTVGAnQvpRiWJRipOk81b3b55uuxZMnT87KNOX26NGjk63rcEQ+vi655JI2fkXrMGzYsGRr23nMGV0LdW/y9Vnb1dOTax9qX3ssGY9/WCd22WWXyjId63369MnKtH+0PT3mpM45PQ/7HNA9bejQoVnZyy+/nGw9D/sZRmNpeEwVjW8xZsyYZHsKZihTig2y3XbbJfuggw7Kyn784x8n+9xzz032DjvskNXTuCQamzIijw2mMW38PKzPNF7mzzVdzUc+8pFkf//730/2yJEjs3q6j2macP+85ZZbJnv69OlZPW0/nX+bb755Vk/jeGk7+33pPNX4kxERu+66a7KfffbZrOywww6r/LtWZqeddkq2xwHWVO26p3mcNn3+0jJ/h/CNb3wj2d6/Dz74YLJnz56dbE9b/73vfS/Z/lyvsS87EzyBAAAAAAAAAABqAC+BAAAAAAAAAABqQLeQg6kLuOJuXdtss02yPTWepoRW98tGU6qWUqD6NdR9V91/3b1s//33ryxTKZqndYT2QV2plyxZkpWpq7u6Pbubu0r61P3QP6vLp48lTRHZLGkDq9h6662TrW3k7aJpEtXd0su0D/y3V7m9uyTPPyvqxqwu8GpH5K6Znt5Tv1vTe3qaSChTkmYeeOCByb7qqquyMk3futdeeyX7yiuvzOqpLOaiiy7Kym677bZkq1zB5/2pp56abHcv/sxnPlN5/3WilC5Y3aRdjqTrgK4djspOdH2NyNdRXZddqqJng1mzZlV+Vyug5yMdzy6z03VSx7amEo/IUyLrfPO6ekbxNbjZJCKdyTvf+c5k+zj31M6KngN0rfSxrfuiSlBcZqJ7nO/Pmqq+JM10iZmie6tKZuosB2s03XujqDxvwoQJWZk+72g/qQQqIg+F4Xua/p1Ksn0c6DPNvHnzsrI//OEP1T+gC5gyZUqyx44dm2x/jtSQAJ4O/Ac/+EGyf/Ob3yT7vPPOy+rpfFap+tlnn53V0/mx7777ZmU6ZjStuUq8/PrwH/QZS2VYEfna269fv2T78/Sdd96ZbJW/ujxL19cvf/nLWdndd9+d7FNOOSXZKueMiNhss82S/a53vSsr+9rXvhZdAZ5AAAAAAAAAAAA1gJdAAAAAAAAAAAA1gJdAAAAAAAAAAAA1oFvEBNK4AKqz9dgqmqbSU/5pfALVCnpKTNVnauwD/y797OmOVSuteKpPTc3qKauVko4cVqUqZoXqgyPy2DSeQlVjW6i23mMfaLp31ZNG5KliNX6Cj6XBgwcnW7XDERGPPPJINBMaE0jnomvIdQ54fAidz5qe1mPx6LjXtKQeW6YUQ0S1wDrHPM6C9qvfr17jve99b7JJQ/7f0XGh89JTbE6aNCnZqsePyOO76Ty64IILsnqaTtzn4vjx45OtcWW8nsZMqHNsizXlpz/9abJPPvnkrEzn0bXXXpvsHXfcMaune7KnNNY9dP78+cnWmH8R+drkc72747EQ/VxRhbaJ7jkamzAi4plnnkm2r8m6rpfitOk8rRul2CpVe1pEdcw0P0/qeVP3T1/LNN6Lx8HwM2tb14vIx4yfQ/Ucrevr5Zdf3ua160BVHCCfH1X1fC6edNJJyd5jjz2ysp/85CfJ1nOZxzLRuCkadyoi71Od66X5rPGkmpGquEylZyxfQx9//PFk6xzWlPAR+TlG4wV5nDV9FijFS7vnnnuSTSzY/46eD+64446srE+fPsnWFO6leIS61o4bNy4r0zHi19C1/P3vf3+yPR6hnoFOPPHErIyYQAAAAAAAAAAA0GHwEggAAAAAAAAAoAZ0OzmYutK59OOuu+5Ktkp9InIX81JqU6VRl2ZPo6nXVNvdfe+9995kjxgxovL6/jshx12u1U29f//+yX7Pe96T1dO0qe4GvWDBgmSrnEAlfP5dPubU1fbZZ59Nto8Ddb9vNvmXo+lv1aXc+6BqvkXk8jot87TtKs3R67ncT/vO3TTVdVkloX5P6gbq0hIt0zGEHOy/UyXNdEmk1vO1UFO7nnPOOcnWORWRy7w8Fa5LJV7DUyfrmPZ7rDNVsj7nmGOOSbaniNe1U6ViDz30UFavtMerZEHXW+8rd8dvJXRPi8h/u/aNS911Dqg04qmnnqr8Lu9DX+fbuoeIesvBNtlkk2S77ErlOL7f6T5WGtvaj/o3ngJ72bJlldfQs4neo0tVdC/0PtVzjMt7IafRdPHe/nvvvXey58yZk5VpKnQtO+SQQ7J6mgLb126ViulY8r7WMaJn4+6E/yb9vcuXL8/KtB9U3nn//fdn9aZOnZps3Ztcdqf7kaYJj8j3RZ2zLj1T/JxbJe9sdXQt9DOe7nHaXt52Oje13zS8QEQu89J549fUOfb0009n9bRM31d0JXgCAQAAAAAAAADUAF4CAQAAAAAAAADUgG4hB1M3PnXd6tu3b1bvyiuvTLZHgtco4uoG6O7N6tZVlSksInev9ewj6sKu1/AsD+pa6BIIjUTucjPIcfc+7Q918zzjjDOyeuqq52646l7bq1evZJdcYX2MeGaV1/Co/6Vo9c2GShFK2bbUNdMlBTqH9be7TE4/l2SVJalKVaYIl//pGuPXV1nFzjvvHHWkUffjkjRz2223TbZm34jIM594Fj8t23jjjZPtshhdd5988smsTNdQ/TuXd+q63sqSotWlSjbtLtgqC3nssceyMp1HDz/8cLJdfrnRRhsluyrTZkQu9fT7aGX3eJUnRORzTG2XP+h56eqrr27ou3xfrJK6l+SddUPPC74v6rnOx2xVe3pbar/quuZrtMr8VCrr19D7KJ1NfA3Q+azrMqwen//855OtWfsiInr27JlslXVF5JJn7Sc/o0yYMKHyu6syVc2ePTv7rH3vMhmVaDcDVdI7l8cqvs9o1l/NMrXFFltk9bbaaqtkX3XVVcn250+dY0cddVRWpmchPevouHCqZLl147bbbku2j3PNvnXRRRcl2zOHH3744cnWTKW33nprVk/nm0uo99xzz2R/97vfTbY/A+p3aYbUroSRBAAAAAAAAABQA3gJBAAAAAAAAABQA3gJBAAAAAAAAABQA7pdTCDF08eqLnbJkiVZmWo3NTWnxw7Qz/q9Xq8q1khErgPUtJ3+O2bMmJFs1f5G5BrVqvTG8B9cd68ccMAByX7ggQcq/87jS6nWXtvfv0tjy2gsgIg83WO/fv2S7fFKFI2HEbFqGuyuRttC55jHI1DNsmuxNd6B2h5zoCqe1urEm9C/03nqmmqdp752qD580KBBDX93K9FojJVS3yxatCjZH/nIR7IyHffjxo3LylSTr+nEPb27zpWtt946K9M4T43OKU/h3IpU7WNVMYAc7wPF98Wq+ez7osZT8FgNus5ommv/Lp2nnpK3lBK9O6BxZSLy9VXXtVK8w4svvrih7/JUx0pVbLeIiJUrVzZ0/VZE+8fXEI2PVzp76r7o8ev0DKJzwFOI61rs81nvQ/vRYzvpPWna+oj8nOtxvbqS0tm8q/A5q+ndd9hhh2T/8Ic/zOpdd911ydb9M2LVeHav4Wuyxv0p7c+6Xvg5VM+28+bNq7xGM+P7jM5NP6Pq/qHt4nPAU4W/hrez/t0RRxyRlelc1LKqmKIRq8atrCv6nOZr17Rp05KtMcu8D/V8r9cYM2ZMVk9jHGo8woj8+XHo0KHJHjJkSFavR48eyZ45c2Y0A3gCAQAAAAAAAADUAF4CAQAAAAAAAADUgKaUg3lKdHXjK6VyVtdYd7+skgy5u6i6g6krtLv1qpus32+j7tnqtunX0M96H/obI6pTPNYJdwMcP358si+44IJku/udurD7eNFxoS58mnI+Ipd5eepMdeecPn16sl2Oou7Am2yySVbW1XKwRtPY+tjWOeBtq591DpfSXuq8d9d2db31NUH/TvvbXfFVvuDuolX35VIJdeuF/+Ayyyp0nPsc0JSbe+21V7J9TR42bFjl986fPz/Z6v7raeBHjBiR7P322y8r85ShrUZJAlYlI9D+iIhYsGBBsp977rnKslGjRiV71qxZWT2dV7r2+mdN2eoSCB0bBx54YFb21a9+Nboz3k96ttG1yvtM28QlRlX4fqfrqc4jXzNdUtvq6FhUfO/Qc53vF9q22le+p1VJnF2Sp39XOt/o3PH9Xvd4l8zo+KraZyMalxK3FyUJmJ8VFf09JWlsoxKzgw46KNl+Tv/617+e7K985SvJVslJRC758mcY3dO033wuat/479ffrOPK91ad6/3794/uSGl/87ZVOfmvfvWrZLtU/d5770229pXL6QYMGJDsY489NivT/tJ21/0S2kZle4cddlhWdv/99ydbU7j7vnjllVcmW+ffqaeemtXTvtH+jMjXWpWA6XkyIuKb3/xmss8666ys7IMf/GB0BXgCAQAAAAAAAADUAF4CAQAAAAAAAADUgKaUg2k2GEddG92tS12Q11133aysKiNRyS1av6skVfEMJuouqq6w7jJcirKvbsPqxui/q5XlYO6+WeWafPfdd2f1tI3uuuuuZA8fPjyrp33qUf41mry6inof/vnPf062ZgPz79O+dpdSzbzgmWwmTZoUXYlHt1fU7dv7qiSD1PbUueP1qlyyfd7rNdzFXutqBjCXqpQyilTJvDxT2NSpU9us113w9l+dLGxVqCxLZZruJrvFFlsk+8wzz8zKbrnllmRfcsklyVZ3+Ig826JLM6uy/XmGFZ3DI0eOjFanKmNeo33/qU99qrLM93GVFt1+++3JHjx4cOXfuZRE70vnpY9dleLqWt4K+NxRV3Q9z7i0R93jG8VldrrWujxIKZW1IipFVVzKpX3iGdR0X9S/K8lYdNy7BFb3I58fVfup97euj6V9V8u8LZppX1yTrEol+ZdLl48++uhkT5w4Mdm6N0VEPPzww8nWM5aGF4jI79elsRoaoirzYkS+TpYyTulaq/Ivv4+BAwdmZd4GzUqpH32vUvmkPl/4XFRZ8w033JBsP1doKAqX/Kn0TPfCRiW7dead73xnsl2ipRm3n3nmmWTvvPPOWT2dfxo+RCWbERGf/OQnk+3riM4rHUt33HFHVk/X4T/96U/RDOAJBAAAAAAAAABQA3gJBAAAAAAAAABQA3gJBAAAAAAAAABQA5oyJpCnmFSNq2o1XXf++OOPJ9tjqyj6d6X0m3ofnnZR9ZquwdVrqlbQ9b5PPfVUsl0fXpX+1rWmnuK4K9GYPaW0miVtbkn/rv22+eabJ9vjImm6aY3LU0pjrvFiIvIYIto3rqnW8aga7YiI7bbbLtkaj8HvV+9rwoQJWdkVV1wRXYnHvVGqYvv4Z48lUNXHjcYh8XSeHvtC0bmo/V0aCx6HxPv1NZopJlApZlmj7Vqq5+ufzgNNGauxfSKqU8RrrK6IiFNOOSXZW221VVZ2zTXXJFtjOXm6W+3r9dZbr7Ks0ZTIugZERPTu3TtajTWJA6RxC/bee++sTGOkafrciIhddtkl2RojYdNNN83q6Vz0dUVjVWgf+z6unzU9bCvgsSKqYrP5+H3yySdX+7tmzZqVfdb4dVWxD72sDmhcFB17PqdK64vGtNT1ttHzUimmnl9D6+p9eEygUkxLPRfp9Zs5JpDGi9xtt92yMv19b3vb25KtsUEj8n1/9OjRWdmiRYuSrfPNx8GRRx6ZbO1Df6bRseRxep5//vlka7/5uNLf4veh/aZnYL+G/p3H1vQ1ulkpxSnz33TSSSclW5+xnnjiiayexpO5/vrrk63PJxH586ifffSZTvvYnxdnz56d7NJzUp3Qc8ScOXOyMn1O0P596KGHsnpjxoxJ9pe+9KU27YiIW2+9NdnHHntsVqYxvnSNP/3007N6Bx544Ko/oovBEwgAAAAAAAAAoAbwEggAAAAAAAAAoAY0pR+fSmwictdVdbVyF3B189K0e/53jboql9wH1SXbZQnqfqnyBU1Z53ja17/85S/J1hSP7qbZTKxJ+s015eyzz062u+tqKk1tO+8nHVc+JrTf1CXa01LvuOOOyb755puzsq997WvJHjp0aLLdlVPHkstpupqNNtqosqwkBys4YHNIAAAgAElEQVSldNf+atSdXWU7Jdd2v4aOSZVNrE469Kpx7e7xXUl7pHNXV/mI3MVcpY0ReVuqG7mn+9bxM2nSpDavHZG78n7+85/Pym666aZkq+zC71ddqX2drEq36muC/p2uARGrrjOtgM6l0t6q/O53v0v24sWLszJde7fffvusbPLkycmePn16sl1uqWugnwU0/ar2v6aAjcjHncvNujuXX3559llT1+r49bFdkshXsXz58uzzxhtvnGxdQ3Wfjcj3gzqkOva02a9ROmsuW7Ys+6x7YWkuapnac+fOzeq5RE/R/aIk49drlORrOherJMBdgUu2Tz311GS7xFnbRPc0fw7Q8fz0009nZdpXej7wNOorVqxI9ssvv9ymHZGfe3yd1OcMLfP5pmuyjyX9bfp3fg0dF6U0881M6XlOZXwR+bzVs46eRSJyOZjie47uWx7SQ89WjzzySLI95TlysFV53/vel2yXg6lcUsfzuHHjsnq//vWvk61nPj+/6LnHn9d1bF188cXJ9jAvKqceNWpUVvbHP/4xugI8gQAAAAAAAAAAagAvgQAAAAAAAAAAagAvgQAAAAAAAAAAakBTxgTydN2qY9W4DK5b1ZgQrvdVDaXartnWzxqPwL+rKk5RRK6H1/v1elX37tfQ+23mdIwaL8BjyUyZMqWha+jfud5W0zZqPY9HoG3pcUMU1YC7Pr8qFbGnftS4OEcccURWpmkDVb/t8St0zLnuvqv19aX20znmKbm1T0oxfEoxE7RtS2lOS/G5tG4pda9ew+eYj6/XUK14V+PxiVR37mnQta7a3nal9Ur/Tsesz6PHH3882RoHxtN2a7wY/y0610vxE7Q/fL2uGge+7moMjAULFmRlpbnQXdH5V4oDdOWVVyZ75MiRydZ+i8g18B5HSuMd6BrgcdBUi++xo3RN0NS9PmdVo9/McfTWBI/To/uJnjd8fmy11Var/V1+DR0jpThtdYgDpPTv3z/ZutaU9jffg6riKHkssqpr+vwtxYnT+ad/V5pHpe/T7/KzQFdywQUXZJ+17Xyv0nVC44n4HND9yc9n2l7ali+++GJWT9tfr+d9rc8gpZiW2v7eZ7ovev9qLKFS3B9tN98X/XmtO+JnPI2vtffeeyf7iiuuyOo9++yzydY05HpejYiYOnVqsn/yk59kZZrm/Jhjjkn2tttum9W75557ku0xLevKsGHDku19o3NTn1X8zHfuuecme6+99kq2jwk9p9x2221Zmcb6US666KLssz4jN8u5hJEEAAAAAAAAAFADeAkEAAAAAAAAAFADmlJb5KkQq1y7Ne32f7uGuudpesaSu666UZakH56uryr9Zgm9p4hqV7GSe25nc/TRR2ef1W3SJR077LBDsjUNn7uWqvurt4m6wWs7eJv069cv2erK6/2k7uwuH9Rr6lhyN0y9J3eB1zGj7rWeLlS/y126PRVhZ1Mav1rmfVBq9yqpo0o9IvI5W0o5XHKr1PYspdUsyWKq2qDRud1RqKSqJKF1SU3VWuYp0Xv37p1sT2uq15wxY0blPep3vfvd7668p9///vfJ9v50OVsjeB/qNXQsuXu83pfvIep63Oqcfvrp2Wd1k54/f36yda2NyNNGq/wvInd7171j7NixWT2V0Xr/6N6qfeoyCu3/KjlnqzBt2rRk65nI51hVum+Xuyje/lUyr1aUSq4OPXv2TLaO0VLK9ZL8Vtu9JOvSv9FU4BGrrtlVlM65ui/6OKkKq6DpsLsal07q2WvgwIFZmf4GXTN8zGsfeptov2lblmReWs/3Pt2DXGKkn/U+fN6XQmjomUsla75/Vq0dEc2drrxKMheRzxeVf0Xk7aJydH/m1OcXTQvvc0/PVjvttFNW9sQTTyT7+uuvT7bvrUozPQd2JV/5yleSfeyxx2Zld955Z7JViu7yc53rKuvylPNab/HixVmZjoMBAwYk2+e9zqPdd989K9PQCSof7GjwBAIAAAAAAAAAqAG8BAIAAAAAAAAAqAFNKQdTl6mI3I1PXSDdJUspuSyq66S7v1Zl4vLrVUlaInIXxEZdJdVVPiLPNqFuaM2UHewTn/hE9lld2K666qqs7Etf+lKyNbvM6NGjs3raN+66qm6Z6m7pEq1Zs2YlW2UyLvnStvQyHXNqe5YHvYa7aOrflaL5V7kjRqyapaGz0Wj2jvaVZwRRd2d3T9a/0zYrZdkrZfTTttWsHl5X+7g0j/w+qvqgq+fi7Nmzk/3QQw9lZTo/3LVb+0rXNZddaTvcfvvtWZn+nY4RlZD5d+l89u/Se9S1LyLfD3xvqLqGs2jRomSXMjuWso+5XK6rWRM3/JL8WTOAeabDefPmtfm9KoOJiBg/fnyyTznllKzskEMOSbZKwNRtOyLir3/9a7J33HHHrKwqW473Y5WUtBVRN/JRo0Yl28er7oXbbLNNsu+7777Ka7sERdtf52Jp7tUBPZvonubnCt3rNbNQRL4mljI6VUm0POuNzgnPhOXS67bu3ev5WaBKDu2hADoblXt4Jib9fXoOjchlOpoFVmU+EXk/eRvoPNAylclG5HuQ7tV+vlSZksqGInIJiu5vvm/pmPMxomu53rufV/Usq2tAW9/XTJTkYDo3XaK12267JVvn22GHHZbV099+7733Jtufa/SspnKhiFwypHPHnwkVn6el83Ero5kxTzjhhKxMZet6DtV+isgloxpiwfc0nZsf/vCHszK9/q9//es2/yYifzYdM2ZMVtZV/YYnEAAAAAAAAABADeAlEAAAAAAAAABADeAlEAAAAAAAAABADegWYnnVP6oeVbXwjmvZFdVbu75VP6uetKTXK6WKLsWdUTTNa0SeirAqJlJXoBpr1zuqjnaPPfbIyjTOg/5W1x6rtt7T62k7a+wXb1eNS1KKaaOxalx/q/FLVBvq+nxNjev3q7+tlJZaf5en2l2xYkXl/XcGHqNI54GORY+ZpXF0tE8j8vGsmmqP97J06dI2v9fjGeg49D6oigm0cuXKrJ6Oye222y4rq0qLXJrPnYGOlVJ8GNfse4ym1/C2K81TjS2g8Sb8Go3GrdE54eNA21+vX4pDUirT+ebjVtvK1/WuSIVbil9Uio/UKB7L6zVU1x6Rr+0zZsxI9pNPPpnV0xgWvvZqHJobbrgh2b4Ha7wg7x/9zaUUzPpZ4zG0Io899liyjzzyyGT7PqOfhwwZkuxSTCBH1zwdj6U05nWgKqW7nxt1vr3zne/MynSuazwnX8uqzpseS02v7/eh8aH0fjUluX+XxyGpiofoZ5jORmPleNwN/d16vojIY6LpOuPrU58+fZLdt2/frMzXq9copVzXmCR+ptZ29bO/pjjXPVJjAPnfef+WzqWK7ov+W6riSzUDpT1b2+Xwww/PyjT+0uWXX55sXTcj8vVws802S7amJI+ImDhxYrK33nrrrGz69OnJ1vN+1bmzLeoaE0ifLfxcq/GVnn766WRfc801WT09p+he6jGHNYabPy8OHz482Z/73OeSPWXKlKyexlD0awwaNCjZTz31VHQWeAIBAAAAAAAAANQAXgIBAAAAAAAAANSAppSDuQufuiKqq2lJDuYpiFWeom527uqp362usO4qqW6a7n5XlXaxhP+WAw88MNnqrlvlbtpZaF94G6t8RF31I/KUieoO6S582pYlWUhJIqdlJVdVreeprXWclSQZmvrT+0Z/m44DHy9a5ik8Nd1pV+AyIL3XUhrbu+66K9meRrpKguLtp+61+r3e3/p3jz76aFam/aXX8HGnY1elLxGrSsdeo6vlYDo/3M1b5TADBw7MyrQddE65bEw/l6QA+l0+XtYEl/aoW29JetsoVWl8I/I1zde3rkiFq2tUe0huXOqo+4y2y4QJE7J62q86L3/2s59l9W6//fZkb7nlllnZ97///WRr6vdNNtmkoXuPyPtE572PT+2rVnePV1f3KrlcRN4OmgK7hKbZjYjYfvvtk61joiukks2EniO1D3wN0ZTGKt2LiDjggAOSrdJJl3mpDEjH+Z///Oesnu5P3j9z5sxJtsoe/Pyh+6LPdR1fuo90tRxMJTV6DonIzyIuU9eU8SqD97ONziNvryrZVGnf0vOQn5VUFqhS24iISZMmtXmPO++8c1av9Dyic7iUIl5/s/+WZpaCltYllUtqW0bk81klf34W/OY3v5nsffbZJ9kq7YmIeN/73pfs8ePHZ2Ua+kPn23HHHZfVGzVqVLJVrhaR91cz90d7o/u+/25dk1Su5VK9H/3oR8nWsf3BD34wq6eyMZeYL1y4MNkqF917772zero3eB/uv//+yb7pppuis8ATCAAAAAAAAACgBvASCAAAAAAAAACgBvASCAAAAAAAAACgBjRlTCDXo6pOT+NDTJ06tfIarktW7WApno9qeqvSxUfkGlwv08+qm3YtrcY28ZhAVekaPV5GZ3PiiScm+2Mf+1hWphpK1a1H5HELtM09boHGNvEYMfq5So8ekafIrErTHpG3v8cO0rSdP//5z5M9bty4rJ7qhfVvIvK4M3p9H5uluB8aO6MZ0L4rpRQ944wzkj1v3rysTOOSqB5+wIABWT2NhfCb3/wm2f369cvqqZZfx1lEPoc1VoCntlYt/sknn5yVVWmsPTZRZ1OKiabomhmRj1nVOW+44YZZvUZj4JQ06BpnoNH4aL5O6vxWTXV7xAfy31iKG9XVqcZPOOGE7PPuu++ebN3TPHaEjnufOwcddFCyNX2y72kam0LjnHjsA53bHutn2223bfMavgfrbynFtdF4HB4/TvtR43u0IrrmaexDX591njYaE0jTF0fkMTZ0jPj+WbeUxfr7td09JomuIQ8++GBW5p+bDR8LGqNEf7/P2c6mNPb0jObnNX2e0LHt+7zun16m5xmdi75H6hqnqad1rY5ofM9UPI25jkePYaRUxdyMyPdJL1u6dOlq32Nn4bHiFI0TM3ny5KxMx8a+++6b7MGDB2f19tprr2RrjCndLyPyc5afc6+//vpkP/DAA8l+//vfn9XT/dTjyehZqPSbuzt+ftGYhv48qv2ra9L/+3//L6unzw+nn356sm+44Yasnl7jAx/4QFb2/PPPJ1tjsfm5RGP97LnnnlnZpZdeGl0BnkAAAAAAAAAAADWAl0AAAAAAAAAAADWgKeVgLiVS9051yXrqqacqr+FSoqqUsaUUglrm6aBLf6cunCXZmEpQPG1dldt1V6elVv7nf/6n8rNLajTdobrpeQrVWbNmJVvTr0fkbVLlfh2Ru2Wq6+Vvf/vbrN61116bbHXJjFhVYvYamk4wIk+DvGzZsqxMUynr/XqK9JLMpKslR/796t5YmgOaCtfxVLari7vC3nLLLa/rehG5S7ajrrbq4q1u4c2Muybr55KkFroeld+ef/75WZnOP5UJl2THJdSN3NdelRGo/MjdszXd7Ysvvlh5vyrZ9fVQJWYLFizIytTtWu/X9/vevXsn29MutzK69/l+pOcel35WUZJ5lc4ldZCAKbqmapv5uCxJWDtTQqd9V/pelf64zKQq9XhX9317fL/+bpcFz5gx43VfvyO5/fbbu/oWmobSWFCJ1oc//OGs7Lzzzkv2T3/602RvtdVWWT1db++4445kP/TQQ1m9nXbaKdm+JowePTrZp5xySrJd/n3rrbeu8hteo3QWbyX02dE/H3LIIVnZlClTkq1rsvZZRD4OlMMOOyz7rGciDVsSkYe80L3Qz96aBn7atGlZmf+2zgJPIAAAAAAAAACAGsBLIAAAAAAAAACAGtCUcjCXCFVl21q0aFHD16jKKuOZDPS7ShkC1M2wFI1d3fQ90r9miZo/f35WpvdYdU9dgX5/KSvQJZdcUvlZXRc9QrpGe99+++2zMu1TzXjksgOVuBx++OHJdhfNNWHSpEnZZ5UZPvroo1mZyhA064NK3iJyt32VG0Xk0qfTTjttDe749eF9rHIMnRMl+Zej8r3SGKpyUy+5vvr1tK7OKY/arzz22GPZZ5VO6PVL2dEA1gSVMUVEHH300cl2iZauFTo/XF6q657vM+oavd9++1Xeh8oldf3S7HsReQYTl9TqetinT59ku/xbJbfvec97sjLN5KH7qcupf/KTnyT729/+dtQFlSSMHTs2K1u5cmWyVbbnfaiu86XsqXrucdlY3aiSQ3n7+VlFWRMZU6MykJLMq1Eefvjh7LNKY+oiR4HWQc91Lu/54he/mOxSRmadR6V6em52Saj+XSl8iGcVriNHHnlk9lnPDhMnTszK9PlRz0Qe+kP3TA3x4O0/cODAZC9cuDAr02ch3U99b/3hD3+YbH9+OPfcc5N94YUXRmeBJxAAAAAAAAAAQA3gJRAAAAAAAAAAQA3gJRAAAAAAAAAAQA1oyqAWJX25aq9LMYE85oemedRrqMY9ItdnVsVciMg10P5dek2NXeO/S8scTaHbaAyVzqA9vl/b0tN7t0e6747ksssu6+pb6FQ8fWKV3tXjf5TozHSypfgMjaJ6cV0fPG4KwOvl7LPPzj5rCnZPw63xfKr2nIg8Ba3HC9KYY9ddd12yBwwYkNXTdaBXr16V36Vx2wYPHpyVqX5f146tt946q+fxfRS95uLFi5Ot+2Wr4/FXdF2bPn16snfZZZesnq7RGtPA2/+BBx5Itseh0jhAmurY4x3WDT0D6lz0dNCeMlhZkxTx7b1/erwS7VeNXejo/Kt7fChoHvS85uNS9yMf977XNkLp2Wi99darLNM5rPPNY9JsueWWyf7FL36x2vfXCniMHe0336s0bfugQYOS7X2rZxjdC5966qms3pVXXpls7YuIfCzpuURjAEXk8RS9TGPZdiZ4AgEAAAAAAAAA1ABeAgEAAAAAAAAA1ICmlIOpm3tEnrZNXZrV3SsiYsSIEcnWFLQRuaRDr+futJrqTV3x/Z7U/dVTxKtb7wsvvNDm/flndcGOyOVrVal1ATqaj3/849lndUPVcXnXXXc1fM2OloC93u9VWUxExCGHHJJsTfF71llntc+NAfz/nHLKKdnnM844I9k6DiMiDjrooGRvvvnmyXbXc3Wh3muvvbKyffbZJ9nqiu4SZ507KnEppYbWvS8id6/W9K2rg8vZqlA3cXer99/W3SjJwe6///5kn3DCCVk9dYPXtfvggw/O6ulZpCRx0PtwOUXd0HbS1NMuX/BU1Mraa6+d7NWRV7cnpfns96RSG7X9rAzQVZRkqrfddluyhwwZkpXpnrHBBhskW+XOEbkMyPeZKly6rPc4d+7cNu2IiB//+MeV1/Rn0Fbl5ptvzj7vv//+yfazzcUXX5zsmTNnJtv3f12jVY6v7wIiIiZMmJDsfffdNyt7+umnk33DDTcke911183qqax+m222ycp+9atfRVeAJxAAAAAAAAAAQA3gJRAAAAAAAAAAQA3gJRAAAAAAAAAAQA1YqzPjc6y11loNfZlqqiPytG2qa7/++uuzeqq323vvvbOyGTNmtHkN17yr5lM1gap5jsjjHXiaeU3pru3reutSKlaN1aBaU9X8R5RTjiqvvvpqtdh7NWi0D6H9aa8+jFjzfrzooouSrdr/73znO1k9HduulS7FmegsSnE1tt1226zsmGOOSfa0adOS7b+5UZiL3Z9mmIuNonuXp23X2DDDhg1r828ich29xjCYM2dOVm/58uXJ7oiUp3pfuo6saUyE7jgXS+uplt19991ZPe2b++67L9mXXHJJVk/jnh1xxBFZ2ZlnnpnsZ555JtkazyYiYr/99qv+Ae1MM8xFnVc77rhjspcuXZrV0znhMS2177pqjyyNrbFjx2ZlX/jCF5Kt8YLuueeerN6FF17Y0Hd3x7kIOc0wF+H1013m4uWXX55sj6E0f/78ZN96663J1mfriDzOrqaB9zh3GlfI4x0qAwcOTPaoUaOyMv07j4v4rW99q/Kaa0KjfYgnEAAAAAAAAABADeAlEAAAAAAAAABADehUORgAAAAAAAAAAHQNeAIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANSAN3Xml73hDW949TX71VdfLVVtiDe/+c3Z5/XXXz/Zw4cPz8pGjhyZ7P79+1dec8mSJcmeOXNmsv/5z39m9Xr37p3sIUOGZGUbbLBBshcuXJjs+++/P6s3derUZL/00ktZ2SuvvFJ5j2vCq6++ulZ7XGettdZ6/R0HDfOGN/zfe9pXXnmlXfowIuKNb3xj6sd///vf7XXZ1eZNb/q/Jeitb31rVrb22msnW9eLtdbKm0Gv8b//+79Z2Ysvvpjsf/3rX6/vZtuJ9pqL7b2eQhkbZ+02F1lTOxddP/797393+31Rf8/uu++e7NNOOy2rp2cWXSd97dDr+Zqpn5977rlk33XXXVm9K664os16HUF7racRzbMv1oWOmIvsi53LG9/4xmT/61//aom5qL+p6hwaka+jpXvUca7Xi4jo0aNHsvU584UXXsjqdWYbNOsZVZ+HIvJnhk022SQrGzNmTLJ79eqV7EWLFmX1ZsyYkeznn38+2ToGIiI22mijZOv7hIiIESNGJPttb3tbsqdMmZLV+/3vf5/spUuXZmXt3T6NPi/iCQQAAAAAAAAAUAN4CQQAAAAAAAAAUAM6VQ7WHu5O66yzTrIPPvjgrGz77bdPtrpnReTuYOpCphKyiIg+ffokW93B/N7VLdrd9qokKMcdd1xWb/r06clWN7GIiF//+teV14f60FEuoB3tWqrurz7HJk6cmOwjjjgi2UOHDs3q6TzV67lLqH52+YK6el5wwQXJvueee7J67S2/7Axwde9cmkVO2MzoXHS3d/38jne8I9k+jnW/cxl2o+tWaW602rzR37PNNtskW93hI/L2d1d3xSQ6WZl+Vnmku+LfeOONye5oOVh70owSMJc/++eqf9e5qH3lZbquuZy6o9ujI+Ziq83vZqejzk7tPfY83MC2226b7H333Tcr22mnnZI9YMCAZPscU2nR5MmTk7148eKs3qBBg5K93XbbZWUbb7xxsl9++eVk33vvvVm9L33pS8nWZ8eI5ly3Itp/Lvq+1bdv32QfdNBBWZnuhdr3K1asyOotW7Ys2Xre8OeWgQMHtvm9Efl7g379+iVb+zMiYr311kv2d77znaysPc6XazIO8AQCAAAAAAAAAKgBvAQCAAAAAAAAAKgBvAQCAAAAAAAAAKgBa3WmfnZN06iqnlnTnu6xxx5ZPdX2/eMf/8jKquL7aDr3iFz3p6n7XPf697//Pdmeck7TzOm9axyEiDy+ketVVQ/62c9+ts3vXR1IEd/9ac9UuO3djz179sw+f+9730v2jjvumJXpPFANrscj0LJG1ynXDGvsA51js2fPzup9/vOfT/af/vSnrKy910jmYvenmediZ+LxuTSu1+mnn56VDRs2LNkaQ2bdddfN6mlMII998MQTTyR7wYIFyfYYDI8++miyZ86cmZVZWt+Wmovf+MY3kj127NisrCp2h8cR0DOLr8kat0BT4Xrsg3PPPTfZHn+tvelOc/Htb397sj0G3vDhw5Ot6YgHDx6c1dMzq8Z58hhcOq98jr3lLW9JtsY58bPsddddl2yNUxmx5mfRKtgXuz/NNhd1z7n66quzMo3TUzrjlWIe6XlT90K/nn722C9V51yPP6Tr8rRp07Ky/fbbL9ntMS+baS6W4ouOHz8+2bvssktWpnuXxun1vnnzm9+cbI3Zo/tbRL5mOnoNPdtsuOGGWb2HHnoo2X4+8lhFr5dG+xBPIAAAAAAAAACAGsBLIAAAAAAAAACAGtCpKeLXFHWbVbe3hQsXZvVUhlWShahbnbvmaTrTv/3tb5X11EXQpWdVLn16vYjcbU/dySIiNt9882SPGjUq2epOBq8P7RtPoar926zpF7salYDdeeedWZmmUFR5R0TEs88+22H3VEqzqPOtV69eWZnK1/y3HHPMMckm/WzXonOWvuh6fJ/dfvvtk73DDjtkZTo31fb1VaUro0ePzspGjhyZ7JLr/DPPPJPsT3/601nZI488Eq2KShxcoqVtpPubS77UDV6lCl5X3eN9HPj6WlcmTpyYff7Wt76VbJWGReT9o2dbH9tV495lK9pXXqbzT8MSbLbZZlm98847L9knnHBCVnbAAQckW8/NAF2Jrks/+9nPkq2p2CMiXnrppWT7uVHnmO5PPhd13fNniEau52VVdkS+nus+GBFx4403JnvXXXetvI/ujsvP9bNLwnX9K4Wd0P7QUBUuB1OpmId20bVcx5X3tY4Xl+i2txysUfAEAgAAAAAAAACoAbwEAgAAAAAAAACoAbwEAgAAAAAAAACoAd0iJpCmftM0mK4P1DLXQGtd/ztFtYOqE3UdYSk2RVX8oZIW0cv0GiNGjEj2ww8/nNUjLsaao5rPAw88MCvr3bt3srXNPZ6Eph6sG5/97GeT7TEgli1b1tm3s1r4fNP14r3vfW9WtuWWWya7leOJNCuqo+7Ro0eyPRVqe6cshv+Oa95VN1/aZ3Vf9D1M912PSaNlut/7Nfr165fsIUOGZGWtNoe1HTQlssdp0TgG2q4rV67M6ml8Qu9fXSff+ta3JttjZXi68jqhMUm+/OUvZ2UaB6IUs0nxmJP6WftqdWIC6T1qmdfT7ydfrigAACAASURBVNKxFRHxiU98ItnnnHNOm/cO0NnstNNOyR48eHCyfZ3TOC4+7vUsoWtgKZ6PzkWfy6W9UMtKz7d6TV87xo0bl2x9rmmFM1EpnpL+Po/rVBUHqFRPzyUe564UR6/qu/x+NS5waUx0Js1xFwAAAAAAAAAA0KHwEggAAAAAAAAAoAZ0CznYwIEDk11Kq65ucJ6OvcodrCQLUfc7d0svpZmvSilecpn2dHT62zTddildKKwe22yzTbLdnVnlYNoX7oY5a9asZJ9//vlZ2R133NHm37mbofa9y8vcTbWr0TG73377Jfv555/vittpN0rz6LTTTkv2scce2wl30/qUUq326dMnK1M58Pvf//5kX3311Vm9q666KtlVazB0LOp+r3KhiHyf1P3Tx4J+LrlM65z1PVg/u5ym1caGSt/WX3/9ZLskV+VgKgfyelWypIi8XbWe72mt1sarwwYbbJBsX8tK8gwd61V95dcoSQ/0s19Dr6+yGE9zrf3qfawSej37+Jka2gfvmyo5rJ9R68amm26abD0/+/jVzyXZjrbnCy+8kNXTdOCKPy/qHPN9UefiOuus06Ydkc9hf16sSj3eCnKwErrW+LivCu1SkvSVpH+lM4Wibe5jrkp61pXgCQQAAAAAAAAAUAN4CQQAAAAAAAAAUAOawx/JcHdkzQijZS4HUze7kruWugi6y5e6b5XczUsRy6vquWtYyb1PXYPVLbDkqg1lvO323HPPZOsYi8hd+tR2eYJmzPjZz36WlanrqMql3M1T3RjPOOOMrOyWW26JZkLd29XFtbu7IOs8dQmeuhdD++CusIMGDUr2Rz/60axs7Nixyd5www2TrXLEiIg//vGPyfbMSNAx+Jq6+eabJ9v3NN2TdS8sSb7cdV73fL2G788qq3XX/FaTUKus2WXwip43dA/y9U5lJt6uSqkPS3/X6mimTB97JbQfdO74eNWxXZIe6FxRyYl/LskSSudNHUMqQ1yyZEnl38DqoXN23333zcq23XbbZOs4uOKKK7J6zzzzTAfdXXMyd+7cZOvc8Tmg88PnqT7T6Vlizpw5Wb0FCxYku5Q9auONN052z549szKVj+pc9Hmpe6GevSPyvXXFihVRRxqVIPtepX9XyiKun9f0DKF7qz9zdhV4AgEAAAAAAAAA1ABeAgEAAAAAAAAA1ABeAgEAAAAAAAAA1IBuERNINdallJWq2Suly1Ttpuv+qtJbum7ev7sK/V7XEarm0+MnaBtoHAy/32ZLId7M+LgaPnx4shttRx9XpRSMqjnWlPOuBdXvHjNmTFZ22223NXRfncX222+f7FaLrfEaHt9I016X1hhoHF/H+vbtm+ytttoqK9too42Srfp5jYkQEbH//vsn2+Mi0FcdQ2mfVf17RB7HoNH9s6TL1+v7/lmnfXHcuHHJ1tgWHntO20tjjXgMF52L3ofazmr7ualV94ZGGD16dLLXdN3Rv/O5ovF3tB99rujfeWytRuef9qPHTdH70JhuxARac0pn1JNPPjkrq0ot7vvnpz71qWR3RHygZotT+uCDDyZ72rRpyR4xYkRWT+eLj22NuzVz5sxkT5kyJaun8YcUj1erZ0qPV6t71XrrrZdsj6uma6zOvYiIu+++u816rUZpPS2tf6X9SJ/Dtd90bfXP/l26T5Zi5emcHTJkSFY2efLkZHfmeRVPIAAAAAAAAACAGsBLIAAAAAAAAACAGtCUcjDHXVlfo5Ra1t2/1OVOba+nLnilFPGlVHKKuhm6i7q6cLr7YJUczN14W9n1r73xceTSq/ZGx4y6HLrcSD+7e2mzpdrdY489ku1ura2CS/zURVclLeoyDKuHpz/t379/slX+5Z/VJddTvn72s59N9vTp07MydePWNdld2UvpsZGUrYrvn/rZ90WVD5X2zEbR/nA5mO6TrTZPve122GGHZFdJCyLytUvHtkomInI3dZ9jOl/0zOLjYHVSo7ca++yzT7tez8+NugbqOuprWXvMMaUkB1Np7qRJk9r1e+uEn+/32muvZKvkztG5uNNOO2Vl5557bpt2RMSzzz6bbJ3Dvp6qDFv/JqL5zoGaIv3UU09N9tlnn53V22yzzZLte7uOdb3eCy+8kNVbuXJlsnX++bPG888/n2xv2ypZ7UsvvZTV02cDT1Wvv7OV8TVt3XXXTbY/Q2tbluRgOuf0/OdS6NJaq9fQvbV0vtx4440rr4EcDAAAAAAAAAAA2hVeAgEAAAAAAAAA1ABeAgEAAAAAAAAA1ICmjAnkOjrV+qkm0ONKqN7O9YGqtyuliNdraj3X6OnfldJtllJK6/X9t+hn1V57PWgcTW8eETFw4MBk/+1vf+vQ79Yx7bFG9LsXLFiQlTVbHBJPy92KeP/oujJ27Nhk33rrrZ12T62AzoHevXtnZVtuuWWyS2tc1V4QEbHJJpsk+8c//nFWpnEM9D48pprG2zjvvPOysnvuuafyvuA/aPy60p7ZHpTiD2k8hUWLFlX+XXekX79+2WdNfaxxI3weaeyJhQsXJvuxxx7L6mmMIY8rVBUzweeRxsqoA7qmjBo1qtO+t9FU7+2Bz2edR3vvvXeyv/e973XaPbUant794IMPTranrK5Kbe1r4bve9a5k+9jU8aPxVfwMpPU05XxExE033RTNyhNPPJHsj33sY1nZMccck+x3v/vdWZm2ocZF83iFGm9O1wCNvxYR0aNHj2SXYrWp7c8Cl112WbIff/zxrKwusWG9XTWWZM+ePbMyjbVailGnfV16h1A6l1Y93/m7DB1Lfr96/c7sTzyBAAAAAAAAAABqAC+BAAAAAAAAAABqQLfQFqkbpLoluruWpnYuuXKVUhpWycYcdRtzl9yqlHP+vVrPUwqqi5qWdab7b6vx4Q9/OPtc6t/2Rl2nXY6gbtaeFrKrpQvuAqlyj1bFx4XO20MPPTTZyMFWD127NA11RO7m6ym9NS2r7gXuHl/aG/T7tD9dKjR8+PBku2s+crBVcemBuzh3Fr4vqhxJ0/O2Aptuumn2WftA3c9971A52JIlS5I9a9asrJ6mHx46dGhWpvNK29zb3+dwq6Pt7mtPq6Jnlc033zzZLoHo6jNMs7POOusk+/jjj8/KtC31+SYi3zO1ns9FPc/od0XkMiUdt8uWLcvqqVRMJfERETfffHN0B3wf+NnPfpZslaNH5KEitJ01NEdELu3Sc7w/z2m9Xr16ZWV6ptZ6KmGPyGW7LtdrZXRsq5wqIqJv377J9r7RMBvaXr4eVcnKfR3Xzy6N1ed8vd9SyBr/LV0V6gVPIAAAAAAAAACAGsBLIAAAAAAAAACAGtCUcrBSRG11s3P3KY0A7u5yHpH9NUoZTBp1Y/X71fvS+/UMVHq/Jfla6TdDGXW9nDhxYlZWkgV2JO6uq2Ou2aL8u8uiSnC6qv06G80ysMcee3ThnXRv1BW9lEFH2zsiXzd1zXSXax2bPm51P9A11LNN6DU9gxmsissLNthgg0777lLmRJX5dXTmx87Gx71Kr3Scl/aSu+++O9nz58/Pyh544IFk77LLLlmZtrnKTFxCWzc5WClrbaui42vAgAHJ9vNNnaQrjaLn/ZNPPjnZI0eOzOppG3uGI5dDV1EVniIif8bRuV1aW1tlX1TJ8IwZM7KyzTbbLNm6p/nZRMtcrqfoOcMl0yq107XdJXnMo1Xl5/p858/4VZm+S2FASuu4zlm/htatytoXkc8/PyvpmVVDIHQ0eAIBAAAAAAAAANQAXgIBAAAAAAAAANQAXgIBAAAAAAAAANSAphQvu6ZYNZOltOpLly5Ntmvq+vTpk+yqlKpeVtJg6t/5NfQeVQs6c+bMrJ5qSP0a+tv0Gq7phTKf+9znkl3SUHel3lbHi6bxbQZct1rS07YqGvuof//+yS6lYoX/oG00evToZGtqz4hcz+2x2DQugvaFxzypiqMWkceFKcVI0FgIHi8IVmWrrbbKPmu7d2bMMJ97rZyW2mMVaJwKHc8eP0Hb6NZbb022x0zSVMQe50KvX4ohUoqP0Yro2aLRWC3dHV1/NTaYn2E6M75Fs+JnhQ996EPJfu9735tsj6XVHntQ6ZlJ1wi1/X51zWmVmEC6Rzz33HNZmbaTnk28fzS9u66jvvfpnPA+0DVV29mfCeuK9pPvVfpuwONA6vqkdulZT8e9z4HS3lo1j0pxhTyGUY8ePZKtMQ07GjyBAAAAAAAAAABqAC+BAAAAAAAAAABqQLeQg6krXSn92uzZs5Pt7n2a8k/doktSLrXdvVw/e5n+nbqGzZs3L6unKR932223rExdxfR3+m+GVdlhhx2Svf/++yfb3ZK1LddU2qP96y7xVe61Xk/vY/jw4VnZgw8+2NB9dBSeArqOaSp1LJSkmcjBVnV/3W677ZI9ceLEhq6hqVsjqmUnnq5V10xPp6ufq9ZnR129/e/gP3zgAx/IPneVC7vvwYMGDUq29+NLL73UKffUUfia3Oi6s2LFimRPnjw52b4faT2XNehepW3u0kyfm62OyqZVDtaZksjORseNrqMuy6iTHEzXPw1Bcdxxx2X1Dj/88GTrfPP10/dTRedclSQ0Ip+npT1Mv7skBxs2bFhW1gohKvy8oGO71C56HtQ5UHpe9LVR1ws9X3emJKi74HK80vOIlmn/+n6nfVO6ns4Bv0bVvPJxoPunj7muerbCEwgAAAAAAAAAoAbwEggAAAAAAAAAoAY0pRzMXSDVHa8kjZo7d26yS5ICdZ8uyQHUlctdrtV1y13DtK7+FpeoLV++PNnuol7lVo8cbFU0QnxExBe+8IVk6zjwcaVu9Y1m9ChFhfcx4uOiqp729aabblr8vs7GXbvriK4D2lclV+1WR8elSiHGjx+f1dt9992TrbKA0nzzzEKNtrn2k889db1VF26vp67anuUGOdh/UPf/7bffPivrKhmQ92OvXr2SveWWW2ZlLsvu7uh+ora3yZIlS5L9/PPPJ9td1rUPPRuL7pl6BnI5WNXe16potkNvi7rh2Y9aDd37XGp66KGHJnvChAnJHjhwYFZPZYKlvVDb0s/+uh+VZLiljFM6h3Vdd4mX/uYBAwZkZS5P7Y64bFPXPV0PXbJTFfLB1wB9vvNnPe1jPfvMmDGjoXuvE6VnJ5dXVWVr8/1Or1E6Q5bCw1RlFS/JDP19gO+1nQWeQAAAAAAAAAAANYCXQAAAAAAAAAAANYCXQAAAAAAAAAAANaApg1p4HJKqlL6u2Zs/f36yPa6E6vTU9jgPVfE/Sin/XCdalY7Odacar8Y1pFWxYNZff/02/71uaMyHr3/961mZpubUlIIaCyQiHyONxnfxMVdKWV2F97WOQU1tvDr31VFsvPHGXfr9zYzHPvD0lR1NaX1q9O9K6U81BoHG/YmI2GKLLZL9zne+M9mbbLJJVk/XQl3XfS5qXASPW1CVitq19bpmlvTheh+l2HE+91ohFW57cMghhyR78ODBWVlpDWw0lfma4NfWvhs6dGhW1lVp7NsLT7ldlVrW/13Xp1I6Wp1jvqZpDBS9Rkf2bXdA46TULR5SRL5G9+zZMyubPXt2p95LKXaH7mn9+vXLykaPHp1sPUP6nqZxG/1ZRePjlOKQ9OjRo8178rVJ90Xfj3S/0zJ/dtD28HlalVq+lGZe7z1i1bhI3RFvM11j9exQOrtrG/nzp17P11QdQytWrEi2n29g1TiN+tnPHtrmeh70sa1zp9F9zOezrvmlmHA6XvyM2lXxX/EEAgAAAAAAAACoAbwEAgAAAAAAAACoAU0pB/OU30rJtVFd6dzducp1uSQ9UEqpwUtSjNL9qrufu4apG6h+l7va1glNw/qNb3wj2SNGjMjqqfumuv55H1al1o3IXXS1zCUzpbIqSnIET7/Z1RKUVnD1bU9KbtFLly7t1HspSWO1zFPLrrfeeslWeam6wEfk7vLuhqvjQt1pPe2lruWlca/rX2kelWS4eh/ukquuwTpnXVqj13C5n0vY6sRGG22U7NNOOy3ZpfbrTHz91r5zSWt3l4N5inudOzrXfU1oVK6sbenSBS3T7y2lzK0DdZfp61jr6jND7969k+1nw8MOOyzZLgfT/VzXMd/7dP/0c7tKwLSenxX0c+mMp/PZ11adc6V08aVzgl5Dr+/fpX/nc730vNbMaDv5WNC1Um3vb71GSRKk1/CwIFqm661L8GHVeaRSOk+xXpWq3femqvlXCgHj6DjQuVN6v+BzTH+bhrbpaOq1UwMAAAAAAAAA1BReAgEAAAAAAAAA1ABeAgEAAAAAAAAA1ICmjAnkMSyqdHQl3aWjMXa0XqPpFL1eKSWjlpW0uiWtaZWusKu156W01PpbPa6H1i21q+q5d91116zs4IMPTrZqeD3Ns8buUL1nKV5FSStdul/Fx63SaApP1ybruO0KPAUq/B9drdnWvvH5NnDgwGR7TBRdQ0qp2XU8l2LslMa2XkPnlOviNaVqaX7o9TQdb1t/p1Rpwn3P0Pvy+eyxIVoZb9sf/OAHydY1WuPwReS6dh+THRmLpzR2PcVzd48JtGjRouyz7l06n31/ror54bG19O885lNVHL26xQByunovaCY0flhXsN122yX7qKOOysp22GGHZE+fPj0rq4rT47HgNN6Y7wk6J3QN8nNUVaxKP4fqWuVzrComVykmkK8Jut/pvfu+qNfs6jiV7YX+jk033TQr03bRWDN+blFKMUa1rzx2zfLly5Ot/dO/f//K76or3q6lsV3VH6X4dWr7vliiap76GajquyIievXq1eb1SrGI2oN679wAAAAAAAAAADWBl0AAAAAAAAAAADWgKeVgnppX3abUjdndpNR9y92w1DVT7ZLrZMl1S/FrqEtn6Z60zFOxVrmsN5qGvKNQ11h3gVbXW0/Nqeky1R3S3ftUqqISkYjcpXLx4sWV91FFqQ/dDbcq3WN7tH9pzPnY72rXW+RgObrmdHVqVJ1T6koakbu9u5RLx1vJ7dTnhKLzVuu5rNU/V32X3ofPMf2s7tgld12fY7qW6BxzuUtVqs+2Prca+ts/9KEPZWUqL1T3dZcNqKSiM2VXpT3YJdTdXbr0/PPPZ591bPfp0yfZLl3QOaf7eEn64deoknd6m3a0C3uz4etvFY1Kf0r/XrVWltbrklRC8bVX9w4/i1TtAS4l7Wz0Pn2P1j3Dpfa6L+g4999dklBXSTNLoQIU7yc9K3vfuBS3re+NyMecl+lzh9bz5xG9L18vSmE4mhmV8vk4eemll5JdkoPpmaBqTkXk487lYIq2s57vIsprR13wsabt6n1TJQdrtO28P0uSS6V0/dLf6RhEDgYAAAAAAAAAAO0KL4EAAAAAAAAAAGpAU/q3b7jhhtnnKglOyd255EJbiijeyN/437ksoSqyvEt9FHfxrspO5fKcznQbi4jYcsstk33kkUdmZTvuuGOy586dm5Xpb1D3WnedU3mGS0nUlVWv4a62VS53LufQcVXq3/Zu15LbqLvremaKzqZRyZNnyVCJQinbn0uV1oTSHKgq83vS+/XfUpWFY8CAAWt4x+2DylwGDx6clQ0dOjTZc+bMycp07mj7lySRXlblXuvuuo1KqKoyKkZUr8O+Pmgf+v3qmqzf5S71PXv2THaryYjaQtv2+OOPT7bLwapc2L1NuirzVklq7fKUrpZUv158jmmWo2HDhiXb557KCzRjms8BXeNU+heR709VmVQjVi+zSiug60YJPb+5FFX7R9evRqVcvuaVzrk6NvTvfMzovPezp8v1X6Nv375t/ntnsWDBgmTfd999WZmOWT976O/TNimFJfD1T/tGz/ulzF6l/VPnppfNnj072TrffN/SM6RnbqvKlOzrvd6j/5ZSxqxmZtCgQcluVArfaJgRP7fr37m8sGqeeqbgRqWfrUwpW3Ip7EEpS3dVmV+v9D5A100dB6U9uFTWmeeo1jvVAgAAAAAAAADAKvASCAAAAAAAAACgBvASCAAAAAAAAACgBjRlTKBSHAnV3HocHdXRefydKk2m66urdH9+TyXNnupzVeOpaXYjIqZMmZJs/y2NalI7OyaQxkEZN25cVqZa5EZjgXjsAP0NpRSq2g6uVa/S2HocGL1Go2PO278U30cpaXi1DVwv3NXpVhtFU0NHRKxcuTLZHi9BY+6o9tznlGrNtZ1LelzXzWt7VsXCicjnn2vqVb+v99HVMYFK6TG1HXzcV8WD8DFaWk+0HbQtSylUFf8u7ZtSzBYdIz5eSilU9bPe++LFi7N6VXEuIlZdZ7ojrqk///zzk61xgDwmhI6hZon3ovdU2o/9LNDo3tSs+Ly85ZZbkr333nsnu5Taes8990z2X//616yezpX58+dnZVVt7vOtWcZIR+HjrSreo8fU0/1c98iIvG1L5wVdv0qxKbXM90y9D/1en/f6uVevXpX3pN/lcWc6m6effjrZvr4/8cQTyR41alRWNmLEiGTrXPGzgva1z7Gqs5zPWd2f9XyhqckjIp577rnKsnnz5rV5j55aXO+3f//+WZmuhTpG/DeX4n91l/g0Pmd33333ZDf6m/xson2scYBKZwWPF1QV69T36tKYrAu9e/fOPmublJ7FSvFftc1LZ0idwz4O9JlG+9fXB3228DL9TEwgAAAAAAAAAABoV3gJBAAAAAAAAABQA5rSL9pdUtX1Tcs8tam6Pbrrqrq/qluXpxmuco9017yq9HMRuVuauoltttlmWb37778/2e7qqa5opdSDnZ2SV3+Py0z0c6MpU1eHqrSppTTFJdlYST6oshZ18fVruEt3I3i7+RhUPF15Z6Dt0miKeq/35JNPJttT4WobLlq0KNnDhw/P6m2wwQbJdtdJRdcHdZ/2v9PvXbp0aVZv1qxZyS65vesY1DTLXcGyZcuS7a6r6pLvkoSqtnSX9VJKzCpJgrsp69xR2113S3K/qvvweqWyqnt0+ZeuFz4vvR07G5cx6XjWNWXIkCFZvSOOOCLZ++yzT1amEuWSNLMqfbLTmenXS/IFlVv43tpVaew7ir/85S/J1r7xvtAz0AEHHJDsSy+9NKunbelysCqJvLdx3eRgulcpfl7QNUX3yIi8rXVfLI1tvd6SJUuyerrmvfjii1mZ7l2aKnvOnDlZPd1Pjz/++KhCv2vDDTfMyjo7ZIHesz8jaLs+9NBDWdnIkSOTrf1ZSgvu50Yd9zoXfS+p2hf9fvW3eB/qeq1z0eV4um8tWLAgK6s63/n96m/xPbMkoW4m/BlO5YCls09J1lclj/V21fbzMaNnfF1f/byhc1ZljXVi8ODB2Wdto1KolFJoF+0P7QsfL6U9Tf9O99nSHuznKP0teo8dvZfiCQQAAAAAAAAAUAN4CQQAAAAAAAAAUAN4CQQAAAAAAAAAUAOaMiaQx0xRVB/tsUZU2+ep5FSbXUr5rTpC1eV5DAvVGHqMjarU1h4nR9PFaXwPv0ajKZM7A9VUayrOiIixY8cm23Wvqn/UPmw0xXpEdRwm13jqZ9V1uk5X04J7bB/V56sG3/XuqgV1HXXVWCql7PYU210Rv0K/sxTbSX+Hz4E777wz2TfeeGNWpnEHdCz4ONe+K2l/tc08NoX2zxZbbJFsHwv6efvtt48qVBNeijHVGbEPNEaAjtGIiMcffzzZQ4cOzcq0T7X9S+PXY+zo36ldStuu1/e4b1Wx2LxM1xUfB/rZ70PXGY1h4OuufvZ79JgM7UVp3Ghsiosuuigr0xhzqjXXfcXL/PoeQ6uKUrylrkLXHD8LlOKEtRrLly9Ptu5jPXr0yOrpHNA1wddubTvdIyPyfVzHra8dnbH+dSW+vugc0/nhe8SUKVOSfckll2RlGqNG272j27IqhmJEHl/mgx/8YOXflf5d22N1zntrSuk7dJ+cO3duVqbp5PU84CnXtU+13/27S3HJ9MxSOr/oXPRxUBUXU39HRL4feJwZ3Vv1fv08rHumxzLpLiniPcaLniV8zOo80L/z36pl2gel5x8v02vo93o9fb7yeGKtvN5qu3pcXR3P3l66r6ntz1/a/jqfvZ7ORR8H+t26dvhZSdcO3zO1TG0/27c3zXGaAwAAAAAAAACADoWXQAAAAAAAAAAANaAp5WDuzq6ueuqS5dIZdd9yV2h1B1MXLXf5qkqx6q5mipepm5e66ZVkXfPmzcs+q4t3STLT2XKhe++9N9mTJk3KylRGc+CBB2Zl6srYv3//ZLt7aqNpnhVvA02pp5IOd6E86aSTKsu0D1VaeM0112T1xowZk2x3Da5K/e6/Q12IPUWrp33tDNRF2F2hlSqpT0TEb3/722T72K7C3bi171ya0yjqSnnPPfdU1lOXUJ17XlZy6+1sOZi2ua+FOp5VwhkR0atXr2SXZIoluZ9+n/aNz1n9O5VTuURH29WvUSULVKmff5ePR5UL6X14Sl5dO0oy0/ZEf4evh5/+9KeTve+++2ZlS5cuTXbJBXlN587rpSQz0rYsrYdOlfu9o+PJ+83vq7ujc1El2ttuu21WT9ukJLnX/vD0tHoNtTtKKtms+JlD57CWeb3bbrst2XfffXdWVnVe6GhK/f3ss88m288iffr0afN6JVlxM+HrTpVEy/cqPYP7eVyvWTVX2vr8etHruaRMpVx+FtD7rbL/G83av45LM/Vs6/2he5DuOb43VZ2R/GxSkojqNapCWUREbLfddsn+xS9+kZV1tGSoK9F2dTmYPnP585e2s47nkhysSpoXkfeNnmX8HkvXqPquiPy9hz4j+/NIez9b4AkEAAAAAAAAAFADeAkEAAAAAAAAAFADmlIO5u6FVVm63AVZ3d5dHqEukeo6565h7g5bRaMu63o9v6eSy6lmh9DrdbUru7a5t/+f/vSnNu2IvA9VjjJy5Mis3vDhw5M9bNiwymuU3GlVUqVZqqZOnZrVa7Svn3nmmWTvt99+WdmJJ56YbM9Ip3+nLroll+tHHnkkK1u4cGFDypaBqwAABytJREFU99ieNCpr0rHoLsgus2l21IXdJXkjRoxItraHZyTqbLdonQPeT9ofLgdasGBBskvyS51v7taq393oPNK/8fVT77+UXabkiq/49fUeS+2mlDKGtCe6B7kcTCUXpYxqJTfjUlaLKvy36lyvynro9bysKmumj7tSZkytW8pep/u9rsP+d62A9qn+1kazHJZkdd7+KjssScparY3/G7p/qBTE1xeVRje6bnYlVWMrImLgwIHJ1vms+4tfozvifaj91t36sCPoLpmp/Eyge1Xpea60llXJ2Ev7Vkl6VsqkV5KKtbIcTNvLM/qp5NnbtdTmSqNnQ/3scjBF9wI/A5UyY+tn3Vt9/CEHAwAAAAAAAACA1YaXQAAAAAAAAAAANYCXQAAAAAAAAAAANaApYwJpPJyIPBWz6q09FeWyZcuSPXny5KzsxhtvTLZq6vr165fV0zgfqrP0eD56DU9pr/elv0VT+kZEzJw5M9mqtY/INYEaa8a1oN1Fj6u6S01T6Skrb7/99k67pzVBx1hExBe/+MUuupOOQ/vK9f3bbLNNsnUuej92Vbrb9sDjc3lKz9fQdMwRnR/7oNG5X9I2l+65GVIWR1THH+porfTqpC5/PZTiN337299O9plnnpmVDR48ONm6B22wwQYNf3dVm3mcGP1cGhd6PY8L9sQTTyRb+27MmDFZvZ49eybbYyto+2jcn5UrV2b1/vKXvyT7sssuy8q8bndH2/zhhx9O9iGHHJLV0/VaYxNoquSIfDxuuOGGWVmPHj2Srecyj2XV6jGBfN3UWD9bbbVVsn3N6G6x8rQf/SygZ1FdE+67776sXnePCQStge+tOmY9vXhVnBgfyzoHdI/09VDX6FIcPV1TS/Ffu8tzX3uga+hf//rXrOzggw9Otu9j2kban96uusd5vym6Z3oMVH1e1z3S+0nr+TO/vmPQOLEd3dd4AgEAAAAAAAAA1ABeAgEAAAAAAAAA1ICmlIMtWbIk+6wyKnV7dzmG1vNrXHzxxclWlyx1PY/IXczVfdBTQarrn7rweV29XsmNfsCAAdlndQdTOVirpd+E5kPdL++5556s7LDDDku2uk7eeuutldfoDuh89nSb7ir8GjNmzMg+18lFtxlolfbW/cJ/00MPPZTsc845JysbP358sgcNGtTmv0dEbLTRRsl26bK6JFelX/d71DStnjb6gQceSPa1116blc2ZMyfZ6pK9++67Z/U+/vGPJ3uzzTbLyqZPn57sadOmJXvq1KlZvd/97ndtfm9Ea++ZkyZNSvbChQuzsl69eiVb1+e+fftm9fTs5PJzPc+oVM/buFXmZhX+++64445kT5gwIdkvvvhiVk/nc3doI10H/Eyt80jDHkyZMqXjbwxgNfG03vr8uPnmm2dlKs3RvU+fHf2zzhWV3kbkc8XlYLrGapmutRERs2fPTnZ3Drewuug6OX/+/KxM11eXsFf1jT+va7gHfaZx2ZiOH98Xtd/0TOV9qP3mkrJHH3002brWIgcDAAAAAAAAAIDXDS+BAAAAAAAAAABqAC+BAAAAAAAAAABqQKfGBFJdnuvcVM/nOuof/vCHydYYPnfddVdWTzV7rvvX1Jx6H562vSp9st+vXqOUDlX/rvSbFy9enJVpWtsdd9wx2RoHISLXItZJJ1oXOirVbmkuapnH+jn//POTrbE8vvvd77bzHXYs3q6qE/7jH/+YlWnKR42D4fFQdD77+tMR/ahz32OWdWe8rRpdaxUf02uiq/bvqooN9XpR7bmv4Rq7xWPgzZo1K9k69jwegWrUPQaeftY54L9d06prTKDly5dn9TSOnsdgqOqD66+/Pvt8++23J9vj1Wi8Lv0ubzfV4vv3ekyG9qAr56L2lZ5zfvSjH2X1dtttt2RXxT6MyNe4mTNnZmWXXnppsjfYYINk33bbbat726+LZktBr2P2kksuSfZzzz2X1dNU8v4bmiFGkN+TxtW49957s7IhQ4YkW9cij4PR2ej47W6xCbsjpVTmr4fSGbU9+MMf/pDsyZMn/3/t3UEKwjAURdG4/+VmA46Elw8VFIXCO2daqKVIEy/lexzLdSY/e/4Nec6TyeudcyXzezhn1+T9y33jXFtzHt67/eUdniMvv7iuvF/5nFnr/J2cc5zWOvcEeY451ynnJOYeb+4T8nxzzby63r33cSyfTfMcORMovy+frBPfrIveBAIAAAAoIAIBAAAAFHjc6dUxAAAAAP7Dm0AAAAAABUQgAAAAgAIiEAAAAEABEQgAAACggAgEAAAAUEAEAgAAACggAgEAAAAUEIEAAAAACohAAAAAAAVEIAAAAIACIhAAAABAAREIAAAAoIAIBAAAAFBABAIAAAAoIAIBAAAAFBCBAAAAAAqIQAAAAAAFRCAAAACAAiIQAAAAQAERCAAAAKCACAQAAABQQAQCAAAAKCACAQAAABR4AtsFOibRgZuZAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<Figure size 1440x288 with 20 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "plt.figure(figsize=(20,4))\n",
+    "for i in range(10):\n",
+    "    idx = random.randint(0, len(test_dataset))\n",
+    "    img, _ = test_dataset[idx]\n",
+    "    x, _ = test_dataset_t[idx]\n",
+    "\n",
+    "    data = x.as_in_context(ctx).expand_dims(axis=0)\n",
+    "    output = net(data)\n",
+    "    \n",
+    "    ax = plt.subplot(2, 10, i+1)\n",
+    "    ax.imshow(img.squeeze().asnumpy(), cmap='gray')\n",
+    "    ax.axis('off')\n",
+    "    ax = plt.subplot(2, 10, 10+i+1)\n",
+    "    ax.imshow((output[0].asnumpy() * 255.).transpose((1,2,0)).squeeze(), cmap='gray')\n",
+    "    _ = ax.axis('off')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Manipulating latent space"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We now use separately the **encoder** that takes an image to a latent vector and the **decoder** that transform a latent vector into images"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We get two images from the testing set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAJIAAACPCAYAAAARM4LLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAACsxJREFUeJztnduLFdkVxr9le7/ftdXWUdFRCUJkCMYEEaOo8zIP4hWCoOBLAgkEzEzyByiCeRCDIEYnD9EYiKAEYYjaAwbjoNHBqENPa7z1qPF+v7buPHR5sven59Q5fbbn1LG/HzRdX+06Vbu7V++9au1Vq8w5ByHKpVO1OyDeD2RIIgoyJBEFGZKIggxJREGGJKIgQxJRkCGJKJRlSGY238yazOysmX0aq1Oi9rD2RrbNrA7AtwDmAmgBcBTAMufcmQKfURi99rjpnBuSdlA5I9IPAJx1zv3HOfccwJ8BfFLG+UQ2uVjMQeUY0kgAlz3dkuwLMLPVZnbMzI6VcS2RcTq/6ws457YA2AJoanufKWdE+g5Ag6dHJftEB6QcQzoKYIKZjTWzrgCWAtgbp1ui1mj31OacazWznwP4AkAdgG3OudPReiZqinbf/rfrYvKRapF/Oec+SjtIkW0RBRmSiIIMSURBhiSiIEMSUZAhiSjIkEQUZEgiCu980bZWMLNApwVqDx48mNvu1q1b0Pb06dNAnzx5MtAHDhwIdGNjY6AfPXpU8Nr9+vUL9JgxYwLds2fP3HZ9fX3QNmnSpECvXbu24LWKRSOSiIIMSURBa20JXbt2DfTz588DPWvWrEDv3fv/RId79+4FbUOHDi147jRevXoV6E6dSvt/f/nyZd7PPnjwINAjRowI9FumVa21icohQxJRkCGJKOj2P4Fv/5m+ffsWfa779+8H+vHjxwWP92/XAaBXr16B9n0e4E3/jdt9unfvHujm5uZAp4UaikUjkoiCDElEQYYkoiAfKSEtnnbixIlA9+nTJ7fNPgvHbgYOHBjoFy9eBLpLly6BfvjwYaA7dw7/THz+urq6QLe2tua2e/fuXfBcsdCIJKIgQxJRkCGJKHRYH4l9BfZbmMuXLwfa92PYZ+G4DvtAHNvh41mzD8TnY+2vp3F8bNeuXXgXaEQSUZAhiSjIkEQU3hsfif0UTn9NW68qlSNHjuS258yZE7TxWtuVK1cCPW7cuECzj9OjR49As4/07NmzQPPP5uczca5UrLU1RiOSiEKqIZnZNjO7bmanvH0DzezvZtacfB/wbrspsk4xI9LnAObTvk8BHHDOTQBwINGiA1NUzraZfQDgb8657yW6CcAs59xVM6sH8KVz7sMizhMtZ5t9Is5zToPzqJcsWRLoTZs2BdpfWwOAQ4cO5bZnzpwZtN26dSvQhw8fDnRa/hE/QnThwoVADxs2LNCjRo0K9J07d3Lb7E9xXGnixImBfsvv8Z3mbA9zzl1Ntq8BGFboYPH+U/Zdm3POFRppzGw1gNXlXkdkm/aOSP9NpjQk36/nO9A5t8U591Exw6OoXdo7Iu0FsALAuuT7nmg9KpI0n2jx4sWBXrhwYaA5B/v48eOB5ljQmTPhmzFmz56d2/bzf4A3/al58+YFmuNCHNthv5V9Ks4x4vP5xw8fPjxo27x5c6BL9S3zUczt/04A/wTwoZm1mNkqtBnQXDNrBjAn0aIDkzoiOeeW5Wn6SeS+iBpGkW0RhUw/+89zv7+mtGDBgqBt3bpwdr17926g/Wf1AWDDhg2ldKUgHKvhdTxu5xgW/w04RsZrb3x+/rz/sw8ePDhou349vC9qaGhACnr2X1QOGZKIggxJRCFTPlIp5fd27NgR6D17wlBWqbnJpZb+K1SD6Pbt24FO84nYh+Lzpa0rcj6SfzznpnPZQPa/uGwh5COJSiJDElHIVKptKdPs8uXLSzp32vSQdm1O5fDP19LSErT1798/0KVOXZx6y33lsAh/3p9K+fFvZtmyMN68ffv2gsfnQyOSiIIMSURBhiSikCkfKQ3/Fp39CL59T0sxTWPr1q2B5ur6/iNGnJLCaSV87bRHutkn4vPxo1b8uPmTJ0/yXstvA4CVK1cGWj6SqCoyJBEFGZKIQs36SGmPXKeVmmHWr18f6FWrVgWaywr76a7sd6SlynI7+0Dc93KWb9Ie954xY0bBcxWLRiQRBRmSiIIMSUShqj4Sz/08n5e6Hlbos8zOnTsDvXTp0kBfvHgx0IMGDcrbl7RyxWml/RjuO/+eOG7EPpYP+28csyr1FV750IgkoiBDElGQIYkoVNVHSounlMOKFSsCvWbNmkBPmTIl0JcuXQo0+zmc1+P7FhwnYvjnZM3X4va0V5OyD+X3lUvgcOot51K1F41IIgoyJBEFGZKIQqbW2qZPnx7oRYsWBdr3a4YMGRK0TZ06NdAcL2FfgF+bxY82p8WGfL+EfZi0OBDrtOPTcrz5+LFjx+a2OebEeVYcL+OSPPx693xoRBJRKKY+UoOZNZrZGTM7bWa/SParRLLIUcyI1ArgV865KQCmA/iZmU2BSiQLj2IKbV0FcDXZfmBm3wAYCeATALOSw/4I4EsAvy7l4ufOnQu0P7cDb5bf89eoeL3q2rVrgeZyevzY9OjRowPNfgo/ysyvXC/kI6W9sovhuBG/hov7wv4fc/78+dz2xo0bg7YbN24EmvOR+Pdy+vTpgtd6TUk+UlJv+/sAvoJKJAuPou/azKw3gL8C+KVz7r7/H1moRLLKI3cMihqRzKwL2ozoT8653cnuokokqzxyxyB1RLK2oecPAL5xzv3Oayq5RHJdXV3wDFhTU1PQzjELXsPyfQX2E/h5e/4s5+Vw7nLaq604H4nbyyEtz4p9MPZz9u/fH+h9+/bltvlV8fzKiFLzwfNRzNT2IwA/BfBvM/s62fcbtBnQX5JyyRcBLM7zedEBKOau7R8A8j2mqhLJAoAi2yISFV1r69y5M4YOHZrTu3fvDtr9+Afwpl/irwPxq6X4+fvJkycHmsspMxz7YX+N/RL/2X+Od928eTPQXAqQSzdzjIpjYIXyjd7G+PHjc9scP+Ofi+sIlFoj4TUakUQUZEgiCjIkEYWK+kitra3BKwz4teXTpk0LdKHYD+d3c8415x+x5rU69h3YD0mrA1mojf0QhnOrfD8SeDNmxWtxfD3f5+J424ABYZIGv/ZUPpKoKjIkEYVMVf7nYZjflOinOPCSCA/ZPPXxLTi/JYiP57I5PM360w0vYXD1/LRHtDnMwbfsrNPSgv23bPPvcNKkSYFml6CxsZG7p8r/onLIkEQUZEgiCpnykUQmkY8kKocMSURBhiSiIEMSUZAhiSjIkEQUZEgiCjIkEQUZkoiCDElEQYYkolDp0n830fZU7uBkO4tktW/V6teY9EMqvGibu6jZsawWlchq37Lar9doahNRkCGJKFTLkLZU6brFkNW+ZbVfAKrkI4n3D01tIgoVNSQzm29mTWZ21syqWk7ZzLaZ2XUzO+Xty0Tt8FqsbV4xQzKzOgC/B7AAwBQAy5J63dXicwDzaV9WaofXXm1z51xFvgD8EMAXnv4MwGeVun6ePn0A4JSnmwDUJ9v1AJqq2T+vX3sAzM1q/5xzFZ3aRgK47OmWZF+WyFzt8FqpbS5nOw+u7d++qre0XNvcb8tC/3wqaUjfAWjw9KhkX5YoqnZ4JSintnk1qKQhHQUwwczGmllXAEvRVqs7S7yuHQ4UWTv8XVBEbXOgiv17KxV2Gj8G8C2AcwB+W2UHdifaXtbzAm3+2ioAg9B2N9QMYD+AgVXq24/RNm2dBPB18vVxVvr3ti9FtkUU5GyLKMiQRBRkSCIKMiQRBRmSiIIMSURBhiSiIEMSUfgfIl7sIAGpIRsAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 144x144 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<matplotlib.image.AxesImage at 0x7f04995adc50>"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAJIAAACPCAYAAAARM4LLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAACZRJREFUeJztnUtsVdcVhv+Feb8JD2Nsg4OwKjFAqhRVoFYC0SJoJmFUBUHEIBKTVmqlSCRph0zKpLNOkEDpoHJVqZWSQSSrRNSoUIE9iKgJAkwRD2Owzdvmadgd3Bv37D/xvde+y/eew/k/yeL851zfsxP93nudvddex0IIEKJaZtS7AeLNQEYSLshIwgUZSbggIwkXZCThgowkXJCRhAtVGcnMdpnZRTPrM7NPvBolsodNdWbbzBoAXAKwA8BNAN0A9oQQvinxO6mdRm9tbS15fWxsLNIzZsz43mMAGB0djfS8efMi/eLFi0i/fv060g0NDZEeGRmJ9JMnT0q21ZnhEMLKch+aWcUNfgSgL4TwXwAws78AeA/AhEZKMwcPHix5/e7du5GeP3/++PHs2bOjaz09PZHetGlTpK9fvx5pNt6iRYsiffr06ZLfP81cq+RD1QxtzQBuJPTN4rkIMztgZj1mVtP/elFbqumRKiKEcATAESDdQ5uojmqM1A8gGVi0FM9lkm3btkV6+fLlkX7w4EGkk0Pdhg0bomtbt26NdHIYBIDjx49H+vnz55HmmOvZs2eRrvHQVhHVDG3dANrN7G0zmw3gfQBf+DRLZI0p90ghhDEz+xWATgANAI6FEM67tUxkiqpipBDClwC+dGqLyDDTHmynldWrV0d61qxZke7s7Iz0nDlzIp2cG1qyZEl0bebM+H/rrVu3Iv348eNI8/QAx0SnTp1C2tESiXBBRhIuyEjChdzGSOvWrYs0L0usXbs20q9evYr03Llzx4+vXLkSXbt06VKkt2/fHuktW7ZE+uHDh5E+fz5++OW1uTSiHkm4ICMJF2Qk4UJuY6SdO3dGenBwMNKcA5SMiQDAzMaPFy9eHF1btmxZpDlNhOeROB9pwYIFkX706BHSjnok4YKMJFzI7dDW3Bzn4JVLf+VlkGRWJA+DTU1NkeahiofJCxculLxXFlCPJFyQkYQLMpJwIbcxUltbW8nrHPcMDQ1FesWKFePHLS0t0TVOreXf5S1gCxcujDTvSuG0lDSiHkm4ICMJF2Qk4UL6B99pgtNIrl69GulycUkyhuIt2cnlE+C7KSp37tyJNC/PlJvjSiPqkYQLMpJwQUYSLuQ2RuL1rHKpHJz6sXLl/yu9HD58OLrGKSr79u2LNFc2Sc5JAd+Nsa5dq6ggSF1RjyRckJGECzKScCE3MRJvuWa4tAxvEeLfb2xsHD/u6OiIrvHa2f79+yPNa228tsaaY6o0oh5JuFDWSGZ2zMwGzaw3ce4tM/uHmV0u/rus1HeIN59KeqTPAOyic58A+CqE0A7gq6IWOaZsjBRCOGlmbXT6PQDbisd/AvBPAB87tssdzj/icsdc2o+vc/nkM2fOTHivrq6uSHNMxPNE9+7dizTHWDyHlUamGiM1hhAGise3ATSW+rB486n6qS2EEEpVqzWzAwAOVHsfkW6m2iPdMbMmACj+OzjRB0MIR0II74QQ3pnivUQGmGqP9AWA/QB+X/z3c7cWTRPr16+PNM8bseZ8pOS8EQAcOnRowntxWRuGXxHB5ZA5PssClTz+dwD4N4AfmNlNM/sQBQPtMLPLAH5W1CLHVPLUtmeCSz91bovIMJrZFi7kZq1t6dKlkeZXVXHpGJ534vJ+PFdUCs7RZjge43mmLKAeSbggIwkXZCThQm5ipDVr1kSa543K1UPifW+T4fbt25HmfCNeiyv1ugoAePr06ZTbMl2oRxIuyEjChdwMbfxGSE7dePnyZcnfP3fu3JTvPTAwEGlebuG3CvAW7TQOZYx6JOGCjCRckJGEC7mJkTg1g+MSrtbPb5Q8ceLEhN/NSxr8OM/LK7xFm5dI+A2SWUA9knBBRhIuyEjChdzESJw2wjESlzTm10Dwax+ScIzDc1Ld3d2R5jdKcgmd+/fvT3ivtKIeSbggIwkXZCThQm5iJJ4X4jQR3rLNMRW/Cmsy8JZrfuMkx0hcTjkLqEcSLshIwgUZSbiQmxhpeHg40rw9abL5SUnKbR/i7+K0Xp7TyiLqkYQLMpJwQUYSLuQmRuItQTyXw1uAOH+Jty8l4ZiH4e3g/F2cC6W1NpFbKqmP1GpmJ8zsGzM7b2a/Lp5XiWQxTiU90hiAj0IIGwFsBvBLM9sIlUgWCSoptDUAYKB4/NjMLgBoRsZKJN+4cSPSnH/E5fjKvXIiyWTL0PC9OJ8pi/NKk4qRivW2fwjgDFQiWSSo+KnNzBYC+BuA34QQHiX/CkuVSFZ55HxQUY9kZrNQMNGfQwh/L56uqESyyiPng7I9khW6nqMALoQQ/pC4lKkSyb29vZHmksQct/DcD6/NJfOXysVI5fa98fVS+eFppZKh7ccAPgDwHzP7unjutygY6K/FcsnXAPxiepooskAlT23/AjDRn5xKJAsAmtkWTuRmrY3303NONud08/oZ79fnHO9S8LwQfzdrLg2YBdQjCRdkJOGCjCRcyE2MxHAON5cgHh0djXR7e3uk+/r6xo/LzSONjIxEmj/PmuewsoB6JOGCjCRcyO3Q1t/fH+lVq1ZFmt90XWobNS95MFzumKcDeOqBpxqygHok4YKMJFyQkYQLuY2R+JUQu3fvjjQvqZSKkcptR+LtRxwj8eM/l8HJAuqRhAsyknBBRhIu5DZGOnnyZKT37t0baV4i2bx5c6SPHj06flxu+xDHSBxv8TyTSv+J3CIjCRdkJOFCbmMk3p7U1dUV6aGhoUgn00aYcmttnLJy9uzZkterKcVcL9QjCRdkJOGCjCRcsHLju+vNzIZQ2JW7AsBwmY/Xi7S2rV7tWhdCWFnuQzU10vhNzXrSWlQirW1La7u+RUObcEFGEi7Uy0hH6nTfSkhr29LaLgB1ipHEm4eGNuFCTY1kZrvM7KKZ9ZlZXcspm9kxMxs0s97EuVTUDs9ibfOaGcnMGgD8EcDPAWwEsKdYr7tefAZgF51LS+3w7NU2DyHU5AfAFgCdCf0pgE9rdf8J2tQGoDehLwJoKh43AbhYz/Yl2vU5gB1pbV8IoaZDWzOAZNX0m8VzaSJ1tcOzUttcwfYEhMKffV0fabm2efJaGtqXpJZG6gfQmtAtxXNpoqLa4bWgmtrm9aCWRuoG0G5mb5vZbADvo1CrO018WzscqGPt8ApqmwNpq21e46DxXQCXAFwB8Ls6B7AdKLys5yUK8dqHAJaj8DR0GcBxAG/VqW0/QWHYOgfg6+LPu2lp3/f9aGZbuKBgW7ggIwkXZCThgowkXJCRhAsyknBBRhIuyEjChf8BBBgORVqd9YYAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 144x144 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "idx = random.randint(0, len(test_dataset))\n",
+    "img1, _ = test_dataset[idx]\n",
+    "x, _ = test_dataset_t[idx]\n",
+    "data1 = x.as_in_context(ctx).expand_dims(axis=0)\n",
+    "\n",
+    "idx = random.randint(0, len(test_dataset))\n",
+    "img2, _ = test_dataset[idx]\n",
+    "x, _ = test_dataset_t[idx]\n",
+    "data2 = x.as_in_context(ctx).expand_dims(axis=0)\n",
+    "\n",
+    "plt.figure(figsize=(2,2))\n",
+    "plt.imshow(img1.squeeze().asnumpy(), cmap='gray')\n",
+    "plt.show()\n",
+    "plt.figure(figsize=(2,2))\n",
+    "plt.imshow(img2.squeeze().asnumpy(), cmap='gray')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We get the latent representations of the images by passing them through the network"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "latent1 = encoder(data1)\n",
+    "latent2 = encoder(data2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We see that the latent vector is made of 32 components"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1, 32, 1, 1)"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "latent1.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We interpolate the two latent representations, vectors of 32 values, to get a new intermediate latent representation, pass it through the decoder and plot the resulting decoded image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAABIEAAACBCAYAAABXearSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3WmMXlUdx/GDC/vSZVraaaEtpS1d6AIFoYSqBRUqpCAISCIkShBBUZREjb4QQnjhQmKMJmCihqAoCCIKmoJhL1vL0tZSoHTfpmVaWmhR2XyBHH7nN3MOT6fzzDzz3O/n1X9679zn9p57zr3z5Pz/Z4933nknAAAAAAAAoLl9qLdPAAAAAAAAAPXHl0AAAAAAAAAVwJdAAAAAAAAAFcCXQAAAAAAAABXAl0AAAAAAAAAVwJdAAAAAAAAAFcCXQAAAAAAAABXAl0AAAAAAAAAVwJdAAAAAAAAAFfCRnvywPfbY452e/Lyq+9CH3v+O76233tqjO45JG/asD3/4wzF+8803u6UNQ6Ade9pHPvL+UPvGG2/QF/ugerRhCLRjT6vHmNoX2lDfB/bYY49O4xBCeOed9/8rb7/9dvZ4ul9Pq8e7TQiN2Y467oQQwt577x3jj370o9n93nrrrRj/97//Tba9+eabncb6OyHUv4313nv77bebqi9qe/Tv3z/Z1traGuO99torxnvuuWeyn/a/1157Ldm2Y8eOGG/btq3TOIQQ3njjjV057V1Wjzb8/3F7rR31/zRkyJAYH3fcccl+I0aMiPG///3vGHs/0j6m+/m2tra2GC9ZsiTZb8uWLTEujcvd4Z133unzfVGf8zNnzozxnDlzkv0OOuigGL/66qsx9jbUe8LHRd1369atMX7ggQeS/ebPnx9jvw+6W61tyEwgAAAAAACACuBLIAAAAAAAgAro0XQw9Kx6TxlE/fmURPRNOuUXfRNt+C6dZu3pC7qtlILSm8+mqoypnuY1aNCgGI8ZMybGmtIQQtpuq1atSrZt2rQpxjrt3VNV6n2Nq/Ru069fv+TnCy+8MMZTp06NsaathJD2t/Xr1yfb1q1bF+Nnn302xvfff3+yn7ZxPa55b6YU1ts+++wT44suuijZdu6558ZY01H23XffZD+9Pjt37ky2aTrYypUrY3zdddcl+z388MMxrkdqWDO3YQghnHLKKTG+8sork20tLS0x1mefpqv6z6XrpW184403JtuuvfbaGGvaEjqnz79LLrkkxjNmzEj20/TaUpq0pT0m23Ip1EcffXSy31e/+tUY1zsdrFbMBAIAAAAAAKgAvgQCAAAAAACoAL4EAgAAAAAAqABqAqGySkv+oXFp7rX/7LUoqlL/o1FpH9t///1jrEvkhpAuofvyyy8n2zZs2BDj0jKs6D6eDz9s2LAYn3baaTE+4YQTkv0OPvjgGHs9Ga1NsWjRohgvW7Ys2U/ry1Sp9kt30xpAIYRw1VVXxXjKlCkx1tolIaTLjntdJ61ZsXHjxhj//ve/T/a79957Y7x9+/ZkG8/aD6Zt8LnPfS7Z9sUvfjHGWpPLx0Ndonz48OHJNu1XZ555Zoznzp2b7PeLX/wixlp3xo+BjsaOHRtjr0Oi107rLmkcQtqGXpNO+9Hhhx8e48suuyzZT5eMf+aZZ7Lngffp82/atGkx1mXaQ0hr8/h7qdL+XKono/VpJk+enOw3ceLEGD/++OPZY+Bdes31fdPHMd1Pazf5O5C2r/dF/VnHZK/xpe9RbW1txfPvKcwEAgAAAAAAqAC+BAIAAAAAAKgA0sEKDjjggBiff/75yTadXuvLaj7yyCMxbpRl4KpEp/HptPevf/3ryX66pKoukxpCCL/+9a9jvHz58hgzfbbnHHjggTG+4oorYnzyyScn++kSq7qEcQgh3HLLLTG+7bbbYtze3t5t54n36fT1EEL45je/GeOzzz47xp6Coj/7NFxND5s/f36Mf/rTnyb7rV69Osb0090zevTo5Odf/epXMdaUr9dffz3ZT6dWDx06NNl25JFHdvpZDz30UPKzpqBom4ZAu34QnYquaUMhpKl7On3dlxvOTY8PIb3++g50zjnnJPtpn503b16yrR7LVDebcePGxfiMM87I7qepk/6uuddee8XY00W0DXTs1RSmEEKYNGlSjHVZ+RBC+M9//pM9r6rS/qJpfJoKHUI6bmoan6f0aXpQKQVFf8+frdqGixcvTrZ5uifepddQ07A8HUzfUfW9xdtRj+ftqPvq+KrjcAhpyt+TTz6ZPQbepe8p+jfC2rVrk/00ZatUIkTHU+832qaaNuZtOHjw4JrOvScxEwgAAAAAAKAC+BIIAAAAAACgAvgSCAAAAAAAoAKoCWQ0d1eXPfUlHjV38Mtf/nKybeHChTG+5pprYvzoo48m+5HHWR8nnnhijH/729/GuKWlJfs7M2fOTH7+whe+EOObbropxj/+8Y+T/Xz5W3Sd1rMIIa0NctJJJ8XYa0poHrXmAYcQwvjx42N8+umnx/jb3/52st+LL74YY5bb3DU6Fn7yk59MtmkdLq2X4HVISrnwuny8Lk/urr322hjr8tUh0Ka10LoFl156abJN6xFoHRJ/hmkbe30obVeNjzvuuGS/p556KsZe40uXKEdHEyZMiPEpp5ySbNP20P7mfUPb0Jc9zm3zWgdjxoyJ8RNPPJFsoyZQ5/TaXnTRRTHu169fsl+u5oTXBPL+p7QN9Hha4ySEdEljPx41gTrSvx+0BpfXTttvv/1irNff21Db12vlaR0S3U+PHUIIAwcO7HS/EKgJlKPPuwEDBsR4x44dyX5aJ0Z539Dr7uOtPkN1P68jpTVq/V5AR9OnT4+xXi+/dlp3S3nf0Lb22oTahrpfrZ/Vm5gJBAAAAAAAUAF8CQQAAAAAAFABlU8H8+laF154YYyPOeaYGPs0QOVTpqdNmxbj22+/PcYrVqxI9ps7d26M77rrrmSbLrGqS9jp1E4/ZlWX0/UpzD/60Y9irClgPtW2RKdg6zLX5513XrLfX/7ylxhr+mAI6ZKqvtSueuWVV7LnWKU0Fk/J+8QnPhFjnb7uKSg+xVnpFHY9/p133pnsd/PNN8f4d7/7XbKtra2t0/Pw/qXTs6vS90JI07U8NVbv39JyxjrGlejxZs+enWzTa/6b3/wm2bZ06dIY61TtKvWvzujz79Of/nSMNZUhhPSa6XX25W51KrQ/F/WzStPe58yZE2N/Zi5YsCDGpBW9S6+lpvHlUhVCSJ9H3gf0eJ4ClJv27mPylClTYnzvvfcm21atWpX97CrTVOZjjz02xu3t7cl++m6ifUzH1xDS9Gp//8gtKe7teNhhh8XY37NK78RVNXny5Bhr+s62bduS/WpNQdE2rLUv+rF1eWxPuacNOzd16tQYeyqf0tQ7Hcu8HUvvqLl0MG+rUnonOho3blyMfWxU+u6p7zbehjrW+ruNtn0pndrflxoBM4EAAAAAAAAqgC+BAAAAAAAAKqDy88u8kv7nP//5GNc6Vdmn0OrPOgV+5MiRyX6XXXZZjC+//PLs8XWKmk8vW7RoUYzPPvvsZJumsTQznYIbQgijRo2K8a6kgClNNdB40KBByX4XX3xxjHVFjxDSdtP7QNO/QgjhhhtuiPHPfvazZFtpKmoz0Cmvs2bNSrbpdddpsp7mUFpBQ2m/bG1tTbZdeeWVMdaV4UJIV7LasGFDjFeuXJnsd+ONN8b46aefTrY1W3qYXmdNsxsxYkSyX60ryOiU3NLU6VL76v1zyCGHJNvmzZsX47vvvjvGa9asSfbT6fHN1mad0et+1llnxdinQusU51rTL306u/Y/3c/TFzSF159pW7ZsifGyZctiXIW2ytE+p1PgPQVFxzxtm61btyb76TYfa/U6l9LB9Dnpz2cdQ1lh6n26Aq1eT08h0PQt7Zfejto+pVQi7X+eYqmpZ77ypr5fVrn/qaOOOirGOlb53xL+d8d7vA1L7zbaVtrWPj7rz424OlEj0jRITSXyNtCUP21jb0f9u81TM7Uddez1vqjPZNJoP9iQIUNirKuKltpQx1p/fmo/8ued/p2pbe1jdyP+PcdMIAAAAAAAgArgSyAAAAAAAIAK4EsgAAAAAACACqAmkOXmag50dyxBq7mbnv/elXx4r7Ogy3Z6/n4z541qXufhhx+ebNP8d8/d3F1eK6NEz1HzRP2eGzt2bIw9n7sRc0i7k9Y3GDBgQHa/WpeerpXn6uoxdAwIIe1jgwcPjvGECROS/VavXh3jxYsXJ9uarfaF3qeTJk2Ksee763hVGgv1PvD21J/1+F7nYvv27TH2+mua4//xj388xnfccUey36233trp8ZqV1k7Smh8+bmrtIG1HX75a+6mPc7m6FV5PRGvGjB8/Ptk2e/bsGN90003Z82jmZ587/vjjY6z9w+t/aO0DHUM3btyY7Kdto/0yhLQvavt6v9ff8zZ84oknYqx1ZarUZiF0HOe0now+n/bff/9kP38+vcfbQMdH7b9Oj+/9Xvvs0KFDk21LliyJ8a68FzUTb0O913WbX39dtl2vuY+F2qal+lzaF70OZqmeop5j1fqf8r6j7wt6Xbwd9d1Q39X9/aa0vLj2dR2zvR2b7R2yu3kbaq087Sv+XNS+qPVafVl5fff3fqTH1NjHxUb8e46ZQAAAAAAAABXAl0AAAAAAAAAVUPl0MJ8a5lP1Go1PF9Upgl1dDr2v69+/f/Jzo0xrzZ1HaTqoT8dulP9LvWiKwsCBA5Nt2jf1mnUl/WtXePvoNNPS9Gk9X++nzdaOmmqQWyY1hHSaul4TT7XS45WWG9ZtpRQUP76eo071Pvroo5P9/vrXv2aP0YyGDRvW6b97H/C0oPd4WmVpKXltYz2+T61Wfi+MGTMmxjrd+9VXX032a+ap837fa+qjTj/366+pDH4Mpdfc7wNtw9LS4np8nW4fQggtLS0x3rx5c4y7O3W70fm11dRMfc74O6qmb2lbeV/R6+ntnet/Xm5Az8O36b1Q1XQw72O58dTHT21DHau8H2kblt5L9Dw8Tbp0/GZ7L+kqv7c19VGfcd6O+l6h19mXiG9tbY2x3zParr5NeXoSUj5O6nNG3+U8nVb74qZNm2K8Zs2aZL9DDz00xt4Xa23DRhwnmQkEAAAAAABQAXwJBAAAAAAAUAF8CQQAAAAAAFABla8JpHUFQkiX/OsLSvmHzZzvq3nPWicihDQ/s1RfpLd4zrbmFde73k0j0Fx2XRZ33LhxyX65PPd6144o9Sn9bO9fmlvc7O2o96zmSntuvebQl+pXdMf10toW3sdyNTBGjx6d7JerVxJCc4ynpXoypdog2if0OnR1fC0t2eptp3SsP/XUU2PsNRhWrly52+fYqGqtQ+J1C3Jt6PWTtLZg6dqVlj3W3/MlzkeNGhXj5cuXx3jnzp3Zz2pGfl10HFWl2lradl7DTI/vn6Xtr2OCjw+leoXN/oyrhdcX0Voy2h4+nurPet+3t7cn+2mdRF+ePNeGTo+5Y8eO7H5Vps/9ENLrrnVi/FmlfVPbe9myZcl+WocwV18vhLRP+dirz7hmeBfpbrqEewhpLTqtp1Rqw/Xr18d4wYIFyX5Tp07t9Ngh5OuG+pjZiHV7mQkEAAAAAABQAXwJBAAAAAAAUAGVTAfT6VpHHHFEsk2nxDficm5Op5X6tOFmnq6rU2N9aXGdKqnXxJfH7C3eLrqMdhXaUKezH3nkkZ3+ewhpO+p18P1KaUa1yn2Wb1O+n6aSenpEs7VjLtXAp7/mrmtpSela0/28rUtpDfrZus2ngWt62PPPP59sa4YlrP0+1OeHtlWpHZUvEV9KUcilUpaWr/Z+NGDAgBjPnDkzxi+99FKyn07rbsQp2LujtGy7KqUM6Dbvi/qz3we5PuBtqH1T36lCSNOedMz0dmq2ND7nKVq5d89Sqo+mg23bti3ZNmjQoBj7vaDtWHo26X6N8v7USPr375/8rH1Rr2vpvU7bbe3atcl+mjpZei/Rbb7fq6++GmNP/cS79LkSQv5dotSO69ati/HSpUuT/fRZVerPpb6oqXykg3Xk73L6/NDr6qmZatGiRTF++umnk22a7uefpW3qz2fViO8izAQCAAAAAACoAL4EAgAAAAAAqIBKpoMpTcUJofGnvPpUwlqnDTcbnXb7yiuvJNt0+qtPue4ttU65Lq2M04z0/n355ZeTbTr1Vqcx+yoZpemXXeFTbXVaqY4PpVSY7j6nRqP3rLaNrsIQQjrVXa/Jli1buv2cap0irX3R20lXe2m2FL4QOl4jHTu1TX0VGZ0Krf3B21FXqqq1PXw/fY6VVnnTFTqGDx+e7KdjRyNOwd4dnib1+uuvx1ivpaezaxvqNWlra0v209VrvG1yfaKU+ldK28ylz1RBKf1ZeQqetqOuruYpKLpSlR+j1lS7UrqZP/+qyFMdcyla3rb6nHz44Ydj/NBDDyX7TZgwIcZDhgxJtuX6WGmMb/S/b3qLr9il16k0Lmk73nHHHTF+7LHHkv1OOeWUGLe2tibbcivweR/VzyIdrCPvizp2lVZd078Xb7zxxhh7OQBdLcxXcsz9XeDPYH1WN4rqfGsAAAAAAABQYXwJBAAAAAAAUAF8CQQAAAAAAFAB1SpA8n+55YJDaPxcS88jr2qeqOZ1eh0S/dnrx/QWvc88xzhXb6NZad6z1gHyfNmdO3fGWGtY+H3e3XWfvH20DklpmXDNvW/2vqjtobnXvgSttqFeV6//1JXrVapXUqolo/t5zraeVzO2of+fNB9er4vX+9B21Hjz5s3Jfno9a71+PuZpPR9fklfz/vX3dAwNoblrX5SeEaX6Fdpn169fH+MVK1Yk+2ktGa+Zlat15u2k94/Xklm1alWMq/bsK8ktS+1jmfaxxYsXx/jZZ59N9psyZUqMDzvssGSbtpd+ltfV0NpgPmZTE6jj3w96D2u7ef/QNnzggQdivHDhwmQ/rfk0ceLE4mfn6Hhd9T6W4+2Tq9Ppy4vrO+s999wT47Vr1yb7zZs3L8bHHnts9jz0menPz0asJ9NI9t577+RnfRZq+3obbt26NcZLliyJsdcSnDt3bow/+9nPJttytZz87wV/P24EzAQCAAAAAACoAL4EAgAAAAAAqIBKpoMpTyUpLevYCHw6tk7nr5Jap7U2Snvq1F0/d00jqsIUa/3/6//dp0rq9Fe9ft5nc8uydpUfQz9bp5X69FydAuzpKI04luwOneaq18v/37nluT2FsyvT1P2a6rTtUhqLnq+nM7300ku7dU6Nzq9Zrh39/67pC5rCs2nTpuzxurpEvKak+JigbaypKs8991z2fJuNj0+a8lx63uk1WblyZYxfeOGFZL9p06bF2FORcsf3qfj6e97XdZzMLW9fBX5t9Wftf572o2PqU089FWPvA21tbTH2FAi91jo2evq89mftb36OVeXXNcf7rF7LRx99NMbaZiGEsGjRohjPmTMn2aZ9Tt9LPKVP3ymr1sdqVfo7sFTKQVMkV69eHWN//ujy4qX051IKYakUAUIYMGBA8nMuvdbfDbXdSmUnNG3a2yL3fuljdyO2ITOBAAAAAAAAKoAvgQAAAAAAACqgkulgpdQSTWfw6bq9RaeX+XREnV62zz779Ng59bbSFE2d0tebU5ZzK3z4tOrSFMRmp9MoSytL5VYzCSFt49zqNbuidHzl59ve3v6Bv9MstD10WnopHay0Kld3rA7WlbbXdET/uRn7oo+VBx10UKf7eVqq3uuagqwra3SVX2e9t/x89Wc9j40bNxaP2Uw8TeDAAw+s6fe0b65ZsybGulpXCGmKlvcpva56j3gakaakeHqKjq/N3E4fpKvvlzp2PvPMMzH2PvD8889nPyuXNu37dffKm83G+2Kt97M+Z7Qv+juFrv5WGic1Lc3PqdaUtSor3eelFbs2bNgQY20730/7pr8j5f6+87G3Uf4ebVT9+vXr0u9p/yv1X02D9/fX3Ep9/u/dUa6iuzETCAAAAAAAoAL4EggAAAAAAKAC+BIIAAAAAACgAiqZZDho0KAYT58+vRfPZNf5EnNaD2DEiBHJtmXLlvXIOfWGUaNGxVjbM4T0Gmn+redK9yStEeNL5pbqJzQjzXWePHlyjD1/VmtO7LfffjH2duzuPFvPxc7VbPLaTrnltpuR9j/Nxda86RDS3GnNaffr05XaIH6MUk0gz8N/z7Zt22rar1n4NdLlwEvjo9bn0vFL68eEkI5zXW3jUk0g7WNag8HPo5lrzfhSuEOGDOl0v1KNNa2DoMsch9CxTlZOqX6F1thobW1Ntg0bNizGCxcurOmzmpHWUgshv0S815fT9tJ3PH8e6ZLxpfpp2o6+X0tLS4y97hM6tmFuKWq/rmvXro2x9ktva60l43XatNZPaQnsKtUK7aoDDjgg+VmvoV5bb5/ly5dntymtX6ftHUJ6D5Wen9QEKvO6TtqGGns76dLvJfp3gL/n6v2jfd3/pmnENmQmEAAAAAAAQAXwJRAAAAAAAEAFNN7cpDrRKebXXnttjMeOHZvs51MuG42nKwwcODDGxxxzTLLtvvvu65Fz6ik6HfnSSy+N8ciRI5P9dFpgb6bl6GfrFESdVhhCCAcffHCMdfp1CCFs3ry5TmfXeyZOnBjjmTNnxtjTF/T6adv7lMrc8oy7IrdMZwj5McH/XdOiumOp+kbiy8yed955Mdb+Vms6lR9P+0et7Vnq275Nz6uU/pBLk+hs377IU4dmzJgRY00b8NQSTesrpT360qm18GNo+5fasbRfqY37utGjRyc/61R0HUO9L2qbasqXP498qntOaal3Td/1dBRND2u2cXJXePqCXk99xnkJgC1btsRYUzN9P00b82dVLlWiVG7Az7eZ+1itPEUu125+fdra2mJcSiPSPutpRHq/aPv6WFha/hzv8jIM2idKadLt7e01HV+fi5oa5scvPVurUCpid5SWY9d+6X1Rx9MS7WP+O/q80zb051sjPu+YCQQAAAAAAFABfAkEAAAAAABQAXwJBAAAAAAAUAFNWxPooIMOSn7+5S9/GeOTTjopxl7DoK8t7ay5iGPGjOnFM+l+WisnhBCuv/76GB911FEx9loypfoxjcDvsaFDh8ZYl88NIYSlS5f2yDnV05QpU5KftR215oDnSmvfrHedp9ISkrmaQJ5bPGjQoBh7HrkvYd0XaI71V77ylWTbZz7zmRhr//O+mFu6VsetrirVbvI20zxtHRO8NpEv+dsMtA2+9rWvJdt0vNH6IqWaTVqbwJ+zXampV2pHfz7nltFuxHG+O+k10jpOIaRtpWOSjznaN7X/eV2TUg0MpfeV1x/S8/A6M9pW3VHPra/S2kghpNddr5/3Kb3W+jt+LWut1ab9yPtbldunFv37909+1ntb283fKUpLged422gtL+3b/ll97W+a3uDPu9wS8d4Xa62Bp/3Zf0fbq1RPBmV+XWttQ39nzSm1Ya7+lz9LS/W/egsjPAAAAAAAQAXwJRAAAAAAAEAF9Ik51DqtS6db+nK3F198cYzPPffcZJsupe5pJ42uNCVXt3kqQyNNA8214YABA5L9Lrjgghh/6UtfSrZp2pROdW/UFA5tm9IS2DoVVdOjGlEuvcfTQs4666wYX3HFFck2TUXQqZg+tVqXR9U2rscUdb0nS1PiS9O4NU2mkdNT9Lw19qWcTzvttBh/4xvfSLblprz6crQ6TVavoy5r7dtq5VNtdQldP15uar4vu6qpMX11iXg/77Fjx8b49NNPT7Zp++g45ClC2k/1PmlpaUn2Ky3bXpomrXRcKaWKKZ/O32zLV+s1nzlzZrJN72H9f3s6mKYE6buTp13rWF7ql3rNdyWtvkppfE6vy/Tp05Nt2ue0D/j9q+2t70/+Xuvvx0rvBW0P7196vl1J9WxGek0OO+ywZJu+p+h19TSQXFt76mSub4eQTxkspZw0w1hYD/369Ut+zrVjqS+WaNuV0vX0+D720nZl/izR51NuvAuha38nez/Npe55G/rvNQJmAgEAAAAAAFQAXwIBAAAAAABUQI/Oxc2lBIUQwsknnxzjyy+/PNmWm57s6QuDBw/OfnZfSwFTtaZK+BT+eqTNlKb7H3nkkTG+6qqrkm2e9tXZ8UIIYfTo0TH2KXY6vb2RUt3e4/d07hz933Xqqd/DPfH/9M/QaeTf+973km2TJ0+OcWm6s06T9vtSp0RqapindGj/1mm3fl/oNNlaq+97Oorehz5lU6da59LhQkhT+Xxs6un7Va/X7Nmzk20nnHBCjDUty1O0pk6dGuNSepC2m0+P1mm4el19+rW2oR9DU0203fz6d2UavKYJh5D2vxUrVtR8/HrR+8bHBn0uanto2mwI6fPUj6Fjaqkf6Uo02gc8hVPvhdLKGNrG3t9qbUe9X4cPH57s99JLL9V0vJ5WGgf0/+b39sc+9rEY6xgcQvr/098rpWjpGO8rUvo4kDtHvQ+8f+ln+TNeV1Espek3Urt1F+2zs2bNSrZpKm0uBdn30/eqjRs3JvvVugJjKc1Ez8PH5UZ8B+sJ+p7nfdGf++/xFYi0DUvpr6UVifyZnKP3QV9Nca4Hvbc9JTb3fPJnVS593K+r/t4rr7ySbNPxN5eq7+eLjvy5pe8fpbQ9/7sjR8fhzZs3J9smTJgQ49KzrxGfadxVAAAAAAAAFcCXQAAAAAAAABXAl0AAAAAAAAAV0KM1gbQewRlnnJFs+9a3vhVjrxehueKaW+k1BzzvthFobmhXc6g1r9CX6dTr4cfPLVu3OzTnedKkScm2n//85zE+4ogjkm2rVq2Kca3Lu5fqDCjPu8z9vz33s7Rsu36WXuPSNfVj5OqhlJY99uPXKw9Y89q9dsBPfvKTGGutrhBC2LRpU4y1HT3XtbTUpd5Deo38XtDf0/P1vPtal6XO1c7wbX7fafvoefhnlZY+rkf9hFL+uC4FfvXVVyfbXnvttU5jP2e9Dr4tV7OiNAbp+Oz52/p7fj/m8u5L17/UhqW20M8ujQk9RWumXHPNNck2fZ62t7fH2O/tKVOmxNj/D/oM1WvkY2VuadNSny3dC/q5vpS51r4o1SbSWGuthJDerzrG9Aa9Dlo3zOn/29+BtD97/8iNw/6OJavRAAAOaUlEQVR+pD+XxsLS+0auDpzfH6VaJvo+p/dZX6if8J5anx++37hx42LsdUh031L9TL1Opb6i19nbUY9ZGjdzy5BXmdYX8edYrg1LS0XrNfbrr89nr0OiY0Spr/j9g45K9dNK9SL92ZWjfcdrd/nfUTm0Y9mWLVuy23J/z4UQws6dO2s6vvax5cuXJ9u0vlupbmgj1nVqvDMCAAAAAABAt+NLIAAAAAAAgArotXSwT33qU8k2neLsU+x0Gly9p8Tljq9LOrrS0pk6DdCnoekURN+mU9h1GrdPW9y+fXuMn3vuuWRbPaZT67T7E088Mdk2YsSI7GfrtFm9xj5dLjdN2ek19tSFXNqGpxHlrrEfo5RmWFo+Uqfy6v/T93vhhRdivGjRomRbvZbw1OusS/aGkC4N7vRe13Pz/5O2SSl9qzTFXNtAUz/8nimllGm75lJaQkj7kbZbCOm0ev2/6O+EEMKTTz4Z461bt2Y/q7vo9fH7d8aMGTH2Ket6LXNL1YaQXstS39HjeZ/Va67juqfo6BRpP49cyogfQ6+5L8OqU371eMuWLUv206XFGyH9Qaf8H3roock2TZPTNCN/hun/17fp/1Fjv7b6s6aeaew/+/TsXDqSP++3bdsW47a2tuzxN2zYEGN/9jVC271H+6kv65xLHerfv3+yn94HpRStXAqyb1u5cmWncQghrFmzJsbej5TeV96G2vaPPPJIsu3ee++Nsfb70vjcl5RSVrW9/Tmj4622sb8D6Dug9g8dh0NI3ys0jTuEdGl5/T1/11mxYkWM/T5plvbaVdqPtK+EkC4VnUv5CiHtL6XrqPv5s2rkyJExzo3jIYSwfv367HlUmY6bCxcuTLadeeaZne7n7zfaPqVrq7+3bt267LZaPwsd+d9OuTG0lPJaovtpnwqhsVOXPwgzgQAAAAAAACqAL4EAAAAAAAAqgC+BAAAAAAAAKqBHawJp7rHnRWqerdff0ToQteb21Zr35/V8NM9df8drYmguttaRCCHNc9ec8DFjxiT76fK/vnSsXis9xx07diT7Pf744zH+4x//mGyrx9K4mo+uNRlCSGukeP0KbUPNhfecea2ZUFoytlQTSP/fnnevnnnmmRh77QO95ppbOmzYsGQ/zeP3uixaF0FrOnitjLlz58bYc5PrlWuq97bfU0uXLo3x8ccfn2zT2l2lejvaX7zv6GdrW5WW4NZz9NzoBQsWxNhrbowdO7bT43ltFK2Z4PV89D7XWgr6uSGEMG/evE7PN4T65+L79df7yNtQ+4vev96PtH39euWWFfbz0J+11ovWWQohHTO9tkVLS0uMc0vTh5DmaXutBt2m/5d//vOf2f0aIc9bx1QfG4YMGRJjve/1eoVQrjmRWwLanx16DL22XqdC7yGvRaX15PR4XodEn63z589Ptj3xxBOdnoe/TzRSTSC9xv6cyd1j/tzSZ4b/jv5caw1CrRezatWqZD+9z4466qhkm47/Oq57Gz7//PMxvuuuu5Jt2r6N0Me6otZ6OL6fji+rV69Otmn/0P7n10jHBB1Tvc+uXbs2xkuWLEm2HX300TEu1QTSOneld6kq0T6m93kIIZx00kkxLtUE0udfqQ9om/qy1NqfSzWBfGxER35tSzW5VK11evQYXucuV8fNxw5/p0Sq9J1CqU5bV55BOu76Z5W+h2jE5x0zgQAAAAAAACqAL4EAAAAAAAAqoEfTwXT669VXX51s01SK2bNnJ9t02rum6ZSWUfW0EJ0CVprKq1NjdYqgpl2FEMIPf/jDGL/88svJNp0CpukLmkoSQgjTp0+P8axZs5Jteo5Dhw6NsS+D96c//SnGPh2uHkt46jS4P//5z8k2nep8zjnnJNu0DXUpXG2zENJlL32btmEuNSyEdKqtpnz94x//SPa74YYbYqxLSPs5aopDa2trsp+2jS+1rlNFDznkkBj78oK6hK5PM6xXGpFOd/XlY7/zne/EeM6cOck2Ta/S+9lTUPT4nnKpbadTXH0as057f/TRR2OsSwyHkKYb+PWaNm1ajDWVz9MvNa1FU5NCSO8n/T1P69Npvt73SqluXVVKw7rzzjtj7Oltmqo5fPjwGHvqnPZFTQMJIW0r7TueyqXbNH3OxzG9B0ePHp1smzlzZox1XPep2JpW6al62qY6Jus9FkKaDuH3Uj3a8IPoPXb99dcn23Rc0mfhMccck+ynKQqachJCOo1Z285TP3LtqONrCGkKil8vXT5Z7x8f83Q8LD1jSsuhl8bNnm5HPRdPt8mdp48tL774YoxLqaZ6Tbx/6Dig6WCbN29O9tN3nRkzZiTbDj/88BjrmODn9OCDD8bY348acUp8vZTSgLSvhJCOe5rqo3EIaXqy9ln/LB3bnn766WSbjvWatun3jD4Xq9RuJXod/F0u9z7j7zb6PCpdV93mKUs6Juu44veLj6/oyN+R9PmXG19D6DhO5+gx/O807XP6HPN2pP+V+TNIU6/1vdHb0EsT1MLvl9x7o/f73niH/CDMBAIAAAAAAKgAvgQCAAAAAACogB5NB9Npbz7t9Oabb47xH/7wh2SbTqHSNB1d2SaEEA4++OAYH3HEEdltOv118ODByX46Ve/222+PsVd0r3Vqnk7v82nXf//732PsqUpK//+1rnpWLzqVzqfV3XfffTG+//77k225NvSVozSlavz48ck2TcXSVDFt2xDSVWNuvfXWGPtqZqU2zE3L1Gn0/nOtU/26ozr97iqlCuoU8Ouuuy7Zpqk0unqUrwSkqVea9hhC2l66UpOnd+oqZbfddluM29vbk/1K109Tx3y1OVVayUOV2rj0e/Xop6Vj6jRlTQ0LIf0/6DXxFLlJkybF+Pzzz0+2aVqIjgOe4qArDel46mOhXn8fEzQVUM/RV9dZuXJljH0qtV6rrrZhb9DpxL4KZe7/pCv6hJD2gR/84AfJtokTJ3Z6PJ/urKvg6Ipq/lzUscOnyl9wwQUxHjFiRPZ89V3Ax+x6pDj3pFrHep/aru8Kp512WrJN0+z0+D7NXVeI0r7jqdArVqyIsabph5CO6/oO5/1+8eLFMW6kldp6m14zX1lPU+9KaX3aPqX+kFvRL4R0fNSUMn+2ev9GylMdcyuoeRqo/15OaUzOvYt7inOtK1hVmY+B2j76PuLp7r7aYy08PS+3EpnfM3392Vdvfn30HVP/5vD+0JVV1/x3tA31nXpX0tR7CzOBAAAAAAAAKoAvgQAAAAAAACqAL4EAAAAAAAAqoEdrAtWqVPdGc95LS715vnWj6+l6IvXW1TbUfGavgZHTKDU+av2svtSefq6a46qx50prPq4vI63tVWvbdfWa6b3WHbWX+lLbvafWvug1XHSZZ63LE0Jam0frOnlNBL0vas1p92NozRi9X7w9693/ervta31GeD0krc9yySWXJNtOPfXUGGudHv2dENI+rHXz/Jz0s3Wp8RDStjv00ENjrDWGQkjrAPV0La1G4eeoNV2+//3vJ9u0zpPWyivVWsotgRxCWh/joYceSrZp3Zq99torxvfcc0+y36pVq7LHrzJ93/FrNmvWrBhrO3otmEWLFsW41uXF/Ri5Z4DWVQuBmkCd0Wun9ZlCSGvJDBw4MMbeTrXW6Sk977T2iO7nNYG6UvOkarx+mj6D9Lno7zClOpM5WhM1hLS25p577hnj3q7/2tf4e4+OXS0tLTH26+g1KGuh77whpM9CPZ6/y3blfqm3xjsjAAAAAAAAdDu+BAIAAAAAAKiAhkwHA3YF0yT7lu5I80LP8rRNX0q4nrojjQ/v8iVtb7nllhjXurRprX3Wl7h97LHHYqypSp5KWu8xoZSC2qj0mvzrX/9Ktn33u9+Ncf/+/WOsKVkhpG1fa2qhLy1+6623xnjkyJEx1iXsQ+jY9t2tL7ZhCOm1XbJkSbLt+uuvj/H48eNj/OKLLyb7LViwoNPjlfiS5Js2bYqxpqN4CifLi5etX78++VnHuGnTpsXY00I2btwYY72XvT11m6cRafqLLlHtfdbHV3TkKXOaFqlpfZ5ypM/JWttRU75CSN+tNPY+q22MjrxttB/os8rfZXNplaU23HvvvZNtuTb09+RGfJdlJhAAAAAAAEAF8CUQAAAAAABABfAlEAAAAAAAQAX0aE2gUr4dup/nEHcHzZ9sxPzGZqP5+t2piu1Y63L09fgszyHuDlVpQ72WpWdIV9pwV2qL7Lvvvrt8/FroOO21eOpN75vSPdQd1za3LLUvm1rvdtxvv/12+fgfRJeMrXftBr8+69at6zT2e6nWMUKvpR9DlydftmxZjH2J5dI9redfarfStnr1xXq3Y+nazps3L8almllaX6Y0Hmq/2rZtW7Ltb3/7W4yHDBkS44ULFyb7aTuW+nNJqR3r8VzsyTb0a/Dggw/GWGs5eRsuX748xnqN/Xi6zeu5aR2u1tbWGGvNqBDyNU86+7yuqNc7ar2fi3pd/G8lvYZbtmzJnocuJa/XwevO6D3p9aHuvvvuGA8fPjzGzz33XPbc69GOvux5d+jNNpw/f36Mt2/fHmO/Vlo7qNSGenyvlabj6ahRo2Ksz8ue0JW/+ZkJBAAAAAAAUAF8CQQAAAAAAFABe5CWBQAAAAAA0PyYCQQAAAAAAFABfAkEAAAAAABQAXwJBAAAAAAAUAF8CQQAAAAAAFABfAkEAAAAAABQAXwJBAAAAAAAUAF8CQQAAAAAAFABfAkEAAAAAABQAXwJBAAAAAAAUAF8CQQAAAAAAFABfAkEAAAAAABQAXwJBAAAAAAAUAF8CQQAAAAAAFABfAkEAAAAAABQAXwJBAAAAAAAUAF8CQQAAAAAAFABfAkEAAAAAABQAXwJBAAAAAAAUAF8CQQAAAAAAFABfAkEAAAAAABQAXwJBAAAAAAAUAF8CQQAAAAAAFABfAkEAAAAAABQAf8DK5G1n+VBYMIAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 1440x360 with 10 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "num = 10\n",
+    "plt.figure(figsize=(20, 5))\n",
+    "\n",
+    "for i in range(int(num)):\n",
+    "    \n",
+    "        new_latent = latent2*(i+1)/num + latent1*(num-i)/num\n",
+    "        output = decoder(new_latent)\n",
+    "        \n",
+    "        #plot result\n",
+    "        ax = plt.subplot(1, num, i+1)\n",
+    "        ax.imshow((output[0].asnumpy() * 255.).transpose((1,2,0)).squeeze(), cmap='gray')\n",
+    "        _ = ax.axis('off')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that the latent space learnt by the autoencoder is fairly smooth, there is no sudden jump from one shape to another"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/example/autoencoder/mnist_sae.py b/example/autoencoder/mnist_sae.py
deleted file mode 100644
index 886f2a16a..000000000
--- a/example/autoencoder/mnist_sae.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=missing-docstring
-from __future__ import print_function
-
-import argparse
-import logging
-
-import mxnet as mx
-import numpy as np
-import data
-from autoencoder import AutoEncoderModel
-
-parser = argparse.ArgumentParser(description='Train an auto-encoder model for mnist dataset.',
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--print-every', type=int, default=1000,
-                    help='interval of printing during training.')
-parser.add_argument('--batch-size', type=int, default=256,
-                    help='batch size used for training.')
-parser.add_argument('--pretrain-num-iter', type=int, default=50000,
-                    help='number of iterations for pretraining.')
-parser.add_argument('--finetune-num-iter', type=int, default=100000,
-                    help='number of iterations for fine-tuning.')
-parser.add_argument('--visualize', action='store_true',
-                    help='whether to visualize the original image and the reconstructed one.')
-parser.add_argument('--num-units', type=str, default="784,500,500,2000,10",
-                    help='number of hidden units for the layers of the encoder.'
-                         'The decoder layers are created in the reverse order. First dimension '
-                         'must be 784 (28x28) to match mnist image dimension.')
-parser.add_argument('--gpu', action='store_true',
-                    help='whether to start training on GPU.')
-
-# set to INFO to see less information during training
-logging.basicConfig(level=logging.INFO)
-opt = parser.parse_args()
-logging.info(opt)
-print_every = opt.print_every
-batch_size = opt.batch_size
-pretrain_num_iter = opt.pretrain_num_iter
-finetune_num_iter = opt.finetune_num_iter
-visualize = opt.visualize
-gpu = opt.gpu
-layers = [int(i) for i in opt.num_units.split(',')]
-
-
-if __name__ == '__main__':
-    xpu = mx.gpu() if gpu else mx.cpu()
-    print("Training on {}".format("GPU" if gpu else "CPU"))
-
-    ae_model = AutoEncoderModel(xpu, layers, pt_dropout=0.2, internal_act='relu',
-                                output_act='relu')
-
-    X, _ = data.get_mnist()
-    train_X = X[:60000]
-    val_X = X[60000:]
-
-    ae_model.layerwise_pretrain(train_X, batch_size, pretrain_num_iter, 'sgd', l_rate=0.1,
-                                decay=0.0, lr_scheduler=mx.lr_scheduler.FactorScheduler(20000, 0.1),
-                                print_every=print_every)
-    ae_model.finetune(train_X, batch_size, finetune_num_iter, 'sgd', l_rate=0.1, decay=0.0,
-                      lr_scheduler=mx.lr_scheduler.FactorScheduler(20000, 0.1), print_every=print_every)
-    ae_model.save('mnist_pt.arg')
-    ae_model.load('mnist_pt.arg')
-    print("Training error:", ae_model.eval(train_X))
-    print("Validation error:", ae_model.eval(val_X))
-    if visualize:
-        try:
-            from matplotlib import pyplot as plt
-            from model import extract_feature
-            # sample a random image
-            original_image = X[np.random.choice(X.shape[0]), :].reshape(1, 784)
-            data_iter = mx.io.NDArrayIter({'data': original_image}, batch_size=1, shuffle=False,
-                                          last_batch_handle='pad')
-            # reconstruct the image
-            reconstructed_image = extract_feature(ae_model.decoder, ae_model.args,
-                                                  ae_model.auxs, data_iter, 1,
-                                                  ae_model.xpu).values()[0]
-            print("original image")
-            plt.imshow(original_image.reshape((28, 28)))
-            plt.show()
-            print("reconstructed image")
-            plt.imshow(reconstructed_image.reshape((28, 28)))
-            plt.show()
-        except ImportError:
-            logging.info("matplotlib is required for visualization")
diff --git a/example/autoencoder/model.py b/example/autoencoder/model.py
deleted file mode 100644
index 9b6185c9f..000000000
--- a/example/autoencoder/model.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=missing-docstring
-from __future__ import print_function
-
-import mxnet as mx
-import numpy as np
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
-
-def extract_feature(sym, args, auxs, data_iter, N, xpu=mx.cpu()):
-    input_buffs = [mx.nd.empty(shape, ctx=xpu) for k, shape in data_iter.provide_data]
-    input_names = [k for k, shape in data_iter.provide_data]
-    args = dict(args, **dict(zip(input_names, input_buffs)))
-    exe = sym.bind(xpu, args=args, aux_states=auxs)
-    outputs = [[] for _ in exe.outputs]
-    output_buffs = None
-
-    data_iter.hard_reset()
-    for batch in data_iter:
-        for data, buff in zip(batch.data, input_buffs):
-            data.copyto(buff)
-        exe.forward(is_train=False)
-        if output_buffs is None:
-            output_buffs = [mx.nd.empty(i.shape, ctx=mx.cpu()) for i in exe.outputs]
-        else:
-            for out, buff in zip(outputs, output_buffs):
-                out.append(buff.asnumpy())
-        for out, buff in zip(exe.outputs, output_buffs):
-            out.copyto(buff)
-    for out, buff in zip(outputs, output_buffs):
-        out.append(buff.asnumpy())
-    outputs = [np.concatenate(i, axis=0)[:N] for i in outputs]
-    return dict(zip(sym.list_outputs(), outputs))
-
-
-class MXModel(object):
-    def __init__(self, xpu=mx.cpu(), *args, **kwargs):
-        self.xpu = xpu
-        self.loss = None
-        self.args = {}
-        self.args_grad = {}
-        self.args_mult = {}
-        self.auxs = {}
-        self.setup(*args, **kwargs)
-
-    def save(self, fname):
-        args_save = {key: v.asnumpy() for key, v in self.args.items()}
-        with open(fname, 'wb') as fout:
-            pickle.dump(args_save, fout)
-
-    def load(self, fname):
-        with open(fname, 'rb') as fin:
-            args_save = pickle.load(fin)
-            for key, v in args_save.items():
-                if key in self.args:
-                    self.args[key][:] = v
-
-    def setup(self, *args, **kwargs):
-        raise NotImplementedError("must override this")
diff --git a/example/autoencoder/solver.py b/example/autoencoder/solver.py
deleted file mode 100644
index 0c990ce74..000000000
--- a/example/autoencoder/solver.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=missing-docstring
-from __future__ import print_function
-
-import logging
-
-import mxnet as mx
-import numpy as np
-
-
-class Monitor(object):
-    def __init__(self, interval, level=logging.DEBUG, stat=None):
-        self.interval = interval
-        self.level = level
-        if stat is None:
-            def mean_abs(x):
-                return np.fabs(x).mean()
-            self.stat = mean_abs
-        else:
-            self.stat = stat
-
-    def forward_end(self, i, internals):
-        if i % self.interval == 0 and logging.getLogger().isEnabledFor(self.level):
-            for key in sorted(internals.keys()):
-                arr = internals[key]
-                logging.log(self.level, 'Iter:%d  param:%s\t\tstat(%s):%s',
-                            i, key, self.stat.__name__, str(self.stat(arr.asnumpy())))
-
-    def backward_end(self, i, weights, grads, metric=None):
-        if i % self.interval == 0 and logging.getLogger().isEnabledFor(self.level):
-            for key in sorted(grads.keys()):
-                arr = grads[key]
-                logging.log(self.level, 'Iter:%d  param:%s\t\tstat(%s):%s\t\tgrad_stat:%s',
-                            i, key, self.stat.__name__,
-                            str(self.stat(weights[key].asnumpy())), str(self.stat(arr.asnumpy())))
-        if i % self.interval == 0 and metric is not None:
-            logging.log(logging.INFO, 'Iter:%d metric:%f', i, metric.get()[1])
-            metric.reset()
-
-
-class Solver(object):
-    def __init__(self, optimizer, **kwargs):
-        if isinstance(optimizer, str):
-            self.optimizer = mx.optimizer.create(optimizer, **kwargs)
-        else:
-            self.optimizer = optimizer
-        self.updater = mx.optimizer.get_updater(self.optimizer)
-        self.monitor = None
-        self.metric = None
-        self.iter_end_callback = None
-        self.iter_start_callback = None
-
-    def set_metric(self, metric):
-        self.metric = metric
-
-    def set_monitor(self, monitor):
-        self.monitor = monitor
-
-    def set_iter_end_callback(self, callback):
-        self.iter_end_callback = callback
-
-    def set_iter_start_callback(self, callback):
-        self.iter_start_callback = callback
-
-    def solve(self, xpu, sym, args, args_grad, auxs,
-              data_iter, begin_iter, end_iter, args_lrmult=None, debug=False):
-        if args_lrmult is None:
-            args_lrmult = dict()
-        input_desc = data_iter.provide_data + data_iter.provide_label
-        input_names = [k for k, shape in input_desc]
-        input_buffs = [mx.nd.empty(shape, ctx=xpu) for k, shape in input_desc]
-        args = dict(args, **dict(zip(input_names, input_buffs)))
-
-        output_names = sym.list_outputs()
-        if debug:
-            sym_group = []
-            for x in sym.get_internals():
-                if x.name not in args:
-                    if x.name not in output_names:
-                        x = mx.symbol.BlockGrad(x, name=x.name)
-                    sym_group.append(x)
-            sym = mx.symbol.Group(sym_group)
-        exe = sym.bind(xpu, args=args, args_grad=args_grad, aux_states=auxs)
-
-        assert len(sym.list_arguments()) == len(exe.grad_arrays)
-        update_dict = {
-            name: nd for name, nd in zip(sym.list_arguments(), exe.grad_arrays) if nd is not None
-        }
-        batch_size = input_buffs[0].shape[0]
-        self.optimizer.rescale_grad = 1.0/batch_size
-        self.optimizer.set_lr_mult(args_lrmult)
-
-        output_dict = {}
-        output_buff = {}
-        internal_dict = dict(zip(input_names, input_buffs))
-        for key, arr in zip(sym.list_outputs(), exe.outputs):
-            if key in output_names:
-                output_dict[key] = arr
-                output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu())
-            else:
-                internal_dict[key] = arr
-
-        data_iter.reset()
-        for i in range(begin_iter, end_iter):
-            if self.iter_start_callback is not None:
-                if self.iter_start_callback(i):
-                    return
-            try:
-                batch = data_iter.next()
-            except StopIteration:
-                data_iter.reset()
-                batch = data_iter.next()
-            for data, buff in zip(batch.data+batch.label, input_buffs):
-                data.copyto(buff)
-            exe.forward(is_train=True)
-            if self.monitor is not None:
-                self.monitor.forward_end(i, internal_dict)
-            for key in output_dict:
-                output_dict[key].copyto(output_buff[key])
-
-            exe.backward()
-            for key, arr in update_dict.items():
-                self.updater(key, arr, args[key])
-
-            if self.metric is not None:
-                self.metric.update([input_buffs[-1]],
-                                   [output_buff[output_names[0]]])
-
-            if self.monitor is not None:
-                self.monitor.backward_end(i, args, update_dict, self.metric)
-
-            if self.iter_end_callback is not None:
-                if self.iter_end_callback(i):
-                    return
-            exe.outputs[0].wait_to_read()
diff --git a/example/bayesian-methods/algos.py b/example/bayesian-methods/algos.py
index f7b362070..29ba3ec97 100644
--- a/example/bayesian-methods/algos.py
+++ b/example/bayesian-methods/algos.py
@@ -14,13 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Create implementation of algorithms of HMC, stepHMC, SGD, SGLD and DistilledSGLD"""
 from __future__ import print_function
+import time
+import numpy
 import mxnet as mx
 import mxnet.ndarray as nd
-import time
-import logging
-from utils import *
+from utils import copy_param, get_executor, sample_test_regression, sample_test_acc
 
 
 def calc_potential(exe, params, label_name, noise_precision, prior_precision):
@@ -35,6 +35,7 @@ def calc_potential(exe, params, label_name, noise_precision, prior_precision):
 
 
 def calc_grad(exe, exe_grads, params, X, Y, label_name=None, outgrad_f=None):
+    """Calculate gradient"""
     exe.copy_params_from(params)
     exe.arg_dict['data'][:] = X
     if outgrad_f is None:
@@ -48,8 +49,8 @@ def calc_grad(exe, exe_grads, params, X, Y, label_name=None, outgrad_f=None):
         v.wait_to_read()
 
 
-def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_precision, L=10,
-             eps=1E-6):
+def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_precision, L=10, eps=1E-6):
+    """Generate the implementation of step HMC"""
     init_params = {k: v.copyto(v.context) for k, v in exe_params.items()}
     end_params = {k: v.copyto(v.context) for k, v in exe_params.items()}
     init_momentums = {k: mx.random.normal(0, 1, v.shape) for k, v in init_params.items()}
@@ -102,6 +103,7 @@ def step_HMC(exe, exe_params, exe_grads, label_key, noise_precision, prior_preci
 def HMC(sym, data_inputs, X, Y, X_test, Y_test, sample_num,
         initializer=None, noise_precision=1 / 9.0, prior_precision=0.1,
         learning_rate=1E-6, L=10, dev=mx.gpu()):
+    """Generate the implementation of HMC"""
     label_key = list(set(data_inputs.keys()) - set(['data']))[0]
     exe, exe_params, exe_grads, _ = get_executor(sym, dev, data_inputs, initializer)
     exe.arg_dict['data'][:] = X
@@ -134,6 +136,7 @@ def SGD(sym, data_inputs, X, Y, X_test, Y_test, total_iter_num,
         out_grad_f=None,
         initializer=None,
         minibatch_size=100, dev=mx.gpu()):
+    """Generate the implementation of SGD"""
     if out_grad_f is None:
         label_key = list(set(data_inputs.keys()) - set(['data']))[0]
     exe, params, params_grad, _ = get_executor(sym, dev, data_inputs, initializer)
@@ -173,6 +176,7 @@ def SGLD(sym, X, Y, X_test, Y_test, total_iter_num,
          initializer=None,
          minibatch_size=100, thin_interval=100, burn_in_iter_num=1000, task='classification',
          dev=mx.gpu()):
+    """Generate the implementation of SGLD"""
     if out_grad_f is None:
         label_key = list(set(data_inputs.keys()) - set(['data']))[0]
     exe, params, params_grad, _ = get_executor(sym, dev, data_inputs, initializer)
@@ -200,7 +204,7 @@ def SGLD(sym, X, Y, X_test, Y_test, total_iter_num,
         if i < burn_in_iter_num:
             continue
         else:
-            if 0 == (i - burn_in_iter_num) % thin_interval:
+            if (i - burn_in_iter_num) % thin_interval == 0:
                 if optimizer.lr_scheduler is not None:
                     lr = optimizer.lr_scheduler(optimizer.num_update)
                 else:
@@ -238,6 +242,7 @@ def DistilledSGLD(teacher_sym, student_sym,
                   minibatch_size=100,
                   task='classification',
                   dev=mx.gpu()):
+    """Generate the implementation of DistilledSGLD"""
     teacher_exe, teacher_params, teacher_params_grad, _ = \
         get_executor(teacher_sym, dev, teacher_data_inputs, teacher_initializer)
     student_exe, student_params, student_params_grad, _ = \
@@ -323,13 +328,14 @@ def DistilledSGLD(teacher_sym, student_sym,
                     sample_test_acc(teacher_exe, X=X, Y=Y, label_num=10,
                                     minibatch_size=minibatch_size)
                 print("Student: Test ACC %d/%d=%f, Train ACC %d/%d=%f" % (test_correct, test_total,
-                                                    test_acc, train_correct, train_total, train_acc))
+                                                                          test_acc, train_correct,
+                                                                          train_total, train_acc))
                 print("Teacher: Test ACC %d/%d=%f, Train ACC %d/%d=%f" \
                       % (teacher_test_correct, teacher_test_total, teacher_test_acc,
                          teacher_train_correct, teacher_train_total, teacher_train_acc))
             else:
                 print("Current Iter Num: %d" % (i + 1), "Time Spent: %f" % (end - start), "MSE:",
-                       sample_test_regression(exe=student_exe, X=X_test, Y=Y_test,
+                      sample_test_regression(exe=student_exe, X=X_test, Y=Y_test,
                                              minibatch_size=minibatch_size,
                                              save_path='regression_DSGLD.txt'))
             start = time.time()
diff --git a/example/bayesian-methods/bdk_demo.py b/example/bayesian-methods/bdk_demo.py
index cd39bfd2a..83a43192b 100644
--- a/example/bayesian-methods/bdk_demo.py
+++ b/example/bayesian-methods/bdk_demo.py
@@ -14,21 +14,21 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Run Stochastic Gradient Langevin Dynamics (SGLD) and Bayesian Dark Knowledge (BDK)"""
 from __future__ import print_function
-import mxnet as mx
-import mxnet.ndarray as nd
+import argparse
+import time
 import numpy
-import logging
 import matplotlib.pyplot as plt
-from scipy.stats import gaussian_kde
-import argparse
-from algos import *
-from data_loader import *
-from utils import *
+import mxnet as mx
+import mxnet.ndarray as nd
+from algos import HMC, SGD, SGLD, DistilledSGLD
+from data_loader import load_mnist, load_toy, load_synthetic
+from utils import BiasXavier, SGLDScheduler
 
 
 class CrossEntropySoftmax(mx.operator.NumpyOp):
+    """Calculate CrossEntropy softmax function"""
     def __init__(self):
         super(CrossEntropySoftmax, self).__init__(False)
 
@@ -58,6 +58,7 @@ def backward(self, out_grad, in_data, out_data, in_grad):
 
 
 class LogSoftmax(mx.operator.NumpyOp):
+    """Generate helper functions to evaluate softmax loss function"""
     def __init__(self):
         super(LogSoftmax, self).__init__(False)
 
@@ -103,6 +104,7 @@ def regression_student_grad(student_outputs, teacher_pred, teacher_noise_precisi
 
 
 def get_mnist_sym(output_op=None, num_hidden=400):
+    """Get symbol of mnist"""
     net = mx.symbol.Variable('data')
     net = mx.symbol.FullyConnected(data=net, name='mnist_fc1', num_hidden=num_hidden)
     net = mx.symbol.Activation(data=net, name='mnist_relu1', act_type="relu")
@@ -117,6 +119,7 @@ def get_mnist_sym(output_op=None, num_hidden=400):
 
 
 def synthetic_grad(X, theta, sigma1, sigma2, sigmax, rescale_grad=1.0, grad=None):
+    """Get synthetic gradient value"""
     if grad is None:
         grad = nd.empty(theta.shape, theta.context)
     theta1 = theta.asnumpy()[0]
@@ -128,17 +131,16 @@ def synthetic_grad(X, theta, sigma1, sigma2, sigmax, rescale_grad=1.0, grad=None
         -(X - theta1 - theta2) ** 2 / (2 * vx))
     grad_npy = numpy.zeros(theta.shape)
     grad_npy[0] = -rescale_grad * ((numpy.exp(-(X - theta1) ** 2 / (2 * vx)) * (X - theta1) / vx
-                                    + numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * (
-                                    X - theta1 - theta2) / vx) / denominator).sum() \
-                  + theta1 / v1
-    grad_npy[1] = -rescale_grad * ((numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) * (
-    X - theta1 - theta2) / vx) / denominator).sum() \
-                  + theta2 / v2
+                                    + numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) *
+                                    (X - theta1 - theta2) / vx) / denominator).sum() + theta1 / v1
+    grad_npy[1] = -rescale_grad * ((numpy.exp(-(X - theta1 - theta2) ** 2 / (2 * vx)) *
+                                    (X - theta1 - theta2) / vx) / denominator).sum() + theta2 / v2
     grad[:] = grad_npy
     return grad
 
 
 def get_toy_sym(teacher=True, teacher_noise_precision=None):
+    """Get toy symbol"""
     if teacher:
         net = mx.symbol.Variable('data')
         net = mx.symbol.FullyConnected(data=net, name='teacher_fc1', num_hidden=100)
@@ -160,8 +162,9 @@ def dev(gpu_id=None):
     return mx.gpu(gpu_id) if gpu_id else mx.cpu()
 
 
-def run_mnist_SGD(training_num=50000, gpu_id=None):
-    X, Y, X_test, Y_test = load_mnist(training_num)
+
+def run_mnist_SGD(num_training=50000, gpu_id=None):
+    X, Y, X_test, Y_test = load_mnist(num_training)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
@@ -175,8 +178,8 @@ def run_mnist_SGD(training_num=50000, gpu_id=None):
                              lr=5E-6, prior_precision=1.0, minibatch_size=100)
 
 
-def run_mnist_SGLD(training_num=50000, gpu_id=None):
-    X, Y, X_test, Y_test = load_mnist(training_num)
+def run_mnist_SGLD(num_training=50000, gpu_id=None):
+    X, Y, X_test, Y_test = load_mnist(num_training)
     minibatch_size = 100
     net = get_mnist_sym()
     data_shape = (minibatch_size,) + X.shape[1::]
@@ -191,10 +194,11 @@ def run_mnist_SGLD(training_num=50000, gpu_id=None):
                             thin_interval=100, burn_in_iter_num=1000)
 
 
-def run_mnist_DistilledSGLD(training_num=50000, gpu_id=None):
-    X, Y, X_test, Y_test = load_mnist(training_num)
+def run_mnist_DistilledSGLD(num_training=50000, gpu_id=None):
+    """Run DistilledSGLD on mnist dataset"""
+    X, Y, X_test, Y_test = load_mnist(num_training)
     minibatch_size = 100
-    if training_num >= 10000:
+    if num_training >= 10000:
         num_hidden = 800
         total_iter_num = 1000000
         teacher_learning_rate = 1E-6
@@ -235,6 +239,7 @@ def run_mnist_DistilledSGLD(training_num=50000, gpu_id=None):
 
 
 def run_toy_SGLD(gpu_id=None):
+    """Run SGLD on toy dataset"""
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0 / 9.0
@@ -243,20 +248,26 @@ def run_toy_SGLD(gpu_id=None):
     data_inputs = {'data': nd.zeros(data_shape, ctx=dev(gpu_id)),
                    'teacher_output_label': nd.zeros((minibatch_size, 1), ctx=dev(gpu_id))}
     initializer = mx.init.Uniform(0.07)
-    exe, params, _ = \
-        SGLD(sym=net, data_inputs=data_inputs,
-             X=X, Y=Y, X_test=X_test, Y_test=Y_test, total_iter_num=50000,
-             initializer=initializer,
-             learning_rate=1E-4,
-             #         lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5),
-             prior_precision=0.1,
-             burn_in_iter_num=1000,
-             thin_interval=10,
-             task='regression',
-             minibatch_size=minibatch_size, dev=dev(gpu_id))
-
-
-def run_toy_DistilledSGLD(gpu_id=None):
+    exe, params, _ = SGLD(sym=net,
+                          data_inputs=data_inputs,
+                          X=X,
+                          Y=Y,
+                          X_test=X_test,
+                          Y_test=Y_test,
+                          total_iter_num=50000,
+                          initializer=initializer,
+                          learning_rate=1E-4,
+                          # lr_scheduler=mx.lr_scheduler.FactorScheduler(100000, 0.5),
+                          prior_precision=0.1,
+                          burn_in_iter_num=1000,
+                          thin_interval=10,
+                          task='regression',
+                          minibatch_size=minibatch_size,
+                          dev=dev(gpu_id))  # disable=unbalanced-tuple-unpacking
+
+
+def run_toy_DistilledSGLD(gpu_id):
+    """Run DistilledSGLD on toy dataset"""
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = 1
     teacher_noise_precision = 1.0
@@ -288,6 +299,7 @@ def run_toy_DistilledSGLD(gpu_id=None):
 
 
 def run_toy_HMC(gpu_id=None):
+    """Run HMC on toy dataset"""
     X, Y, X_test, Y_test = load_toy()
     minibatch_size = Y.shape[0]
     noise_precision = 1 / 9.0
@@ -302,6 +314,7 @@ def run_toy_HMC(gpu_id=None):
 
 
 def run_synthetic_SGLD():
+    """Run synthetic SGLD"""
     theta1 = 0
     theta2 = 1
     sigma1 = numpy.sqrt(10)
@@ -322,14 +335,14 @@ def run_synthetic_SGLD():
     grad = nd.empty((2,), mx.cpu())
     samples = numpy.zeros((2, total_iter_num))
     start = time.time()
-    for i in xrange(total_iter_num):
+    for i in range(total_iter_num):
         if (i + 1) % 100000 == 0:
             end = time.time()
             print("Iter:%d, Time spent: %f" % (i + 1, end - start))
             start = time.time()
         ind = numpy.random.randint(0, X.shape[0])
-        synthetic_grad(X[ind], theta, sigma1, sigma2, sigmax, rescale_grad=
-        X.shape[0] / float(minibatch_size), grad=grad)
+        synthetic_grad(X[ind], theta, sigma1, sigma2, sigmax,
+                       rescale_grad=X.shape[0] / float(minibatch_size), grad=grad)
         updater('theta', grad, theta)
         samples[:, i] = theta.asnumpy()
     plt.hist2d(samples[0, :], samples[1, :], (200, 200), cmap=plt.cm.jet)
@@ -354,18 +367,18 @@ def run_synthetic_SGLD():
     args = parser.parse_args()
     training_num = args.training
     if args.dataset == 1:
-        if 0 == args.algorithm:
+        if args.algorithm == 0:
             run_mnist_SGD(training_num, gpu_id=args.gpu)
-        elif 1 == args.algorithm:
+        elif args.algorithm == 1:
             run_mnist_SGLD(training_num, gpu_id=args.gpu)
         else:
             run_mnist_DistilledSGLD(training_num, gpu_id=args.gpu)
     elif args.dataset == 0:
-        if 1 == args.algorithm:
+        if args.algorithm == 1:
             run_toy_SGLD(gpu_id=args.gpu)
-        elif 2 == args.algorithm:
+        elif args.algorithm == 2:
             run_toy_DistilledSGLD(gpu_id=args.gpu)
-        elif 3 == args.algorithm:
+        elif args.algorithm == 3:
             run_toy_HMC(gpu_id=args.gpu)
     else:
         run_synthetic_SGLD()
diff --git a/example/bayesian-methods/data_loader.py b/example/bayesian-methods/data_loader.py
index 92ca0cfb3..a0e71bb8d 100644
--- a/example/bayesian-methods/data_loader.py
+++ b/example/bayesian-methods/data_loader.py
@@ -14,14 +14,15 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Create helper functions to load mnist dataset and toy dataset"""
 from __future__ import print_function
-import numpy
 import os
 import ssl
+import numpy
 
 
 def load_mnist(training_num=50000):
+    """Load mnist dataset"""
     data_path = os.path.join(os.path.dirname(os.path.realpath('__file__')), 'mnist.npz')
     if not os.path.isfile(data_path):
         from six.moves import urllib
diff --git a/example/bayesian-methods/utils.py b/example/bayesian-methods/utils.py
index a2744373e..b0ea1f37e 100644
--- a/example/bayesian-methods/utils.py
+++ b/example/bayesian-methods/utils.py
@@ -14,11 +14,10 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Generate helper functions to Stochastic Gradient Langevin Dynamics (SGLD) and Bayesian Dark Knowledge (BDK)"""
+import numpy
 import mxnet as mx
 import mxnet.ndarray as nd
-import numpy
-import logging
 
 
 class BiasXavier(mx.initializer.Xavier):
@@ -26,7 +25,9 @@ def _init_bias(self, _, arr):
         scale = numpy.sqrt(self.magnitude / arr.shape[0])
         mx.random.uniform(-scale, scale, out=arr)
 
+
 class SGLDScheduler(mx.lr_scheduler.LRScheduler):
+    """Create SGLDScheduler class"""
     def __init__(self, begin_rate, end_rate, total_iter_num, factor):
         super(SGLDScheduler, self).__init__()
         if factor >= 1.0:
@@ -44,7 +45,9 @@ def __call__(self, num_update):
         self.count += 1
         return self.base_lr
 
+
 def get_executor(sym, ctx, data_inputs, initializer=None):
+    """Get executor to Stochastic Gradient Langevin Dynamics and/or Bayesian Dark Knowledge"""
     data_shapes = {k: v.shape for k, v in data_inputs.items()}
     arg_names = sym.list_arguments()
     aux_names = sym.list_auxiliary_states()
@@ -62,14 +65,18 @@ def get_executor(sym, ctx, data_inputs, initializer=None):
             initializer(k, v)
     return exe, params, params_grad, aux_states
 
+
 def copy_param(exe, new_param=None):
+    """Create copy of parameters"""
     if new_param is None:
-        new_param = {k: nd.empty(v.shape, ctx=mx.cpu()) for k,v in exe.arg_dict.items()}
+        new_param = {k: nd.empty(v.shape, ctx=mx.cpu()) for k, v in exe.arg_dict.items()}
     for k, v in new_param.items():
         exe.arg_dict[k].copyto(v)
     return new_param
 
+
 def sample_test_acc(exe, X, Y, sample_pool=None, label_num=None, minibatch_size=100):
+    """Generate sample test to evaluate accuracy"""
     if label_num is None:
         pred = numpy.zeros((X.shape[0],)).astype('float32')
     else:
@@ -89,12 +96,12 @@ def sample_test_acc(exe, X, Y, sample_pool=None, label_num=None, minibatch_size=
     else:
         old_param = copy_param(exe)
         for sample in sample_pool:
-            if type(sample) is list:
+            if isinstance(sample, list):
                 denominator += sample[0]
             else:
                 denominator += 1.0
         for sample in sample_pool:
-            if type(sample) is list:
+            if isinstance(sample, list):
                 ratio = sample[0]/denominator
                 param = sample[1]
             else:
@@ -118,11 +125,12 @@ def sample_test_acc(exe, X, Y, sample_pool=None, label_num=None, minibatch_size=
 
 
 def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save_path="regression.txt"):
+    """Generate a sample test regression"""
     old_param = copy_param(exe)
     if sample_pool is not None:
         pred = numpy.zeros(Y.shape + (len(sample_pool),))
         ratio = numpy.zeros((len(sample_pool),))
-        if type(sample_pool[0]) is list:
+        if isinstance(sample_pool[0], list):
             denominator = sum(sample[0] for sample in sample_pool)
             for i, sample in enumerate(sample_pool):
                 ratio[i] = sample[0]/float(denominator)
@@ -130,7 +138,7 @@ def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save
             ratio[:] = 1.0/ Y.shape[0]
         iterator = mx.io.NDArrayIter(data=X, label=Y, batch_size=minibatch_size, shuffle=False)
         for i, sample in enumerate(sample_pool):
-            if type(sample) is list:
+            if isinstance(sample, list):
                 sample_param = sample[1]
             else:
                 sample_param = sample
@@ -146,7 +154,7 @@ def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save
                 curr_instance += batch_len
         mean = pred.mean(axis=2)
         var = pred.std(axis=2)**2
-        #print numpy.concatenate((Y, mean), axis=1)
+        # print numpy.concatenate((Y, mean), axis=1)
         mse = numpy.square(Y.reshape((Y.shape[0], )) - mean.reshape((mean.shape[0], ))).mean()
         numpy.savetxt(save_path, numpy.concatenate((mean, var), axis=1))
     else:
@@ -157,15 +165,19 @@ def sample_test_regression(exe, X, Y, sample_pool=None, minibatch_size=100, save
         for batch in iterator:
             exe.arg_dict['data'][:] = batch.data[0]
             exe.forward(is_train=False)
-            mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 0] = exe.outputs[0].asnumpy()[:minibatch_size - batch.pad].flatten()
-            mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 1] = numpy.exp(exe.outputs[1].asnumpy())[:minibatch_size - batch.pad].flatten()
+            mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 0] =\
+                exe.outputs[0].asnumpy()[:minibatch_size - batch.pad].flatten()
+            mean_var[curr_instance:curr_instance + minibatch_size - batch.pad, 1] = \
+                numpy.exp(exe.outputs[1].asnumpy())[:minibatch_size - batch.pad].flatten()
             curr_instance += minibatch_size - batch.pad
         mse = numpy.square(Y.reshape((Y.shape[0],)) - mean_var[:, 0]).mean()
         numpy.savetxt(save_path, mean_var)
     exe.copy_params_from(old_param)
     return mse
 
+
 def pred_test(testing_data, exe, param_list=None, save_path=""):
+    """Generate prediction on testset"""
     ret = numpy.zeros((testing_data.shape[0], 2))
     if param_list is None:
         for i in range(testing_data.shape[0]):
@@ -177,8 +189,8 @@ def pred_test(testing_data, exe, param_list=None, save_path=""):
     else:
         for i in range(testing_data.shape[0]):
             pred = numpy.zeros((len(param_list),))
-            for j in range(len(param_list)):
-                exe.copy_params_from(param_list[j])
+            for (j, param) in enumerate(param_list):
+                exe.copy_params_from(param)
                 exe.arg_dict['data'][:] = testing_data[i, 0]
                 exe.forward(is_train=False)
                 pred[j] = exe.outputs[0].asnumpy()
diff --git a/example/caffe/caffe_net.py b/example/caffe/caffe_net.py
index 0459c901e..6796fca5c 100644
--- a/example/caffe/caffe_net.py
+++ b/example/caffe/caffe_net.py
@@ -14,64 +14,80 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Generate helper functions to load Caffe into MXNet"""
+import argparse
 import mxnet as mx
 from data import get_iterator
-import argparse
 import train_model
 
+
 def get_mlp():
-    """
-    multi-layer perceptron
-    """
+    """Get multi-layer perceptron"""
     data = mx.symbol.Variable('data')
-    fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
+    fc1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1',
+                            prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
     act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
-    fc2  = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
+    fc2 = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2',
+                            prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
     act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
-    fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
+    fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3',
+                            prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
     if use_caffe_loss:
         label = mx.symbol.Variable('softmax_label')
-        mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+        mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax',
+                                  prototxt="layer{type:\"SoftmaxWithLoss\"}")
     else:
         mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
     return mlp
 
+
 def get_lenet():
-    """
-    LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
+    """LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
     Haffner. "Gradient-based learning applied to document recognition."
     Proceedings of the IEEE (1998)
     """
     data = mx.symbol.Variable('data')
 
     # first conv
-    conv1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, prototxt="layer{type:\"Convolution\" convolution_param { num_output: 20 kernel_size: 5 stride: 1} }")
+    conv1 = mx.symbol.CaffeOp(data_0=data, num_weight=2,
+                              prototxt="layer{type:\"Convolution\" "
+                                       "convolution_param { num_output: 20 kernel_size: 5 stride: 1} }")
     act1 = mx.symbol.CaffeOp(data_0=conv1, prototxt="layer{type:\"TanH\"}")
-    pool1 = mx.symbol.CaffeOp(data_0=act1, prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
+    pool1 = mx.symbol.CaffeOp(data_0=act1,
+                              prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
 
     # second conv
-    conv2 = mx.symbol.CaffeOp(data_0=pool1, num_weight=2, prototxt="layer{type:\"Convolution\" convolution_param { num_output: 50 kernel_size: 5 stride: 1} }")
+    conv2 = mx.symbol.CaffeOp(data_0=pool1, num_weight=2,
+                              prototxt="layer{type:\"Convolution\" "
+                                       "convolution_param { num_output: 50 kernel_size: 5 stride: 1} }")
     act2 = mx.symbol.CaffeOp(data_0=conv2, prototxt="layer{type:\"TanH\"}")
-    pool2 = mx.symbol.CaffeOp(data_0=act2, prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
+    pool2 = mx.symbol.CaffeOp(data_0=act2,
+                              prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}")
 
-    fc1 = mx.symbol.CaffeOp(data_0=pool2, num_weight=2, prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 500} }")
+    fc1 = mx.symbol.CaffeOp(data_0=pool2, num_weight=2,
+                            prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 500} }")
     act3 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
 
     # second fullc
-    fc2 = mx.symbol.CaffeOp(data_0=act3, num_weight=2, prototxt="layer{type:\"InnerProduct\"inner_product_param{num_output: 10} }")
+    fc2 = mx.symbol.CaffeOp(data_0=act3, num_weight=2,
+                            prototxt="layer{type:\"InnerProduct\"inner_product_param{num_output: 10} }")
     if use_caffe_loss:
         label = mx.symbol.Variable('softmax_label')
-        lenet = mx.symbol.CaffeLoss(data=fc2, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+        lenet = mx.symbol.CaffeLoss(data=fc2, label=label, grad_scale=1, name='softmax',
+                                    prototxt="layer{type:\"SoftmaxWithLoss\"}")
     else:
         lenet = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
     return lenet
 
+
 def get_network_from_json_file(file_name):
     network = mx.sym.load(file_name)
     return network
 
+
 def parse_args():
+    """Parse the arguments
+    """
     parser = argparse.ArgumentParser(description='train an image classifier on mnist')
     parser.add_argument('--network', type=str, default='lenet',
                         help='the cnn to use (mlp | lenet | <path to network json file>')
diff --git a/example/caffe/data.py b/example/caffe/data.py
index 15276c423..f6bbc0f0d 100644
--- a/example/caffe/data.py
+++ b/example/caffe/data.py
@@ -14,42 +14,44 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Create the helper functions to mnist dataset for Caffe operators in MXNet"""
 import mxnet as mx
 from mxnet.test_utils import get_mnist_ubyte
 
+
 def get_iterator(data_shape, use_caffe_data):
+    """Generate the iterator of mnist dataset"""
     def get_iterator_impl_mnist(args, kv):
         """return train and val iterators for mnist"""
         # download data
         get_mnist_ubyte()
         flat = False if len(data_shape) != 1 else True
 
-        train           = mx.io.MNISTIter(
-            image       = "data/train-images-idx3-ubyte",
-            label       = "data/train-labels-idx1-ubyte",
-            input_shape = data_shape,
-            batch_size  = args.batch_size,
-            shuffle     = True,
-            flat        = flat,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
+        train = mx.io.MNISTIter(
+            image="data/train-images-idx3-ubyte",
+            label="data/train-labels-idx1-ubyte",
+            input_shape=data_shape,
+            batch_size=args.batch_size,
+            shuffle=True,
+            flat=flat,
+            num_parts=kv.num_workers,
+            part_index=kv.rank)
 
         val = mx.io.MNISTIter(
-            image       = "data/t10k-images-idx3-ubyte",
-            label       = "data/t10k-labels-idx1-ubyte",
-            input_shape = data_shape,
-            batch_size  = args.batch_size,
-            flat        = flat,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
+            image="data/t10k-images-idx3-ubyte",
+            label="data/t10k-labels-idx1-ubyte",
+            input_shape=data_shape,
+            batch_size=args.batch_size,
+            flat=flat,
+            num_parts=kv.num_workers,
+            part_index=kv.rank)
 
         return (train, val)
 
     def get_iterator_impl_caffe(args, kv):
         flat = False if len(data_shape) != 1 else True
         train = mx.io.CaffeDataIter(
-            prototxt =
+            prototxt=
             'layer { \
                 name: "mnist" \
                 type: "Data" \
@@ -67,13 +69,13 @@ def get_iterator_impl_caffe(args, kv):
                     backend: LMDB \
                 } \
             }',
-            flat           = flat,
-            num_examples   = 60000
+            flat=flat,
+            num_examples=60000
             # float32 is the default, so left out here in order to illustrate
         )
 
         val = mx.io.CaffeDataIter(
-            prototxt =
+            prototxt=
             'layer { \
                 name: "mnist" \
                 type: "Data" \
@@ -91,9 +93,9 @@ def get_iterator_impl_caffe(args, kv):
                     backend: LMDB \
                 } \
             }',
-            flat           = flat,
-            num_examples   = 10000,
-            dtype          = "float32" # float32 is the default
+            flat=flat,
+            num_examples=10000,
+            dtype="float32"  # float32 is the default
         )
 
         return train, val
diff --git a/example/caffe/train_model.py b/example/caffe/train_model.py
index 4290e7106..16b18674f 100644
--- a/example/caffe/train_model.py
+++ b/example/caffe/train_model.py
@@ -14,12 +14,14 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-import mxnet as mx
-import logging
+"""Train module with using Caffe operator in MXNet"""
 import os
+import logging
+import mxnet as mx
+
 
 def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
+    """Train the model with using Caffe operator in MXNet"""
     # kvstore
     kv = mx.kvstore.create(args.kv_store)
 
@@ -74,8 +76,8 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
 
     if 'lr_factor' in args and args.lr_factor < 1:
         model_args['lr_scheduler'] = mx.lr_scheduler.FactorScheduler(
-            step = max(int(epoch_size * args.lr_factor_epoch), 1),
-            factor = args.lr_factor)
+            step=max(int(epoch_size * args.lr_factor_epoch), 1),
+            factor=args.lr_factor)
 
     if 'clip_gradient' in args and args.clip_gradient is not None:
         model_args['clip_gradient'] = args.clip_gradient
@@ -85,12 +87,11 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
             args.gpus is None or len(args.gpus.split(',')) is 1):
         kv = None
 
-
     mod = mx.mod.Module(network, context=devs)
 
     if eval_metrics is None:
         eval_metrics = ['accuracy']
-        ## TopKAccuracy only allows top_k > 1
+        # TopKAccuracy only allows top_k > 1
         for top_k in [5, 10, 20]:
             eval_metrics.append(mx.metric.create('top_k_accuracy', top_k=top_k))
 
@@ -102,8 +103,7 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
     batch_end_callback.append(mx.callback.Speedometer(args.batch_size, 50))
 
     mod.fit(train_data=train, eval_metric=eval_metrics, eval_data=val, optimizer='sgd',
-        optimizer_params={'learning_rate':args.lr, 'momentum': 0.9, 'wd': 0.00001},
-        num_epoch=args.num_epochs, batch_end_callback=batch_end_callback,
-        initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
-        kvstore=kv, epoch_end_callback=checkpoint, **model_args)
-
+            optimizer_params={'learning_rate':args.lr, 'momentum': 0.9, 'wd': 0.00001},
+            num_epoch=args.num_epochs, batch_end_callback=batch_end_callback,
+            initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
+            kvstore=kv, epoch_end_callback=checkpoint, **model_args)
diff --git a/example/capsnet/capsulelayers.py b/example/capsnet/capsulelayers.py
index 5ac4fad49..077a4003f 100644
--- a/example/capsnet/capsulelayers.py
+++ b/example/capsnet/capsulelayers.py
@@ -14,7 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
+"""Create layers of capsule net"""
 import mxnet as mx
 
 
@@ -41,8 +41,7 @@ def primary_caps(data, dim_vector, n_channels, kernel, strides, name=''):
 
 
 class CapsuleLayer:
-    """
-    The capsule layer with dynamic routing.
+    """The capsule layer with dynamic routing.
     [batch_size, input_num_capsule, input_dim_vector] => [batch_size, num_capsule, dim_vector]
     """
 
@@ -98,7 +97,8 @@ def __call__(self, data):
                     mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped, name='broadcast_mul_' + str(i)),
                                axis=1, keepdims=True,
                                name='sum_' + str(i)), name='output_' + str(i), squash_axis=4)
-                bias_ = bias_ + mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped, name='bias_broadcast_mul' + str(i)),
+                bias_ = bias_ + mx.sym.sum(mx.sym.broadcast_mul(c, inputs_hat_stopped,
+                                                                name='bias_broadcast_mul' + str(i)),
                                            axis=4,
                                            keepdims=True, name='bias_' + str(i))
 
diff --git a/example/capsnet/capsulenet.py b/example/capsnet/capsulenet.py
index 67108757b..05df9cdc5 100644
--- a/example/capsnet/capsulenet.py
+++ b/example/capsnet/capsulenet.py
@@ -14,24 +14,27 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import mxnet as mx
-import numpy as np
+"""Generate MXNet implementation of CapsNet"""
 import os
 import re
 import gzip
 import struct
+import numpy as np
 import scipy.ndimage as ndi
+import mxnet as mx
 from capsulelayers import primary_caps, CapsuleLayer
 
 from mxboard import SummaryWriter
 
+
 def margin_loss(y_true, y_pred):
     loss = y_true * mx.sym.square(mx.sym.maximum(0., 0.9 - y_pred)) +\
         0.5 * (1 - y_true) * mx.sym.square(mx.sym.maximum(0., y_pred - 0.1))
     return mx.sym.mean(data=mx.sym.sum(loss, 1))
 
 
-def capsnet(batch_size, n_class, num_routing,recon_loss_weight):
+def capsnet(batch_size, n_class, num_routing, recon_loss_weight):
+    """Create CapsNet"""
     # data.shape = [batch_size, 1, 28, 28]
     data = mx.sym.Variable('data')
 
@@ -107,7 +110,8 @@ def read_data(label_url, image_url):
         label = np.fromstring(flbl.read(), dtype=np.int8)
     with gzip.open(download_data(image_url), 'rb') as fimg:
         magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
-        image = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(label), rows, cols)
+        image = np.fromstring(fimg.read(), dtype=np.uint8)
+        np.reshape(image, len(label), (rows, cols))
     return label, image
 
 
@@ -116,10 +120,11 @@ def to4d(img):
 
 
 class LossMetric(mx.metric.EvalMetric):
-    def __init__(self, batch_size, num_gpu):
+    """Evaluate the loss function"""
+    def __init__(self, batch_size, num_gpus):
         super(LossMetric, self).__init__('LossMetric')
         self.batch_size = batch_size
-        self.num_gpu = num_gpu
+        self.num_gpu = num_gpus
         self.sum_metric = 0
         self.num_inst = 0
         self.loss = 0.0
@@ -130,6 +135,7 @@ def __init__(self, batch_size, num_gpu):
         self.n_batch = 0
 
     def update(self, labels, preds):
+        """Update the hyper-parameters and loss of CapsNet"""
         batch_sum_metric = 0
         batch_num_inst = 0
         for label, pred_outcaps in zip(labels[0], preds[0]):
@@ -146,7 +152,7 @@ def update(self, labels, preds):
         self.batch_sum_metric = batch_sum_metric
         self.batch_num_inst = batch_num_inst
         self.batch_loss = batch_loss
-        self.n_batch += 1 
+        self.n_batch += 1
 
     def get_name_value(self):
         acc = float(self.sum_metric)/float(self.num_inst)
@@ -184,6 +190,7 @@ def __call__(self, num_update):
 
 
 def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay):
+    """Run training to CapsNet"""
     summary_writer = SummaryWriter(args.tblog_dir)
     lr_scheduler = SimpleLRScheduler(learning_rate)
     optimizer_params = {'lr_scheduler': lr_scheduler}
@@ -218,7 +225,8 @@ def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, deca
         summary_writer.add_scalar('val_loss', val_loss, n_epoch)
         summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch)
 
-        print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, train_recon_err))
+        print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss,
+                                                                        train_recon_err))
         print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err))
         print('SAVE CHECKPOINT')
 
@@ -227,10 +235,8 @@ def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, deca
         lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch)
 
 
-def apply_transform(x,
-                    transform_matrix,
-                    fill_mode='nearest',
-                    cval=0.):
+def apply_transform(x, transform_matrix, fill_mode='nearest', cval=0.):
+    """Apply transform on nd.array"""
     x = np.rollaxis(x, 0, 0)
     final_affine_matrix = transform_matrix[:2, :2]
     final_offset = transform_matrix[:2, 2]
@@ -255,30 +261,45 @@ def random_shift(x, width_shift_fraction, height_shift_fraction):
     x = apply_transform(x, shift_matrix, 'nearest')
     return x
 
+
 def _shuffle(data, idx):
     """Shuffle the data."""
     shuffle_data = []
 
-    for k, v in data:
-        shuffle_data.append((k, mx.ndarray.array(v.asnumpy()[idx], v.context)))
+    for idx_k, idx_v in data:
+        shuffle_data.append((idx_k, mx.ndarray.array(idx_v.asnumpy()[idx], idx_v.context)))
 
     return shuffle_data
 
+
 class MNISTCustomIter(mx.io.NDArrayIter):
-    
+    """Create custom iterator of mnist dataset"""
+    def __init__(self, data, label, batch_size, shuffle):
+        self.data = data
+        self.label = label
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.cursor = None
+
     def reset(self):
+        """Reset class MNISTCustomIter(mx.io.NDArrayIter):"""
         # shuffle data
         if self.is_train:
             np.random.shuffle(self.idx)
             self.data = _shuffle(self.data, self.idx)
             self.label = _shuffle(self.label, self.idx)
+
         if self.last_batch_handle == 'roll_over' and self.cursor > self.num_data:
-            self.cursor = -self.batch_size + (self.cursor%self.num_data)%self.batch_size
+            self.cursor = -self.batch_size + (self.cursor % self.num_data) % self.batch_size
         else:
             self.cursor = -self.batch_size
+
     def set_is_train(self, is_train):
+        """Set training flag"""
         self.is_train = is_train
+
     def next(self):
+        """Generate next of iterator"""
         if self.iter_next():
             if self.is_train:
                 data_raw_list = self.getdata()
@@ -288,8 +309,7 @@ def next(self):
                 return mx.io.DataBatch(data=[mx.nd.array(data_shifted)], label=self.getlabel(),
                                        pad=self.getpad(), index=None)
             else:
-                 return mx.io.DataBatch(data=self.getdata(), label=self.getlabel(), \
-                                  pad=self.getpad(), index=None)
+                return mx.io.DataBatch(data=self.getdata(), label=self.getlabel(), pad=self.getpad(), index=None)
 
         else:
             raise StopIteration
@@ -298,10 +318,9 @@ def next(self):
 if __name__ == "__main__":
     # Read mnist data set
     path = 'http://yann.lecun.com/exdb/mnist/'
-    (train_lbl, train_img) = read_data(
-        path + 'train-labels-idx1-ubyte.gz', path + 'train-images-idx3-ubyte.gz')
-    (val_lbl, val_img) = read_data(
-        path + 't10k-labels-idx1-ubyte.gz', path + 't10k-images-idx3-ubyte.gz')
+    (train_lbl, train_img) = read_data(path + 'train-labels-idx1-ubyte.gz', path + 'train-images-idx3-ubyte.gz')
+    (val_lbl, val_img) = read_data(path + 't10k-labels-idx1-ubyte.gz', path + 't10k-images-idx3-ubyte.gz')
+
     # set batch size
     import argparse
     parser = argparse.ArgumentParser()
@@ -331,10 +350,13 @@ def next(self):
     # generate train_iter, val_iter
     train_iter = MNISTCustomIter(data=to4d(train_img), label=train_lbl, batch_size=int(args.batch_size), shuffle=True)
     train_iter.set_is_train(True)
-    val_iter = MNISTCustomIter(data=to4d(val_img), label=val_lbl, batch_size=int(args.batch_size),)
+    val_iter = MNISTCustomIter(data=to4d(val_img), label=val_lbl, batch_size=int(args.batch_size), shuffle=True)
     val_iter.set_is_train(False)
     # define capsnet
-    final_net = capsnet(batch_size=int(args.batch_size/num_gpu), n_class=10, num_routing=args.num_routing, recon_loss_weight=args.recon_loss_weight)
+    final_net = capsnet(batch_size=int(args.batch_size/num_gpu),
+                        n_class=10,
+                        num_routing=args.num_routing,
+                        recon_loss_weight=args.recon_loss_weight)
     # set metric
     loss_metric = LossMetric(args.batch_size/num_gpu, 1)
 
@@ -343,5 +365,6 @@ def next(self):
     module.bind(data_shapes=train_iter.provide_data,
                 label_shapes=val_iter.provide_label,
                 for_training=True)
+
     do_training(num_epoch=args.num_epoch, optimizer='adam', kvstore='device', learning_rate=args.lr,
                 model_prefix=args.model_prefix, decay=args.decay)
diff --git a/example/cnn_chinese_text_classification/data_helpers.py b/example/cnn_chinese_text_classification/data_helpers.py
index b3a13deec..49bb3d5dc 100644
--- a/example/cnn_chinese_text_classification/data_helpers.py
+++ b/example/cnn_chinese_text_classification/data_helpers.py
@@ -14,6 +14,9 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+"""Help functions to support for implementing CNN + Highway Network for Chinese Text Classification in MXNet"""
+
 import codecs
 import itertools
 import os
@@ -27,8 +30,7 @@
 
 
 def clean_str(string):
-    """
-    Tokenization/string cleaning for all datasets except for SST.
+    """Tokenization/string cleaning for all datasets except for SST.
     Original taken from /~https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
     """
     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
@@ -40,27 +42,28 @@ def clean_str(string):
     string = re.sub(r"\'ll", " \'ll", string)
     string = re.sub(r",", " , ", string)
     string = re.sub(r"!", " ! ", string)
-    string = re.sub(r"\(", " \( ", string)
-    string = re.sub(r"\)", " \) ", string)
-    string = re.sub(r"\?", " \? ", string)
+    string = re.sub(r"\(", r" \( ", string)
+    string = re.sub(r"\)", r" \) ", string)
+    string = re.sub(r"\?", r" \? ", string)
     string = re.sub(r"\s{2,}", " ", string)
     return string.strip().lower()
 
 
 def get_chinese_text():
+    """Download the chinese_text dataset and unzip it"""
     if not os.path.isdir("data/"):
         os.system("mkdir data/")
     if (not os.path.exists('data/pos.txt')) or \
        (not os.path.exists('data/neg')):
-        os.system("wget -q https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/chinese_text.zip -P data/")
+        os.system("wget -q https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/chinese_text.zip "
+                  "-P data/")
         os.chdir("./data")
         os.system("unzip -u chinese_text.zip")
         os.chdir("..")
 
 
 def load_data_and_labels():
-    """
-    Loads MR polarity data from files, splits the data into words and generates labels.
+    """Loads MR polarity data from files, splits the data into words and generates labels.
     Returns split sentences and labels.
     """
     # download dataset
@@ -86,14 +89,14 @@ def load_data_and_labels():
 
 
 def pad_sentences(sentences, padding_word="</s>"):
-    """
-    Pads all sentences to the same length. The length is defined by the longest sentence.
+    """Pads all sentences to the same length. The length is defined by the longest sentence.
     Returns padded sentences.
     """
     sequence_length = max(len(x) for x in sentences)
     padded_sentences = []
-    for i in range(len(sentences)):
-        sentence = sentences[i]
+    for i, element in enumerate(sentences):
+        print(i, element)
+        sentence = element
         num_padding = sequence_length - len(sentence)
         new_sentence = sentence + [padding_word] * num_padding
         padded_sentences.append(new_sentence)
@@ -101,8 +104,7 @@ def pad_sentences(sentences, padding_word="</s>"):
 
 
 def build_vocab(sentences):
-    """
-    Builds a vocabulary mapping from word to index based on the sentences.
+    """Builds a vocabulary mapping from word to index based on the sentences.
     Returns vocabulary mapping and inverse vocabulary mapping.
     """
     # Build vocabulary
@@ -115,45 +117,41 @@ def build_vocab(sentences):
 
 
 def build_input_data(sentences, labels, vocabulary):
-    """
-    Maps sentencs and labels to vectors based on a vocabulary.
-    """
+    """Maps sentences and labels to vectors based on a vocabulary."""
     x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
     y = np.array(labels)
     return [x, y]
 
 
-def build_input_data_with_word2vec(sentences, labels, word2vec):
-    """Map sentences and labels to vectors based on a pretrained word2vec"""
+def build_input_data_with_word2vec(sentences, labels, word2vec_list):
+    """Map sentences and labels to vectors based on a pre-trained word2vec"""
     x_vec = []
     for sent in sentences:
         vec = []
         for word in sent:
-            if word in word2vec:
-                vec.append(word2vec[word])
+            if word in word2vec_list:
+                vec.append(word2vec_list[word])
             else:
-                vec.append(word2vec['</s>'])
+                vec.append(word2vec_list['</s>'])
         x_vec.append(vec)
     x_vec = np.array(x_vec)
     y_vec = np.array(labels)
     return [x_vec, y_vec]
 
 
-def load_data_with_word2vec(word2vec):
-    """
-    Loads and preprocessed data for the MR dataset.
+def load_data_with_word2vec(word2vec_list):
+    """Loads and preprocessed data for the MR dataset.
     Returns input vectors, labels, vocabulary, and inverse vocabulary.
     """
     # Load and preprocess data
     sentences, labels = load_data_and_labels()
     sentences_padded = pad_sentences(sentences)
     # vocabulary, vocabulary_inv = build_vocab(sentences_padded)
-    return build_input_data_with_word2vec(sentences_padded, labels, word2vec)
+    return build_input_data_with_word2vec(sentences_padded, labels, word2vec_list)
 
 
 def load_data():
-    """
-    Loads and preprocessed data for the MR dataset.
+    """Loads and preprocessed data for the MR dataset.
     Returns input vectors, labels, vocabulary, and inverse vocabulary.
     """
     # Load and preprocess data
@@ -165,9 +163,7 @@ def load_data():
 
 
 def batch_iter(data, batch_size, num_epochs):
-    """
-    Generates a batch iterator for a dataset.
-    """
+    """Generates a batch iterator for a dataset."""
     data = np.array(data)
     data_size = len(data)
     num_batches_per_epoch = int(len(data) / batch_size) + 1
@@ -182,18 +178,19 @@ def batch_iter(data, batch_size, num_epochs):
 
 
 def load_pretrained_word2vec(infile):
+    """Load the pre-trained word2vec from file."""
     if isinstance(infile, str):
         infile = open(infile)
 
-    word2vec = {}
+    word2vec_list = {}
     for idx, line in enumerate(infile):
         if idx == 0:
             vocab_size, dim = line.strip().split()
         else:
             tks = line.strip().split()
-            word2vec[tks[0]] = map(float, tks[1:])
+            word2vec_list[tks[0]] = map(float, tks[1:])
 
-    return word2vec
+    return word2vec_list
 
 
 def load_google_word2vec(path):
diff --git a/example/cnn_chinese_text_classification/text_cnn.py b/example/cnn_chinese_text_classification/text_cnn.py
index 4598a52e6..ce7068136 100644
--- a/example/cnn_chinese_text_classification/text_cnn.py
+++ b/example/cnn_chinese_text_classification/text_cnn.py
@@ -20,12 +20,14 @@
 
 # -*- coding: utf-8 -*-
 
-import sys, os
-import mxnet as mx
-import numpy as np
-import argparse
+"""Implementing CNN + Highway Network for Chinese Text Classification in MXNet"""
+
+import os
+import sys
 import logging
-import time
+import argparse
+import numpy as np
+import mxnet as mx
 
 from mxnet import random
 from mxnet.initializer import Xavier, Initializer
@@ -63,12 +65,28 @@
 
 
 def save_model():
+    """Save cnn model
+
+    Returns
+    ----------
+    callback: A callback function that can be passed as epoch_end_callback to fit
+    """
     if not os.path.exists("checkpoint"):
         os.mkdir("checkpoint")
     return mx.callback.do_checkpoint("checkpoint/checkpoint", args.save_period)
 
 
 def highway(data):
+    """Construct highway net
+
+    Parameters
+    ----------
+    data:
+
+    Returns
+    ----------
+    Highway Networks
+    """
     _data = data
     high_weight = mx.sym.Variable('high_weight')
     high_bias = mx.sym.Variable('high_bias')
@@ -85,20 +103,41 @@ def highway(data):
 
 
 def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
+    """Construct data iter
+
+    Parameters
+    ----------
+    batch_size: int
+    num_embed: int
+    pre_trained_word2vec: boolean
+                        identify the pre-trained layers or not
+    Returns
+    ----------
+    train_set: DataIter
+                Train DataIter
+    valid: DataIter
+                Valid DataIter
+    sentences_size: int
+                array dimensions
+    embedded_size: int
+                array dimensions
+    vocab_size: int
+                array dimensions
+    """
     logger.info('Loading data...')
     if pre_trained_word2vec:
         word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
         x, y = data_helpers.load_data_with_word2vec(word2vec)
-        # reshpae for convolution input
+        # reshape for convolution input
         x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
-        embed_size = x.shape[-1]
-        sentence_size = x.shape[2]
-        vocab_size = -1
+        embedded_size = x.shape[-1]
+        sentences_size = x.shape[2]
+        vocabulary_size = -1
     else:
         x, y, vocab, vocab_inv = data_helpers.load_data()
-        embed_size = num_embed
-        sentence_size = x.shape[1]
-        vocab_size = len(vocab)
+        embedded_size = num_embed
+        sentences_size = x.shape[1]
+        vocabulary_size = len(vocab)
 
     # randomly shuffle data
     np.random.seed(10)
@@ -109,30 +148,55 @@ def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
     # split train/valid set
     x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
     y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
-    logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
+    logger.info('Train/Valid split: %d/%d', len(y_train), len(y_dev))
     logger.info('train shape: %(shape)s', {'shape': x_train.shape})
     logger.info('valid shape: %(shape)s', {'shape': x_dev.shape})
-    logger.info('sentence max words: %(shape)s', {'shape': sentence_size})
-    logger.info('embedding size: %(msg)s', {'msg': embed_size})
-    logger.info('vocab size: %(msg)s', {'msg': vocab_size})
+    logger.info('sentence max words: %(shape)s', {'shape': sentences_size})
+    logger.info('embedding size: %(msg)s', {'msg': embedded_size})
+    logger.info('vocab size: %(msg)s', {'msg': vocabulary_size})
 
-    train = mx.io.NDArrayIter(
+    train_set = mx.io.NDArrayIter(
         x_train, y_train, batch_size, shuffle=True)
     valid = mx.io.NDArrayIter(
         x_dev, y_dev, batch_size)
-    return (train, valid, sentence_size, embed_size, vocab_size)
+    return train_set, valid, sentences_size, embedded_size, vocabulary_size
 
 
-def sym_gen(batch_size, sentence_size, num_embed, vocab_size,
-            num_label=2, filter_list=[3, 4, 5], num_filter=100,
+def sym_gen(batch_size, sentences_size, num_embed, vocabulary_size,
+            num_label=2, filter_list=None, num_filter=100,
             dropout=0.0, pre_trained_word2vec=False):
+    """Generate network symbol
+
+    Parameters
+    ----------
+    batch_size: int
+    sentences_size: int
+    num_embed: int
+    vocabulary_size: int
+    num_label: int
+    filter_list: list
+    num_filter: int
+    dropout: int
+    pre_trained_word2vec: boolean
+                        identify the pre-trained layers or not
+    Returns
+    ----------
+    sm: symbol
+    data: list of str
+        data names
+    softmax_label: list of str
+        label names
+    """
     input_x = mx.sym.Variable('data')
     input_y = mx.sym.Variable('softmax_label')
 
     # embedding layer
     if not pre_trained_word2vec:
-        embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')
-        conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentence_size, num_embed))
+        embed_layer = mx.sym.Embedding(data=input_x,
+                                       input_dim=vocabulary_size,
+                                       output_dim=num_embed,
+                                       name='vocab_embed')
+        conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentences_size, num_embed))
     else:
         conv_input = input_x
 
@@ -141,7 +205,7 @@ def sym_gen(batch_size, sentence_size, num_embed, vocab_size,
     for i, filter_size in enumerate(filter_list):
         convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
         relui = mx.sym.Activation(data=convi, act_type='relu')
-        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1, 1))
+        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentences_size - filter_size + 1, 1), stride=(1, 1))
         pooled_outputs.append(pooli)
 
     # combine all pooled outputs
@@ -170,10 +234,27 @@ def sym_gen(batch_size, sentence_size, num_embed, vocab_size,
     return sm, ('data',), ('softmax_label',)
 
 
-def train(symbol, train_iter, valid_iter, data_names, label_names):
-    devs = mx.cpu() if args.gpus is None or args.gpus is '' else [
-        mx.gpu(int(i)) for i in args.gpus.split(',')]
-    module = mx.mod.Module(symbol, data_names=data_names, label_names=label_names, context=devs)
+def train(symbol_data, train_iterator, valid_iterator, data_column_names, target_names):
+    """Train cnn model
+
+    Parameters
+    ----------
+    symbol_data: symbol
+    train_iterator: DataIter
+                    Train DataIter
+    valid_iterator: DataIter
+                    Valid DataIter
+    data_column_names: list of str
+                       Defaults to ('data') for a typical model used in image classification
+    target_names: list of str
+                  Defaults to ('softmax_label') for a typical model used in image classification
+    """
+    devs = mx.cpu()  # default setting
+    if args.gpus is not None:
+        for i in args.gpus.split(','):
+            mx.gpu(int(i))
+        devs = mx.gpu()
+    module = mx.mod.Module(symbol_data, data_names=data_column_names, label_names=target_names, context=devs)
 
     init_params = {
         'vocab_embed_weight': {'uniform': 0.1},
@@ -185,7 +266,7 @@ def train(symbol, train_iter, valid_iter, data_names, label_names):
         'cls_weight': {'uniform': 0.1}, 'cls_bias': {'costant': 0},
     }
     # custom init_params
-    module.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
+    module.bind(data_shapes=train_iterator.provide_data, label_shapes=train_iterator.provide_label)
     module.init_params(CustomInit(init_params))
     lr_sch = mx.lr_scheduler.FactorScheduler(step=25000, factor=0.999)
     module.init_optimizer(
@@ -195,8 +276,8 @@ def norm_stat(d):
         return mx.nd.norm(d) / np.sqrt(d.size)
     mon = mx.mon.Monitor(25000, norm_stat)
 
-    module.fit(train_data=train_iter,
-               eval_data=valid_iter,
+    module.fit(train_data=train_iterator,
+               eval_data=valid_iterator,
                eval_metric='acc',
                kvstore=args.kv_store,
                monitor=mon,
@@ -207,8 +288,7 @@ def norm_stat(d):
 
 @mx.init.register
 class CustomInit(Initializer):
-    """
-    https://mxnet.incubator.apache.org/api/python/optimization.html#mxnet.initializer.register
+    """https://mxnet.incubator.apache.org/api/python/optimization.html#mxnet.initializer.register
     Create and register a custom initializer that
     Initialize the weight and bias with custom requirements
 
diff --git a/example/cnn_text_classification/data_helpers.py b/example/cnn_text_classification/data_helpers.py
index b6fe1e691..093da7bf3 100644
--- a/example/cnn_text_classification/data_helpers.py
+++ b/example/cnn_text_classification/data_helpers.py
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+"""Help functions to support for implementing CNN + Highway Network for Text Classification in MXNet"""
+
 import itertools
 import os
 import re
@@ -27,8 +29,7 @@
 
 
 def clean_str(string):
-    """
-    Tokenization/string cleaning for all datasets except for SST.
+    """Tokenization/string cleaning for all datasets except for SST.
     Original taken from /~https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
     """
     string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
@@ -40,16 +41,15 @@ def clean_str(string):
     string = re.sub(r"\'ll", " \'ll", string)
     string = re.sub(r",", " , ", string)
     string = re.sub(r"!", " ! ", string)
-    string = re.sub(r"\(", " \( ", string)
-    string = re.sub(r"\)", " \) ", string)
-    string = re.sub(r"\?", " \? ", string)
+    string = re.sub(r"\(", r" \( ", string)
+    string = re.sub(r"\)", r" \) ", string)
+    string = re.sub(r"\?", r" \? ", string)
     string = re.sub(r"\s{2,}", " ", string)
     return string.strip().lower()
 
 
 def load_data_and_labels():
-    """
-    Loads MR polarity data from files, splits the data into words and generates labels.
+    """Loads MR polarity data from files, splits the data into words and generates labels.
     Returns split sentences and labels.
     """
     # Load data from files
@@ -75,14 +75,12 @@ def load_data_and_labels():
 
 
 def pad_sentences(sentences, padding_word="</s>"):
-    """
-    Pads all sentences to the same length. The length is defined by the longest sentence.
+    """Pads all sentences to the same length. The length is defined by the longest sentence.
     Returns padded sentences.
     """
     sequence_length = max(len(x) for x in sentences)
     padded_sentences = []
-    for i in range(len(sentences)):
-        sentence = sentences[i]
+    for i, sentence in enumerate(sentences):
         num_padding = sequence_length - len(sentence)
         new_sentence = sentence + [padding_word] * num_padding
         padded_sentences.append(new_sentence)
@@ -90,8 +88,7 @@ def pad_sentences(sentences, padding_word="</s>"):
 
 
 def build_vocab(sentences):
-    """
-    Builds a vocabulary mapping from word to index based on the sentences.
+    """Builds a vocabulary mapping from word to index based on the sentences.
     Returns vocabulary mapping and inverse vocabulary mapping.
     """
     # Build vocabulary
@@ -104,44 +101,41 @@ def build_vocab(sentences):
 
 
 def build_input_data(sentences, labels, vocabulary):
-    """
-    Maps sentencs and labels to vectors based on a vocabulary.
-    """
+    """Maps sentencs and labels to vectors based on a vocabulary."""
     x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
     y = np.array(labels)
     return [x, y]
 
-def build_input_data_with_word2vec(sentences, labels, word2vec):
+
+def build_input_data_with_word2vec(sentences, labels, word2vec_list):
     """Map sentences and labels to vectors based on a pretrained word2vec"""
     x_vec = []
     for sent in sentences:
         vec = []
         for word in sent:
-            if word in word2vec:
-                vec.append(word2vec[word])
+            if word in word2vec_list:
+                vec.append(word2vec_list[word])
             else:
-                vec.append(word2vec['</s>'])
+                vec.append(word2vec_list['</s>'])
         x_vec.append(vec)
     x_vec = np.array(x_vec)
     y_vec = np.array(labels)
     return [x_vec, y_vec]
 
 
-def load_data_with_word2vec(word2vec):
-    """
-    Loads and preprocessed data for the MR dataset.
+def load_data_with_word2vec(word2vec_list):
+    """Loads and preprocessed data for the MR dataset.
     Returns input vectors, labels, vocabulary, and inverse vocabulary.
     """
     # Load and preprocess data
     sentences, labels = load_data_and_labels()
     sentences_padded = pad_sentences(sentences)
     # vocabulary, vocabulary_inv = build_vocab(sentences_padded)
-    return build_input_data_with_word2vec(sentences_padded, labels, word2vec)
+    return build_input_data_with_word2vec(sentences_padded, labels, word2vec_list)
 
 
 def load_data():
-    """
-    Loads and preprocessed data for the MR dataset.
+    """Loads and preprocessed data for the MR dataset.
     Returns input vectors, labels, vocabulary, and inverse vocabulary.
     """
     # Load and preprocess data
@@ -153,9 +147,7 @@ def load_data():
 
 
 def batch_iter(data, batch_size, num_epochs):
-    """
-    Generates a batch iterator for a dataset.
-    """
+    """Generates a batch iterator for a dataset."""
     data = np.array(data)
     data_size = len(data)
     num_batches_per_epoch = int(len(data)/batch_size) + 1
@@ -170,18 +162,19 @@ def batch_iter(data, batch_size, num_epochs):
 
 
 def load_pretrained_word2vec(infile):
+    """Load the pre-trained word2vec from file."""
     if isinstance(infile, str):
         infile = open(infile)
 
-    word2vec = {}
+    word2vec_list = {}
     for idx, line in enumerate(infile):
         if idx == 0:
             vocab_size, dim = line.strip().split()
         else:
             tks = line.strip().split()
-            word2vec[tks[0]] = map(float, tks[1:])
+            word2vec_list[tks[0]] = map(float, tks[1:])
 
-    return word2vec
+    return word2vec_list
 
 
 def load_google_word2vec(path):
diff --git a/example/deep-embedded-clustering/model.py b/example/deep-embedded-clustering/model.py
index 9b6185c9f..b388c5513 100644
--- a/example/deep-embedded-clustering/model.py
+++ b/example/deep-embedded-clustering/model.py
@@ -18,8 +18,9 @@
 # pylint: disable=missing-docstring
 from __future__ import print_function
 
-import mxnet as mx
 import numpy as np
+import mxnet as mx
+
 try:
     import cPickle as pickle
 except ImportError:
@@ -53,7 +54,7 @@ def extract_feature(sym, args, auxs, data_iter, N, xpu=mx.cpu()):
 
 
 class MXModel(object):
-    def __init__(self, xpu=mx.cpu(), *args, **kwargs):
+    def __init__(self, *args, xpu=mx.cpu(), **kwargs):
         self.xpu = xpu
         self.loss = None
         self.args = {}
diff --git a/example/deep-embedded-clustering/solver.py b/example/deep-embedded-clustering/solver.py
index 567c78eeb..79fe5c69a 100644
--- a/example/deep-embedded-clustering/solver.py
+++ b/example/deep-embedded-clustering/solver.py
@@ -19,9 +19,8 @@
 from __future__ import print_function
 
 import logging
-
-import mxnet as mx
 import numpy as np
+import mxnet as mx
 
 
 class Monitor(object):
@@ -148,4 +147,4 @@ def solve(self, xpu, sym, args, args_grad, auxs,
             if self.iter_end_callback is not None:
                 if self.iter_end_callback(i):
                     return
-            exe.outputs[0].wait_to_read()
\ No newline at end of file
+            exe.outputs[0].wait_to_read()
diff --git a/example/gluon/sn_gan/model.py b/example/gluon/sn_gan/model.py
index 6040adb4e..cfd7f93e8 100644
--- a/example/gluon/sn_gan/model.py
+++ b/example/gluon/sn_gan/model.py
@@ -21,7 +21,7 @@
 
 import mxnet as mx
 from mxnet import nd
-from mxnet import gluon
+from mxnet import gluon, autograd
 from mxnet.gluon import Block
 
 
@@ -68,7 +68,8 @@ def _spectral_norm(self):
         if sigma == 0.:
             sigma = EPSILON
 
-        self.params.setattr('u', _u)
+        with autograd.pause():
+            self.u.set_data(_u)
 
         return w / sigma
 
diff --git a/example/gluon/sn_gan/train.py b/example/gluon/sn_gan/train.py
index 5faf3a2a0..46e44791c 100644
--- a/example/gluon/sn_gan/train.py
+++ b/example/gluon/sn_gan/train.py
@@ -50,7 +50,7 @@
                     help='use gpu for training.')
 parser.add_argument('--clip_gr', type=float, default=10.0,
                     help='Clip the gradient by projecting onto the box. default is 10.0.')
-parser.add_argument('--z-dim', type=int, default=10,
+parser.add_argument('--z-dim', type=int, default=100,
                     help='dimension of the latent z vector. default is 100.')
 opt = parser.parse_args()
 
diff --git a/example/gluon/word_language_model/README.md b/example/gluon/word_language_model/README.md
index 43d173b86..4a77950d0 100644
--- a/example/gluon/word_language_model/README.md
+++ b/example/gluon/word_language_model/README.md
@@ -28,7 +28,9 @@ python train.py --cuda --tied --nhid 650 --emsize 650 --epochs 40  --dropout 0.5
 ```
 python train.py --cuda --tied --nhid 1500 --emsize 1500 --epochs 60  --dropout 0.65     # Test ppl of 88.42
 ```
-
+```
+python train.py --export-model # hybridize and export model graph. See below for visualization options.
+```
 
 <br>
 
@@ -38,7 +40,8 @@ usage: train.py [-h] [--model MODEL] [--emsize EMSIZE] [--nhid NHID]
                 [--nlayers NLAYERS] [--lr LR] [--clip CLIP] [--epochs EPOCHS]
                 [--batch_size N] [--bptt BPTT] [--dropout DROPOUT] [--tied]
                 [--cuda] [--log-interval N] [--save SAVE] [--gctype GCTYPE]
-                [--gcthreshold GCTHRESHOLD]
+                [--gcthreshold GCTHRESHOLD] [--hybridize] [--static-alloc]
+                [--static-shape] [--export-model]
 
 MXNet Autograd RNN/LSTM Language Model on Wikitext-2.
 
@@ -62,4 +65,23 @@ optional arguments:
                         `none` for now.
   --gcthreshold GCTHRESHOLD
                         threshold for 2bit gradient compression
+  --hybridize           whether to hybridize in mxnet>=1.3 (default=False)
+  --static-alloc        whether to use static-alloc hybridize in mxnet>=1.3
+                        (default=False)
+  --static-shape        whether to use static-shape hybridize in mxnet>=1.3
+                        (default=False)
+  --export-model        export a symbol graph and exit (default=False)
+```
+
+You may visualize the graph with `mxnet.viz.plot_network` without any additional dependencies. Alternatively, if [mxboard](/~https://github.com/awslabs/mxboard) is installed, use the following approach for interactive visualization.
+```python
+#!python
+import mxnet, mxboard
+with mxboard.SummaryWriter(logdir='./model-graph') as sw:
+    sw.add_graph(mxnet.sym.load('./model-symbol.json'))
+```
+```bash
+#!/bin/bash
+tensorboard --logdir=./model-graph/
 ```
+![model graph](./model-graph.png?raw=true "rnn model graph")
diff --git a/example/gluon/word_language_model/model-graph.png b/example/gluon/word_language_model/model-graph.png
new file mode 100644
index 000000000..c621518c5
Binary files /dev/null and b/example/gluon/word_language_model/model-graph.png differ
diff --git a/example/gluon/word_language_model/model.py b/example/gluon/word_language_model/model.py
index a810c416d..ec6e700a8 100644
--- a/example/gluon/word_language_model/model.py
+++ b/example/gluon/word_language_model/model.py
@@ -19,7 +19,7 @@
 from mxnet import gluon
 from mxnet.gluon import nn, rnn
 
-class RNNModel(gluon.Block):
+class RNNModel(gluon.HybridBlock):
     """A model with an encoder, recurrent layer, and a decoder."""
 
     def __init__(self, mode, vocab_size, num_embed, num_hidden,
@@ -53,7 +53,7 @@ def __init__(self, mode, vocab_size, num_embed, num_hidden,
 
             self.num_hidden = num_hidden
 
-    def forward(self, inputs, hidden):
+    def hybrid_forward(self, F, inputs, hidden):
         emb = self.drop(self.encoder(inputs))
         output, hidden = self.rnn(emb, hidden)
         output = self.drop(output)
diff --git a/example/gluon/word_language_model/train.py b/example/gluon/word_language_model/train.py
index 7f0a916b7..d08c07ec9 100644
--- a/example/gluon/word_language_model/train.py
+++ b/example/gluon/word_language_model/train.py
@@ -58,6 +58,14 @@
                           takes `2bit` or `none` for now.')
 parser.add_argument('--gcthreshold', type=float, default=0.5,
                     help='threshold for 2bit gradient compression')
+parser.add_argument('--hybridize', action='store_true',
+                    help='whether to hybridize in mxnet>=1.3 (default=False)')
+parser.add_argument('--static-alloc', action='store_true',
+                    help='whether to use static-alloc hybridize in mxnet>=1.3 (default=False)')
+parser.add_argument('--static-shape', action='store_true',
+                    help='whether to use static-shape hybridize in mxnet>=1.3 (default=False)')
+parser.add_argument('--export-model', action='store_true',
+                    help='export a symbol graph and exit (default=False)')
 args = parser.parse_args()
 
 print(args)
@@ -72,6 +80,15 @@
 else:
     context = mx.cpu(0)
 
+if args.export_model:
+    args.hybridize = True
+
+# optional parameters only for mxnet >= 1.3
+hybridize_optional = dict(filter(lambda kv:kv[1],
+    {'static_alloc':args.static_alloc, 'static_shape':args.static_shape}.items()))
+if args.hybridize:
+    print('hybridize_optional', hybridize_optional)
+
 dirname = './data'
 dirname = os.path.expanduser(dirname)
 if not os.path.exists(dirname):
@@ -114,6 +131,8 @@
 ntokens = len(vocab)
 model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                        args.nlayers, args.dropout, args.tied)
+if args.hybridize:
+    model.hybridize(**hybridize_optional)
 model.initialize(mx.init.Xavier(), ctx=context)
 
 compression_params = None if args.gctype == 'none' else {'type': args.gctype, 'threshold': args.gcthreshold}
@@ -123,6 +142,8 @@
                          'wd': 0},
                         compression_params=compression_params)
 loss = gluon.loss.SoftmaxCrossEntropyLoss()
+if args.hybridize:
+    loss.hybridize(**hybridize_optional)
 
 ###############################################################################
 # Training code
@@ -177,6 +198,10 @@ def train():
                     epoch, i, cur_L, math.exp(cur_L)))
                 total_L = 0.0
 
+            if args.export_model:
+                model.export('model')
+                return
+
         val_L = eval(val_data)
 
         print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
@@ -193,6 +218,8 @@ def train():
 
 if __name__ == '__main__':
     train()
-    model.load_parameters(args.save, context)
-    test_L = eval(test_data)
-    print('Best test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
+    if not args.export_model:
+        model.load_parameters(args.save, context)
+        test_L = eval(test_data)
+        print('Best test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
+
diff --git a/example/image-classification/train_mnist.py b/example/image-classification/train_mnist.py
index 196cfaf4e..2c206e1a4 100644
--- a/example/image-classification/train_mnist.py
+++ b/example/image-classification/train_mnist.py
@@ -73,7 +73,6 @@ def get_mnist_iter(args, kv):
 
     parser.add_argument('--add_stn',  action="store_true", default=False, help='Add Spatial Transformer Network Layer (lenet only)')
     parser.add_argument('--image_shape', default='1, 28, 28', help='shape of training images')
-    parser.add_argument('--with-nnp',  action="store_true", default=False, dest="is_nnp")
 
     fit.add_fit_args(parser)
     parser.set_defaults(
diff --git a/example/quantization/imagenet_gen_qsym.py b/example/quantization/imagenet_gen_qsym.py
index 8a2818c4b..41713c3c3 100644
--- a/example/quantization/imagenet_gen_qsym.py
+++ b/example/quantization/imagenet_gen_qsym.py
@@ -155,6 +155,16 @@ def save_params(fname, arg_params, aux_params, logger=None):
         if args.ctx == 'gpu':
             calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1
                                                                      or name.find('fc') != -1)
+            excluded_sym_names += ['ch_concat_3a_chconcat',
+                                   'ch_concat_3b_chconcat',
+                                   'ch_concat_3c_chconcat',
+                                   'ch_concat_4a_chconcat',
+                                   'ch_concat_4b_chconcat',
+                                   'ch_concat_4c_chconcat',
+                                   'ch_concat_4d_chconcat',
+                                   'ch_concat_4e_chconcat',
+                                   'ch_concat_5a_chconcat',
+                                   'ch_concat_5b_chconcat']
         else:
             calib_layer = lambda name: name.endswith('_output') and (name.find('conv') != -1)
             excluded_sym_names += ['flatten', 'fc1']
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
index 938890bb7..d807e7f2d 100644
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ b/example/quantization/imagenet_gen_qsym_mkldnn.py
@@ -55,24 +55,24 @@ def convert_from_gluon(model_name, image_shape, classes=1000, logger=None):
     symnet = mx.symbol.load_json(y.tojson())
     params = net.collect_params()
     args = {}
-    auxs = {}    
+    auxs = {}
     for param in params.values():
         v = param._reduce()
         k = param.name
         if 'running' in k:
             auxs[k] = v
         else:
-            args[k] = v            
+            args[k] = v
     mod = mx.mod.Module(symbol=symnet, context=mx.cpu(),
                         label_names = ['softmax_label'])
-    mod.bind(for_training=False, 
-             data_shapes=[('data', (1,) + 
+    mod.bind(for_training=False,
+             data_shapes=[('data', (1,) +
                           tuple([int(i) for i in image_shape.split(',')]))])
     mod.set_params(arg_params=args, aux_params=auxs)
     dst_dir = os.path.join(dir_path, 'model')
     prefix = os.path.join(dir_path, 'model', model_name)
     if not os.path.isdir(dst_dir):
-        os.mkdir(dst_dir)       
+        os.mkdir(dst_dir)
     mod.save_checkpoint(prefix, 0)
     return prefix
 
@@ -104,7 +104,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
                              'you can set to custom to load your pre-trained model.')
     parser.add_argument('--use-gluon-model', type=bool, default=False,
                         help='If enabled, will download pretrained model from Gluon-CV '
-                             'and convert to symbolic model ')    
+                             'and convert to symbolic model ')
     parser.add_argument('--batch-size', type=int, default=32)
     parser.add_argument('--label-name', type=str, default='softmax_label')
     parser.add_argument('--calib-dataset', type=str, default='data/val_256_q90.rec',
@@ -114,7 +114,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
                         help='number of threads for data decoding')
     parser.add_argument('--num-calib-batches', type=int, default=10,
                         help='number of batches for calibration')
-    parser.add_argument('--exclude-first-conv', action='store_true', default=True,
+    parser.add_argument('--exclude-first-conv', action='store_true', default=False,
                         help='excluding quantizing the first conv layer since the'
                              ' input data may have negative value which doesn\'t support at moment' )
     parser.add_argument('--shuffle-dataset', action='store_true', default=True,
@@ -140,8 +140,8 @@ def save_params(fname, arg_params, aux_params, logger=None):
                              ' thresholds. This mode is expected to produce the best inference accuracy of all three'
                              ' kinds of quantized models if the calibration dataset is representative enough of the'
                              ' inference dataset.')
-    parser.add_argument('--quantized-dtype', type=str, default='uint8',
-                        choices=['int8', 'uint8'],
+    parser.add_argument('--quantized-dtype', type=str, default='auto',
+                        choices=['auto', 'int8', 'uint8'],
                         help='quantization destination data type for input data')
     parser.add_argument('--enable-calib-quantize', type=bool, default=True,
                         help='If enabled, the quantize op will '
@@ -198,40 +198,39 @@ def save_params(fname, arg_params, aux_params, logger=None):
     # get image shape
     image_shape = args.image_shape
 
+    calib_layer = lambda name: name.endswith('_output') or name == "data"
     exclude_first_conv = args.exclude_first_conv
+    if args.quantized_dtype == "uint8":
+        logger.info('quantized dtype is set to uint8, will exclude first conv.')
+        exclude_first_conv = True
     excluded_sym_names = []
     if args.model == 'imagenet1k-resnet-152':
         rgb_mean = '0,0,0'
         rgb_std = '1,1,1'
-        calib_layer = lambda name: name.endswith('_output')
-        excluded_sym_names += ['flatten0', 'fc1', 'pooling0']
+        excluded_sym_names += ['flatten0', 'fc1']
         if exclude_first_conv:
             excluded_sym_names += ['conv0']
     elif args.model == 'imagenet1k-inception-bn':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '1,1,1'
-        calib_layer = lambda name: name.endswith('_output')
         excluded_sym_names += ['flatten', 'fc1']
         if exclude_first_conv:
             excluded_sym_names += ['conv_1']
     elif args.model in ['resnet50_v1', 'resnet101_v1']:
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
-        calib_layer = lambda name: name.endswith('_output')
-        excluded_sym_names += ['resnetv10_dense0_fwd', 'resnetv10_pool0_fwd']
+        excluded_sym_names += ['resnetv10_dense0_fwd']
         if exclude_first_conv:
             excluded_sym_names += ['resnetv10_conv0_fwd']
     elif args.model == 'squeezenet1.0':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
-        calib_layer = lambda name: name.endswith('_output')
         excluded_sym_names += ['squeezenet0_flatten0_flatten0']
         if exclude_first_conv:
             excluded_sym_names += ['squeezenet0_conv0_fwd']
     elif args.model == 'mobilenet1.0':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
-        calib_layer = lambda name: name.endswith('_output')
         excluded_sym_names += ['mobilenet0_flatten0_flatten0',
                                'mobilenet0_dense0_fwd',
                                'mobilenet0_pool0_fwd']
@@ -240,22 +239,15 @@ def save_params(fname, arg_params, aux_params, logger=None):
     elif args.model == 'inceptionv3':
         rgb_mean = '123.68,116.779,103.939'
         rgb_std = '58.393, 57.12, 57.375'
-        calib_layer = lambda name: name.endswith('_output')
-        excluded_sym_names += ['inception30_dense0_fwd',
-                               'inception30_pool0_fwd']
+        excluded_sym_names += ['inception30_dense0_fwd']
         if exclude_first_conv:
             excluded_sym_names += ['inception30_conv0_fwd']
     elif args.model == 'custom':
         # add rgb mean/std of your model.
         rgb_mean = '0,0,0'
         rgb_std = '0,0,0'
-        calib_layer = lambda name: name.endswith('_output')
         # add layer names you donnot want to quantize.
-        # add conv/pool layer names that has negative inputs
-        # since Intel MKL-DNN only support uint8 quantization temporary.
-        # add all fc layer names since Intel MKL-DNN does not support temporary.
         excluded_sym_names += ['layers']
-        # add your first conv layer names since Intel MKL-DNN only support uint8 quantization temporary.
         if exclude_first_conv:
             excluded_sym_names += ['layers']
     else:
@@ -272,7 +264,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
     mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]}
     logger.info('rgb_std = %s' % rgb_std)
     rgb_std = [float(i) for i in rgb_std.split(',')]
-    std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]}    
+    std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]}
     combine_mean_std = {}
     combine_mean_std.update(mean_args)
     combine_mean_std.update(std_args)
@@ -303,8 +295,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
                                                         calib_mode=calib_mode, calib_data=data,
                                                         num_calib_examples=num_calib_batches * batch_size,
                                                         calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
-                                                        label_names=(label_name,), calib_quantize_op = True,
-                                                        logger=logger)
+                                                        label_names=(label_name,), logger=logger)
         if calib_mode == 'entropy':
             suffix = '-quantized-%dbatches-entropy' % num_calib_batches
         elif calib_mode == 'naive':
diff --git a/example/ssd/quantization.py b/example/ssd/quantization.py
index 4e6e73996..4b111dfa1 100644
--- a/example/ssd/quantization.py
+++ b/example/ssd/quantization.py
@@ -51,7 +51,7 @@ def save_params(fname, arg_params, aux_params, logger=None):
     parser.add_argument('--batch-size', type=int, default=32)
     parser.add_argument('--num-calib-batches', type=int, default=5,
                         help='number of batches for calibration')
-    parser.add_argument('--exclude-first-conv', action='store_true', default=True,
+    parser.add_argument('--exclude-first-conv', action='store_true', default=False,
                         help='excluding quantizing the first conv layer since the'
                              ' number of channels is usually not a multiple of 4 in that layer'
                              ' which does not satisfy the requirement of cuDNN')
@@ -78,8 +78,8 @@ def save_params(fname, arg_params, aux_params, logger=None):
                              ' thresholds. This mode is expected to produce the best inference accuracy of all three'
                              ' kinds of quantized models if the calibration dataset is representative enough of the'
                              ' inference dataset.')
-    parser.add_argument('--quantized-dtype', type=str, default='uint8',
-                        choices=['int8', 'uint8'],
+    parser.add_argument('--quantized-dtype', type=str, default='auto',
+                        choices=['auto', 'int8', 'uint8'],
                         help='quantization destination data type for input data')
 
     args = parser.parse_args()
@@ -115,18 +115,19 @@ def save_params(fname, arg_params, aux_params, logger=None):
     # get image shape
     image_shape = '3,300,300'
 
+    def calib_layer(name): return not (name.endswith('_data') or
+                                       name.endswith('_weight') or
+                                       name.endswith('_bias') or
+                                       name.endswith('_workspace'))
     # Quantization layer configs
     exclude_first_conv = args.exclude_first_conv
     excluded_sym_names = []
     rgb_mean = '123,117,104'
     for i in range(1,19):
         excluded_sym_names += ['flatten'+str(i)]
-    excluded_sym_names += ['relu4_3_cls_pred_conv',
-                            'relu7_cls_pred_conv',
-                            'relu4_3_loc_pred_conv',
-                            'multibox_loc_pred',
-                            'concat0',
-                            'concat1']
+    excluded_sym_names += ['multibox_loc_pred',
+                           'concat0',
+                           'concat1']
     if exclude_first_conv:
         excluded_sym_names += ['conv1_1']
 
@@ -158,10 +159,8 @@ def save_params(fname, arg_params, aux_params, logger=None):
                                                         ctx=ctx, excluded_sym_names=excluded_sym_names,
                                                         calib_mode=calib_mode, calib_data=eval_iter,
                                                         num_calib_examples=num_calib_batches * batch_size,
-                                                        calib_layer=None, quantized_dtype=args.quantized_dtype,
-                                                        label_names=(label_name,),
-                                                        calib_quantize_op=True,
-                                                        logger=logger)
+                                                        calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                        label_names=(label_name,), logger=logger)
         sym_name = '%s-symbol.json' % ('./model/cqssd_vgg16_reduced_300')
         param_name = '%s-%04d.params' % ('./model/cqssd_vgg16_reduced_300', epoch)
     qsym = qsym.get_backend_symbol('MKLDNN_POST_QUANTIZE')
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 3b226d757..d6d186a5f 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -25,47 +25,18 @@
 #ifndef MXNET_BASE_H_
 #define MXNET_BASE_H_
 
-#include <dmlc/base.h>
-#include <dmlc/io.h>
-#include <dmlc/type_traits.h>
-#include <dmlc/parameter.h>
-#include <mshadow/tensor.h>
-// nnvm headers for symbolic construction.
-#include <nnvm/op.h>
-#include <nnvm/tuple.h>
-#include <nnvm/symbolic.h>
+#include "dmlc/base.h"
 #include <string>
+#include "dmlc/io.h"
+#include "dmlc/type_traits.h"
+#include "dmlc/parameter.h"
+#include "mshadow/tensor.h"
+// nnvm headers for symbolic construction.
+#include "nnvm/op.h"
+#include "nnvm/tuple.h"
+#include "nnvm/symbolic.h"
+#include "mxfeatures.h"
 
-/*!
- *\brief whether to use opencv support
- */
-#ifndef MXNET_USE_OPENCV
-#define MXNET_USE_OPENCV 1
-#endif
-
-/*!
- *\brief whether to use cuda support
- */
-#ifndef MXNET_USE_CUDA
-#define MXNET_USE_CUDA MSHADOW_USE_CUDA
-#endif
-
-/*!
- *\brief whether to use cudnn library for convolution
- */
-#ifndef MXNET_USE_CUDNN
-#define MXNET_USE_CUDNN MSHADOW_USE_CUDNN
-#endif
-
-/*!
- *\brief whether to use cusolver library
- */
-#ifndef MXNET_USE_CUSOLVER
-#define MXNET_USE_CUSOLVER MSHADOW_USE_CUSOLVER
-#endif
-
-/*! \brief Error message for using gpu when MXNET_USE_CUDA==0 */
-#define MXNET_GPU_NOT_ENABLED_ERROR  "GPU is not enabled"
 
 /*!
  * \brief define compatible keywords in g++
@@ -268,6 +239,10 @@ struct RunContext {
    * \brief the stream of the device, can be NULL or Stream<gpu>* in GPU mode
    */
   void *stream;
+  /*!
+   * \brief indicator of whether this execution is run in bulk mode
+   */
+  bool is_bulk;
   /*!
    * \brief get mshadow stream from Context
    * \return the mshadow stream
@@ -427,6 +402,7 @@ inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
 #define MXNET_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" STRINGIZE(__LINE__))
 #define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
 
+
 #if MXNET_USE_MKLDNN == 1
 constexpr size_t kMKLDNNAlign = 64;
 #endif
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index ce773b8ad..1cf849e0f 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -208,6 +208,15 @@ MXNET_DLL const char *MXGetLastError();
 //-------------------------------------
 // Part 0: Global State setups
 //-------------------------------------
+
+/*!
+ * \brief
+ * \param feature to check mxfeatures.h
+ * \param out set to true if the feature is enabled, false otherwise
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXHasFeature(const mx_uint feature, bool* out);
+
 /*!
  * \brief Seed all global random number generators in mxnet.
  * \param seed the random number seed.
@@ -465,6 +474,7 @@ MXNET_DLL int MXGetGPUMemoryInformation64(int dev, uint64_t *free_mem, uint64_t
  */
 MXNET_DLL int MXGetVersion(int *out);
 
+
 //-------------------------------------
 // Part 1: NDArray creation and deletion
 //-------------------------------------
@@ -1556,7 +1566,7 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
  * \param num_offline number of parameters that are quantized offline
  * \param offline_params array of c strings representing the names of params quantized offline
  * \param quantized_dtype the quantized destination type for input data.
- * \param calib_quantize whether calibrate quantize op with offline calibration data.
+ * \param calib_quantize **Deprecated**. quantize op will always be calibrated if could.
  */
 MXNET_DLL int MXQuantizeSymbol(SymbolHandle sym_handle, SymbolHandle *ret_sym_handle,
                                const mx_uint num_excluded_symbols,
@@ -1837,6 +1847,14 @@ MXNET_DLL int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
 MXNET_DLL int MXExecutorSetMonitorCallback(ExecutorHandle handle,
                                            ExecutorMonitorCallback callback,
                                            void* callback_handle);
+
+/*!
+ * \brief set a call back to notify the completion of operation
+ * \param monitor_all If true, monitor both input and output, otherwise monitor output only.
+ */
+MXNET_DLL int MXExecutorSetMonitorCallbackEX(ExecutorHandle handle,
+                                             ExecutorMonitorCallback callback,
+                                             void *callback_handle, bool monitor_all);
 //--------------------------------------------
 // Part 5: IO Interface
 //--------------------------------------------
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index e02b995d6..408a70a5f 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -74,15 +74,15 @@ class CallbackOnComplete {
  public:
   // use implicit copy and assign
   /*! \brief involve the callback */
-  inline void operator()() const {
-    (*callback_)(engine_, param_);
+  inline void operator()(const dmlc::Error* error = nullptr) const {
+    (*callback_)(engine_, param_, error);
   }
 
  private:
   /*! \brief engine can see content of callback */
   friend class ::mxnet::Engine;
   /*! \brief the real callback */
-  void (*callback_)(Engine *, void *);
+  void (*callback_)(Engine *, void *, const dmlc::Error *);
   /*! \brief the engine class passed to callback */
   Engine* engine_;
   /*! \brief the parameter set on callback */
@@ -275,7 +275,7 @@ class MXNET_API Engine {
    * \param param the paramter passed to callback.
    */
   inline CallbackOnComplete CreateCallback(
-      void (*callback)(Engine *, void *), void *param) {
+      void (*callback)(Engine *, void *, const dmlc::Error *), void *param) {
     CallbackOnComplete ret;
     ret.callback_ = callback;
     ret.engine_ = this;
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index 0ab04b86a..aec10091a 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -174,7 +174,7 @@ class Executor {
   /*!
    * \brief Install a callback to notify the completion of operation.
    */
-  virtual void SetMonitorCallback(const MonitorCallback& callback) {}
+  virtual void SetMonitorCallback(const MonitorCallback& callback, bool monitor_all = false) {}
 };  // class executor
 }  // namespace mxnet
 #endif  // MXNET_EXECUTOR_H_
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 3c806d85d..e18f03ed0 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -25,12 +25,12 @@
 #ifndef MXNET_IO_H_
 #define MXNET_IO_H_
 
-#include <dmlc/data.h>
-#include <dmlc/registry.h>
 #include <vector>
 #include <string>
 #include <utility>
 #include <queue>
+#include "dmlc/data.h"
+#include "dmlc/registry.h"
 #include "./base.h"
 #include "./ndarray.h"
 
diff --git a/include/mxnet/mxfeatures.h b/include/mxnet/mxfeatures.h
new file mode 100644
index 000000000..10f9b3656
--- /dev/null
+++ b/include/mxnet/mxfeatures.h
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file mxfeatures.h
+ * \brief check MXNet features including compile time support
+ */
+
+#pragma once
+
+#include "dmlc/base.h"
+#include "mshadow/base.h"
+
+/*!
+ *\brief whether to use opencv support
+ */
+#ifndef MXNET_USE_OPENCV
+#define MXNET_USE_OPENCV 1
+#endif
+
+/*!
+ *\brief whether to use cuda support
+ */
+#ifndef MXNET_USE_CUDA
+#define MXNET_USE_CUDA MSHADOW_USE_CUDA
+#endif
+
+/*!
+ *\brief whether to use cudnn library for convolution
+ */
+#ifndef MXNET_USE_CUDNN
+#define MXNET_USE_CUDNN MSHADOW_USE_CUDNN
+#endif
+
+#ifndef MXNET_USE_NCCL
+#define MXNET_USE_NCCL 0
+#endif
+
+/*!
+ *\brief whether to use cusolver library
+ */
+#ifndef MXNET_USE_CUSOLVER
+#define MXNET_USE_CUSOLVER MSHADOW_USE_CUSOLVER
+#endif
+
+#ifndef MXNET_ENABLE_CUDA_RTC
+#define MXNET_ENABLE_CUDA_RTC 0
+#endif
+
+/*! \brief Error message for using gpu when MXNET_USE_CUDA==0 */
+#define MXNET_GPU_NOT_ENABLED_ERROR  "GPU is not enabled"
+
+
+#ifndef MXNET_USE_TENSORRT
+#define MXNET_USE_TENSORRT 0
+#endif
+
+
+#ifndef MXNET_USE_BLAS_ATLAS
+#define MXNET_USE_BLAS_ATLAS 0
+#endif
+
+#ifndef MXNET_USE_BLAS_OPEN
+#define MXNET_USE_BLAS_OPEN 0
+#endif
+
+#ifndef MXNET_USE_BLAS_MKL
+#define MXNET_USE_BLAS_MKL 0
+#endif
+
+#ifndef MXNET_USE_BLAS_APPLE
+#define MXNET_USE_BLAS_APPLE 0
+#endif
+
+#ifndef MXNET_USE_LAPACK
+#define MXNET_USE_LAPACK 0
+#endif
+
+#ifndef MXNET_USE_MKLDNN
+#define MXNET_USE_MKLDNN 0
+#endif
+
+#ifndef MXNET_USE_OPENMP
+#define MXNET_USE_OPENMP 0
+#endif
+
+#ifndef MXNET_USE_F16C
+#define MXNET_USE_F16C MSHADOW_USE_F16C
+#endif
+
+#ifndef MXNET_USE_CAFFE
+#define MXNET_USE_CAFFE 0
+#endif
+
+#ifndef MXNET_USE_DIST_KVSTORE
+#define MXNET_USE_DIST_KVSTORE 0
+#endif
+
+#ifndef MXNET_USE_SIGNAL_HANDLER
+#define MXNET_USE_SIGNAL_HANDLER 0
+#endif
+
+
+
+namespace mxnet {
+namespace features {
+// Check compile flags such as CMakeLists.txt
+
+/// Compile time features
+enum : uint32_t {
+  // NVIDIA, CUDA
+  CUDA = 0,
+  CUDNN,
+  NCCL,
+  CUDA_RTC,
+  TENSORRT,
+
+  // CPU Features / optimizations
+  CPU_SSE,
+  CPU_SSE2,
+  CPU_SSE3,
+  CPU_SSE4_1,
+  CPU_SSE4_2,
+  CPU_SSE4A,  // AMD extensions to SSE4
+  CPU_AVX,
+  CPU_AVX2,
+
+
+  // Multiprocessing / CPU / System
+  OPENMP,
+  SSE,
+  F16C,
+  JEMALLOC,
+
+  // Math libraries & BLAS
+  // Flavour of BLAS
+  BLAS_OPEN,
+  BLAS_ATLAS,
+  // Intel(R) Math Kernel Library
+  BLAS_MKL,
+  BLAS_APPLE,
+  // Other math libraries:
+  // Linear Algebra PACKage
+  LAPACK,
+  // Intel(R) Math Kernel Library for Deep Neural Networks
+  MKLDNN,
+
+  // Image processing
+  OPENCV,
+
+  // Misc
+  CAFFE,
+  PROFILER,
+  DIST_KVSTORE,
+  CXX14,
+  // Signal handler to print stack traces on exceptions
+  SIGNAL_HANDLER,
+  DEBUG,
+
+  // size indicator
+  MAX_FEATURES
+};
+
+
+/*!
+ * \return true if the given feature is supported
+ */
+bool is_enabled(uint32_t feat);
+
+}  // namespace features
+}  // namespace mxnet
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 4ba13ca64..5de42e19a 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -694,9 +694,13 @@ class NDArray {
   /*
    * Create NDArray from mkldnn memory.
    * mkldnn_mem The mkldnn memory to be managed.
-   * static_data If true, mkldnn memory won't be freed on destruction.
    */
-  explicit NDArray(const mkldnn::memory *mkldnn_mem, bool static_data = true);
+  explicit NDArray(const std::shared_ptr<mkldnn::memory> &mkldnn_mem);
+  /*
+   * Create NDArray from mkldnn memory descriptor.
+   * mem_pd The mkldnn memory descriptor to be created.
+   */
+  explicit NDArray(mkldnn::memory::primitive_desc mem_pd);
   /*
    * Test if the data is stored in one of special MKLDNN format.
    */
@@ -776,7 +780,7 @@ class NDArray {
    /*!
    * \ Fix mkldnn memory descriptor mismatch from NDArray.
    */
-  void UpdateMKLDNNMemDesc();
+  void UpdateMKLDNNMemDesc(mkldnn::memory::format format);
 #endif
 
   /*!
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index 431fa2ba4..ec4eea95a 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -238,6 +238,8 @@ using FResourceRequest = std::function<
 /*!
  * \brief The resource request from the operator.
  *        An operator could register ResourceRequestEx, or ResourceRequest, or neither.
+ *        If an operator registers both ResourceRequestEx and ResourceRequest,
+ *        ResourceRequest is ignored.
  *
  * \note Register under "FResourceRequestEx"
  */
@@ -254,7 +256,7 @@ using FNDArrayFunction = std::function<void (const nnvm::NodeAttrs& attrs,
                                              const std::vector<NDArray>& inputs,
                                              std::vector<NDArray>* outputs)>;
 /*!
- * \brief Resiger a compute function for simple stateless forward only operator
+ * \brief Register a compute function for simple stateless forward only operator
  *
  * \note Register under "FCompute<cpu>" and "FCompute<gpu>"
  */
@@ -264,7 +266,7 @@ using FCompute = std::function<void (const nnvm::NodeAttrs& attrs,
                                      const std::vector<OpReqType>& req,
                                      const std::vector<TBlob>& outputs)>;
 /*!
- * \brief Resiger an NDArray compute function for simple stateless forward only operator
+ * \brief Register an NDArray compute function for simple stateless forward only operator
  * \note Register under "FComputeEx<xpu>" and "FComputeEx<xpu>"
  *       Dispatched only when inferred dispatch_mode is FDispatchComputeEx
  */
@@ -275,7 +277,7 @@ using FComputeEx = std::function<void (const nnvm::NodeAttrs& attrs,
                                        const std::vector<NDArray>& outputs)>;
 
 /*!
- * \brief Resiger a storage and dispatch mode inference function based on
+ * \brief Register a storage and dispatch mode inference function based on
  *        storage types of the inputs and outputs, and the dev_mask for the operator.
  *
  * \note Register under "FInferStorageType"
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index 67c14b66a..34c8f88d1 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -44,6 +44,11 @@ struct ResourceRequest {
     kTempSpace,
     /*! \brief common::RandGenerator<xpu> object, which can be used in GPU kernel functions */
     kParallelRandom
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+    ,
+    /*! \brief cudnnDropoutDescriptor_t object for GPU dropout kernel functions */
+    kCuDNNDropoutDesc
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
   };
   /*! \brief type of resources */
   Type type;
@@ -157,6 +162,21 @@ struct Resource {
         reinterpret_cast<DType*>(get_space_internal(shape.Size() * sizeof(DType))),
         shape, shape[ndim - 1], stream);
   }
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+  /*!
+   * \brief Get cudnn dropout descriptor from shared state space.
+   *
+   * \param dropout_desc reference to previously created cudnn dropout descriptor.
+   * \param stream the stream of retruning tensor.
+   * \return the mshadow tensor requested.
+   */
+  void get_cudnn_dropout_desc(
+      cudnnDropoutDescriptor_t* dropout_desc,
+      mshadow::Stream<gpu> *stream,
+      const float dropout,
+      uint64_t seed) const;
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+
   /*!
    * \brief Get CPU space as mshadow Tensor in specified type.
    * The caller can request arbitrary size.
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 496e8c7cf..412877a58 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -287,7 +287,7 @@ class TBlob {
     CHECK(Device::kDevMask == this->dev_mask())
       << "TBlob.get: device type do not match specified type";
     CHECK_EQ(this->CheckContiguous(), true) << "TBlob.get_reshape: must be contiguous";
-    CHECK_EQ(this->shape_.Size(), shape.Size())
+    CHECK_EQ(this->shape_.Size(), static_cast<size_t>(shape.Size()))
       << "TBlob.get_with_shape: new and old shape do not match total elements";
     return mshadow::Tensor<Device, dim, DType>(dptr<DType>(), shape,
                                                shape[dim - 1], stream);
diff --git a/julia/.gitignore b/julia/.gitignore
index 3687ed485..e7b35fa85 100644
--- a/julia/.gitignore
+++ b/julia/.gitignore
@@ -8,3 +8,4 @@ deps/src
 deps/usr
 deps/deps.jl
 .vscode
+/Manifest.toml
diff --git a/julia/NEWS.md b/julia/NEWS.md
index 71ee86ff7..3da119496 100644
--- a/julia/NEWS.md
+++ b/julia/NEWS.md
@@ -1,11 +1,10 @@
-# v0.4.0 (#TBD)
+# v1.5.0 (#TBD)
 
 * Following material from `mx` module got exported (#TBD):
     * `NDArray`
         * `clip()`
         * `clip!()`
         * `context()`
-        * `empty()`
         * `expand_dims()`
         * `@inplace`
         * `σ()`
@@ -113,6 +112,16 @@
    3.0
   ```
 
+* `mx.empty` is deprecated and replaced by `UndefInitializer` constructor. (#TBD)
+
+  E.g.
+  ```julia
+  julia> NDArray(undef, 2, 5)
+  2×5 NDArray{Float32,2} @ CPU0:
+   -21260.344f0     1.674986f19    0.00016893122f0  1.8363f-41  0.0f0
+        3.0763f-41  1.14321726f27  4.24219f-8       0.0f0       0.0f0
+  ```
+
 * A port of Python's `autograd` for `NDArray` (#274)
 
 * `size(x, dims...)` is supported now. (#TBD)
diff --git a/julia/Project.toml b/julia/Project.toml
new file mode 100644
index 000000000..82a94c5a0
--- /dev/null
+++ b/julia/Project.toml
@@ -0,0 +1,26 @@
+name = "MXNet"
+uuid = "a7949054-b901-59c6-b8e3-7238c29bf7f0"
+authors = ["Chiyuan Zhang <pluskid@gmail.com>", "Valentin Churavy <vchuravy@mit.edu>", "Iblis Lin <iblis@hs.ntnu.edu.tw>"]
+version = "1.5.0"
+
+[deps]
+BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
+Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
+Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+
+[compat]
+julia = "≥0.7"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
diff --git a/julia/REQUIRE b/julia/REQUIRE
index b53f0c3cc..8008da3d2 100644
--- a/julia/REQUIRE
+++ b/julia/REQUIRE
@@ -1,4 +1,4 @@
-julia 0.6
+julia 0.7
 Formatting
 BinDeps
 JSON
diff --git a/julia/appveyor.yml b/julia/appveyor.yml
deleted file mode 100644
index 50e275cfa..000000000
--- a/julia/appveyor.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-environment:
-  matrix:
-  - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
-
-branches:
-  only:
-    - master
-    - stable
-
-notifications:
-  - provider: Email
-    on_build_success: false
-    on_build_failure: false
-    on_build_status_changed: false
-
-install:
-  - ps: "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12"
-# If there's a newer build queued for the same PR, cancel this one
-  - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod `
-        https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | `
-        Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { `
-        throw "There are newer queued builds for this pull request, failing early." }
-
-# Download most recent Julia Windows binary
-  - ps: (new-object net.webclient).DownloadFile(
-        $env:JULIA_URL,
-        "C:\projects\julia-binary.exe")
-# Run installer silently, output to C:\projects\julia
-  - C:\projects\julia-binary.exe /S /D=C:\projects\julia
-
-build_script:
-# Need to convert from shallow to complete for Pkg.clone to work
-  - IF EXIST .git\shallow (git fetch --unshallow)
-  - C:\projects\julia\bin\julia -e "versioninfo();
-      Pkg.clone(pwd(), \"MXNet\"); Pkg.build(\"MXNet\")"
-
-test_script:
-  - C:\projects\julia\bin\julia --check-bounds=yes -e "Pkg.test(\"MXNet\")"
-
diff --git a/julia/deps/build.jl b/julia/deps/build.jl
index 7a37803f3..9a719be85 100644
--- a/julia/deps/build.jl
+++ b/julia/deps/build.jl
@@ -86,7 +86,7 @@ if HAS_CUDA
   if HAS_CUDNN
     @info("Found a CuDNN installation.")
   end
-  @info("CUDA_HOME -> $(get(ENV, "CUDA_HOME", nothing))")
+  @info("CUDA_HOME -> $(get(ENV, "CUDA_HOME", "nothing"))")
 else
   @info("Did not find a CUDA installation, using CPU-only version of MXNet.")
 end
diff --git a/julia/docs/src/user-guide/overview.md b/julia/docs/src/user-guide/overview.md
index a81d7ff30..5815bc6d7 100644
--- a/julia/docs/src/user-guide/overview.md
+++ b/julia/docs/src/user-guide/overview.md
@@ -73,9 +73,13 @@ operators in Julia directly.
 
 The followings are common ways to create `NDArray` objects:
 
-- `mx.empty(shape[, context])`: create on uninitialized array of a
-  given shape on a specific device. For example,
-  `mx.empty(2, 3)`, `mx.((2, 3), mx.gpu(2))`.
+- `NDArray(undef, shape...; ctx = context, writable = true)`:
+  create an uninitialized array of a given shape on a specific device.
+  For example,
+  `NDArray(undef, 2, 3)`, `NDArray(undef, 2, 3, ctx = mx.gpu(2))`.
+- `NDArray(undef, shape; ctx = context, writable = true)`
+- `NDArray{T}(undef, shape...; ctx = context, writable = true)`:
+  create an uninitialized with the given type `T`.
 - `mx.zeros(shape[, context])` and `mx.ones(shape[, context])`:
   similar to the Julia's built-in `zeros` and `ones`.
 - `mx.copy(jl_arr, context)`: copy the contents of a Julia `Array` to
@@ -101,11 +105,11 @@ shows a way to set the contents of an `NDArray`.
 ```@repl
 using MXNet
 mx.srand(42)
-a = mx.empty(2, 3)
+a = NDArray(undef, 2, 3)
 a[:] = 0.5              # set all elements to a scalar
 a[:] = rand(size(a))    # set contents with a Julia Array
 copy!(a, rand(size(a))) # set value by copying a Julia Array
-b = mx.empty(size(a))
+b = NDArray(undef, size(a))
 b[:] = a                # copying and assignment between NDArrays
 ```
 
@@ -175,7 +179,7 @@ function inplace_op()
   grad   = mx.ones(SHAPE, CTX)
 
   # pre-allocate temp objects
-  grad_lr = mx.empty(SHAPE, CTX)
+  grad_lr = NDArray(undef, SHAPE, ctx = CTX)
 
   for i = 1:N_REP
     copy!(grad_lr, grad)
@@ -234,7 +238,7 @@ shape = (2, 3)
 key   = 3
 
 mx.init!(kv, key, mx.ones(shape) * 2)
-a = mx.empty(shape)
+a = NDArray(undef, shape)
 mx.pull!(kv, key, a) # pull value into a
 a
 ```
diff --git a/julia/src/MXNet.jl b/julia/src/MXNet.jl
index febd80cc8..68663d1e5 100644
--- a/julia/src/MXNet.jl
+++ b/julia/src/MXNet.jl
@@ -53,7 +53,6 @@ export NDArray,
        clip,
        clip!,
        context,
-       empty,
        expand_dims,
        @inplace,
        # activation funcs
diff --git a/julia/src/deprecated.jl b/julia/src/deprecated.jl
index 32819810e..70079b8dc 100644
--- a/julia/src/deprecated.jl
+++ b/julia/src/deprecated.jl
@@ -169,3 +169,28 @@ import Base: sum, maximum, minimum, prod, cat
 
 import Statistics: mean
 @deprecate mean(x::NDArray, dims) mean(x, dims = dims)
+
+# replaced by UndefInitializer
+function empty(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
+  @warn("`mx.empty(T, dims, ctx)` is deprecated, " *
+        "use `NDArray{T,N}(undef, dims; ctx = ctx)` instead.")
+  NDArray{T,N}(undef, dims; ctx = ctx)
+end
+
+function empty(::Type{T}, dims::Int...) where {T<:DType}
+  @warn("`mx.empty(T, dims...)` is deprecated, " *
+        "use `NDArray{T,N}(undef, dims...)` instead.")
+  NDArray{T,N}(undef, dims...)
+end
+
+function empty(dims::NTuple{N,Int}, ctx::Context = cpu()) where N
+  @warn("`mx.empty(dims, ctx)` is deprecated, " *
+        "use `NDArray(undef, dims; ctx = ctx)` instead.")
+  NDArray(undef, dims; ctx = ctx)
+end
+
+function empty(dims::Int...)
+  @warn("`mx.empty(dims...)` is deprecated, " *
+        "use `NDArray(undef, dims...)` instead.")
+  NDArray(undef, dims...)
+end
diff --git a/julia/src/io.jl b/julia/src/io.jl
index 32f7fece7..6309f7ecd 100644
--- a/julia/src/io.jl
+++ b/julia/src/io.jl
@@ -360,7 +360,7 @@ function ArrayDataProvider(data, label; batch_size::Int = 0, shuffle::Bool = fal
   function gen_batch_nds(arrs :: Vector{Array{MX_float}}, bsize :: Int)
     map(arrs) do arr
       shape = size(arr)
-      empty(shape[1:end-1]..., bsize)
+      NDArray(undef, shape[1:end-1]..., bsize)
     end
   end
 
diff --git a/julia/src/kvstore.jl b/julia/src/kvstore.jl
index 000684d5f..1fb6df20d 100644
--- a/julia/src/kvstore.jl
+++ b/julia/src/kvstore.jl
@@ -128,7 +128,7 @@ One can use ``barrier()`` to sync all workers.
 julia> kv = KVStore(:local)
 mx.KVStore @ local
 
-julia> x = mx.empty(2, 3);
+julia> x = NDArray(undef, 2, 3);
 
 julia> init!(kv, 3, x)
 
@@ -161,11 +161,11 @@ julia> x
 ```jldoctest
 julia> keys = [4, 5];
 
-julia> init!(kv, keys, [empty(2, 3), empty(2, 3)])
+julia> init!(kv, keys, [NDArray(undef, 2, 3), NDArray(undef, 2, 3)])
 
 julia> push!(kv, keys, [x, x])
 
-julia> y, z = empty(2, 3), empty(2, 3);
+julia> y, z = NDArray(undef, 2, 3), NDArray(undef, 2, 3);
 
 julia> pull!(kv, keys, [y, z])
 ```
@@ -279,7 +279,7 @@ julia> init!(kv, 42, mx.ones(2, 3))
 
 julia> push!(kv, 42, mx.ones(2, 3))
 
-julia> x = empty(2, 3);
+julia> x = NDArray(undef, 2, 3);
 
 julia> pull!(kv, 42, x)
 
diff --git a/julia/src/model.jl b/julia/src/model.jl
index cb5f95e3c..0324edd1c 100644
--- a/julia/src/model.jl
+++ b/julia/src/model.jl
@@ -122,7 +122,7 @@ function init_model(self::FeedForward, initializer::AbstractInitializer; overwri
         delete!(self.arg_params, name)
       end
     end
-    arg_params[name] = empty(shape)
+    arg_params[name] = NDArray(undef, shape)
   end
 
   for (name, shape) in zip(aux_names, aux_shapes)
@@ -135,7 +135,7 @@ function init_model(self::FeedForward, initializer::AbstractInitializer; overwri
         delete!(self.aux_params, name)
       end
     end
-    aux_params[name] = empty(shape)
+    aux_params[name] = NDArray(undef, shape)
   end
 
   for (k,v) in arg_params
@@ -463,8 +463,8 @@ function fit(self::FeedForward, optimizer::AbstractOptimizer, data::AbstractData
   # set up output and labels in CPU for evaluation metric
   output_shapes = [tuple(size(x)[1:end-1]...,batch_size) for x in train_execs[1].outputs]
   cpu_dev = Context(CPU)
-  cpu_output_arrays = [empty(shape, cpu_dev) for shape in output_shapes]
-  cpu_label_arrays  = [empty(shape, cpu_dev) for (name,shape) in provide_label(data)]
+  cpu_output_arrays = [NDArray(undef, shape, ctx = cpu_dev) for shape in output_shapes]
+  cpu_label_arrays  = [NDArray(undef, shape, ctx = cpu_dev) for (name,shape) in provide_label(data)]
 
   # invoke callbacks on epoch 0
   _invoke_callbacks(self, opts.callbacks, op_state, AbstractEpochCallback)
diff --git a/julia/src/ndarray.jl b/julia/src/ndarray.jl
index dad9b59e8..256fdb4f5 100644
--- a/julia/src/ndarray.jl
+++ b/julia/src/ndarray.jl
@@ -15,1751 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# All the types supported by mshadow. See `mshadow/base.h`
-const DType = Union{Float32, Float64, Float16, UInt8, Int32, Int8, Int64}
-@enum TypeFlag kFloat32 kFloat64 kFloat16 kUint8 kInt32 kInt8 kInt64
-const DEFAULT_DTYPE = Float32  # MSHADOW_DEFAULT_DTYPE
-
-function toTypeFlag(T::Type{<:DType})
-  if T == Float32
-    return kFloat32
-  elseif T == Float64
-    return kFloat64
-  elseif T == Float16
-    return kFloat16
-  elseif T == UInt8
-    return kUint8
-  elseif T == Int32
-    return kInt32
-  elseif T == Int8
-    return kInt8
-  elseif T == Int64
-    return kInt64
-  else
-    throw(ArgumentError("Can't convert $T to DType."))
-  end
-end
-
-function fromTypeFlag(T::TypeFlag)
-  if T == kFloat32
-    return Float32
-  elseif T == kFloat64
-    return Float64
-  elseif T == kFloat16
-    return Float16
-  elseif T == kUint8
-    return UInt8
-  elseif T == kInt32
-    return Int32
-  elseif T == kInt8
-    return Int8
-  elseif T == kInt64
-    return Int64
-  else
-    throw(ArgumentError("Can't convert DType $T."))
-  end
-end
-
-# create a NDArray handle of specific shape
-function _ndarray_alloc(shape::NTuple{N,Int}, ctx::Context, delay_alloc::Bool) where N
-  h_ref  = Ref{MX_handle}(0)
-  shape  = collect(reverse(MX_uint.(shape)))
-  @mxcall(:MXNDArrayCreate, (Ptr{MX_uint}, MX_uint, Cint, Cint, Cint, Ref{MX_handle}),
-      shape, N, ctx.device_type, ctx.device_id, delay_alloc, h_ref)
-  handle = MX_NDArrayHandle(h_ref[])
-  return handle
-end
-
-# create a NDArray handle of specific shape type
-function _ndarray_alloc(::Type{T}, shape::NTuple{N,Int}, ctx::Context, delay_alloc::Bool) where {T<:DType,N}
-  h_ref  = Ref{MX_handle}(0)
-  shape  = collect(reverse(MX_uint.(shape)))
-  dtype  = toTypeFlag(T)
-  @mxcall(:MXNDArrayCreateEx, (Ptr{MX_uint}, MX_uint, Cint, Cint, Cint, Cint, Ref{MX_handle}),
-      shape, N, ctx.device_type, ctx.device_id, delay_alloc, dtype, h_ref)
-  handle = MX_NDArrayHandle(h_ref[])
-  return handle
-end
-
-# create a handle to an empty NDArray, this handle can be used to hold
-# results returned by libmx API calls
-function _ndarray_alloc()
-  h_ref = Ref{MX_handle}(0)
-  @mxcall(:MXNDArrayCreateNone, (Ref{MX_handle},), h_ref)
-  return MX_NDArrayHandle(h_ref[])
-end
-
-################################################################################
-# NDArray Type
-################################################################################
-"""
-    NDArray{T,N}
-
-Wrapper of the `NDArray` type in `libmxnet`. This is the basic building block
-of tensor-based computation.
-
-!!! note
-      since C/C++ use row-major ordering for arrays while Julia follows a
-      column-major ordering. To keep things consistent, we keep the underlying data
-      in their original layout, but use *language-native* convention when we talk
-      about shapes. For example, a mini-batch of 100 MNIST images is a tensor of
-      C/C++/Python shape (100,1,28,28), while in Julia, the same piece of memory
-      have shape (28,28,1,100).
-"""
-mutable struct NDArray{T,N}
-  handle   :: MX_NDArrayHandle
-  writable :: Bool
-
-  NDArray{T,N}(handle, writable = true) where {T,N} = new(handle, writable)
-end
-
-NDArray(x::AbstractArray{<:DType}) = copy(collect(x), cpu())
-NDArray(x::Array{<:DType})         = copy(x, cpu())
-NDArray(::Type{T}, x::AbstractArray) where {T<:DType} =
-  copy(convert(AbstractArray{T}, x), cpu())
-NDArray(handle, writable = true) =
-  NDArray{eltype(handle), ndims(handle)}(handle, writable)
-
-# type aliases
-const NDArrayOrReal = Union{NDArray,Real}
-const VecOfNDArray = AbstractVector{<:NDArray}
-
-function Base.show(io::IO, x::NDArray)
-  print(io, "NDArray(")
-  Base.show(io, try_get_shared(x, sync = :read))
-  print(io, ")")
-end
-
-# for REPL
-function Base.show(io::IO, ::MIME{Symbol("text/plain")}, x::NDArray{T,N}) where {T,N}
-  type_ = split(string(typeof(x)), '.', limit=2)[end]
-  n = length(x)
-  size_ = N == 1 ? "$n-element" : join(size(x), "×")
-  print(io, "$size_ $type_ @ $(context(x))", (n == 0) ? "" : ":\n")
-  Base.print_array(io, try_get_shared(x, sync = :read))
-end
-
-Base.unsafe_convert(::Type{MX_handle}, x::NDArray) =
-  Base.unsafe_convert(MX_handle, x.handle)
-Base.convert(T::Type{MX_handle}, x::NDArray) = Base.unsafe_convert(T, x)
-Base.cconvert(T::Type{MX_handle}, x::NDArray) = Base.unsafe_convert(T, x)
-
-MX_handle(x::NDArray) = Base.convert(MX_handle, x)
-
-################################################################################
-# NDArray functions exported to the users
-################################################################################
-"""
-    context(x::NDArray)
-
-Get the context that this `NDArray` lives on.
-"""
-function context(x::NDArray)
-  ref_typeid = Ref{Cint}(0)
-  ref_devid  = Ref{Cint}(0)
-  @mxcall(:MXNDArrayGetContext, (MX_handle, Ref{Cint}, Ref{Cint}),
-          x, ref_typeid, ref_devid)
-  Context(ref_typeid[], ref_devid[])
-end
-
-"""
-    empty(DType, dims[, ctx::Context = cpu()])
-    empty(DType, dims)
-    empty(DType, dim1, dim2, ...)
-
-Allocate memory for an uninitialized `NDArray` with a specified type.
-"""
-empty(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType} =
-  NDArray{T,N}(_ndarray_alloc(T, dims, ctx, false))
-empty(::Type{T}, dims::Int...) where {T<:DType} = empty(T, dims)
-
-"""
-    empty(dims::Tuple[, ctx::Context = cpu()])
-    empty(dim1, dim2, ...)
-
-Allocate memory for an uninitialized `NDArray` with specific shape of type Float32.
-"""
-empty(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
-  NDArray(_ndarray_alloc(dims, ctx, false))
-empty(dims::Int...) = empty(dims)
-
-"""
-    similar(x::NDArray)
-
-Create an `NDArray` with similar shape, data type,
-and context with the given one.
-Note that the returned `NDArray` is uninitialized.
-"""
-Base.similar(x::NDArray{T}) where {T} = empty(T, size(x), context(x))
-
-"""
-    zeros([DType], dims, [ctx::Context = cpu()])
-    zeros([DType], dims...)
-    zeros(x::NDArray)
-
-Create zero-ed `NDArray` with specific shape and type.
-"""
-function zeros(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
-  x = empty(T, dims, ctx)
-  x[:] = zero(T)
-  x
-end
-
-zeros(::Type{T}, dims::Int...) where {T<:DType} = zeros(T, dims)
-
-zeros(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
-  zeros(MX_float, dims, ctx)
-zeros(dims::Int...) = zeros(dims)
-
-zeros(x::NDArray)::typeof(x)      = zeros_like(x)
-Base.zeros(x::NDArray)::typeof(x) = zeros_like(x)
-
-"""
-    ones([DType], dims, [ctx::Context = cpu()])
-    ones([DType], dims...)
-    ones(x::NDArray)
-
-Create an `NDArray` with specific shape & type, and initialize with 1.
-"""
-function ones(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
-  arr = empty(T, dims, ctx)
-  arr[:] = one(T)
-  arr
-end
-
-ones(::Type{T}, dims::Int...) where T<:DType = ones(T, dims)
-
-ones(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
-  ones(MX_float, dims, ctx)
-ones(dims::Int...) = ones(dims)
-
-ones(x::NDArray)::typeof(x)      = ones_like(x)
-Base.ones(x::NDArray)::typeof(x) = ones_like(x)
-
-import Base: length, ndims
-
-"""
-    size(x::NDArray)
-    size(x::NDArray, dims)
-
-Get the shape of an `NDArray`. The shape is in Julia's column-major convention.
-See also the notes on NDArray shapes [`NDArray`](@ref).
-"""
-function Base.size(x::NDArray)
-  ref_ndim  = Ref{MX_uint}(0)
-  ref_shape = Ref{Ptr{MX_uint}}(0)
-  @mxcall(:MXNDArrayGetShape, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_uint}}),
-          x, ref_ndim, ref_shape)
-  tuple(map(Int, reverse(unsafe_wrap(Array, ref_shape[], ref_ndim[])))...)
-end
-
-Base.size(x::NDArray{T,N}, dims::Integer) where {T,N} = (dims > N) ? 1 : size(x)[dims]
-
-"""
-    length(x::NDArray)
-
-Get the number of elements in an `NDArray`.
-"""
-length(x::NDArray) = prod(size(x))
-
-"""
-    ndims(x::NDArray)
-
-Get the number of dimensions of an `NDArray`.
-Is equivalent to `length(size(arr))`.
-"""
-ndims(x::NDArray) = ndims(x.handle)
-
-function ndims(x::MX_NDArrayHandle)::Int
-  ref_ndim  = Ref{MX_uint}(0)
-  ref_shape = Ref{Ptr{MX_uint}}(0)
-  @mxcall(:MXNDArrayGetShape, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_uint}}),
-          x, ref_ndim, ref_shape)
-  ref_ndim[]
-end
-
-"""
-    eltype(x::NDArray)
-
-Get the element type of an `NDArray`.
-"""
-function Base.eltype(x::Union{NDArray,MX_NDArrayHandle})
-  dtype_ref = Ref{Cint}(0)
-  @mxcall(:MXNDArrayGetDType, (MX_handle, Ptr{Cint}), x, dtype_ref)
-
-  if dtype_ref[] == -1 # x->is_none()
-    # TODO: unit test for this branch
-    throw(MXError("Eltype of $x is not defined"))
-  end
-
-  fromTypeFlag(TypeFlag(dtype_ref[]))
-end
-
-@inline _first(x::NDArray) = try_get_shared(x, sync = :read) |> first
-
-Base.first(x::NDArray) = _first(x)
-
-Base.lastindex(x::NDArray) = length(x)
-
-"""
-    slice(arr :: NDArray, start:stop)
-
-Create a view into a sub-slice of an `NDArray`. Note only slicing at the slowest
-changing dimension is supported. In Julia's column-major perspective, this is the last
-dimension. For example, given an `NDArray` of shape (2,3,4), `slice(array, 2:3)` will create
-a `NDArray` of shape (2,3,2), sharing the data with the original array. This operation is
-used in data parallelization to split mini-batch into sub-batches for different devices.
-"""
-function slice(arr::NDArray, ::Colon)
-  arr
-end
-function slice(arr::NDArray, slice::UnitRange{Int})
-  dim1 = size(arr)[end]
-  @assert(1 <= slice.start <= slice.stop <= dim1)
-  if slice.start == 1 && slice.stop == dim1
-    return arr
-  end
-
-  hdr_ref = Ref{MX_handle}(0)
-  # note Julia is 1-based, inclusive-inclusive indexing, while C++ is
-  # 0-based, inclusive-exclusive indexing. So 1:3 in Julia should
-  # translates into 0:3 in C++.
-  @mxcall(:MXNDArraySlice, (MX_handle, MX_uint, MX_uint, Ref{MX_handle}),
-          arr, slice.start-1, slice.stop, hdr_ref)
-  return NDArray(MX_NDArrayHandle(hdr_ref[]), arr.writable)
-end
-
-function _at(handle::Union{MX_NDArrayHandle, MX_handle}, idx::Integer)
-  h_ref = Ref{MX_handle}(C_NULL)
-  @mxcall(:MXNDArrayAt, (MX_handle, MX_uint, Ref{MX_handle}),
-          handle, idx, h_ref)
-  h_ref[]
-end
-
-import Base: setindex!
-
-"""
-    setindex!(arr::NDArray, val, idx)
-
-Assign values to an `NDArray`.
-The following scenarios are supported
-
-* single value assignment via linear indexing: `arr[42] = 24`
-
-* `arr[:] = val`: whole array assignment, `val` could be a scalar or an array (Julia `Array`
-  or `NDArray`) of the same shape.
-* `arr[start:stop] = val`: assignment to a *slice*, `val` could be a scalar or an array of
-  the same shape to the slice. See also [`slice`](@ref).
-"""
-function setindex!(arr::NDArray, val::Real, idx::Integer)
-  # linear indexing
-  @assert arr.writable
-  _set_value(out=arr[idx], src=val)
-end
-
-function setindex!(arr::NDArray, val::Real, ::Colon)
-  @assert arr.writable
-  _set_value(out = arr, src = dump_mx_param(val))
-end
-
-function setindex!(arr::NDArray, val::Array{T}, ::Colon) where T<:Real
-  @assert arr.writable
-  copy!(arr, val)
-end
-
-function setindex!(arr::NDArray, val::NDArray, ::Colon)
-  @assert arr.writable
-  copy!(arr, val)
-end
-
-function setindex!(arr::NDArray, val::Union{T,Array{T},NDArray},
-                   idx::UnitRange{Int}) where T<:Real
-  @assert arr.writable
-  setindex!(slice(arr, idx), val, Colon())
-end
-
-import Base: getindex
-"""
-    getindex(arr::NDArray, idx)
-
-Shortcut for [`slice`](@ref). A typical use is to write
-
-```julia
-  arr[:] += 5
-```
-
-which translates into
-
-```julia
-  arr[:] = arr[:] + 5
-```
-
-which furthur translates into
-
-```julia
-  setindex!(getindex(arr, Colon()), 5, Colon())
-```
-
-!!! note
-    The behavior is quite different from indexing into Julia's `Array`. For example, `arr[2:5]`
-    create a **copy** of the sub-array for Julia `Array`, while for `NDArray`, this is
-    a *slice* that shares the memory.
-"""
-getindex(arr::NDArray, ::Colon) = arr
-
-"""
-Shortcut for [`slice`](@ref).
-**NOTE** the behavior for Julia's built-in index slicing is to create a
-copy of the sub-array, while here we simply call `slice`,
-which shares the underlying memory.
-"""
-getindex(arr::NDArray, idx::UnitRange{Int}) = slice(arr, idx)
-
-getindex(arr::NDArray) = _first(arr)
-
-function getindex(arr::NDArray, idx::Integer)
-  # linear indexing
-  len = length(arr)
-  size_ = size(arr)
-
-  if idx <= 0 || idx > len
-    throw(BoundsError(
-      "attempt to access $(join(size_, 'x')) NDArray at index $(idx)"))
-  end
-
-  idx -= 1
-  offsets = size_[1:end-1] |> reverse ∘ cumprod ∘ collect
-  handle = arr.handle
-  for offset ∈ offsets
-    handle = _at(handle, idx ÷ offset)
-    idx %= offset
-  end
-
-  _at(handle, idx) |> MX_NDArrayHandle |> x -> NDArray(x, arr.writable)
-end
-
-import Base: copy!, copy, convert, deepcopy
-
-"""
-    copy!(dst::Union{NDArray, Array}, src::Union{NDArray, Array})
-
-Copy contents of `src` into `dst`.
-"""
-function copy!(dst::NDArray, src::NDArray)
-  @assert(dst.writable)
-  if dst.handle == src.handle
-    @warn("Copying an NDArray to itself")
-    return
-  end
-
-  _copyto(src, out=dst)
-  return dst
-end
-
-function copy!(dst::Array{T}, src::NDArray{T}) where T<:DType
-  @assert size(dst) == size(src)
-  @mxcall(:MXNDArraySyncCopyToCPU, (MX_handle, Ptr{Cvoid}, Csize_t),
-          src, pointer(dst), length(dst))
-  dst
-end
-
-copy!(dst::Array{<:Real}, src::NDArray) = copy!(dst, copy(src))
-copy!(dst::NDArray, src::AbstractArray) = copy!(dst, collect(src))
-
-function copy!(dst::NDArray{T}, src::Array{<:Real}) where {T}
-  @assert dst.writable
-  @assert size(dst) == size(src)
-  src = convert(Array{T}, src) # this might involve copying
-  @mxcall(:MXNDArraySyncCopyFromCPU, (MX_handle, Ptr{Cvoid}, Csize_t),
-          dst.handle, pointer(src), length(src))
-  dst
-end
-
-function copy_ignore_shape!(dst::NDArray{T}, src::Array{<:Real}) where {T}
-  @assert dst.writable
-  @assert length(dst) == length(src)
-  src = convert(Array{T}, src) # this might involve copying
-  @mxcall(:MXNDArraySyncCopyFromCPU, (MX_handle, Ptr{Cvoid}, Csize_t),
-          dst.handle, pointer(src), length(src))
-  dst
-end
-
-
-"""
-    copy(arr :: NDArray)
-    copy(arr :: NDArray, ctx :: Context)
-    copy(arr :: Array, ctx :: Context)
-
-Create a copy of an array. When no `Context` is given, create a Julia `Array`.
-Otherwise, create an `NDArray` on the specified context.
-"""
-copy
-
-# Create copy: NDArray -> Julia Array
-copy(x::NDArray{T,D}) where{T,D} = copy!(Array{T,D}(undef, size(x)), x)
-
-# Create copy: NDArray -> NDArray in a given context
-copy(x::NDArray{T,D}, ctx::Context) where {T,D} =
-  copy!(NDArray{T,D}(_ndarray_alloc(T, size(x), ctx, true)), x)
-
-# Create copy: Julia Array -> NDArray in a given context
-copy(x::Array{T}, ctx::Context) where {T<:DType} =
-  copy!(empty(T, size(x), ctx), x)
-
-copy(x::AbstractArray, ctx::Context) =
-  copy!(empty(eltype(x), size(x), ctx), collect(x))
-
-"""
-    convert(::Type{Array{<:Real}}, x::NDArray)
-
-Convert an `NDArray` into a Julia `Array` of specific type.
-Data will be copied.
-"""
-convert(T::Type{Array{<:Real}}, x::NDArray) = convert(T, copy(x))
-
-"""
-    deepcopy(arr::NDArray)
-
-Get a deep copy of the data blob in the form of an NDArray of default storage
-type. This function blocks. Do not use it in performance critical code.
-"""
-function deepcopy(arr::NDArray)
-  out_ref = Ref{MX_handle}(C_NULL)
-  @mxcall(:MXNDArrayGetDataNDArray, (MX_handle, Ref{MX_handle}), arr, out_ref)
-  NDArray(MX_NDArrayHandle(out_ref[]))
-end
-
-"""
-    hcat(x::NDArray...)
-"""
-Base.hcat(xs::NDArray{T}...) where T = cat(xs..., dims = 2)
-
-"""
-    vcat(x::NDArray...)
-"""
-Base.vcat(xs::NDArray{T}...) where T = cat(xs..., dims = 1)
-
-"""
-    cat(xs::NDArray...; dims)
-
-Concate the `NDArray`s which have the same element type along the `dims`.
-Building a diagonal matrix is not supported yet.
-"""
-function Base.cat(xs::NDArray{T}...; dims) where T
-  ns = ndims.(xs)
-  d = Base.max(dims, maximum(ns))
-  xs′ = map(zip(ns, xs)) do i
-    n, x = i
-    (d > n) ? reshape(x, -2, Base.ones(Int, d - n)...) : x
-  end
-  concat(xs′..., dim = d - dims)
-end
-
-"""
-    @inplace
-
-Julia does not support re-definiton of `+=` operator (like `__iadd__` in python),
-When one write `a += b`, it gets translated to `a = a+b`. `a+b` will allocate new
-memory for the results, and the newly allocated `NDArray` object is then assigned
-back to a, while the original contents in a is discarded. This is very inefficient
-when we want to do inplace update.
-
-This macro is a simple utility to implement this behavior. Write
-
-```julia
-  @mx.inplace a += b
-```
-
-will translate into
-
-```julia
-  mx.add_to!(a, b)
-```
-
-which will do inplace adding of the contents of `b` into `a`.
-"""
-macro inplace(ex)
-  f = if ex.head == :+= || ex.head == :.+=
-    :add_to!
-  elseif ex.head == :-= || ex.head == :.-=
-    :sub_from!
-  elseif ex.head == :.*=
-    :mul_to!
-  elseif ex.head == :./=
-    :div_from!
-  elseif ex.head == :.%=
-    :mod_from!
-  else
-    error("unsupported inplace translation for $ex")
-  end
-  Expr(:call, f, esc(ex.args[1]), esc(ex.args[2]))
-end
-
-"""
-    add_to!(dst::NDArray, args::NDArrayOrReal...)
-
-Add a bunch of arguments into `dst`. Inplace updating.
-"""
-function add_to!(dst::NDArray, args::NDArrayOrReal...)
-  @assert dst.writable
-  for arg in args
-    if isa(arg, Real)
-      _plus_scalar(dst, scalar = arg, out = dst)
-    else
-      _plus!(dst, arg)
-    end
-  end
-  dst
-end
-
-import Base: +
-
-"""
-    +(args...)
-    .+(args...)
-
-Summation. Multiple arguments of either scalar or `NDArray` could be
-added together. Note at least the first or second argument needs to be an
-`NDArray` to avoid ambiguity of built-in summation.
-"""
-+(x::NDArray)             = x
-+(x::NDArray, y::NDArray) = _plus(x, y)
-+(x::NDArray, y::Real)    = _plus_scalar(x, scalar = y)
-+(y::Real,    x::NDArray) = _plus_scalar(x, scalar = y)
-
-broadcasted(::typeof(+), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
-  _broadcast_add(x, y)
-
-"""
-    sub_from!(dst::NDArray, args::NDArrayOrReal...)
-
-Subtract a bunch of arguments from `dst`. Inplace updating.
-"""
-function sub_from!(dst::NDArray, arg::NDArrayOrReal)
-  @assert dst.writable
-  if isa(arg, Real)
-    _minus_scalar(dst, scalar = arg, out = dst)
-  else
-    _minus!(dst, arg)
-  end
-  dst
-end
-
-import Base: -
-
-"""
-    -(x::NDArray)
-    -(x, y)
-    .-(x, y)
-
-Subtraction `x - y`, of scalar types or `NDArray`.
-Or create the negative of `x`.
-"""
--(x::NDArray)             = _mul_scalar(x, scalar = -one(eltype(x)))
--(x::NDArray, y::NDArray) = _minus(x, y)
--(x::NDArray, y::Real)    = _minus_scalar(x, scalar = y)
--(y::Real, x::NDArray)    = _rminus_scalar(x, scalar = y)
-
-broadcasted(::typeof(-), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
-  _broadcast_minus(x, y)
-
-"""
-    mul_to!(dst::NDArray, arg::NDArrayOrReal)
-
-Elementwise multiplication into `dst` of either a scalar or an `NDArray` of the same shape.
-Inplace updating.
-"""
-function mul_to!(dst::NDArray, arg::NDArrayOrReal)
-  @assert dst.writable
-  if isa(arg, Real)
-    _mul_scalar(dst, scalar = arg, out = dst)
-  else
-    _mul(dst, arg, out = dst)
-  end
-  dst
-end
-
-import Base: *
-
-"""
-    .*(x, y)
-
-Elementwise multiplication for `NDArray`.
-"""
-*(x::NDArray, y::Real)  = _mul_scalar(x, scalar = y)
-*(y::Real, x::NDArray)  = _mul_scalar(x, scalar = y)
-
-broadcasted(::typeof(*), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
-  _mul(x, y)
-broadcasted(::typeof(*), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
-  _broadcast_mul(x, y)
-
-"""
-    *(A::NDArray, B::NDArray)
-
-Matrix/tensor multiplication.
-"""
-*(x::NDArray{T}, y::NDArray{T}) where T = x ⋅ y
-
-LinearAlgebra.adjoint(x::NDArray{T,1}) where T = transpose(x)
-LinearAlgebra.adjoint(x::NDArray{T,2}) where T = transpose(x)
-
-"""
-    div_from!(dst::NDArray, arg::NDArrayOrReal)
-
-Elementwise divide a scalar or an `NDArray` of the same shape from `dst`. Inplace updating.
-"""
-function div_from!(dst::NDArray, arg::NDArrayOrReal)
-  @assert dst.writable
-  if isa(arg, Real)
-    _div_scalar(dst, scalar = arg, out = dst)
-  else
-    _div(dst, arg, out = dst)
-  end
-  dst
-end
-
-function div_from!(dst::NDArray{T}, arg::Real) where {T<:Integer}
-  @assert dst.writable
-  @assert(round(T, arg) != zero(T), "Integer divided by zero")
-  _div_scalar(dst, scalar = arg, out = dst)
-  dst
-end
-
-"""
-    rdiv_from!(x:: Real, y::NDArray)
-
-Elementwise divide a scalar by an `NDArray`. Inplace updating.
-"""
-function rdiv_from!(x::Real, y::NDArray)
-  @assert y.writable
-  _rdiv_scalar(y, scalar = x, out = y)
-  y
-end
-
-import Base: /
-
-"""
-    ./(x::NDArray, y::NDArray)
-    ./(x::NDArray, y::Real)
-    ./(x::Real, y::NDArray)
-
-* Elementwise dividing an `NDArray` by a scalar or another `NDArray`
-of the same shape.
-
-* Elementwise divide a scalar by an `NDArray`.
-
-* Matrix division (solving linear systems) is not implemented yet.
-"""
-/(x::NDArray, y::Real) = _div_scalar(x, scalar = y)
-
-broadcasted(::typeof(/), y::Real, x::NDArray) = _rdiv_scalar(x, scalar = y)
-broadcasted(::typeof(/), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
-  _div(x, y)
-broadcasted(::typeof(/), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
-  _broadcast_div(x, y)
-
-function broadcasted(::typeof(/), x::NDArray{T}, y::Real) where {T<:Integer}
-  @assert(round(T, y) != zero(T), "Integer divided by zero")
-  _div_scalar(x, scalar = y)
-end
-
-"""
-    mod_from!(x::NDArray, y::NDArray)
-    mod_from!(x::NDArray, y::Real)
-
-Elementwise modulo for `NDArray`.
-Inplace updating.
-"""
-mod_from!(x::NDArray, y::NDArray) = _mod!(x, y)
-mod_from!(x::NDArray, y::Real)    = _mod_scalar!(x, y)
-
-"""
-    rmod_from!(y::Real, x::NDArray)
-
-Elementwise modulo for `NDArray`.
-Inplace updating.
-"""
-rmod_from!(y::Real, x::NDArray) = _rmod_scalar!(x, y)
-
-import Base: %
-
-"""
-    .%(x::NDArray, y::NDArray)
-    .%(x::NDArray, y::Real)
-    .%(x::Real, y::NDArray)
-
-Elementwise modulo for `NDArray`.
-"""
-%(x::NDArray, y::Real) = _mod_scalar(x, y)
-
-broadcasted(::typeof(%), y::Real, x::NDArray) = _rmod_scalar(x, y)
-broadcasted(::typeof(%), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
-  _mod(x, y)
-broadcasted(::typeof(%), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
-  _broadcast_mod(x, y)
-
-# document of `.^` is merged into SymbolicNode's
-
-broadcasted(::typeof(Base.literal_pow), ::typeof(^), x::NDArray, ::Val{s}) where {s} =
-  _power_scalar(x, scalar = s)
-broadcasted(::typeof(^), x::NDArray, s::Real) = _power_scalar(x,  scalar = s)
-broadcasted(::typeof(^), s::Real, x::NDArray) = _rpower_scalar(x, scalar = s)
-
-broadcasted(::typeof(^), ::Irrational{:ℯ}, x::NDArray) = exp(x)
-broadcasted(::typeof(^), x::NDArray, s::Irrational)    = _power_scalar(x, scalar = s)
-broadcasted(::typeof(^), s::Irrational, x::NDArray)    = _rpower_scalar(x, scalar = s)
-
-broadcasted(::typeof(^), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
-  _power(x, y)
-broadcasted(::typeof(^), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
-  _broadcast_power(x, y)
-
-###############################################################################
-# comparison
-###############################################################################
-
-broadcasted(::typeof(==), x::NDArray{T}, y::NDArray{T}) where {T} =
-  _broadcast_equal(x, y)
-
-broadcasted(::typeof(!=), x::NDArray{T}, y::NDArray{T}) where {T} =
-  _broadcast_not_equal(x, y)
-
-broadcasted(::typeof(>), x::NDArray{T}, y::NDArray{T}) where {T} =
-  _broadcast_greater(x, y)
-
-broadcasted(::typeof(>=), x::NDArray{T}, y::NDArray{T}) where {T} =
-  _broadcast_greater_equal(x, y)
-
-broadcasted(::typeof(<), x::NDArray{T}, y::NDArray{T}) where {T} =
-  _broadcast_lesser(x, y)
-
-broadcasted(::typeof(<=), x::NDArray{T}, y::NDArray{T}) where {T} =
-  _broadcast_lesser_equal(x, y)
-
-
-###############################################################################
-# min/max
-###############################################################################
-
-import Base: min, max
-
-broadcasted(::typeof(max), x::NDArray{T}, y::NDArray{T}) where {T} =
-  _broadcast_maximum(x, y)
-
-broadcasted(::typeof(min), x::NDArray{T}, y::NDArray{T}) where {T} =
-  _broadcast_minimum(x, y)
-
-"""
-    fill!(arr::NDArray, x)
-
-Create an `NDArray` filled with the value `x`, like `Base.fill!`.
-"""
-function Base.fill!(arr::NDArray, x)
-  arr[:] = x
-  arr
-end
-
-"""
-    fill(x, dims, ctx=cpu())
-    fill(x, dims...)
-
-Create an `NDArray` filled with the value `x`, like `Base.fill`.
-"""
-function fill(x, dims::NTuple{N,Integer}, ctx::Context=cpu()) where N
-  arr = empty(typeof(x), dims, ctx)
-  arr[:] = x
-  arr
-end
-
-fill(x, dims::Integer...) = fill(x, dims)
-
-import Base: hypot
-
-broadcasted(::typeof(hypot), x::NDArray{T}, y::NDArray{T}) where {T} =
-  _broadcast_hypot(x, y)
-
-"""
-Manipulating as Julia Arrays
-----------------------------
-
-    @nd_as_jl(captures..., statement)
-
-A convenient macro that allows to operate `NDArray` as Julia Arrays. For example,
-
-```julia
-  x = mx.zeros(3,4)
-  y = mx.ones(3,4)
-  z = mx.zeros((3,4), mx.gpu())
-
-  @mx.nd_as_jl ro=(x,y) rw=z begin
-    # now x, y, z are just ordinary Julia Arrays
-    z[:,1] = y[:,2]
-    z[:,2] = 5
-  end
-```
-
-Under the hood, the macro convert all the declared captures from `NDArray` into Julia
-Arrays, by using `try_get_shared`. And automatically commit the modifications back into
-the `NDArray` that is declared as `rw`. This is useful for fast prototyping and when
-implement non-critical computations, such as `AbstractEvalMetric`.
-
-!!! note
-* Multiple `rw` and / or `ro` capture declaration could be made.
-* The macro does **not** check to make sure that `ro` captures are not modified. If the
-  original `NDArray` lives in CPU memory, then it is very likely the corresponding
-  Julia Array shares data with the `NDArray`, so modifying the Julia Array will also
-  modify the underlying `NDArray`.
-* More importantly, since the `NDArray` is
-  asynchronized, we will wait for *writing* for `rw` variables but wait only for *reading*
-  in `ro` variables. If we write into those `ro` variables, **and** if the memory is
-  shared, racing condition might happen, and the behavior is undefined.
-* When an `NDArray` is declared to be captured as `rw`, its contents is always sync
-  back in the end.
-* The execution results of the expanded macro is always `nothing`.
-* The statements are wrapped in a `let`, thus locally introduced new variables will not be
-  available after the statements. So you will need to declare the variables before calling the
-  macro if needed.
-"""
-macro nd_as_jl(m_args...)
-  @assert(length(m_args) > 0)
-  stmts = m_args[end]
-  @assert(isa(stmts, Expr) && stmts.head == :block,
-          "The last argument should be a statement block (begin-end); but get $stmts")
-  stmts = esc(stmts)
-
-  dclrs  = m_args[1:end-1]
-  nd_ro  = []
-  nd_rw  = []
-  nd_all = []
-  for declr in dclrs
-    @assert(isa(declr, Expr) && declr.head == :(=) && length(declr.args)==2 && declr.args[1] ∈ (:ro,:rw),
-            "Invalid declaration, should be rw=(x,y) or ro=z; but get $declr")
-
-    declr_vars = declr.args[2]
-    if isa(declr_vars, Symbol)
-      declr_vars = (declr_vars,)
-    elseif isa(declr_vars, Expr)
-      @assert(declr_vars.head ∈ (:tuple, :vect),
-              "Capture declaration should be a variable or a tuple of variables; but got $declr_vars")
-      declr_vars = declr_vars.args
-    else
-      @assert(false, "Capture declaration should be a variable or a tuple of variables; but got $declr_vars")
-    end
-    for declr_var in declr_vars
-      @assert(isa(declr_var, Symbol),
-              "Captured ndarrays in ro/rw declaration should be variables, but get $(declr_var)")
-    end
-    append!(nd_all, [declr_vars...])
-    if declr.args[1] == :ro
-      append!(nd_ro, [declr_vars...])
-    else
-      append!(nd_rw, [declr_vars...])
-    end
-  end
-
-  nd_ro    = map(esc, nd_ro)
-  nd_rw    = map(esc, nd_rw)
-  nd_all   = map(esc, nd_all)
-  rw_origs = [gensym() for _ in nd_rw]
-
-  save_statements  = Expr(:block, [:($v_orig = $v) for (v_orig, v) in zip(rw_origs, nd_rw)]...)
-  wait_statements  = Expr(:block, [:(_wait_to_read($v)) for v in nd_ro]...,
-                                  [:(_wait_to_write($v)) for v in nd_rw]...)
-  clear_statements = Expr(:block, [:($v_orig = nothing) for v_orig in rw_origs]...)
-  let_assignments  = Expr(:block, [:($v = try_get_shared($v)) for v in nd_all]...)
-  sync_statements  = map(rw_origs, nd_rw) do v_orig, v
-    quote
-      if !is_shared($v, $v_orig)
-        # copy data back if not or no longer sharing data
-        copy!($v_orig, $v)
-      end
-    end
-  end
-  sync_statements  = Expr(:block, sync_statements...)
-
-  let_statement = Expr(:let, let_assignments, quote
-    $stmts
-    $sync_statements
-  end)
-  m_body = quote
-    $wait_statements
-    $save_statements
-    $let_statement
-    $clear_statements
-    nothing # the final results is always nothing
-  end
-
-  m_body
-end
-
-# NOTE: internal use only. Accessing pointers on a different device (e.g. accessing GPU
-# pointers from CPU) leads to undefined behavior.
-import Base.pointer
-function pointer(arr :: NDArray)
-  pdata = Ref{Ptr{Cvoid}}(0)
-  @mxcall(:MXNDArrayGetData, (MX_handle, Ref{Ptr{Cvoid}}), arr, pdata)
-  return convert(Ptr{eltype(arr)}, pdata[])
-end
-
-@inline _wait_to_read(arr :: NDArray) =
-  @mxcall(:MXNDArrayWaitToRead, (MX_handle,), arr)
-@inline _wait_to_write(arr :: NDArray) =
-  @mxcall(:MXNDArrayWaitToWrite, (MX_handle,), arr)
-
-"""
-    try_get_shared(arr; sync=:nop)
-
-Try to create a Julia array by sharing the data with the underlying `NDArray`.
-
-# Arguments:
-
-* `arr::NDArray`: the array to be shared.
-
-!!! note
-    The returned array does not guarantee to share data with the underlying `NDArray`.
-    In particular, data sharing is possible only when the `NDArray` lives on CPU.
-
-* `sync::Symbol`: `:nop`,`:write`, `:read`
-  On CPU, invoke `_wait_to_read` if `:read`;
-  invoke `_wait_to_write` if `:write`.
-"""
-function try_get_shared(x::NDArray; sync::Symbol=:nop)
-  if context(x).device_type == CPU
-    # try to do data sharing
-    if sync == :read
-      _wait_to_read(x)
-    elseif sync == :write
-      _wait_to_write(x)
-    end
-
-    unsafe_wrap(Array, pointer(x), size(x))
-  else
-    # impossible to share, just copying
-    copy(x)
-  end
-end
-
-"""
-    is_shared(j_arr, arr)
-
-Test whether `j_arr` is sharing data with `arr`.
-
-# Arguments:
-
-* `j_arr::Array`: the Julia Array.
-* `arr::NDArray`: the `NDArray`.
-"""
-is_shared(::Array, ::NDArray) = false
-
-function is_shared(j_arr::Array{T}, arr::NDArray{T}) where {T<:DType}
-  if length(j_arr) != length(arr)
-    return false
-  end
-  if context(arr).device_type != CPU
-    return false
-  end
-  pointer(j_arr) == pointer(arr)
-end
-
-"""
-    load(filename, ::Type{NDArray})
-
-Load NDArrays from binary file.
-
-# Arguments:
-* `filename::String`: the path of the file to load. It could be S3 or HDFS address.
-
-Returns either `Dict{Symbol, NDArray}` or `Vector{NDArray}`.
-
-`filename` can point to `s3` or `hdfs` resources if the `libmxnet` is built with the
-corresponding components enabled. Examples:
-* `s3://my-bucket/path/my-s3-ndarray`
-* `hdfs://my-bucket/path/my-hdfs-ndarray`
-* `/path-to/my-local-ndarray`
-"""
-function load(filename::AbstractString, ::Type{<:NDArray})
-  out_size      = Ref{MX_uint}(0)
-  out_hdrs      = Ref{Ptr{MX_handle}}(0)
-  out_name_size = Ref{MX_uint}(0)
-  out_names     = Ref{char_pp}(0)
-  @mxcall(:MXNDArrayLoad, (char_p, Ref{MX_uint}, Ref{Ptr{MX_handle}}, Ref{MX_uint}, Ref{char_pp}),
-          filename, out_size, out_hdrs, out_name_size, out_names)
-  out_name_size = out_name_size[]
-  out_size      = out_size[]
-  if out_name_size == 0
-    return [NDArray(MX_NDArrayHandle(hdr)) for hdr in unsafe_wrap(Array, out_hdrs[], out_size)]
-  else
-    @assert out_size == out_name_size
-    return Dict([(Symbol(unsafe_string(k)), NDArray(MX_NDArrayHandle(hdr))) for (k,hdr) in
-                 zip(unsafe_wrap(Array, out_names[], out_size), unsafe_wrap(Array, out_hdrs[], out_size))])
-  end
-end
-
-"""
-    save(filename::AbstractString, data)
-
-Save NDarrays to binary file. Filename could be S3 or HDFS address, if `libmxnet` is built
-with corresponding support (see `load`).
-
-* `filename::String`: path to the binary file to write to.
-* `data`: data to save to file. Data can be a`NDArray`, a `Vector` of `NDArray`,
-  or a `Dict{Symbol}` contains `NDArray`s.
-"""
-save(filename::String, data::NDArray) = save(filename, [data])
-
-save(filename::String, data::VecOfNDArray) =
-  @mxcall(:MXNDArraySave, (char_p, MX_uint, Ptr{MX_handle}, char_pp),
-          filename, length(data), MX_handle[data...], char_pp(0))
-
-function save(filename::String, data::Dict{Symbol})
-  names  = keys(data)
-  arrays = MX_handle.(collect(values(data)))
-  names  = String.(collect(names))
-
-  @mxcall(:MXNDArraySave, (char_p, MX_uint, Ptr{MX_handle}, char_pp),
-          filename, length(names), arrays, names)
-end
-
-################################################################################
-# Mapping NDArray functions to Base-like API
-################################################################################
-
-const _ndsig = Dict{Symbol,Expr}()
-const _nddoc = Dict{Symbol,Any}()
-
-_isinplace(name::Symbol) = endswith(string(name), "!")
-
-_writable(name::Symbol, x) =
-  _isinplace(name) ? :(@assert $x.writable "this NDArray isn't writable") : :()
-
-function _outexpr(name::Symbol, x #= the first arg of `sig` =#)
-  if _isinplace(name)  # `func!`
-    Ptr, 1, :([[MX_handle(x.handle)]]), :($x)
-  else
-    retexpr = :(NDArray(MX_NDArrayHandle(unsafe_load(hdls_ref[], 1))))
-    Ref, 0, :(Ref{Ptr{MX_handle}}(C_NULL)), retexpr
-  end
-end
-
-_broadcast_target(sig::Expr) = sig.args[2].args[].args[end]
-
-"""
-Generate docstring from function signature
-"""
-function _docsig(fname::Symbol, sig::Expr, opname::String)
-  if fname !== :broadcasted
-    get(_nddoc, fname, "    $sig") * "\n" * _getdocdefine(opname)
-  else
-    name = _broadcast_target(sig)
-    str = get(_nddoc, name, "")
-    _nddoc[name] = false  # change to false, denote docstring has been set up
-    if isempty(str)
-      sig_ = Expr(:call, Symbol(name, "."), sig.args[3:end]...)
-      str = "    $sig_"
-    end
-    if str ≠ false
-      # append "Defined in ..."
-      def = _getdocdefine(opname)
-      str = if str isa Markdown.MD
-        str = Markdown.MD(copy(str.content), copy(str.meta))
-        push!(str, Markdown.Paragraph(def))
-        str
-      else
-        str * def
-      end
-
-      @eval @doc $str $name
-    end
-    ""
-  end
-end
-
-
-macro _remap(sig::Expr, imp::Expr)
-  d = splitdef(:($sig = $imp))
-  @capture d[:name] (M_.fname_|fname_)
-
-  opname = string(imp.args[1])
-
-  if isa(imp.args[2], Expr) && imp.args[2].head == :parameters
-    ndin = imp.args[3:end]
-    mxargs = imp.args[2].args
-  else  # no keyword arguments
-    ndin = imp.args[2:end]
-    mxargs = []
-  end
-
-  mxkeys = map(x -> string(x.args[1]), mxargs)
-  mxvals = Expr(:vect, map(x -> :(dump_mx_param($(x.args[2]))), mxargs)...)
-  ndhlds = Expr(:vect, map(x -> :($(x).handle), ndin)...)
-
-  # handler for `func!` which has side effect on first argument.
-  T, n_output, hdls_ref, retexpr = _outexpr(fname, _firstarg(sig))
-
-  assert_expr = _writable(fname, _firstarg(sig))
-
-  func_body = quote
-    $assert_expr
-    op_handle = _get_cached_libmx_op_handle($opname)
-    n_output = Ref(Cint($n_output))
-    hdls_ref = $hdls_ref
-    @mxcall(:MXImperativeInvoke,
-            (MX_handle,
-             Cint,
-             Ptr{MX_handle},
-             Ref{Cint},
-             $T{Ptr{MX_handle}},
-             Cint,
-             char_pp,
-             char_pp),
-            op_handle,
-            $(length(ndin)),
-            $(ndhlds),
-            n_output,
-            hdls_ref,
-            $(length(mxargs)),
-            $mxkeys,
-            $mxvals)
-    $retexpr
-  end
-
-  docstr = _docsig(fname, sig, opname)
-  func_def = Expr(:function, sig, func_body)
-
-  esc(quote
-    @doc $docstr
-    $func_def
-  end)
-end
-
-macro _remap(sig::Expr, imp::Symbol)
-  imp = _ndsig[imp]
-
-  esc(quote
-    @_remap($sig, $imp)
-  end)
-end
-
-_ndsig[:reshape] = :(reshape(x; shape = dim, reverse = !reverse))
-@_remap Base.reshape(x::NDArray, dim...; reverse = false) reshape
-@_remap Base.reshape(x::NDArray, dim   ; reverse = false) reshape
-
-Statistics.mean(x::NDArray; dims = :) = _mean(x, dims)
-@_remap _mean(x::NDArray, ::Colon) mean(x)
-@_remap _mean(x::NDArray, dims)    mean(x; axis = 0 .- dims, keepdims = true)
-
-Base.sum(x::NDArray; dims = :) = _sum(x, dims)
-@_remap _sum(x::NDArray, ::Colon) sum(x)
-@_remap _sum(x::NDArray, dims)    sum(x; axis = 0 .- dims, keepdims = true)
-
-Base.maximum(x::NDArray; dims = :) = _nd_maximum(x, dims)
-@_remap _nd_maximum(x::NDArray, ::Colon) max(x)
-@_remap _nd_maximum(x::NDArray, dims)    max(x; axis = 0 .- dims, keepdims = true)
-
-Base.minimum(x::NDArray; dims = :) = _nd_minimum(x, dims)
-@_remap _nd_minimum(x::NDArray, ::Colon) min(x)
-@_remap _nd_minimum(x::NDArray, dims)    min(x; axis = 0 .- dims, keepdims = true)
-
-# See /~https://github.com/dmlc/MXNet.jl/issues/55
-@_remap LinearAlgebra.dot(x::NDArray, y::NDArray) dot(y, x)
-
-# See /~https://github.com/dmlc/MXNet.jl/pull/123
-@_remap Base.transpose(x::NDArray{T,1}) where T reshape(x; shape = (1, length(x)), reverse = true)
-@_remap Base.transpose(x::NDArray{T,2}) where T transpose(x)
-@_remap Base.permutedims(x::NDArray, axes) transpose(x; axes = length(axes) .- tuple(axes...))
-
-Base.prod(x::NDArray; dims = :) = _prod(x, dims)
-@_remap _prod(x::NDArray, ::Colon) prod(x)
-@_remap _prod(x::NDArray, dims)    prod(x; axis = 0 .- dims, keepdims = true)
-
-_nddoc[:clip] = _nddoc[:clip!] =
-"""
-    clip(x::NDArray, min, max)
-    clip!(x::NDArray, min, max)
-
-Clips (limits) the values in `NDArray`.
-Given an interval, values outside the interval are clipped to the interval edges.
-Clipping `x` between `min` and `x` would be:
-
-```julia
-clip(x, min_, max_) = max(min(x, max_), min_))
-```
-
-```jldoctest
-julia> x = NDArray(1:9);
-
-julia> mx.clip(x, 2, 8)'
-1×9 mx.NDArray{Int64,2} @ CPU0:
- 2  2  3  4  5  6  7  8  8
-```
-
-The storage type of clip output depends on storage types of inputs and the
-`min`, `max` parameter values:
-
-- clip(default) = default
-- clip(row_sparse, min <= 0, max >= 0) = row_sparse
-- clip(csr, min <= 0, max >= 0) = csr
-- clip(row_sparse, min < 0, max < 0) = default
-- clip(row_sparse, min > 0, max > 0) = default
-- clip(csr, min < 0, max < 0) = csr
-- clip(csr, min > 0, max > 0) = csr
-"""
-@_remap clip(x::NDArray, min::Real, max::Real) clip(x; a_min = min, a_max = max)
-@_remap clip!(x::NDArray, min::Real, max::Real) clip(x; a_min = min, a_max = max)
-
-_nddoc[:expand_dims] =
-"""
-    expand_dims(x::NDArray, dim)
-
-Insert a new axis into `dim`.
-
-```julia
-julia> x
-4 mx.NDArray{Float64,1} @ CPU0:
- 1.0
- 2.0
- 3.0
- 4.0
-
-julia> mx.expand_dims(x, 1)
-1×4 mx.NDArray{Float64,2} @ CPU0:
- 1.0  2.0  3.0  4.0
-
-julia> mx.expand_dims(x, 2)
-4×1 mx.NDArray{Float64,2} @ CPU0:
- 1.0
- 2.0
- 3.0
- 4.0
-```
-"""
-@_remap expand_dims(x::NDArray, dim) expand_dims(x; axis = -dim)
-
-# trigonometric functions, remap to keep consistent with Base
-@_remap broadcasted(::typeof(sin),  x::NDArray) sin(x)
-@_remap broadcasted(::typeof(cos),  x::NDArray) cos(x)
-@_remap broadcasted(::typeof(tan),  x::NDArray) tan(x)
-@_remap broadcasted(::typeof(asin), x::NDArray) arcsin(x)
-@_remap broadcasted(::typeof(acos), x::NDArray) arccos(x)
-@_remap broadcasted(::typeof(atan), x::NDArray) arctan(x)
-
-# hyperbolic funcs, remap to keep consistent with Base
-@_remap broadcasted(::typeof(sinh),  x::NDArray) sinh(x)
-@_remap broadcasted(::typeof(cosh),  x::NDArray) cosh(x)
-@_remap broadcasted(::typeof(tanh),  x::NDArray) tanh(x)
-@_remap broadcasted(::typeof(asinh), x::NDArray) arcsinh(x)
-@_remap broadcasted(::typeof(acosh), x::NDArray) arccosh(x)
-@_remap broadcasted(::typeof(atanh), x::NDArray) arctanh(x)
-
-# activation functions
-@doc doc"""
-    σ.(x::NDArray)
-    sigmoid.(x::NDArray)
-
-Computes sigmoid of x element-wise.
-
-```math
-σ(x) = \frac{1}{(1 + exp(-x))}
-```
-
-The storage type of `sigmoid` output is always dense.
-"""
-function σ end
-const sigmoid = σ
-_nddoc[:σ] = false
-@_remap broadcasted(::typeof(σ), x::NDArray) sigmoid(x)
-
-@doc doc"""
-    relu.(x::NDArray)
-
-Computes rectified linear.
-
-```math
-\max(x, 0)
-```
-"""
-function relu end
-_nddoc[:relu] = false
-@_remap broadcasted(::typeof(relu), x::NDArray) relu(x)
-
-@doc doc"""
-    softmax.(x::NDArray, [dim = ndims(x)])
-
-Applies the softmax function.
-
-The resulting array contains elements in the range `(0, 1)`
-and the elements along the given axis sum up to 1.
-
-```math
-softmax(\mathbf{z})_j = \frac{e^{z_j}}{\sum_{k=1}^K e^{z_k}}
-```
-"""
-function softmax end
-_nddoc[:softmax] = false
-@_remap broadcasted(::typeof(softmax), x::NDArray)           softmax(x; axis = -ndims(x))
-@_remap broadcasted(::typeof(softmax), x::NDArray, dim::Int) softmax(x; axis = -dim)
-
-"""
-    log_softmax.(x::NDArray, [dim = ndims(x)])
-
-Computes the log softmax of the input.
-This is equivalent to computing softmax followed by log.
-
-julia> x
-2×3 mx.NDArray{Float64,2} @ CPU0:
- 1.0  2.0  0.1
- 0.1  2.0  1.0
-
-julia> mx.log_softmax.(x)
-2×3 mx.NDArray{Float64,2} @ CPU0:
- -1.41703  -0.41703  -2.31703
- -2.31703  -0.41703  -1.41703
-"""
-function log_softmax end
-_nddoc[:log_softmax] = false
-@_remap broadcasted(::typeof(log_softmax), x::NDArray)           log_softmax(x; axis = -ndims(x))
-@_remap broadcasted(::typeof(log_softmax), x::NDArray, dim::Int) log_softmax(x; axis = -dim)
-
-################################################################################
-# remapping to solving type unstablility
-################################################################################
-
-@_remap _plus(x::NDArray, y::NDArray)  _plus(x, y)
-@_remap _plus!(x::NDArray, y::NDArray) _plus(x, y)
-
-@_remap _minus(x::NDArray, y::NDArray)  _minus(x, y)
-@_remap _minus!(x::NDArray, y::NDArray) _minus(x, y)
-
-@_remap _mod(x::NDArray, y::NDArray)  _mod(x, y)
-@_remap _mod!(x::NDArray, y::NDArray) _mod(x, y)
-
-@_remap _mod_scalar(x::NDArray, y::Real)  _mod_scalar(x; scalar = y)
-@_remap _mod_scalar!(x::NDArray, y::Real) _mod_scalar(x; scalar = y)
-
-@_remap _rmod_scalar(x::NDArray, y::Real)  _rmod_scalar(x; scalar = y)
-@_remap _rmod_scalar!(x::NDArray, y::Real) _rmod_scalar(x; scalar = y)
-
-@_remap _broadcast_add(x::NDArray, y::NDArray)  broadcast_add(x, y)
-@_remap _broadcast_add!(x::NDArray, y::NDArray) broadcast_add(x, y)
-
-@_remap _broadcast_minus(x::NDArray, y::NDArray)  broadcast_minus(x, y)
-@_remap _broadcast_minus!(x::NDArray, y::NDArray) broadcast_minus(x, y)
-
-@_remap _broadcast_mul(x::NDArray, y::NDArray)  broadcast_mul(x, y)
-@_remap _broadcast_mul!(x::NDArray, y::NDArray) broadcast_mul(x, y)
-
-@_remap _broadcast_div(x::NDArray, y::NDArray)  broadcast_div(x, y)
-@_remap _broadcast_div!(x::NDArray, y::NDArray) broadcast_div(x, y)
-
-@_remap _broadcast_mod(x::NDArray, y::NDArray)  broadcast_mod(x, y)
-@_remap _broadcast_mod!(x::NDArray, y::NDArray) broadcast_mod(x, y)
-
-@_remap _broadcast_power(x::NDArray, y::NDArray)  broadcast_power(x, y)
-@_remap _broadcast_power!(x::NDArray, y::NDArray) broadcast_power(x, y)
-
-@_remap _broadcast_equal(x::NDArray, y::NDArray)  broadcast_equal(x, y)
-@_remap _broadcast_equal!(x::NDArray, y::NDArray) broadcast_equal(x, y)
-
-@_remap _broadcast_not_equal(x::NDArray, y::NDArray)  broadcast_not_equal(x, y)
-@_remap _broadcast_not_equal!(x::NDArray, y::NDArray) broadcast_not_equal(x, y)
-
-@_remap _broadcast_greater(x::NDArray, y::NDArray)  broadcast_greater(x, y)
-@_remap _broadcast_greater!(x::NDArray, y::NDArray) broadcast_greater(x, y)
-
-@_remap _broadcast_greater_equal(x::NDArray, y::NDArray)  broadcast_greater_equal(x, y)
-@_remap _broadcast_greater_equal!(x::NDArray, y::NDArray) broadcast_greater_equal(x, y)
-
-@_remap _broadcast_lesser(x::NDArray, y::NDArray)  broadcast_lesser(x, y)
-@_remap _broadcast_lesser!(x::NDArray, y::NDArray) broadcast_lesser(x, y)
-
-@_remap _broadcast_lesser_equal(x::NDArray, y::NDArray)  broadcast_lesser_equal(x, y)
-@_remap _broadcast_lesser_equal!(x::NDArray, y::NDArray) broadcast_lesser_equal(x, y)
-
-@_remap _broadcast_maximum(x::NDArray, y::NDArray)  broadcast_maximum(x, y)
-@_remap _broadcast_maximum!(x::NDArray, y::NDArray) broadcast_maximum(x, y)
-
-@_remap _broadcast_minimum(x::NDArray, y::NDArray)  broadcast_minimum(x, y)
-@_remap _broadcast_minimum!(x::NDArray, y::NDArray) broadcast_minimum(x, y)
-
-@_remap _broadcast_hypot(x::NDArray, y::NDArray)  broadcast_hypot(x, y)
-@_remap _broadcast_hypot!(x::NDArray, y::NDArray) broadcast_hypot(x, y)
-
-_nddoc[:broadcast_to] = """
-    broadcast_to(x::NDArray, dims)
-    broadcast_to(x::NDArray, dims...)
-
-Broadcasts the input array to a new shape.
-
-In the case of broacasting doesn't work out of box,
-you can expand the NDArray first.
-
-```jldoctest
-julia> x = mx.ones(2, 3, 4);
-
-julia> y = mx.ones(1, 1, 4);
-
-julia> x .+ mx.broadcast_to(y, 2, 3, 4)
-2×3×4 mx.NDArray{Float32,3} @ CPU0:
-[:, :, 1] =
- 2.0  2.0  2.0
- 2.0  2.0  2.0
-
-[:, :, 2] =
- 2.0  2.0  2.0
- 2.0  2.0  2.0
-
-[:, :, 3] =
- 2.0  2.0  2.0
- 2.0  2.0  2.0
-
-[:, :, 4] =
- 2.0  2.0  2.0
- 2.0  2.0  2.0
-```
-"""
-@_remap broadcast_to(x::NDArray, dims)    broadcast_to(x; shape = dims)
-@_remap broadcast_to(x::NDArray, dims...) broadcast_to(x; shape = dims)
-
-_nddoc[:broadcast_axis] = _nddoc[:broadcast_axes] = """
-    broadcast_axis(x::NDArray, dim, size)
-    broadcast_axes(x::NDArray, dim, size)
-
-Broadcasts the input array over particular axis(axes).
-Parameter `dim` and `size` could be a scalar, a Tuple or an Array.
-
-`broadcast_axes` is just an alias.
-
-```jldoctest
-julia> x
-1×2×1 mx.NDArray{Int64,3} @ CPU0:
-[:, :, 1] =
- 1  2
-
-julia> mx.broadcast_axis(x, 1, 2)
-2×2×1 mx.NDArray{Int64,3} @ CPU0:
-[:, :, 1] =
- 1  2
- 1  2
-
-julia> mx.broadcast_axis(x, 3, 2)
-1×2×2 mx.NDArray{Int64,3} @ CPU0:
-[:, :, 1] =
- 1  2
-
-[:, :, 2] =
- 1  2
-```
-"""
-@_remap(broadcast_axis(x::NDArray, dim, size),
-        broadcast_axis(x; axis = ndims(x) .- dim, size = size))
-@_remap(Base.broadcast_axes(x::NDArray, dim, size),
-        broadcast_axes(x; axis = ndims(x) .- dim, size = size))
-
-################################################################################
-# NDArray functions dynamically imported from libmxnet
-################################################################################
-function _invoke_mxfunction(func_handle::MX_handle, use_vars, scalars, mut_vars; kwargs...)
-  names = String[string(entry[1]) for entry in kwargs]
-  args = String[string(entry[2]) for entry in kwargs]
-  @mxcall(:MXFuncInvokeEx,
-          (MX_handle, Ptr{MX_handle}, Ptr{MX_float}, Ptr{MX_handle}, Cint, char_pp, char_pp),
-          func_handle, use_vars, scalars, mut_vars, length(names), names, args)
-end
-
-@enum(LIBMX_FUNC_TYPE_MASK,
-  NDARRAY_ARG_BEFORE_SCALAR = 1,
-  ACCEPT_EMPTY_MUTATE_TARGET = (1 << 2)
-)
-
-# Import corresponding math functions from base so the automatically defined libmxnet
-# functions can overload them
-import Base: sqrt
-
-"""
-The libxmnet APIs are automatically imported from `libmxnet.so`. The functions listed
-here operate on `NDArray` objects. The arguments to the functions are typically ordered
-as
-
-```julia
-  func_name(arg_in1, arg_in2, ..., scalar1, scalar2, ..., arg_out1, arg_out2, ...)
-```
-
-unless `NDARRAY_ARG_BEFORE_SCALAR` is not set. In this case, the scalars are put before the input arguments:
-
-```julia
-  func_name(scalar1, scalar2, ..., arg_in1, arg_in2, ..., arg_out1, arg_out2, ...)
-```
-
-If `ACCEPT_EMPTY_MUTATE_TARGET` is set. An overloaded function without the output arguments will also be defined:
-
-```julia
-  func_name(arg_in1, arg_in2, ..., scalar1, scalar2, ...)
-```
-
-Upon calling, the output arguments will be automatically initialized with empty NDArrays.
-
-Those functions always return the output arguments. If there is only one output (the typical situation), that
-object (`NDArray`) is returned. Otherwise, a tuple containing all the outputs will be returned.
-"""
-function _get_ndarray_function_def(name::String)
-  func_name = Symbol(name)
-
-  func_def = quote
-    function $func_name(::Type{<:NDArray}, args::NDArray...; out=nothing, kwargs...)
-      if out != nothing
-        output_vars = out
-        if isa(output_vars, NDArray)
-          output_vars = NDArray[output_vars]
-        end
-        num_outputs = length(output_vars)
-      else
-        output_vars = NDArray[]
-        num_outputs = 0
-      end
-
-      args = collect(args)  # tuple to list
-      if length(args) == 0
-        args = MX_handle[]
-      end
-
-      output_handles_pp = if length(output_vars) > 0
-        [map(x -> x.handle, output_vars)]
-      else
-        [Ptr{MX_handle}(C_NULL)]
-      end
-      num_outputs_p = [convert(Cint, num_outputs)]
-
-      kw_keys_str = String[string(x[1]) for x in kwargs]
-      kw_vals_str = String[dump_mx_param(x[2]) for x in kwargs]
-
-      op_handle = _get_cached_libmx_op_handle($(name))
-      @mxcall(:MXImperativeInvoke,
-              (MX_handle, Cint, Ptr{MX_handle},
-               Ptr{Cint}, Ptr{Ptr{MX_handle}},
-               Cint, char_pp, char_pp),
-              op_handle, length(args), args,
-              num_outputs_p, output_handles_pp,
-              length(kwargs), kw_keys_str, kw_vals_str)
-
-      if out == nothing
-        n = num_outputs_p[]
-        hdls = unsafe_wrap(Array{MX_handle}, output_handles_pp[], n)
-        xs = NDArray[NDArray(MX_NDArrayHandle(x)) for x in hdls]
-        if n == 1
-          return xs[]
-        else
-          return xs
-        end
-      else
-        return out
-      end
-    end
-  end
-
-  func_def2 = quote
-    function $func_name(args::NDArray...; out=nothing, kwargs...)
-      $func_name(NDArray, args...; out=out, kwargs...)
-    end
-  end
-
-  return func_def, func_def2
-end
-
-const _op_import_bl = [  # import black list; do not import these funcs
-    "_full",   # we already have `mx.fill`
-    "_ones",   # we already have `mx.ones`
-    "_zeros",  # we already have `mx.zeros`
-    "clip",
-    "expand_dims",
-
-    # arithmetic
-    "_plus",
-    "_minus",
-    "_mod",
-    "_mod_scalar",
-    "_rmod_scalar",
-
-    "dot",
-    "max",
-    "max_axis",
-    "mean",
-    "min",
-    "min_axis",
-    "prod",
-    "reshape",
-    "sum",
-    "transpose",
-
-    # trigonometric
-    "sin",
-    "cos",
-    "tan",
-    "arcsin",
-    "arccos",
-    "arctan",
-
-    # hyperbolic
-    "sinh",
-    "cosh",
-    "tanh",
-    "arcsinh",
-    "arccosh",
-    "arctanh",
-
-    # activation
-    "sigmoid",
-    "relu",
-    "softmax",
-    "log_softmax",
-
-    # broadcast
-    "broadcast_add",
-    "broadcast_plus",
-    "broadcast_minus",
-    "broadcast_sub",
-    "broadcast_mul",
-    "broadcast_div",
-    "broadcast_mod",
-    "broadcast_power",
-    "broadcast_equal",
-    "broadcast_not_equal",
-    "broadcast_greater",
-    "broadcast_greater_equal",
-    "broadcast_lesser",
-    "broadcast_lesser_equal",
-    "broadcast_maximum",
-    "broadcast_minimum",
-    "broadcast_to",
-    "broadcast_axis",
-    "broadcast_axes",
-    "broadcast_hypot",
-]
-
-macro _import_ndarray_functions()
-  names = filter(n -> ∉(lowercase(n), _op_import_bl), _get_libmx_op_names())
-
-  func_exprs = map(names) do name
-    op_handle = _get_libmx_op_handle(name)
-
-    desc, key_narg = _get_libmx_op_description(name, op_handle)
-    func_def, func_def2 = _get_ndarray_function_def(name)
-
-    func_name = Symbol(name)
-
-    import_expr = _import_expr(func_name)
-
-    quote
-      $import_expr
-      $func_def
-      @doc $desc
-      $func_def2
-    end
-  end
-
-  esc(quote
-    $(func_exprs...)
-  end)
-end
-
-@_import_ndarray_functions
+include("ndarray/type.jl")  # type def and constructors
+include("ndarray/context.jl")
+include("ndarray/show.jl")
+include("ndarray/remap.jl")  # provide @_remap util
+include("ndarray/array.jl")
+include("ndarray/arithmetic.jl")
+include("ndarray/comparison.jl")
+include("ndarray/io.jl")  # save/load and synchronization utils
+include("ndarray/reduction.jl")
+include("ndarray/statistic.jl")
+include("ndarray/linalg.jl")
+include("ndarray/trig.jl")
+include("ndarray/activation.jl")
+include("ndarray/autoimport.jl")  # auto import operators from libmxnet
diff --git a/julia/src/ndarray/activation.jl b/julia/src/ndarray/activation.jl
new file mode 100644
index 000000000..8dd31aac8
--- /dev/null
+++ b/julia/src/ndarray/activation.jl
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# activation functions
+
+@doc doc"""
+    σ.(x::NDArray)
+    sigmoid.(x::NDArray)
+
+Computes sigmoid of x element-wise.
+
+```math
+σ(x) = \frac{1}{(1 + exp(-x))}
+```
+
+The storage type of `sigmoid` output is always dense.
+"""
+function σ end
+const sigmoid = σ
+_nddoc[:σ] = false
+@_remap broadcasted(::typeof(σ), x::NDArray) sigmoid(x)
+
+@doc doc"""
+    relu.(x::NDArray)
+
+Computes rectified linear.
+
+```math
+\max(x, 0)
+```
+"""
+function relu end
+_nddoc[:relu] = false
+@_remap broadcasted(::typeof(relu), x::NDArray) relu(x)
+
+@doc doc"""
+    softmax.(x::NDArray, [dim = ndims(x)])
+
+Applies the softmax function.
+
+The resulting array contains elements in the range `(0, 1)`
+and the elements along the given axis sum up to 1.
+
+```math
+softmax(\mathbf{z})_j = \frac{e^{z_j}}{\sum_{k=1}^K e^{z_k}}
+```
+"""
+function softmax end
+_nddoc[:softmax] = false
+@_remap broadcasted(::typeof(softmax), x::NDArray)           softmax(x; axis = -ndims(x))
+@_remap broadcasted(::typeof(softmax), x::NDArray, dim::Int) softmax(x; axis = -dim)
+
+"""
+    log_softmax.(x::NDArray, [dim = ndims(x)])
+
+Computes the log softmax of the input.
+This is equivalent to computing softmax followed by log.
+
+julia> x
+2×3 mx.NDArray{Float64,2} @ CPU0:
+ 1.0  2.0  0.1
+ 0.1  2.0  1.0
+
+julia> mx.log_softmax.(x)
+2×3 mx.NDArray{Float64,2} @ CPU0:
+ -1.41703  -0.41703  -2.31703
+ -2.31703  -0.41703  -1.41703
+"""
+function log_softmax end
+_nddoc[:log_softmax] = false
+@_remap broadcasted(::typeof(log_softmax), x::NDArray)           log_softmax(x; axis = -ndims(x))
+@_remap broadcasted(::typeof(log_softmax), x::NDArray, dim::Int) log_softmax(x; axis = -dim)
+
diff --git a/julia/src/ndarray/arithmetic.jl b/julia/src/ndarray/arithmetic.jl
new file mode 100644
index 000000000..60dde6bde
--- /dev/null
+++ b/julia/src/ndarray/arithmetic.jl
@@ -0,0 +1,291 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import Base: +
+
+"""
+    +(args...)
+    .+(args...)
+
+Summation. Multiple arguments of either scalar or `NDArray` could be
+added together. Note at least the first or second argument needs to be an
+`NDArray` to avoid ambiguity of built-in summation.
+"""
++(x::NDArray)             = x
++(x::NDArray, y::NDArray) = _plus(x, y)
++(x::NDArray, y::Real)    = _plus_scalar(x, scalar = y)
++(y::Real,    x::NDArray) = _plus_scalar(x, scalar = y)
+
+broadcasted(::typeof(+), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_add(x, y)
+
+"""
+    sub_from!(dst::NDArray, args::NDArrayOrReal...)
+
+Subtract a bunch of arguments from `dst`. Inplace updating.
+"""
+function sub_from!(dst::NDArray, arg::NDArrayOrReal)
+  @assert dst.writable
+  if isa(arg, Real)
+    _minus_scalar(dst, scalar = arg, out = dst)
+  else
+    _minus!(dst, arg)
+  end
+  dst
+end
+
+import Base: -
+
+"""
+    -(x::NDArray)
+    -(x, y)
+    .-(x, y)
+
+Subtraction `x - y`, of scalar types or `NDArray`.
+Or create the negative of `x`.
+"""
+-(x::NDArray)             = _mul_scalar(x, scalar = -one(eltype(x)))
+-(x::NDArray, y::NDArray) = _minus(x, y)
+-(x::NDArray, y::Real)    = _minus_scalar(x, scalar = y)
+-(y::Real, x::NDArray)    = _rminus_scalar(x, scalar = y)
+
+broadcasted(::typeof(-), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_minus(x, y)
+
+"""
+    mul_to!(dst::NDArray, arg::NDArrayOrReal)
+
+Elementwise multiplication into `dst` of either a scalar or an `NDArray` of the same shape.
+Inplace updating.
+"""
+function mul_to!(dst::NDArray, arg::NDArrayOrReal)
+  @assert dst.writable
+  if isa(arg, Real)
+    _mul_scalar(dst, scalar = arg, out = dst)
+  else
+    _mul(dst, arg, out = dst)
+  end
+  dst
+end
+
+import Base: *
+
+"""
+    .*(x, y)
+
+Elementwise multiplication for `NDArray`.
+"""
+*(x::NDArray, y::Real)  = _mul_scalar(x, scalar = y)
+*(y::Real, x::NDArray)  = _mul_scalar(x, scalar = y)
+
+broadcasted(::typeof(*), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
+  _mul(x, y)
+broadcasted(::typeof(*), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_mul(x, y)
+
+"""
+    *(A::NDArray, B::NDArray)
+
+Matrix/tensor multiplication.
+"""
+*(x::NDArray{T}, y::NDArray{T}) where T = x ⋅ y
+
+LinearAlgebra.adjoint(x::NDArray{T,1}) where T = transpose(x)
+LinearAlgebra.adjoint(x::NDArray{T,2}) where T = transpose(x)
+
+"""
+    div_from!(dst::NDArray, arg::NDArrayOrReal)
+
+Elementwise divide a scalar or an `NDArray` of the same shape from `dst`. Inplace updating.
+"""
+function div_from!(dst::NDArray, arg::NDArrayOrReal)
+  @assert dst.writable
+  if isa(arg, Real)
+    _div_scalar(dst, scalar = arg, out = dst)
+  else
+    _div(dst, arg, out = dst)
+  end
+  dst
+end
+
+function div_from!(dst::NDArray{T}, arg::Real) where {T<:Integer}
+  @assert dst.writable
+  @assert(round(T, arg) != zero(T), "Integer divided by zero")
+  _div_scalar(dst, scalar = arg, out = dst)
+  dst
+end
+
+"""
+    rdiv_from!(x:: Real, y::NDArray)
+
+Elementwise divide a scalar by an `NDArray`. Inplace updating.
+"""
+function rdiv_from!(x::Real, y::NDArray)
+  @assert y.writable
+  _rdiv_scalar(y, scalar = x, out = y)
+  y
+end
+
+import Base: /
+
+"""
+    ./(x::NDArray, y::NDArray)
+    ./(x::NDArray, y::Real)
+    ./(x::Real, y::NDArray)
+
+* Elementwise dividing an `NDArray` by a scalar or another `NDArray`
+of the same shape.
+
+* Elementwise divide a scalar by an `NDArray`.
+
+* Matrix division (solving linear systems) is not implemented yet.
+"""
+/(x::NDArray, y::Real) = _div_scalar(x, scalar = y)
+
+broadcasted(::typeof(/), y::Real, x::NDArray) = _rdiv_scalar(x, scalar = y)
+broadcasted(::typeof(/), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
+  _div(x, y)
+broadcasted(::typeof(/), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_div(x, y)
+
+function broadcasted(::typeof(/), x::NDArray{T}, y::Real) where {T<:Integer}
+  @assert(round(T, y) != zero(T), "Integer divided by zero")
+  _div_scalar(x, scalar = y)
+end
+
+"""
+    mod_from!(x::NDArray, y::NDArray)
+    mod_from!(x::NDArray, y::Real)
+
+Elementwise modulo for `NDArray`.
+Inplace updating.
+"""
+mod_from!(x::NDArray, y::NDArray) = _mod!(x, y)
+mod_from!(x::NDArray, y::Real)    = _mod_scalar!(x, y)
+
+"""
+    rmod_from!(y::Real, x::NDArray)
+
+Elementwise modulo for `NDArray`.
+Inplace updating.
+"""
+rmod_from!(y::Real, x::NDArray) = _rmod_scalar!(x, y)
+
+import Base: %
+
+"""
+    .%(x::NDArray, y::NDArray)
+    .%(x::NDArray, y::Real)
+    .%(x::Real, y::NDArray)
+
+Elementwise modulo for `NDArray`.
+"""
+%(x::NDArray, y::Real) = _mod_scalar(x, y)
+
+broadcasted(::typeof(%), y::Real, x::NDArray) = _rmod_scalar(x, y)
+broadcasted(::typeof(%), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
+  _mod(x, y)
+broadcasted(::typeof(%), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_mod(x, y)
+
+# document of `.^` is merged into SymbolicNode's
+
+broadcasted(::typeof(Base.literal_pow), ::typeof(^), x::NDArray, ::Val{s}) where {s} =
+  _power_scalar(x, scalar = s)
+broadcasted(::typeof(^), x::NDArray, s::Real) = _power_scalar(x,  scalar = s)
+broadcasted(::typeof(^), s::Real, x::NDArray) = _rpower_scalar(x, scalar = s)
+
+broadcasted(::typeof(^), ::Irrational{:ℯ}, x::NDArray) = exp(x)
+broadcasted(::typeof(^), x::NDArray, s::Irrational)    = _power_scalar(x, scalar = s)
+broadcasted(::typeof(^), s::Irrational, x::NDArray)    = _rpower_scalar(x, scalar = s)
+
+broadcasted(::typeof(^), x::NDArray{T,N}, y::NDArray{T,N}) where {T,N} =
+  _power(x, y)
+broadcasted(::typeof(^), x::NDArray{T,N}, y::NDArray{T,M}) where {T,N,M} =
+  _broadcast_power(x, y)
+
+_nddoc[:clip] = _nddoc[:clip!] =
+"""
+    clip(x::NDArray, min, max)
+    clip!(x::NDArray, min, max)
+
+Clips (limits) the values in `NDArray`.
+Given an interval, values outside the interval are clipped to the interval edges.
+Clipping `x` between `min` and `x` would be:
+
+```julia
+clip(x, min_, max_) = max(min(x, max_), min_))
+```
+
+```jldoctest
+julia> x = NDArray(1:9);
+
+julia> mx.clip(x, 2, 8)'
+1×9 mx.NDArray{Int64,2} @ CPU0:
+ 2  2  3  4  5  6  7  8  8
+```
+
+The storage type of clip output depends on storage types of inputs and the
+`min`, `max` parameter values:
+
+- clip(default) = default
+- clip(row_sparse, min <= 0, max >= 0) = row_sparse
+- clip(csr, min <= 0, max >= 0) = csr
+- clip(row_sparse, min < 0, max < 0) = default
+- clip(row_sparse, min > 0, max > 0) = default
+- clip(csr, min < 0, max < 0) = csr
+- clip(csr, min > 0, max > 0) = csr
+"""
+@_remap clip(x::NDArray, min::Real, max::Real) clip(x; a_min = min, a_max = max)
+@_remap clip!(x::NDArray, min::Real, max::Real) clip(x; a_min = min, a_max = max)
+
+################################################################################
+# remapping to solving type unstablility
+################################################################################
+
+@_remap _plus(x::NDArray, y::NDArray)  _plus(x, y)
+@_remap _plus!(x::NDArray, y::NDArray) _plus(x, y)
+
+@_remap _minus(x::NDArray, y::NDArray)  _minus(x, y)
+@_remap _minus!(x::NDArray, y::NDArray) _minus(x, y)
+
+@_remap _mod(x::NDArray, y::NDArray)  _mod(x, y)
+@_remap _mod!(x::NDArray, y::NDArray) _mod(x, y)
+
+@_remap _mod_scalar(x::NDArray, y::Real)  _mod_scalar(x; scalar = y)
+@_remap _mod_scalar!(x::NDArray, y::Real) _mod_scalar(x; scalar = y)
+
+@_remap _rmod_scalar(x::NDArray, y::Real)  _rmod_scalar(x; scalar = y)
+@_remap _rmod_scalar!(x::NDArray, y::Real) _rmod_scalar(x; scalar = y)
+
+@_remap _broadcast_add(x::NDArray, y::NDArray)  broadcast_add(x, y)
+@_remap _broadcast_add!(x::NDArray, y::NDArray) broadcast_add(x, y)
+
+@_remap _broadcast_minus(x::NDArray, y::NDArray)  broadcast_minus(x, y)
+@_remap _broadcast_minus!(x::NDArray, y::NDArray) broadcast_minus(x, y)
+
+@_remap _broadcast_mul(x::NDArray, y::NDArray)  broadcast_mul(x, y)
+@_remap _broadcast_mul!(x::NDArray, y::NDArray) broadcast_mul(x, y)
+
+@_remap _broadcast_div(x::NDArray, y::NDArray)  broadcast_div(x, y)
+@_remap _broadcast_div!(x::NDArray, y::NDArray) broadcast_div(x, y)
+
+@_remap _broadcast_mod(x::NDArray, y::NDArray)  broadcast_mod(x, y)
+@_remap _broadcast_mod!(x::NDArray, y::NDArray) broadcast_mod(x, y)
+
+@_remap _broadcast_power(x::NDArray, y::NDArray)  broadcast_power(x, y)
+@_remap _broadcast_power!(x::NDArray, y::NDArray) broadcast_power(x, y)
diff --git a/julia/src/ndarray/array.jl b/julia/src/ndarray/array.jl
new file mode 100644
index 000000000..b71e5ddf9
--- /dev/null
+++ b/julia/src/ndarray/array.jl
@@ -0,0 +1,712 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Julia Array related interface
+
+"""
+    similar(x::NDArray; writable, ctx)
+
+Create an `NDArray` with similar shape, data type,
+and context with the given one.
+Note that the returned `NDArray` is uninitialized.
+"""
+Base.similar(x::NDArray{T,N}; writable = x.writable, ctx = context(x)) where {T,N} =
+  NDArray{T,N}(undef, size(x)...; writable = writable, ctx = ctx)
+
+"""
+    zeros([DType], dims, [ctx::Context = cpu()])
+    zeros([DType], dims...)
+    zeros(x::NDArray)
+
+Create zero-ed `NDArray` with specific shape and type.
+"""
+function zeros(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
+  x = NDArray{T}(undef, dims..., ctx = ctx)
+  x[:] = zero(T)
+  x
+end
+
+zeros(::Type{T}, dims::Int...) where {T<:DType} = zeros(T, dims)
+
+zeros(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
+  zeros(MX_float, dims, ctx)
+zeros(dims::Int...) = zeros(dims)
+
+zeros(x::NDArray)::typeof(x)      = zeros_like(x)
+Base.zeros(x::NDArray)::typeof(x) = zeros_like(x)
+
+"""
+    ones([DType], dims, [ctx::Context = cpu()])
+    ones([DType], dims...)
+    ones(x::NDArray)
+
+Create an `NDArray` with specific shape & type, and initialize with 1.
+"""
+function ones(::Type{T}, dims::NTuple{N,Int}, ctx::Context = cpu()) where {N,T<:DType}
+  arr = NDArray{T}(undef, dims..., ctx = ctx)
+  arr[:] = one(T)
+  arr
+end
+
+ones(::Type{T}, dims::Int...) where T<:DType = ones(T, dims)
+
+ones(dims::NTuple{N,Int}, ctx::Context = cpu()) where N =
+  ones(MX_float, dims, ctx)
+ones(dims::Int...) = ones(dims)
+
+ones(x::NDArray)::typeof(x)      = ones_like(x)
+Base.ones(x::NDArray)::typeof(x) = ones_like(x)
+
+import Base: length, ndims
+
+"""
+    size(x::NDArray)
+    size(x::NDArray, dims)
+
+Get the shape of an `NDArray`. The shape is in Julia's column-major convention.
+See also the notes on NDArray shapes [`NDArray`](@ref).
+"""
+function Base.size(x::NDArray)
+  ref_ndim  = Ref{MX_uint}(0)
+  ref_shape = Ref{Ptr{MX_uint}}(0)
+  @mxcall(:MXNDArrayGetShape, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_uint}}),
+          x, ref_ndim, ref_shape)
+  tuple(map(Int, reverse(unsafe_wrap(Array, ref_shape[], ref_ndim[])))...)
+end
+
+Base.size(x::NDArray{T,N}, dims::Integer) where {T,N} = (dims > N) ? 1 : size(x)[dims]
+
+"""
+    length(x::NDArray)
+
+Get the number of elements in an `NDArray`.
+"""
+length(x::NDArray) = prod(size(x))
+
+"""
+    ndims(x::NDArray)
+
+Get the number of dimensions of an `NDArray`.
+Is equivalent to `length(size(arr))`.
+"""
+ndims(x::NDArray) = ndims(x.handle)
+
+function ndims(x::MX_NDArrayHandle)::Int
+  ref_ndim  = Ref{MX_uint}(0)
+  ref_shape = Ref{Ptr{MX_uint}}(0)
+  @mxcall(:MXNDArrayGetShape, (MX_handle, Ref{MX_uint}, Ref{Ptr{MX_uint}}),
+          x, ref_ndim, ref_shape)
+  ref_ndim[]
+end
+
+"""
+    eltype(x::NDArray)
+
+Get the element type of an `NDArray`.
+"""
+function Base.eltype(x::Union{NDArray,MX_NDArrayHandle})
+  dtype_ref = Ref{Cint}(0)
+  @mxcall(:MXNDArrayGetDType, (MX_handle, Ptr{Cint}), x, dtype_ref)
+
+  if dtype_ref[] == -1 # x->is_none()
+    # TODO: unit test for this branch
+    throw(MXError("Eltype of $x is not defined"))
+  end
+
+  fromTypeFlag(TypeFlag(dtype_ref[]))
+end
+
+@inline _first(x::NDArray) = try_get_shared(x, sync = :read) |> first
+
+Base.first(x::NDArray) = _first(x)
+
+Base.lastindex(x::NDArray) = length(x)
+
+"""
+    slice(arr :: NDArray, start:stop)
+
+Create a view into a sub-slice of an `NDArray`. Note only slicing at the slowest
+changing dimension is supported. In Julia's column-major perspective, this is the last
+dimension. For example, given an `NDArray` of shape (2,3,4), `slice(array, 2:3)` will create
+a `NDArray` of shape (2,3,2), sharing the data with the original array. This operation is
+used in data parallelization to split mini-batch into sub-batches for different devices.
+"""
+function slice(arr::NDArray, ::Colon)
+  arr
+end
+function slice(arr::NDArray, slice::UnitRange{Int})
+  dim1 = size(arr)[end]
+  @assert(1 <= slice.start <= slice.stop <= dim1)
+  if slice.start == 1 && slice.stop == dim1
+    return arr
+  end
+
+  hdr_ref = Ref{MX_handle}(0)
+  # note Julia is 1-based, inclusive-inclusive indexing, while C++ is
+  # 0-based, inclusive-exclusive indexing. So 1:3 in Julia should
+  # translates into 0:3 in C++.
+  @mxcall(:MXNDArraySlice, (MX_handle, MX_uint, MX_uint, Ref{MX_handle}),
+          arr, slice.start-1, slice.stop, hdr_ref)
+  return NDArray(MX_NDArrayHandle(hdr_ref[]), arr.writable)
+end
+
+function _at(handle::Union{MX_NDArrayHandle, MX_handle}, idx::Integer)
+  h_ref = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXNDArrayAt, (MX_handle, MX_uint, Ref{MX_handle}),
+          handle, idx, h_ref)
+  h_ref[]
+end
+
+import Base: setindex!
+
+"""
+    setindex!(arr::NDArray, val, idx)
+
+Assign values to an `NDArray`.
+The following scenarios are supported
+
+* single value assignment via linear indexing: `arr[42] = 24`
+
+* `arr[:] = val`: whole array assignment, `val` could be a scalar or an array (Julia `Array`
+  or `NDArray`) of the same shape.
+* `arr[start:stop] = val`: assignment to a *slice*, `val` could be a scalar or an array of
+  the same shape to the slice. See also [`slice`](@ref).
+"""
+function setindex!(arr::NDArray, val::Real, idx::Integer)
+  # linear indexing
+  @assert arr.writable
+  _set_value(out=arr[idx], src=val)
+end
+
+function setindex!(arr::NDArray, val::Real, ::Colon)
+  @assert arr.writable
+  _set_value(out = arr, src = dump_mx_param(val))
+end
+
+function setindex!(arr::NDArray, val::Array{T}, ::Colon) where T<:Real
+  @assert arr.writable
+  copy!(arr, val)
+end
+
+function setindex!(arr::NDArray, val::NDArray, ::Colon)
+  @assert arr.writable
+  copy!(arr, val)
+end
+
+function setindex!(arr::NDArray, val::Union{T,Array{T},NDArray},
+                   idx::UnitRange{Int}) where T<:Real
+  @assert arr.writable
+  setindex!(slice(arr, idx), val, Colon())
+end
+
+import Base: getindex
+"""
+    getindex(arr::NDArray, idx)
+
+Shortcut for [`slice`](@ref). A typical use is to write
+
+```julia
+  arr[:] += 5
+```
+
+which translates into
+
+```julia
+  arr[:] = arr[:] + 5
+```
+
+which furthur translates into
+
+```julia
+  setindex!(getindex(arr, Colon()), 5, Colon())
+```
+
+!!! note
+    The behavior is quite different from indexing into Julia's `Array`. For example, `arr[2:5]`
+    create a **copy** of the sub-array for Julia `Array`, while for `NDArray`, this is
+    a *slice* that shares the memory.
+"""
+getindex(arr::NDArray, ::Colon) = arr
+
+"""
+Shortcut for [`slice`](@ref).
+**NOTE** the behavior for Julia's built-in index slicing is to create a
+copy of the sub-array, while here we simply call `slice`,
+which shares the underlying memory.
+"""
+getindex(arr::NDArray, idx::UnitRange{Int}) = slice(arr, idx)
+
+getindex(arr::NDArray) = _first(arr)
+
+function getindex(arr::NDArray, idx::Integer)
+  # linear indexing
+  len = length(arr)
+  size_ = size(arr)
+
+  if idx <= 0 || idx > len
+    throw(BoundsError(
+      "attempt to access $(join(size_, 'x')) NDArray at index $(idx)"))
+  end
+
+  idx -= 1
+  offsets = size_[1:end-1] |> reverse ∘ cumprod ∘ collect
+  handle = arr.handle
+  for offset ∈ offsets
+    handle = _at(handle, idx ÷ offset)
+    idx %= offset
+  end
+
+  _at(handle, idx) |> MX_NDArrayHandle |> x -> NDArray(x, arr.writable)
+end
+
+import Base: copy!, copy, convert, deepcopy
+
+"""
+    copy!(dst::Union{NDArray, Array}, src::Union{NDArray, Array})
+
+Copy contents of `src` into `dst`.
+"""
+function copy!(dst::NDArray, src::NDArray)
+  @assert(dst.writable)
+  if dst.handle == src.handle
+    @warn("Copying an NDArray to itself")
+    return
+  end
+
+  _copyto(src, out=dst)
+  return dst
+end
+
+function copy!(dst::Array{T}, src::NDArray{T}) where T<:DType
+  @assert size(dst) == size(src)
+  @mxcall(:MXNDArraySyncCopyToCPU, (MX_handle, Ptr{Cvoid}, Csize_t),
+          src, pointer(dst), length(dst))
+  dst
+end
+
+copy!(dst::Array{<:Real}, src::NDArray) = copy!(dst, copy(src))
+copy!(dst::NDArray, src::AbstractArray) = copy!(dst, collect(src))
+
+function copy!(dst::NDArray{T}, src::Array{<:Real}) where {T}
+  @assert dst.writable
+  @assert size(dst) == size(src)
+  src = convert(Array{T}, src) # this might involve copying
+  @mxcall(:MXNDArraySyncCopyFromCPU, (MX_handle, Ptr{Cvoid}, Csize_t),
+          dst.handle, pointer(src), length(src))
+  dst
+end
+
+function copy_ignore_shape!(dst::NDArray{T}, src::Array{<:Real}) where {T}
+  @assert dst.writable
+  @assert length(dst) == length(src)
+  src = convert(Array{T}, src) # this might involve copying
+  @mxcall(:MXNDArraySyncCopyFromCPU, (MX_handle, Ptr{Cvoid}, Csize_t),
+          dst.handle, pointer(src), length(src))
+  dst
+end
+
+
+"""
+    copy(arr :: NDArray)
+    copy(arr :: NDArray, ctx :: Context)
+    copy(arr :: Array, ctx :: Context)
+
+Create a copy of an array. When no `Context` is given, create a Julia `Array`.
+Otherwise, create an `NDArray` on the specified context.
+"""
+copy
+
+# Create copy: NDArray -> Julia Array
+copy(x::NDArray{T,D}) where{T,D} = copy!(Array{T,D}(undef, size(x)), x)
+
+# Create copy: NDArray -> NDArray in a given context
+copy(x::NDArray{T,D}, ctx::Context) where {T,D} =
+  copy!(NDArray{T,D}(_ndarray_alloc(T, size(x), ctx, true)), x)
+
+# Create copy: Julia Array -> NDArray in a given context
+copy(x::Array{T}, ctx::Context) where {T<:DType} =
+  copy!(NDArray{T}(undef, size(x); ctx = ctx), x)
+
+copy(x::AbstractArray, ctx::Context) =
+  copy!(NDArray{eltype(x)}(undef, size(x); ctx = ctx), collect(x))
+
+"""
+    convert(::Type{Array{<:Real}}, x::NDArray)
+
+Convert an `NDArray` into a Julia `Array` of specific type.
+Data will be copied.
+"""
+convert(T::Type{Array{<:Real}}, x::NDArray) = convert(T, copy(x))
+
+"""
+    deepcopy(arr::NDArray)
+
+Get a deep copy of the data blob in the form of an NDArray of default storage
+type. This function blocks. Do not use it in performance critical code.
+"""
+function deepcopy(arr::NDArray)
+  out_ref = Ref{MX_handle}(C_NULL)
+  @mxcall(:MXNDArrayGetDataNDArray, (MX_handle, Ref{MX_handle}), arr, out_ref)
+  NDArray(MX_NDArrayHandle(out_ref[]))
+end
+
+"""
+    hcat(x::NDArray...)
+"""
+Base.hcat(xs::NDArray{T}...) where T = cat(xs..., dims = 2)
+
+"""
+    vcat(x::NDArray...)
+"""
+Base.vcat(xs::NDArray{T}...) where T = cat(xs..., dims = 1)
+
+"""
+    cat(xs::NDArray...; dims)
+
+Concate the `NDArray`s which have the same element type along the `dims`.
+Building a diagonal matrix is not supported yet.
+"""
+function Base.cat(xs::NDArray{T}...; dims) where T
+  ns = ndims.(xs)
+  d = Base.max(dims, maximum(ns))
+  xs′ = map(zip(ns, xs)) do i
+    n, x = i
+    (d > n) ? reshape(x, -2, Base.ones(Int, d - n)...) : x
+  end
+  concat(xs′..., dim = d - dims)
+end
+
+"""
+    @inplace
+
+Julia does not support re-definiton of `+=` operator (like `__iadd__` in python),
+When one write `a += b`, it gets translated to `a = a+b`. `a+b` will allocate new
+memory for the results, and the newly allocated `NDArray` object is then assigned
+back to a, while the original contents in a is discarded. This is very inefficient
+when we want to do inplace update.
+
+This macro is a simple utility to implement this behavior. Write
+
+```julia
+  @mx.inplace a += b
+```
+
+will translate into
+
+```julia
+  mx.add_to!(a, b)
+```
+
+which will do inplace adding of the contents of `b` into `a`.
+"""
+macro inplace(ex)
+  f = if ex.head == :+= || ex.head == :.+=
+    :add_to!
+  elseif ex.head == :-= || ex.head == :.-=
+    :sub_from!
+  elseif ex.head == :.*=
+    :mul_to!
+  elseif ex.head == :./=
+    :div_from!
+  elseif ex.head == :.%=
+    :mod_from!
+  else
+    error("unsupported inplace translation for $ex")
+  end
+  Expr(:call, f, esc(ex.args[1]), esc(ex.args[2]))
+end
+
+"""
+    add_to!(dst::NDArray, args::NDArrayOrReal...)
+
+Add a bunch of arguments into `dst`. Inplace updating.
+"""
+function add_to!(dst::NDArray, args::NDArrayOrReal...)
+  @assert dst.writable
+  for arg in args
+    if isa(arg, Real)
+      _plus_scalar(dst, scalar = arg, out = dst)
+    else
+      _plus!(dst, arg)
+    end
+  end
+  dst
+end
+
+"""
+    fill!(arr::NDArray, x)
+
+Create an `NDArray` filled with the value `x`, like `Base.fill!`.
+"""
+function Base.fill!(arr::NDArray, x)
+  arr[:] = x
+  arr
+end
+
+"""
+    fill(x, dims, ctx=cpu())
+    fill(x, dims...)
+
+Create an `NDArray` filled with the value `x`, like `Base.fill`.
+"""
+function fill(x::T, dims::NTuple{N,Integer}, ctx::Context = cpu()) where {T,N}
+  arr = NDArray{T}(undef, dims, ctx = ctx)
+  arr[:] = x
+  arr
+end
+
+fill(x, dims::Integer...) = fill(x, dims)
+
+import Base: hypot
+
+broadcasted(::typeof(hypot), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_hypot(x, y)
+
+"""
+Manipulating as Julia Arrays
+----------------------------
+
+    @nd_as_jl(captures..., statement)
+
+A convenient macro that allows to operate `NDArray` as Julia Arrays. For example,
+
+```julia
+  x = mx.zeros(3,4)
+  y = mx.ones(3,4)
+  z = mx.zeros((3,4), mx.gpu())
+
+  @mx.nd_as_jl ro=(x,y) rw=z begin
+    # now x, y, z are just ordinary Julia Arrays
+    z[:,1] = y[:,2]
+    z[:,2] = 5
+  end
+```
+
+Under the hood, the macro convert all the declared captures from `NDArray` into Julia
+Arrays, by using `try_get_shared`. And automatically commit the modifications back into
+the `NDArray` that is declared as `rw`. This is useful for fast prototyping and when
+implement non-critical computations, such as `AbstractEvalMetric`.
+
+!!! note
+* Multiple `rw` and / or `ro` capture declaration could be made.
+* The macro does **not** check to make sure that `ro` captures are not modified. If the
+  original `NDArray` lives in CPU memory, then it is very likely the corresponding
+  Julia Array shares data with the `NDArray`, so modifying the Julia Array will also
+  modify the underlying `NDArray`.
+* More importantly, since the `NDArray` is
+  asynchronized, we will wait for *writing* for `rw` variables but wait only for *reading*
+  in `ro` variables. If we write into those `ro` variables, **and** if the memory is
+  shared, racing condition might happen, and the behavior is undefined.
+* When an `NDArray` is declared to be captured as `rw`, its contents is always sync
+  back in the end.
+* The execution results of the expanded macro is always `nothing`.
+* The statements are wrapped in a `let`, thus locally introduced new variables will not be
+  available after the statements. So you will need to declare the variables before calling the
+  macro if needed.
+"""
+macro nd_as_jl(m_args...)
+  @assert(length(m_args) > 0)
+  stmts = m_args[end]
+  @assert(isa(stmts, Expr) && stmts.head == :block,
+          "The last argument should be a statement block (begin-end); but get $stmts")
+  stmts = esc(stmts)
+
+  dclrs  = m_args[1:end-1]
+  nd_ro  = []
+  nd_rw  = []
+  nd_all = []
+  for declr in dclrs
+    @assert(isa(declr, Expr) && declr.head == :(=) && length(declr.args)==2 && declr.args[1] ∈ (:ro,:rw),
+            "Invalid declaration, should be rw=(x,y) or ro=z; but get $declr")
+
+    declr_vars = declr.args[2]
+    if isa(declr_vars, Symbol)
+      declr_vars = (declr_vars,)
+    elseif isa(declr_vars, Expr)
+      @assert(declr_vars.head ∈ (:tuple, :vect),
+              "Capture declaration should be a variable or a tuple of variables; but got $declr_vars")
+      declr_vars = declr_vars.args
+    else
+      @assert(false, "Capture declaration should be a variable or a tuple of variables; but got $declr_vars")
+    end
+    for declr_var in declr_vars
+      @assert(isa(declr_var, Symbol),
+              "Captured ndarrays in ro/rw declaration should be variables, but get $(declr_var)")
+    end
+    append!(nd_all, [declr_vars...])
+    if declr.args[1] == :ro
+      append!(nd_ro, [declr_vars...])
+    else
+      append!(nd_rw, [declr_vars...])
+    end
+  end
+
+  nd_ro    = map(esc, nd_ro)
+  nd_rw    = map(esc, nd_rw)
+  nd_all   = map(esc, nd_all)
+  rw_origs = [gensym() for _ in nd_rw]
+
+  save_statements  = Expr(:block, [:($v_orig = $v) for (v_orig, v) in zip(rw_origs, nd_rw)]...)
+  wait_statements  = Expr(:block, [:(_wait_to_read($v)) for v in nd_ro]...,
+                                  [:(_wait_to_write($v)) for v in nd_rw]...)
+  clear_statements = Expr(:block, [:($v_orig = nothing) for v_orig in rw_origs]...)
+  let_assignments  = Expr(:block, [:($v = try_get_shared($v)) for v in nd_all]...)
+  sync_statements  = map(rw_origs, nd_rw) do v_orig, v
+    quote
+      if !is_shared($v, $v_orig)
+        # copy data back if not or no longer sharing data
+        copy!($v_orig, $v)
+      end
+    end
+  end
+  sync_statements  = Expr(:block, sync_statements...)
+
+  let_statement = Expr(:let, let_assignments, quote
+    $stmts
+    $sync_statements
+  end)
+  m_body = quote
+    $wait_statements
+    $save_statements
+    $let_statement
+    $clear_statements
+    nothing # the final results is always nothing
+  end
+
+  m_body
+end
+
+# NOTE: internal use only. Accessing pointers on a different device (e.g. accessing GPU
+# pointers from CPU) leads to undefined behavior.
+import Base.pointer
+function pointer(arr :: NDArray)
+  pdata = Ref{Ptr{Cvoid}}(0)
+  @mxcall(:MXNDArrayGetData, (MX_handle, Ref{Ptr{Cvoid}}), arr, pdata)
+  return convert(Ptr{eltype(arr)}, pdata[])
+end
+
+_ndsig[:reshape] = :(reshape(x; shape = dim, reverse = !reverse))
+@_remap Base.reshape(x::NDArray, dim...; reverse = false) reshape
+@_remap Base.reshape(x::NDArray, dim   ; reverse = false) reshape
+
+_nddoc[:expand_dims] =
+"""
+    expand_dims(x::NDArray, dim)
+
+Insert a new axis into `dim`.
+
+```julia
+julia> x
+4 mx.NDArray{Float64,1} @ CPU0:
+ 1.0
+ 2.0
+ 3.0
+ 4.0
+
+julia> mx.expand_dims(x, 1)
+1×4 mx.NDArray{Float64,2} @ CPU0:
+ 1.0  2.0  3.0  4.0
+
+julia> mx.expand_dims(x, 2)
+4×1 mx.NDArray{Float64,2} @ CPU0:
+ 1.0
+ 2.0
+ 3.0
+ 4.0
+```
+"""
+@_remap expand_dims(x::NDArray, dim) expand_dims(x; axis = -dim)
+
+@_remap Base.permutedims(x::NDArray, axes) transpose(x; axes = length(axes) .- tuple(axes...))
+
+_nddoc[:broadcast_to] = """
+    broadcast_to(x::NDArray, dims)
+    broadcast_to(x::NDArray, dims...)
+
+Broadcasts the input array to a new shape.
+
+In the case of broacasting doesn't work out of box,
+you can expand the NDArray first.
+
+```jldoctest
+julia> x = mx.ones(2, 3, 4);
+
+julia> y = mx.ones(1, 1, 4);
+
+julia> x .+ mx.broadcast_to(y, 2, 3, 4)
+2×3×4 mx.NDArray{Float32,3} @ CPU0:
+[:, :, 1] =
+ 2.0  2.0  2.0
+ 2.0  2.0  2.0
+
+[:, :, 2] =
+ 2.0  2.0  2.0
+ 2.0  2.0  2.0
+
+[:, :, 3] =
+ 2.0  2.0  2.0
+ 2.0  2.0  2.0
+
+[:, :, 4] =
+ 2.0  2.0  2.0
+ 2.0  2.0  2.0
+```
+"""
+@_remap broadcast_to(x::NDArray, dims)    broadcast_to(x; shape = dims)
+@_remap broadcast_to(x::NDArray, dims...) broadcast_to(x; shape = dims)
+
+_nddoc[:broadcast_axis] = _nddoc[:broadcast_axes] = """
+    broadcast_axis(x::NDArray, dim, size)
+    broadcast_axes(x::NDArray, dim, size)
+
+Broadcasts the input array over particular axis(axes).
+Parameter `dim` and `size` could be a scalar, a Tuple or an Array.
+
+`broadcast_axes` is just an alias.
+
+```jldoctest
+julia> x
+1×2×1 mx.NDArray{Int64,3} @ CPU0:
+[:, :, 1] =
+ 1  2
+
+julia> mx.broadcast_axis(x, 1, 2)
+2×2×1 mx.NDArray{Int64,3} @ CPU0:
+[:, :, 1] =
+ 1  2
+ 1  2
+
+julia> mx.broadcast_axis(x, 3, 2)
+1×2×2 mx.NDArray{Int64,3} @ CPU0:
+[:, :, 1] =
+ 1  2
+
+[:, :, 2] =
+ 1  2
+```
+"""
+@_remap(broadcast_axis(x::NDArray, dim, size),
+        broadcast_axis(x; axis = ndims(x) .- dim, size = size))
+@_remap(Base.broadcast_axes(x::NDArray, dim, size),
+        broadcast_axes(x; axis = ndims(x) .- dim, size = size))
+
+################################################################################
+# remapping to solving type unstablility
+################################################################################
+
+@_remap _broadcast_hypot(x::NDArray, y::NDArray)  broadcast_hypot(x, y)
+@_remap _broadcast_hypot!(x::NDArray, y::NDArray) broadcast_hypot(x, y)
diff --git a/julia/src/ndarray/autoimport.jl b/julia/src/ndarray/autoimport.jl
new file mode 100644
index 000000000..c86e8fffd
--- /dev/null
+++ b/julia/src/ndarray/autoimport.jl
@@ -0,0 +1,227 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# NDArray functions dynamically imported from libmxnet
+
+function _invoke_mxfunction(func_handle::MX_handle, use_vars, scalars, mut_vars; kwargs...)
+  names = String[string(entry[1]) for entry in kwargs]
+  args = String[string(entry[2]) for entry in kwargs]
+  @mxcall(:MXFuncInvokeEx,
+          (MX_handle, Ptr{MX_handle}, Ptr{MX_float}, Ptr{MX_handle}, Cint, char_pp, char_pp),
+          func_handle, use_vars, scalars, mut_vars, length(names), names, args)
+end
+
+@enum(LIBMX_FUNC_TYPE_MASK,
+  NDARRAY_ARG_BEFORE_SCALAR = 1,
+  ACCEPT_EMPTY_MUTATE_TARGET = (1 << 2)
+)
+
+# Import corresponding math functions from base so the automatically defined libmxnet
+# functions can overload them
+import Base: sqrt
+
+"""
+The libxmnet APIs are automatically imported from `libmxnet.so`. The functions listed
+here operate on `NDArray` objects. The arguments to the functions are typically ordered
+as
+
+```julia
+  func_name(arg_in1, arg_in2, ..., scalar1, scalar2, ..., arg_out1, arg_out2, ...)
+```
+
+unless `NDARRAY_ARG_BEFORE_SCALAR` is not set. In this case, the scalars are put before the input arguments:
+
+```julia
+  func_name(scalar1, scalar2, ..., arg_in1, arg_in2, ..., arg_out1, arg_out2, ...)
+```
+
+If `ACCEPT_EMPTY_MUTATE_TARGET` is set. An overloaded function without the output arguments will also be defined:
+
+```julia
+  func_name(arg_in1, arg_in2, ..., scalar1, scalar2, ...)
+```
+
+Upon calling, the output arguments will be automatically initialized with empty NDArrays.
+
+Those functions always return the output arguments. If there is only one output (the typical situation), that
+object (`NDArray`) is returned. Otherwise, a tuple containing all the outputs will be returned.
+"""
+function _get_ndarray_function_def(name::String)
+  func_name = Symbol(name)
+
+  func_def = quote
+    function $func_name(::Type{<:NDArray}, args::NDArray...; out=nothing, kwargs...)
+      if out != nothing
+        output_vars = out
+        if isa(output_vars, NDArray)
+          output_vars = NDArray[output_vars]
+        end
+        num_outputs = length(output_vars)
+      else
+        output_vars = NDArray[]
+        num_outputs = 0
+      end
+
+      args = collect(args)  # tuple to list
+      if length(args) == 0
+        args = MX_handle[]
+      end
+
+      output_handles_pp = if length(output_vars) > 0
+        [map(x -> x.handle, output_vars)]
+      else
+        [Ptr{MX_handle}(C_NULL)]
+      end
+      num_outputs_p = [convert(Cint, num_outputs)]
+
+      kw_keys_str = String[string(x[1]) for x in kwargs]
+      kw_vals_str = String[dump_mx_param(x[2]) for x in kwargs]
+
+      op_handle = _get_cached_libmx_op_handle($(name))
+      @mxcall(:MXImperativeInvoke,
+              (MX_handle, Cint, Ptr{MX_handle},
+               Ptr{Cint}, Ptr{Ptr{MX_handle}},
+               Cint, char_pp, char_pp),
+              op_handle, length(args), args,
+              num_outputs_p, output_handles_pp,
+              length(kwargs), kw_keys_str, kw_vals_str)
+
+      if out == nothing
+        n = num_outputs_p[]
+        hdls = unsafe_wrap(Array{MX_handle}, output_handles_pp[], n)
+        xs = NDArray[NDArray(MX_NDArrayHandle(x)) for x in hdls]
+        if n == 1
+          return xs[]
+        else
+          return xs
+        end
+      else
+        return out
+      end
+    end
+  end
+
+  func_def2 = quote
+    function $func_name(args::NDArray...; out=nothing, kwargs...)
+      $func_name(NDArray, args...; out=out, kwargs...)
+    end
+  end
+
+  return func_def, func_def2
+end
+
+const _op_import_bl = [  # import black list; do not import these funcs
+    "_full",   # we already have `mx.fill`
+    "_ones",   # we already have `mx.ones`
+    "_zeros",  # we already have `mx.zeros`
+    "clip",
+    "expand_dims",
+
+    # arithmetic
+    "_plus",
+    "_minus",
+    "_mod",
+    "_mod_scalar",
+    "_rmod_scalar",
+
+    "dot",
+    "max",
+    "max_axis",
+    "mean",
+    "min",
+    "min_axis",
+    "prod",
+    "reshape",
+    "sum",
+    "transpose",
+
+    # trigonometric
+    "sin",
+    "cos",
+    "tan",
+    "arcsin",
+    "arccos",
+    "arctan",
+
+    # hyperbolic
+    "sinh",
+    "cosh",
+    "tanh",
+    "arcsinh",
+    "arccosh",
+    "arctanh",
+
+    # activation
+    "sigmoid",
+    "relu",
+    "softmax",
+    "log_softmax",
+
+    # broadcast
+    "broadcast_add",
+    "broadcast_plus",
+    "broadcast_minus",
+    "broadcast_sub",
+    "broadcast_mul",
+    "broadcast_div",
+    "broadcast_mod",
+    "broadcast_power",
+    "broadcast_equal",
+    "broadcast_not_equal",
+    "broadcast_greater",
+    "broadcast_greater_equal",
+    "broadcast_lesser",
+    "broadcast_lesser_equal",
+    "broadcast_maximum",
+    "broadcast_minimum",
+    "broadcast_to",
+    "broadcast_axis",
+    "broadcast_axes",
+    "broadcast_hypot",
+
+    # reduction
+    "argmax",
+    "argmin",
+]
+
+macro _import_ndarray_functions()
+  names = filter(n -> ∉(lowercase(n), _op_import_bl), _get_libmx_op_names())
+
+  func_exprs = map(names) do name
+    op_handle = _get_libmx_op_handle(name)
+
+    desc, key_narg = _get_libmx_op_description(name, op_handle)
+    func_def, func_def2 = _get_ndarray_function_def(name)
+
+    func_name = Symbol(name)
+
+    import_expr = _import_expr(func_name)
+
+    quote
+      $import_expr
+      $func_def
+      @doc $desc
+      $func_def2
+    end
+  end
+
+  esc(quote
+    $(func_exprs...)
+  end)
+end
+
+@_import_ndarray_functions
diff --git a/julia/src/ndarray/comparison.jl b/julia/src/ndarray/comparison.jl
new file mode 100644
index 000000000..19be6fa51
--- /dev/null
+++ b/julia/src/ndarray/comparison.jl
@@ -0,0 +1,56 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+broadcasted(::typeof(==), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_equal(x, y)
+
+broadcasted(::typeof(!=), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_not_equal(x, y)
+
+broadcasted(::typeof(>), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_greater(x, y)
+
+broadcasted(::typeof(>=), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_greater_equal(x, y)
+
+broadcasted(::typeof(<), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_lesser(x, y)
+
+broadcasted(::typeof(<=), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_lesser_equal(x, y)
+
+################################################################################
+# remapping to solving type unstablility
+################################################################################
+
+@_remap _broadcast_equal(x::NDArray, y::NDArray)  broadcast_equal(x, y)
+@_remap _broadcast_equal!(x::NDArray, y::NDArray) broadcast_equal(x, y)
+
+@_remap _broadcast_not_equal(x::NDArray, y::NDArray)  broadcast_not_equal(x, y)
+@_remap _broadcast_not_equal!(x::NDArray, y::NDArray) broadcast_not_equal(x, y)
+
+@_remap _broadcast_greater(x::NDArray, y::NDArray)  broadcast_greater(x, y)
+@_remap _broadcast_greater!(x::NDArray, y::NDArray) broadcast_greater(x, y)
+
+@_remap _broadcast_greater_equal(x::NDArray, y::NDArray)  broadcast_greater_equal(x, y)
+@_remap _broadcast_greater_equal!(x::NDArray, y::NDArray) broadcast_greater_equal(x, y)
+
+@_remap _broadcast_lesser(x::NDArray, y::NDArray)  broadcast_lesser(x, y)
+@_remap _broadcast_lesser!(x::NDArray, y::NDArray) broadcast_lesser(x, y)
+
+@_remap _broadcast_lesser_equal(x::NDArray, y::NDArray)  broadcast_lesser_equal(x, y)
+@_remap _broadcast_lesser_equal!(x::NDArray, y::NDArray) broadcast_lesser_equal(x, y)
diff --git a/julia/src/ndarray/context.jl b/julia/src/ndarray/context.jl
new file mode 100644
index 000000000..c89c17b2c
--- /dev/null
+++ b/julia/src/ndarray/context.jl
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+    context(x::NDArray)
+
+Get the context that this `NDArray` lives on.
+"""
+function context(x::NDArray)
+  ref_typeid = Ref{Cint}(0)
+  ref_devid  = Ref{Cint}(0)
+  @mxcall(:MXNDArrayGetContext, (MX_handle, Ref{Cint}, Ref{Cint}),
+          x, ref_typeid, ref_devid)
+  Context(ref_typeid[], ref_devid[])
+end
diff --git a/julia/src/ndarray/io.jl b/julia/src/ndarray/io.jl
new file mode 100644
index 000000000..99c11cd90
--- /dev/null
+++ b/julia/src/ndarray/io.jl
@@ -0,0 +1,135 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+@inline _wait_to_read(arr :: NDArray) =
+  @mxcall(:MXNDArrayWaitToRead, (MX_handle,), arr)
+@inline _wait_to_write(arr :: NDArray) =
+  @mxcall(:MXNDArrayWaitToWrite, (MX_handle,), arr)
+
+"""
+    try_get_shared(arr; sync=:nop)
+
+Try to create a Julia array by sharing the data with the underlying `NDArray`.
+
+# Arguments:
+
+* `arr::NDArray`: the array to be shared.
+
+!!! note
+    The returned array does not guarantee to share data with the underlying `NDArray`.
+    In particular, data sharing is possible only when the `NDArray` lives on CPU.
+
+* `sync::Symbol`: `:nop`,`:write`, `:read`
+  On CPU, invoke `_wait_to_read` if `:read`;
+  invoke `_wait_to_write` if `:write`.
+"""
+function try_get_shared(x::NDArray; sync::Symbol=:nop)
+  if context(x).device_type == CPU
+    # try to do data sharing
+    if sync == :read
+      _wait_to_read(x)
+    elseif sync == :write
+      _wait_to_write(x)
+    end
+
+    unsafe_wrap(Array, pointer(x), size(x))
+  else
+    # impossible to share, just copying
+    copy(x)
+  end
+end
+
+"""
+    is_shared(j_arr, arr)
+
+Test whether `j_arr` is sharing data with `arr`.
+
+# Arguments:
+
+* `j_arr::Array`: the Julia Array.
+* `arr::NDArray`: the `NDArray`.
+"""
+is_shared(::Array, ::NDArray) = false
+
+function is_shared(j_arr::Array{T}, arr::NDArray{T}) where {T<:DType}
+  if length(j_arr) != length(arr)
+    return false
+  end
+  if context(arr).device_type != CPU
+    return false
+  end
+  pointer(j_arr) == pointer(arr)
+end
+
+"""
+    load(filename, ::Type{NDArray})
+
+Load NDArrays from binary file.
+
+# Arguments:
+* `filename::String`: the path of the file to load. It could be S3 or HDFS address.
+
+Returns either `Dict{Symbol, NDArray}` or `Vector{NDArray}`.
+
+`filename` can point to `s3` or `hdfs` resources if the `libmxnet` is built with the
+corresponding components enabled. Examples:
+* `s3://my-bucket/path/my-s3-ndarray`
+* `hdfs://my-bucket/path/my-hdfs-ndarray`
+* `/path-to/my-local-ndarray`
+"""
+function load(filename::AbstractString, ::Type{<:NDArray})
+  out_size      = Ref{MX_uint}(0)
+  out_hdrs      = Ref{Ptr{MX_handle}}(0)
+  out_name_size = Ref{MX_uint}(0)
+  out_names     = Ref{char_pp}(0)
+  @mxcall(:MXNDArrayLoad, (char_p, Ref{MX_uint}, Ref{Ptr{MX_handle}}, Ref{MX_uint}, Ref{char_pp}),
+          filename, out_size, out_hdrs, out_name_size, out_names)
+  out_name_size = out_name_size[]
+  out_size      = out_size[]
+  if out_name_size == 0
+    return [NDArray(MX_NDArrayHandle(hdr)) for hdr in unsafe_wrap(Array, out_hdrs[], out_size)]
+  else
+    @assert out_size == out_name_size
+    return Dict([(Symbol(unsafe_string(k)), NDArray(MX_NDArrayHandle(hdr))) for (k,hdr) in
+                 zip(unsafe_wrap(Array, out_names[], out_size), unsafe_wrap(Array, out_hdrs[], out_size))])
+  end
+end
+
+"""
+    save(filename::AbstractString, data)
+
+Save NDarrays to binary file. Filename could be S3 or HDFS address, if `libmxnet` is built
+with corresponding support (see `load`).
+
+* `filename::String`: path to the binary file to write to.
+* `data`: data to save to file. Data can be a`NDArray`, a `Vector` of `NDArray`,
+  or a `Dict{Symbol}` contains `NDArray`s.
+"""
+save(filename::String, data::NDArray) = save(filename, [data])
+
+save(filename::String, data::VecOfNDArray) =
+  @mxcall(:MXNDArraySave, (char_p, MX_uint, Ptr{MX_handle}, char_pp),
+          filename, length(data), MX_handle[data...], char_pp(0))
+
+function save(filename::String, data::Dict{Symbol})
+  names  = keys(data)
+  arrays = MX_handle.(collect(values(data)))
+  names  = String.(collect(names))
+
+  @mxcall(:MXNDArraySave, (char_p, MX_uint, Ptr{MX_handle}, char_pp),
+          filename, length(names), arrays, names)
+end
diff --git a/julia/src/ndarray/linalg.jl b/julia/src/ndarray/linalg.jl
new file mode 100644
index 000000000..4e91cfac6
--- /dev/null
+++ b/julia/src/ndarray/linalg.jl
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# See /~https://github.com/dmlc/MXNet.jl/issues/55
+@_remap LinearAlgebra.dot(x::NDArray, y::NDArray) dot(y, x)
+
+# See /~https://github.com/dmlc/MXNet.jl/pull/123
+@_remap LinearAlgebra.transpose(x::NDArray{T,1}) where T reshape(x; shape = (1, length(x)), reverse = true)
+@_remap LinearAlgebra.transpose(x::NDArray{T,2}) where T transpose(x)
diff --git a/julia/src/ndarray/reduction.jl b/julia/src/ndarray/reduction.jl
new file mode 100644
index 000000000..833b483ca
--- /dev/null
+++ b/julia/src/ndarray/reduction.jl
@@ -0,0 +1,115 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+Base.prod(x::NDArray; dims = :) = _prod(x, dims)
+@_remap _prod(x::NDArray, ::Colon) prod(x)
+@_remap _prod(x::NDArray, dims)    prod(x; axis = 0 .- dims, keepdims = true)
+
+Base.maximum(x::NDArray; dims = :) = _nd_maximum(x, dims)
+@_remap _nd_maximum(x::NDArray, ::Colon) max(x)
+@_remap _nd_maximum(x::NDArray, dims)    max(x; axis = 0 .- dims, keepdims = true)
+
+Base.minimum(x::NDArray; dims = :) = _nd_minimum(x, dims)
+@_remap _nd_minimum(x::NDArray, ::Colon) min(x)
+@_remap _nd_minimum(x::NDArray, dims)    min(x; axis = 0 .- dims, keepdims = true)
+
+###############################################################################
+# min/max
+###############################################################################
+
+import Base: min, max
+
+broadcasted(::typeof(max), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_maximum(x, y)
+
+broadcasted(::typeof(min), x::NDArray{T}, y::NDArray{T}) where {T} =
+  _broadcast_minimum(x, y)
+
+###############################################################################
+# argmin/argmax
+###############################################################################
+
+# TODO: support CartesianIndex ?
+"""
+    argmax(x::NDArray; dims) -> indices
+
+Note that `NaN` is skipped during comparison.
+This is different from Julia `Base.argmax`.
+
+## Examples
+
+```julia-repl
+julia> x = NDArray([0. 1 2; 3 4 5])
+2×3 NDArray{Float64,2} @ CPU0:
+ 0.0  1.0  2.0
+ 3.0  4.0  5.0
+
+julia> argmax(x, dims = 1)
+1×3 NDArray{Float64,2} @ CPU0:
+ 2.0  2.0  2.0
+
+julia> argmax(x, dims = 2)
+2×1 NDArray{Float64,2} @ CPU0:
+ 3.0
+ 3.0
+```
+
+See also [`argmin`](@ref mx.argmin).
+"""
+Base.argmax(x::NDArray; dims = :) = _argmax(x, dims) .+ 1
+@_remap _argmax(x::NDArray, ::Colon) argmax(x)
+@_remap _argmax(x::NDArray, dims)    argmax(x; axis = 0 .- dims, keepdims = true)
+
+"""
+    argmin(x::NDArray; dims) -> indices
+
+Note that `NaN` is skipped during comparison.
+This is different from Julia `Base.argmin`.
+
+## Examples
+
+```julia-repl
+julia> x = NDArray([0. 1 2; 3 4 5])
+2×3 NDArray{Float64,2} @ CPU0:
+ 0.0  1.0  2.0
+ 3.0  4.0  5.0
+
+julia> argmax(x, dims = 1)
+1×3 NDArray{Float64,2} @ CPU0:
+ 2.0  2.0  2.0
+
+julia> argmax(x, dims = 2)
+2×1 NDArray{Float64,2} @ CPU0:
+ 3.0
+ 3.0
+```
+
+See also [`argmax`](@ref mx.argmax).
+"""
+Base.argmin(x::NDArray; dims = :) = _argmin(x, dims) .+ 1
+@_remap _argmin(x::NDArray, ::Colon) argmin(x)
+@_remap _argmin(x::NDArray, dims)    argmin(x; axis = 0 .- dims, keepdims = true)
+
+################################################################################
+# remapping to solving type unstablility
+################################################################################
+
+@_remap _broadcast_maximum(x::NDArray, y::NDArray)  broadcast_maximum(x, y)
+@_remap _broadcast_maximum!(x::NDArray, y::NDArray) broadcast_maximum(x, y)
+
+@_remap _broadcast_minimum(x::NDArray, y::NDArray)  broadcast_minimum(x, y)
+@_remap _broadcast_minimum!(x::NDArray, y::NDArray) broadcast_minimum(x, y)
diff --git a/julia/src/ndarray/remap.jl b/julia/src/ndarray/remap.jl
new file mode 100644
index 000000000..e6515e43d
--- /dev/null
+++ b/julia/src/ndarray/remap.jl
@@ -0,0 +1,133 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Mapping NDArray functions to Base-like API
+
+const _ndsig = Dict{Symbol,Expr}()
+const _nddoc = Dict{Symbol,Any}()
+
+_isinplace(name::Symbol) = endswith(string(name), "!")
+
+_writable(name::Symbol, x) =
+  _isinplace(name) ? :(@assert $x.writable "this NDArray isn't writable") : :()
+
+function _outexpr(name::Symbol, x #= the first arg of `sig` =#)
+  if _isinplace(name)  # `func!`
+    Ptr, 1, :([[MX_handle(x.handle)]]), :($x)
+  else
+    retexpr = :(NDArray(MX_NDArrayHandle(unsafe_load(hdls_ref[], 1))))
+    Ref, 0, :(Ref{Ptr{MX_handle}}(C_NULL)), retexpr
+  end
+end
+
+_broadcast_target(sig::Expr) = sig.args[2].args[].args[end]
+
+"""
+Generate docstring from function signature
+"""
+function _docsig(fname::Symbol, sig::Expr, opname::String)
+  if fname !== :broadcasted
+    get(_nddoc, fname, "    $sig") * "\n" * _getdocdefine(opname)
+  else
+    name = _broadcast_target(sig)
+    str = get(_nddoc, name, "")
+    _nddoc[name] = false  # change to false, denote docstring has been set up
+    if isempty(str)
+      sig_ = Expr(:call, Symbol(name, "."), sig.args[3:end]...)
+      str = "    $sig_"
+    end
+    if str ≠ false
+      # append "Defined in ..."
+      def = _getdocdefine(opname)
+      str = if str isa Markdown.MD
+        str = Markdown.MD(copy(str.content), copy(str.meta))
+        push!(str, Markdown.Paragraph(def))
+        str
+      else
+        str * def
+      end
+
+      @eval @doc $str $name
+    end
+    ""
+  end
+end
+
+macro _remap(sig::Expr, imp::Expr)
+  d = splitdef(:($sig = $imp))
+  @capture d[:name] (M_.fname_|fname_)
+
+  opname = string(imp.args[1])
+
+  if isa(imp.args[2], Expr) && imp.args[2].head == :parameters
+    ndin = imp.args[3:end]
+    mxargs = imp.args[2].args
+  else  # no keyword arguments
+    ndin = imp.args[2:end]
+    mxargs = []
+  end
+
+  mxkeys = map(x -> string(x.args[1]), mxargs)
+  mxvals = Expr(:vect, map(x -> :(dump_mx_param($(x.args[2]))), mxargs)...)
+  ndhlds = Expr(:vect, map(x -> :($(x).handle), ndin)...)
+
+  # handler for `func!` which has side effect on first argument.
+  T, n_output, hdls_ref, retexpr = _outexpr(fname, _firstarg(sig))
+
+  assert_expr = _writable(fname, _firstarg(sig))
+
+  func_body = quote
+    $assert_expr
+    op_handle = _get_cached_libmx_op_handle($opname)
+    n_output = Ref(Cint($n_output))
+    hdls_ref = $hdls_ref
+    @mxcall(:MXImperativeInvoke,
+            (MX_handle,
+             Cint,
+             Ptr{MX_handle},
+             Ref{Cint},
+             $T{Ptr{MX_handle}},
+             Cint,
+             char_pp,
+             char_pp),
+            op_handle,
+            $(length(ndin)),
+            $(ndhlds),
+            n_output,
+            hdls_ref,
+            $(length(mxargs)),
+            $mxkeys,
+            $mxvals)
+    $retexpr
+  end
+
+  docstr = _docsig(fname, sig, opname)
+  func_def = Expr(:function, sig, func_body)
+
+  esc(quote
+    @doc $docstr
+    $func_def
+  end)
+end
+
+macro _remap(sig::Expr, imp::Symbol)
+  imp = _ndsig[imp]
+
+  esc(quote
+    @_remap($sig, $imp)
+  end)
+end
diff --git a/julia/src/ndarray/show.jl b/julia/src/ndarray/show.jl
new file mode 100644
index 000000000..4a6bfa3f5
--- /dev/null
+++ b/julia/src/ndarray/show.jl
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+function Base.show(io::IO, x::NDArray)
+  print(io, "NDArray(")
+  Base.show(io, try_get_shared(x, sync = :read))
+  print(io, ")")
+end
+
+# for REPL
+function Base.show(io::IO, ::MIME{Symbol("text/plain")}, x::NDArray{T,N}) where {T,N}
+  type_ = split(string(typeof(x)), '.', limit=2)[end]
+  n = length(x)
+  size_ = N == 1 ? "$n-element" : join(size(x), "×")
+  print(io, "$size_ $type_ @ $(context(x))", (n == 0) ? "" : ":\n")
+  Base.print_array(io, try_get_shared(x, sync = :read))
+end
diff --git a/julia/src/ndarray/statistic.jl b/julia/src/ndarray/statistic.jl
new file mode 100644
index 000000000..b4f7b90b8
--- /dev/null
+++ b/julia/src/ndarray/statistic.jl
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+Statistics.mean(x::NDArray; dims = :) = _mean(x, dims)
+@_remap _mean(x::NDArray, ::Colon) mean(x)
+@_remap _mean(x::NDArray, dims)    mean(x; axis = 0 .- dims, keepdims = true)
+
+Base.sum(x::NDArray; dims = :) = _sum(x, dims)
+@_remap _sum(x::NDArray, ::Colon) sum(x)
+@_remap _sum(x::NDArray, dims)    sum(x; axis = 0 .- dims, keepdims = true)
diff --git a/julia/src/ndarray/trig.jl b/julia/src/ndarray/trig.jl
new file mode 100644
index 000000000..5251b3a34
--- /dev/null
+++ b/julia/src/ndarray/trig.jl
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# trigonometric functions, remap to keep consistent API with Base
+@_remap broadcasted(::typeof(sin),  x::NDArray) sin(x)
+@_remap broadcasted(::typeof(cos),  x::NDArray) cos(x)
+@_remap broadcasted(::typeof(tan),  x::NDArray) tan(x)
+@_remap broadcasted(::typeof(asin), x::NDArray) arcsin(x)
+@_remap broadcasted(::typeof(acos), x::NDArray) arccos(x)
+@_remap broadcasted(::typeof(atan), x::NDArray) arctan(x)
+
+# hyperbolic functions, remap to keep consistent API with Base
+@_remap broadcasted(::typeof(sinh),  x::NDArray) sinh(x)
+@_remap broadcasted(::typeof(cosh),  x::NDArray) cosh(x)
+@_remap broadcasted(::typeof(tanh),  x::NDArray) tanh(x)
+@_remap broadcasted(::typeof(asinh), x::NDArray) arcsinh(x)
+@_remap broadcasted(::typeof(acosh), x::NDArray) arccosh(x)
+@_remap broadcasted(::typeof(atanh), x::NDArray) arctanh(x)
diff --git a/julia/src/ndarray/type.jl b/julia/src/ndarray/type.jl
new file mode 100644
index 000000000..8d90d63f0
--- /dev/null
+++ b/julia/src/ndarray/type.jl
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# All the types supported by mshadow. See `mshadow/base.h`
+const DType = Union{Float32, Float64, Float16, UInt8, Int32, Int8, Int64}
+@enum TypeFlag kFloat32 kFloat64 kFloat16 kUint8 kInt32 kInt8 kInt64
+const DEFAULT_DTYPE = Float32  # MSHADOW_DEFAULT_DTYPE
+
+function toTypeFlag(T::Type{<:DType})
+  if T == Float32
+    return kFloat32
+  elseif T == Float64
+    return kFloat64
+  elseif T == Float16
+    return kFloat16
+  elseif T == UInt8
+    return kUint8
+  elseif T == Int32
+    return kInt32
+  elseif T == Int8
+    return kInt8
+  elseif T == Int64
+    return kInt64
+  else
+    throw(ArgumentError("Can't convert $T to DType."))
+  end
+end
+
+function fromTypeFlag(T::TypeFlag)
+  if T == kFloat32
+    return Float32
+  elseif T == kFloat64
+    return Float64
+  elseif T == kFloat16
+    return Float16
+  elseif T == kUint8
+    return UInt8
+  elseif T == kInt32
+    return Int32
+  elseif T == kInt8
+    return Int8
+  elseif T == kInt64
+    return Int64
+  else
+    throw(ArgumentError("Can't convert DType $T."))
+  end
+end
+
+# create a NDArray handle of specific shape
+function _ndarray_alloc(shape::NTuple{N,Int}, ctx::Context, delay_alloc::Bool) where N
+  h_ref  = Ref{MX_handle}(0)
+  shape  = collect(reverse(MX_uint.(shape)))
+  @mxcall(:MXNDArrayCreate, (Ptr{MX_uint}, MX_uint, Cint, Cint, Cint, Ref{MX_handle}),
+      shape, N, ctx.device_type, ctx.device_id, delay_alloc, h_ref)
+  handle = MX_NDArrayHandle(h_ref[])
+  return handle
+end
+
+# create a NDArray handle of specific shape type
+function _ndarray_alloc(::Type{T}, shape::NTuple{N,Int}, ctx::Context, delay_alloc::Bool) where {T<:DType,N}
+  h_ref  = Ref{MX_handle}(0)
+  shape  = collect(reverse(MX_uint.(shape)))
+  dtype  = toTypeFlag(T)
+  @mxcall(:MXNDArrayCreateEx, (Ptr{MX_uint}, MX_uint, Cint, Cint, Cint, Cint, Ref{MX_handle}),
+      shape, N, ctx.device_type, ctx.device_id, delay_alloc, dtype, h_ref)
+  handle = MX_NDArrayHandle(h_ref[])
+  return handle
+end
+
+# create a handle to an empty NDArray, this handle can be used to hold
+# results returned by libmx API calls
+function _ndarray_alloc()
+  h_ref = Ref{MX_handle}(0)
+  @mxcall(:MXNDArrayCreateNone, (Ref{MX_handle},), h_ref)
+  return MX_NDArrayHandle(h_ref[])
+end
+
+################################################################################
+# NDArray Type
+################################################################################
+"""
+    NDArray{T,N}
+
+Wrapper of the `NDArray` type in `libmxnet`. This is the basic building block
+of tensor-based computation.
+
+!!! note
+      since C/C++ use row-major ordering for arrays while Julia follows a
+      column-major ordering. To keep things consistent, we keep the underlying data
+      in their original layout, but use *language-native* convention when we talk
+      about shapes. For example, a mini-batch of 100 MNIST images is a tensor of
+      C/C++/Python shape (100,1,28,28), while in Julia, the same piece of memory
+      have shape (28,28,1,100).
+"""
+mutable struct NDArray{T,N}
+  handle   :: MX_NDArrayHandle
+  writable :: Bool
+
+  NDArray{T,N}(handle::MX_NDArrayHandle, writable::Bool = true) where {T,N} =
+    new(handle, writable)
+end
+
+# UndefInitializer constructors
+NDArray{T,N}(::UndefInitializer, dims::NTuple{N,Integer};
+             writable = true, ctx::Context = cpu()) where {T,N} =
+  NDArray{T,N}(_ndarray_alloc(T, dims, ctx, false), writable)
+NDArray{T,N}(::UndefInitializer, dims::Vararg{Integer,N}; kw...) where {T,N} =
+  NDArray{T,N}(undef, dims; kw...)
+
+NDArray{T}(::UndefInitializer, dims::NTuple{N,Integer}; kw...) where {T,N} =
+  NDArray{T,N}(undef, dims; kw...)
+NDArray{T}(::UndefInitializer, dims::Vararg{Integer,N}; kw...) where {T,N} =
+  NDArray{T,N}(undef, dims; kw...)
+
+NDArray(::UndefInitializer, dims::NTuple{N,Integer}; kw...) where {N} =
+  NDArray{DEFAULT_DTYPE,N}(undef, dims; kw...)
+NDArray(::UndefInitializer, dims::Vararg{Integer,N}; kw...) where {N} =
+  NDArray{DEFAULT_DTYPE,N}(undef, dims; kw...)
+
+NDArray(x::AbstractArray{<:DType}) = copy(collect(x), cpu())
+NDArray(x::Array{<:DType})         = copy(x, cpu())
+
+NDArray(::Type{T}, x::AbstractArray) where {T<:DType} =
+  copy(convert(AbstractArray{T}, x), cpu())
+
+NDArray(handle, writable = true) =
+  NDArray{eltype(handle), ndims(handle)}(handle, writable)
+
+# type aliases
+const NDArrayOrReal = Union{NDArray,Real}
+const VecOfNDArray = AbstractVector{<:NDArray}
+
+Base.unsafe_convert(::Type{MX_handle}, x::NDArray) =
+  Base.unsafe_convert(MX_handle, x.handle)
+Base.convert(T::Type{MX_handle}, x::NDArray) = Base.unsafe_convert(T, x)
+Base.cconvert(T::Type{MX_handle}, x::NDArray) = Base.unsafe_convert(T, x)
+
+MX_handle(x::NDArray) = Base.convert(MX_handle, x)
diff --git a/julia/src/random.jl b/julia/src/random.jl
index e18e906a5..3f3b80bba 100644
--- a/julia/src/random.jl
+++ b/julia/src/random.jl
@@ -23,12 +23,12 @@ Samples are uniformly distributed over the half-open interval [low, high)
 (includes low, but excludes high).
 
 ```julia
-julia> mx.rand!(empty(2, 3))
+julia> mx.rand!(NDArray(undef, 2, 3))
 2×3 mx.NDArray{Float32,2} @ CPU0:
  0.385748   0.839275  0.444536
  0.0879585  0.215928  0.104636
 
-julia> mx.rand!(empty(2, 3), low = 1, high = 10)
+julia> mx.rand!(NDArray(undef, 2, 3), low = 1, high = 10)
 2×3 mx.NDArray{Float32,2} @ CPU0:
  6.6385   4.18888  2.07505
  8.97283  2.5636   1.95586
@@ -56,8 +56,8 @@ julia> mx.rand(2, 2; low = 1, high = 10)
  9.81258  3.58068
 ```
 """
-rand(dims::Int...; low = 0, high = 1, context = cpu()) =
-  rand!(empty(dims, context), low = low, high = high)
+rand(dims::Integer...; low = 0, high = 1, context = cpu()) =
+  rand!(NDArray(undef, dims, ctx = context), low = low, high = high)
 
 """
     randn!(x::NDArray; μ = 0, σ = 1)
@@ -73,7 +73,7 @@ randn!(x::NDArray; μ = 0, σ = 1) =
 Draw random samples from a normal (Gaussian) distribution.
 """
 randn(dims::Int...; μ = 0, σ = 1, context = cpu()) =
-  randn!(empty(dims, context), μ = μ, σ = σ)
+  randn!(NDArray(undef, dims, ctx = context), μ = μ, σ = σ)
 
 """
     seed!(seed::Int)
diff --git a/julia/src/util.jl b/julia/src/util.jl
index a836d3e39..ac7f4fc71 100644
--- a/julia/src/util.jl
+++ b/julia/src/util.jl
@@ -35,12 +35,12 @@ function get_mnist_ubyte()
   filenames = Dict((x[1] => joinpath(mnist_dir, x[2]) for x ∈ pairs(filenames)))
   if !all(isfile, values(filenames))
     cd(mnist_dir) do
-      mnist_dir = download("http://data.mxnet.io/mxnet/data/mnist.zip", "mnist.zip")
+      data = download("http://data.mxnet.io/mxnet/data/mnist.zip", "mnist.zip")
         try
-          run(`unzip -u $mnist_dir`)
+          run(`unzip -u $data`)
         catch
           try
-            run(pipe(`7z x $mnist_dir`,stdout = devnull))
+            run(pipeline(`7z x $data`,stdout = devnull))
           catch
             error("Extraction Failed:No extraction program found in path")
           end
@@ -183,15 +183,14 @@ end
 Extract the line of `Defined in ...`
 
 julia> mx._getdocdefine("sgd_update")
-"Defined in src/operator/optimizer_op.cc:L53"
-```
+"Defined in `src/operator/optimizer_op.cc:L53`"
 """
 function _getdocdefine(name::String)
   op = _get_libmx_op_handle(name)
   str = _get_libmx_op_description(name, op)[1]
   lines = split(str, '\n')
-  for m ∈ match.(Ref(r"^Defined in .*$"), lines)
-    m != nothing && return m.match
+  for m ∈ match.(Ref(r"^Defined in ([\S]+)$"), lines)
+    m != nothing && return "Defined in `$(m.captures[1])`"
   end
   ""
 end
diff --git a/julia/test/unittest/bind.jl b/julia/test/unittest/bind.jl
index abaca884b..0ae0ab427 100644
--- a/julia/test/unittest/bind.jl
+++ b/julia/test/unittest/bind.jl
@@ -33,10 +33,10 @@ function test_arithmetic(::Type{T}, uf, gf) where T <: mx.DType
   ret = uf(lhs, rhs)
   @test mx.list_arguments(ret) == [:lhs, :rhs]
 
-  lhs_arr  = mx.NDArray(rand(T, shape))
-  rhs_arr  = mx.NDArray(rand(T, shape))
-  lhs_grad = mx.empty(T, shape)
-  rhs_grad = mx.empty(T, shape)
+  lhs_arr  = NDArray(rand(T, shape))
+  rhs_arr  = NDArray(rand(T, shape))
+  lhs_grad = NDArray{T}(undef, shape)
+  rhs_grad = NDArray{T}(undef, shape)
 
   exec2 = mx.bind(ret, mx.Context(mx.CPU), [lhs_arr, rhs_arr], args_grad=[lhs_grad, rhs_grad])
   exec3 = mx.bind(ret, mx.Context(mx.CPU), [lhs_arr, rhs_arr])
diff --git a/julia/test/unittest/io.jl b/julia/test/unittest/io.jl
index cf8d8368d..7d98d28fc 100644
--- a/julia/test/unittest/io.jl
+++ b/julia/test/unittest/io.jl
@@ -38,8 +38,8 @@ function test_mnist()
   n_batch = 0
   for batch in mnist_provider
     if n_batch == 0
-      data_array  = mx.empty(28,28,1,batch_size)
-      label_array = mx.empty(batch_size)
+      data_array  = NDArray(undef, 28, 28, 1, batch_size)
+      label_array = NDArray(undef, batch_size)
       # have to use "for i=1:1" to get over the legacy "feature" of using
       # [ ] to do concatenation in Julia
       data_targets = [[(1:batch_size, data_array)] for i = 1:1]
diff --git a/julia/test/unittest/kvstore.jl b/julia/test/unittest/kvstore.jl
index 503a1fdbd..db6885717 100644
--- a/julia/test/unittest/kvstore.jl
+++ b/julia/test/unittest/kvstore.jl
@@ -47,7 +47,7 @@ function test_single_kv_pair()
 
   kv = init_kv()
   mx.push!(kv, 3, mx.ones(SHAPE))
-  val = mx.empty(SHAPE)
+  val = NDArray(undef, SHAPE)
   mx.pull!(kv, 3, val)
   @test maximum(abs.(copy(val) .- 1)) == 0
 end
diff --git a/julia/test/unittest/ndarray.jl b/julia/test/unittest/ndarray.jl
index 9ca4ba206..eb69a736a 100644
--- a/julia/test/unittest/ndarray.jl
+++ b/julia/test/unittest/ndarray.jl
@@ -57,6 +57,78 @@ function test_constructor()
     @test eltype(x) == Float32
     @test copy(x) ≈ [1.1, 2, 3]
   end
+
+  @info "NDArray::NDArray{T,N}(undef, dims...)"
+  let
+    x = NDArray{Int,2}(undef, 5, 5)
+    @test eltype(x) == Int
+    @test size(x) == (5, 5)
+    @test x.writable
+
+    y = NDArray{Int,2}(undef, 5, 5, writable = false)
+    @test !y.writable
+
+    # dimension mismatch
+    @test_throws MethodError NDArray{Int,1}(undef, 5, 5)
+  end
+
+  @info "NDArray::NDArray{T,N}(undef, dims)"
+  let
+    x = NDArray{Int,2}(undef, (5, 5))
+    @test eltype(x) == Int
+    @test size(x) == (5, 5)
+    @test x.writable
+
+    y = NDArray{Int,2}(undef, (5, 5), writable = false)
+    @test !y.writable
+
+    # dimension mismatch
+    @test_throws MethodError NDArray{Int,1}(undef, (5, 5))
+  end
+
+  @info "NDArray::NDArray{T}(undef, dims...)"
+  let
+    x = NDArray{Int}(undef, 5, 5)
+    @test eltype(x) == Int
+    @test size(x) == (5, 5)
+    @test x.writable
+
+    y = NDArray{Int}(undef, 5, 5, writable = false)
+    @test !y.writable
+  end
+
+  @info "NDArray::NDArray{T}(undef, dims)"
+  let
+    x = NDArray{Int}(undef, (5, 5))
+    @test eltype(x) == Int
+    @test size(x) == (5, 5)
+    @test x.writable
+
+    y = NDArray{Int}(undef, (5, 5), writable = false)
+    @test !y.writable
+  end
+
+  @info "NDArray::NDArray(undef, dims...)"
+  let
+    x = NDArray(undef, 5, 5)
+    @test eltype(x) == mx.MX_float
+    @test size(x) == (5, 5)
+    @test x.writable
+
+    y = NDArray(undef, 5, 5, writable = false)
+    @test !y.writable
+  end
+
+  @info "NDArray::NDArray(undef, dims)"
+  let
+    x = NDArray(undef, (5, 5))
+    @test eltype(x) == mx.MX_float
+    @test size(x) == (5, 5)
+    @test x.writable
+
+    y = NDArray(undef, (5, 5), writable = false)
+    @test !y.writable
+  end
 end  # function test_constructor
 
 
@@ -134,8 +206,8 @@ function test_assign()
   @info("NDArray::assign::dims = $dims")
 
   # Julia Array -> NDArray assignment
-  array   = mx.empty(size(tensor))
-  array[:]= tensor
+  array    = NDArray(undef, size(tensor)...)
+  array[:] = tensor
   @test tensor ≈ copy(array)
 
   array2  = mx.zeros(size(tensor))
@@ -1006,14 +1078,14 @@ end
 
 function test_eltype()
   @info("NDArray::eltype")
-  dims1 = (3,3)
+  dims = (3,3)
 
-  x = mx.empty(dims1)
+  x = NDArray(undef, dims)
   @test eltype(x) == mx.DEFAULT_DTYPE
 
   for TF in instances(mx.TypeFlag)
     T = mx.fromTypeFlag(TF)
-    x = mx.empty(T, dims1)
+    x = NDArray{T}(undef, dims)
     @test eltype(x) == T
   end
 end
@@ -1434,6 +1506,50 @@ function test_hypot()
   @test copy(z) == C
 end  # function test_hypot
 
+function test_argmax()
+  @info "NDArray::argmax"
+  let
+    A = [1. 5 3;
+         4 2 6]
+    x = NDArray(A)
+
+    @test copy(argmax(x, dims = 1)) == [2 1 2]
+    @test copy(argmax(x, dims = 2)) == reshape([2, 3], :, 1)
+  end
+
+  @info "NDArray::argmax::NaN"
+  let
+    A = [1.  5 3;
+         NaN 2 6]
+    x = NDArray(A)
+
+    @test copy(argmax(x, dims = 1)) == [1 1 2]
+    @test copy(argmax(x, dims = 2)) == reshape([2, 3], :, 1)
+  end
+end
+
+function test_argmin()
+  @info "NDArray::argmin"
+  let
+    A = [1. 5 3;
+         4 2 6]
+    x = NDArray(A)
+
+    @test copy(argmin(x, dims = 1)) == [1 2 1]
+    @test copy(argmin(x, dims = 2)) == reshape([1, 2], :, 1)
+  end
+
+  @info "NDArray::argmin::NaN"
+  let
+    A = [1.  5 3;
+         NaN 2 6]
+    x = NDArray(A)
+
+    @test copy(argmin(x, dims = 1)) == [1 2 1]
+    @test copy(argmin(x, dims = 2)) == reshape([1, 2], :, 1)
+  end
+end
+
 ################################################################################
 # Run tests
 ################################################################################
@@ -1479,6 +1595,8 @@ end  # function test_hypot
   test_broadcast_to()
   test_broadcast_axis()
   test_hypot()
+  test_argmax()
+  test_argmin()
 end
 
 end
diff --git a/julia/test/unittest/random.jl b/julia/test/unittest/random.jl
index 013e4f609..38da9601a 100644
--- a/julia/test/unittest/random.jl
+++ b/julia/test/unittest/random.jl
@@ -30,7 +30,7 @@ function test_uniform()
   ret1 = mx.rand(dims..., low = low, high = high)
 
   mx.seed!(seed)
-  ret2 = mx.empty(dims)
+  ret2 = NDArray(undef, dims)
   mx.rand!(ret2, low = low, high = high)
 
   @test copy(ret1) == copy(ret2)
@@ -47,7 +47,7 @@ function test_gaussian()
   ret1 = mx.randn(dims..., μ = μ, σ = σ)
 
   mx.seed!(seed)
-  ret2 = mx.empty(dims)
+  ret2 = NDArray(undef, dims)
   mx.randn!(ret2, μ = μ, σ = σ)
 
   @test copy(ret1) == copy(ret2)
diff --git a/make/config.mk b/make/config.mk
index 81d86e3ca..7beae8d84 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -104,40 +104,6 @@ USE_OPENMP = 1
 # you can disable it explicity with USE_MKLDNN = 0
 USE_MKLDNN =
 
-# The following variables influence if/how MXnet is built with nGraph support:
-#
-# USE_NGRAPH - If 1, then MXnet will built with nGraph support.
-#
-# USE_NGRAPH_IE - If 1, then if/when nGraph is built it will contain support for inference-engine
-#   processing.
-#
-# USE_NGRAPH_GPU - If 1, then if/when nGraph is built it will contain support for NVIDIA GPU
-#   processing. Requires USE_CUDA and USE_CUDA_PATH to be set above.
-#
-# USE_NGRAPH_DISTRIBUTED - If 1, then if/when nGraph is built it will contain support for
-#   distributed processing.
-#
-# NGRAPH_EXTRA_CMAKE_FLAGS - Additional command-line arguments passed to the 'cmake' invocation
-#   used to configure the nGraph build.
-#
-# NGRAPH_EXTRA_MAKE_FLAGS - Additional command-line arugments passed to the 'make' invocation
-#   used to build nGraph.
-#
-# ADD_NGRAPH_LIBDIR_TO_MXNET_RPATH - If USE_NGRAPH=1 and ADD_NGRAPH_LIBDIR_TO_MXNET_RPATH=1, then
-#   'libmxnet.so' will be linked with linker option `-rpath=D`, where "D" is whatever directory
-#   contains the copy of `libngraph.so` used by MXnet's build system.
-
-USE_NGRAPH = 0
-USE_NGRAPH_IE = 0
-USE_NGRAPH_GPU = 0
-USE_NGRAPH_DISTRIBUTED = 0
-
-# use prebuilt mkldnn if set
-ifneq ($(MKLDNNROOT),)
-  override NGRAPH_EXTRA_CMAKE_FLAGS += -DMKLDNN_INCLUDE_DIR=$(MKLDNNROOT)/include
-  override NGRAPH_EXTRA_CMAKE_FLAGS += -DMKLDNN_LIB_DIR=$(MKLDNNROOT)/lib
-endif
-
 # whether use NNPACK library
 USE_NNPACK = 0
 
diff --git a/make/maven/maven_darwin_mkl.mk b/make/maven/maven_darwin_mkl.mk
new file mode 100644
index 000000000..f5b77ae78
--- /dev/null
+++ b/make/maven/maven_darwin_mkl.mk
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making maven package
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -lz -framework CoreFoundation -framework Security  -Wl,-exported_symbols_list,$(CURDIR)/make/config/libmxnet.sym,-rpath,'$${ORIGIN}',-dead_strip
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=apple
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = NONE
+
+# whether use CuDNN R3 library
+USE_CUDNN = 0
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
+# use openmp for parallelization
+USE_OPENMP = 0
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/maven/maven_linux_cu90mkl.mk b/make/maven/maven_linux_cu90mkl.mk
new file mode 100644
index 000000000..661f444ce
--- /dev/null
+++ b/make/maven/maven_linux_cu90mkl.mk
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making maven package
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.0
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/maven/maven_linux_cu92mkl.mk b/make/maven/maven_linux_cu92mkl.mk
new file mode 100644
index 000000000..ecd252769
--- /dev/null
+++ b/make/maven/maven_linux_cu92mkl.mk
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making maven package
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.2
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/maven/maven_linux_mkl.mk b/make/maven/maven_linux_mkl.mk
new file mode 100644
index 000000000..6cb9f326b
--- /dev/null
+++ b/make/maven/maven_linux_mkl.mk
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making maven package
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER = 1
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -lgfortran -ldl -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = NONE
+
+# whether use CuDNN R3 library
+USE_CUDNN = 0
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
index b1907f5cd..0e6a05ea9 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -1618,6 +1618,7 @@ int MXExecutorReshape(int partial_shaping,
 int MXExecutorSetMonitorCallback(ExecutorHandle handle,
                                            ExecutorMonitorCallback callback,
                                            void* callback_handle);
+
 //--------------------------------------------
 // Part 5: IO Interface
 //--------------------------------------------
@@ -2167,4 +2168,3 @@ int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** cuda_kernel_
                                   mx_uint grid_dim_z, mx_uint block_dim_x,
                                   mx_uint block_dim_y, mx_uint block_dim_z,
                                   mx_uint shared_mem);
-
diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index 51deb4fca..e077824e0 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -219,6 +219,68 @@ def convert_convolution(node, **kwargs):
     return [conv_node]
 
 
+@mx_op.register("Deconvolution")
+def convert_deconvolution(node, **kwargs):
+    """Map MXNet's deconvolution operator attributes to onnx's ConvTranspose operator
+    and return the created node.
+    """
+    name, inputs, attrs = get_inputs(node, kwargs)
+
+    kernel_dims = list(parse_helper(attrs, "kernel"))
+    stride_dims = list(parse_helper(attrs, "stride", [1, 1]))
+    pad_dims = list(parse_helper(attrs, "pad", [0, 0]))
+    num_group = int(attrs.get("num_group", 1))
+    dilations = list(parse_helper(attrs, "dilate", [1, 1]))
+    adj_dims = list(parse_helper(attrs, "adj", [0, 0]))
+
+    pad_dims = pad_dims + pad_dims
+
+    deconv_node = onnx.helper.make_node(
+        "ConvTranspose",
+        inputs=inputs,
+        outputs=[name],
+        kernel_shape=kernel_dims,
+        strides=stride_dims,
+        dilations=dilations,
+        output_padding=adj_dims,
+        pads=pad_dims,
+        group=num_group,
+        name=name
+    )
+
+    return [deconv_node]
+
+
+@mx_op.register("Crop")
+def convert_crop(node, **kwargs):
+    """Map MXNet's crop operator attributes to onnx's Crop operator
+    and return the created node.
+    """
+    name, inputs, attrs = get_inputs(node, kwargs)
+    num_inputs = len(inputs)
+
+    y, x = list(parse_helper(attrs, "offset", [0, 0]))
+    h, w = list(parse_helper(attrs, "h_w", [0, 0]))
+    if num_inputs > 1:
+        h, w = kwargs["out_shape"][-2:]
+    border = [x, y, x + w, y + h]
+
+    crop_node = onnx.helper.make_node(
+        "Crop",
+        inputs=[inputs[0]],
+        outputs=[name],
+        border=border,
+        scale=[1, 1],
+        name=name
+    )
+
+    logging.warning(
+        "Using an experimental ONNX operator: Crop. " \
+        "Its definition can change.")
+
+    return [crop_node]
+
+
 @mx_op.register("FullyConnected")
 def convert_fully_connected(node, **kwargs):
     """Map MXNet's FullyConnected operator attributes to onnx's Gemm operator
@@ -583,8 +645,8 @@ def convert_pooling(node, **kwargs):
     name, input_nodes, attrs = get_inputs(node, kwargs)
 
     kernel = eval(attrs["kernel"])
-    pool_type = attrs["pool_type"]
-    stride = eval(attrs["stride"]) if attrs.get("stride") else None
+    pool_type = attrs["pool_type"] if attrs.get("pool_type") else "max"
+    stride = eval(attrs["stride"]) if attrs.get("stride") else (1, 1)
     global_pool = get_boolean_attribute_value(attrs, "global_pool")
     p_value = attrs.get('p_value', 'None')
 
@@ -1907,3 +1969,79 @@ def convert_roipooling(node, **kwargs):
         name=name
     )
     return [node]
+
+
+@mx_op.register("tile")
+def convert_tile(node, **kwargs):
+    """Map MXNet's Tile operator attributes to onnx's Tile
+    operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    reps_list = convert_string_to_list(attrs["reps"])
+
+    initializer = kwargs["initializer"]
+    reps_shape_np = np.array(reps_list, dtype='int64')
+    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[reps_shape_np.dtype]
+    dims = np.shape(reps_shape_np)
+
+    output_shape_name = "reps_attr_tensor" + str(kwargs["idx"])
+    tensor_node = onnx.helper.make_tensor_value_info(output_shape_name, data_type, dims)
+
+    initializer.append(
+        onnx.helper.make_tensor(
+            name=output_shape_name,
+            data_type=data_type,
+            dims=dims,
+            vals=reps_list,
+            raw=False,
+        )
+    )
+
+    input_nodes.append(output_shape_name)
+    tile_node = onnx.helper.make_node(
+        "Tile",
+        input_nodes,
+        [name],
+        name=name
+    )
+
+    return [tensor_node, tile_node]
+
+
+@mx_op.register("broadcast_to")
+def convert_broadcast_to(node, **kwargs):
+    """Map MXNet's broadcast_to operator attributes to onnx's Expand
+    operator and return the created node.
+    """
+    name, input_nodes, attrs = get_inputs(node, kwargs)
+
+    shape_list = convert_string_to_list(attrs["shape"])
+
+    initializer = kwargs["initializer"]
+    output_shape_np = np.array(shape_list, dtype='int64')
+    data_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[output_shape_np.dtype]
+    dims = np.shape(output_shape_np)
+
+    output_shape_name = "expand_attr_tensor" + str(kwargs["idx"])
+    tensor_node = onnx.helper.make_tensor_value_info(output_shape_name, data_type, dims)
+
+    initializer.append(
+        onnx.helper.make_tensor(
+            name=output_shape_name,
+            data_type=data_type,
+            dims=dims,
+            vals=shape_list,
+            raw=False,
+        )
+    )
+
+    input_nodes.append(output_shape_name)
+    expand_node = onnx.helper.make_node(
+        "Expand",
+        input_nodes,
+        [name],
+        name=name
+    )
+
+    return [tensor_node, expand_node]
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index 61ad8a3ec..96183bb7a 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -80,8 +80,7 @@ def _quantize_params(qsym, params, th_dict):
                 quantized_params[name] = ndarray.array([th_dict[output][1]])
     return quantized_params
 
-def _quantize_symbol(sym, excluded_symbols=None, offline_params=None,
-                     quantized_dtype='int8', calib_quantize_op=False):
+def _quantize_symbol(sym, excluded_symbols=None, offline_params=None, quantized_dtype='int8'):
     """Given a symbol object representing a neural network of data type FP32,
     quantize it into a INT8 network.
 
@@ -98,8 +97,6 @@ def _quantize_symbol(sym, excluded_symbols=None, offline_params=None,
         avoided.
     quantized_dtype: str
         The quantized destination type for input data.
-    calib_quantize_op : bool
-        Whether perform offline calibration for quantize op.
     """
     num_excluded_symbols = 0
     if excluded_symbols is not None:
@@ -123,7 +120,7 @@ def _quantize_symbol(sym, excluded_symbols=None, offline_params=None,
                                      mx_uint(num_offline),
                                      c_array(ctypes.c_char_p, offline),
                                      c_str(quantized_dtype),
-                                     ctypes.c_bool(calib_quantize_op)))
+                                     ctypes.c_bool(True)))
     return Symbol(out)
 
 
@@ -151,7 +148,6 @@ def collect(self, name, arr):
         else:
             self.nd_dict[name] = [arr]
 
-
 class _LayerOutputMinMaxCollector(object):
     """Saves layer output min and max values in a dict with layer names as keys.
     The collected min and max values will be directly used as thresholds for quantization.
@@ -177,10 +173,9 @@ def collect(self, name, arr):
         else:
             self.min_max_dict[name] = (min_range, max_range)
         if self.logger is not None:
-            self.logger.info("Collecting layer %s output min_range=%f, max_range=%f"
+            self.logger.info("Collecting layer %s min_range=%f, max_range=%f"
                              % (name, min_range, max_range))
 
-
 def _calibrate_quantized_sym(qsym, th_dict):
     """Given a dictionary containing the thresholds for quantizing the layers,
     set the thresholds into the quantized symbol as the params of requantize operators.
@@ -210,7 +205,7 @@ def _collect_layer_statistics(mod, data, collector, max_num_examples=None, logge
     if not isinstance(data, DataIter):
         raise ValueError('Only supports data as a type of DataIter, while received type %s'
                          % str(type(data)))
-    mod._exec_group.execs[0].set_monitor_callback(collector.collect)
+    mod._exec_group.execs[0].set_monitor_callback(collector.collect, monitor_all=True)
     num_batches = 0
     num_examples = 0
     for batch in data:
@@ -265,6 +260,9 @@ def _smooth_distribution(p, eps=0.0001):
 # pylint: disable=line-too-long
 def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
     """Given a dataset, find the optimal threshold for quantizing it.
+    The reference distribution is `q`, and the candidate distribution is `p`.
+    `q` is a truncated version of the original distribution.
+
     Ref: http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
     """
     if isinstance(arr, NDArray):
@@ -290,8 +288,6 @@ def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
     hist, hist_edges = np.histogram(arr, bins=num_bins, range=(-th, th))
     zero_bin_idx = num_bins // 2
     num_half_quantized_bins = num_quantized_bins // 2
-    assert np.allclose(hist_edges[zero_bin_idx] + hist_edges[zero_bin_idx + 1],
-                       0, rtol=1e-5, atol=1e-7)
 
     thresholds = np.zeros(num_bins // 2 + 1 - num_quantized_bins // 2)
     divergence = np.zeros_like(thresholds)
@@ -315,10 +311,10 @@ def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
         right_outlier_count = np.sum(hist[p_bin_idx_stop:])
         p[-1] += right_outlier_count
         # is_nonzeros[k] indicates whether hist[k] is nonzero
-        is_nonzeros = (sliced_nd_hist != 0).astype(np.int32)
+        is_nonzeros = (p != 0).astype(np.int32)
 
         # calculate how many bins should be merged to generate quantized distribution q
-        num_merged_bins = p.size // num_quantized_bins
+        num_merged_bins = sliced_nd_hist.size // num_quantized_bins
         # merge hist into num_quantized_bins bins
         for j in range(num_quantized_bins):
             start = j * num_merged_bins
@@ -326,17 +322,17 @@ def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
             quantized_bins[j] = sliced_nd_hist[start:stop].sum()
         quantized_bins[-1] += sliced_nd_hist[num_quantized_bins * num_merged_bins:].sum()
         # expand quantized_bins into p.size bins
-        q = np.zeros(p.size, dtype=np.float32)
+        q = np.zeros(sliced_nd_hist.size, dtype=np.float32)
         for j in range(num_quantized_bins):
             start = j * num_merged_bins
             if j == num_quantized_bins - 1:
-                stop = -1
+                stop = len(is_nonzeros)
             else:
                 stop = start + num_merged_bins
             norm = is_nonzeros[start:stop].sum()
             if norm != 0:
                 q[start:stop] = float(quantized_bins[j]) / float(norm)
-        q[sliced_nd_hist == 0] = 0
+        q[p == 0] = 0
         p = _smooth_distribution(p)
         # There is a chance that q is an invalid probability distribution.
         try:
@@ -344,7 +340,6 @@ def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
         except ValueError:
             divergence[i - num_half_quantized_bins] = float("inf")
         divergence[i - num_half_quantized_bins] = stats.entropy(p, q)
-        quantized_bins[:] = 0
 
     min_divergence_idx = np.argmin(divergence)
     min_divergence = divergence[min_divergence_idx]
@@ -424,7 +419,7 @@ def quantize_model(sym, arg_params, aux_params,
                    data_names=('data',), label_names=('softmax_label',),
                    ctx=cpu(), excluded_sym_names=None, calib_mode='entropy',
                    calib_data=None, num_calib_examples=None, calib_layer=None,
-                   quantized_dtype='int8', calib_quantize_op=False, logger=logging):
+                   quantized_dtype='int8', logger=logging):
     """User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
     The backend quantized operators are only enabled for Linux systems. Please do not run
     inference using the quantized models on Windows for now.
@@ -476,9 +471,8 @@ def quantize_model(sym, arg_params, aux_params,
         all the layers' outputs that need requantization will be collected.
     quantized_dtype : str
         The quantized destination type for input data. Currently support 'int8'
-        and 'uint8', default value is 'int8'.
-    calib_quantize_op: bool
-        Whether calibrate quantize op with its input calibration data. The quantize op's input should be in calib_layer
+        , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.
+        Default value is 'int8'.
     logger : Object
         A logging object for printing information during the process of quantization.
 
@@ -496,13 +490,12 @@ def quantize_model(sym, arg_params, aux_params,
                          ' while received type %s' % str(type(excluded_sym_names)))
 
     logger.info('Quantizing symbol')
-    if quantized_dtype not in ('int8', 'uint8'):
+    if quantized_dtype not in ('int8', 'uint8', 'auto'):
         raise ValueError('unknown quantized_dtype %s received,'
-                         ' expected `int8` or `uint8`' % quantized_dtype)
+                         ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
     qsym = _quantize_symbol(sym, excluded_symbols=excluded_sym_names,
                             offline_params=list(arg_params.keys()),
-                            quantized_dtype=quantized_dtype,
-                            calib_quantize_op=calib_quantize_op)
+                            quantized_dtype=quantized_dtype)
 
     th_dict = {}
     if calib_mode is not None and calib_mode != 'none':
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index fcd540623..7bf867579 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -234,13 +234,15 @@ def backward(self, out_grads=None, is_train=True):
             ndarray,
             ctypes.c_int(is_train)))
 
-    def set_monitor_callback(self, callback):
+    def set_monitor_callback(self, callback, monitor_all=False):
         """Install callback for monitor.
 
         Parameters
         ----------
         callback : function
             Takes a string and an NDArrayHandle.
+        monitor_all : bool, default False
+            If true, monitor both input and output, otherwise monitor output only.
 
         Examples
         --------
@@ -251,10 +253,11 @@ def set_monitor_callback(self, callback):
         """
         cb_type = ctypes.CFUNCTYPE(None, ctypes.c_char_p, NDArrayHandle, ctypes.c_void_p)
         self._monitor_callback = cb_type(_monitor_callback_wrapper(callback))
-        check_call(_LIB.MXExecutorSetMonitorCallback(
+        check_call(_LIB.MXExecutorSetMonitorCallbackEX(
             self.handle,
             self._monitor_callback,
-            None))
+            None,
+            ctypes.c_int(monitor_all)))
 
     @property
     def arg_dict(self):
diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py
index 28fea1592..56f0809b3 100644
--- a/python/mxnet/gluon/contrib/nn/basic_layers.py
+++ b/python/mxnet/gluon/contrib/nn/basic_layers.py
@@ -165,7 +165,10 @@ class SyncBatchNorm(BatchNorm):
 
     Standard BN [1]_ implementation only normalize the data within each device.
     SyncBN normalizes the input within the whole mini-batch.
-    We follow the sync-onece implmentation described in the paper [2]_.
+    We follow the implementation described in the paper [2]_.
+
+    Note: Current implementation of SyncBN does not support FP16 training.
+    For FP16 inference, use standard nn.BatchNorm instead of SyncBN.
 
     Parameters
     ----------
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms.py
index 175076925..9310e15f5 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms.py
@@ -96,17 +96,20 @@ def hybrid_forward(self, F, x):
 
 
 class ToTensor(HybridBlock):
-    """Converts an image NDArray to a tensor NDArray.
+    """Converts an image NDArray or batch of image NDArray to a tensor NDArray.
 
     Converts an image NDArray of shape (H x W x C) in the range
     [0, 255] to a float32 tensor NDArray of shape (C x H x W) in
     the range [0, 1).
 
+    If batch input, converts a batch image NDArray of shape (N x H x W x C) in the
+    range [0, 255] to a float32 tensor NDArray of shape (N x C x H x W).
+
     Inputs:
-        - **data**: input tensor with (H x W x C) shape and uint8 type.
+        - **data**: input tensor with (H x W x C) or (N x H x W x C) shape and uint8 type.
 
     Outputs:
-        - **out**: output tensor with (C x H x W) shape and float32 type.
+        - **out**: output tensor with (C x H x W) or (N x H x W x C) shape and float32 type.
 
     Examples
     --------
@@ -135,7 +138,7 @@ def hybrid_forward(self, F, x):
 
 
 class Normalize(HybridBlock):
-    """Normalize an tensor of shape (C x H x W) with mean and
+    """Normalize an tensor of shape (C x H x W) or (N x C x H x W) with mean and
     standard deviation.
 
     Given mean `(m1, ..., mn)` and std `(s1, ..., sn)` for `n` channels,
@@ -154,12 +157,31 @@ class Normalize(HybridBlock):
 
 
     Inputs:
-        - **data**: input tensor with (C x H x W) shape.
+        - **data**: input tensor with (C x H x W) or (N x C x H x W) shape.
 
     Outputs:
         - **out**: output tensor with the shape as `data`.
+
+    Examples
+    --------
+    >>> transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
+    >>> image = mx.nd.random.uniform(0, 1, (3, 4, 2))
+    >>> transformer(image)
+    [[[ 0.18293785  0.19761486]
+      [ 0.23839645  0.28142193]
+      [ 0.20092112  0.28598186]
+      [ 0.18162774  0.28241724]]
+     [[-0.2881726  -0.18821815]
+      [-0.17705294 -0.30780914]
+      [-0.2812064  -0.3512327 ]
+      [-0.05411351 -0.4716435 ]]
+     [[-1.0363373  -1.7273437 ]
+      [-1.6165586  -1.5223348 ]
+      [-1.208275   -1.1878313 ]
+      [-1.4711051  -1.5200229 ]]]
+    <NDArray 3x4x2 @cpu(0)>
     """
-    def __init__(self, mean, std):
+    def __init__(self, mean=0.0, std=1.0):
         super(Normalize, self).__init__()
         self._mean = mean
         self._std = std
@@ -243,8 +265,8 @@ def forward(self, x):
         return image.center_crop(x, *self._args)[0]
 
 
-class Resize(Block):
-    """Resize an image to the given size.
+class Resize(HybridBlock):
+    """Resize an image or a batch of image NDArray to the given size.
     Should be applied before `mxnet.gluon.data.vision.transforms.ToTensor`.
 
     Parameters
@@ -257,13 +279,17 @@ class Resize(Block):
     interpolation : int
         Interpolation method for resizing. By default uses bilinear
         interpolation. See OpenCV's resize function for available choices.
+        Note that the Resize on gpu use contrib.bilinearResize2D operator
+        which only support bilinear interpolation(1). The result would be slightly
+        different on gpu compared to cpu. OpenCV tend to align center while bilinearResize2D
+        use algorithm which aligns corner.
 
 
     Inputs:
-        - **data**: input tensor with (Hi x Wi x C) shape.
+        - **data**: input tensor with (H x W x C) or (N x H x W x C) shape.
 
     Outputs:
-        - **out**: output tensor with (H x W x C) shape.
+        - **out**: output tensor with (H x W x C) or (N x H x W x C) shape.
 
     Examples
     --------
@@ -271,6 +297,9 @@ class Resize(Block):
     >>> image = mx.nd.random.uniform(0, 255, (224, 224, 3)).astype(dtype=np.uint8)
     >>> transformer(image)
     <NDArray 500x1000x3 @cpu(0)>
+    >>> image = mx.nd.random.uniform(0, 255, (3, 224, 224, 3)).astype(dtype=np.uint8)
+    >>> transformer(image)
+    <NDArray 3x500x1000x3 @cpu(0)>
     """
     def __init__(self, size, keep_ratio=False, interpolation=1):
         super(Resize, self).__init__()
@@ -278,23 +307,8 @@ def __init__(self, size, keep_ratio=False, interpolation=1):
         self._size = size
         self._interpolation = interpolation
 
-    def forward(self, x):
-        if isinstance(self._size, numeric_types):
-            if not self._keep:
-                wsize = self._size
-                hsize = self._size
-            else:
-                h, w, _ = x.shape
-                if h > w:
-                    wsize = self._size
-                    hsize = int(h * wsize / w)
-                else:
-                    hsize = self._size
-                    wsize = int(w * hsize / h)
-        else:
-            wsize, hsize = self._size
-        return image.imresize(x, wsize, hsize, self._interpolation)
-
+    def hybrid_forward(self, F, x):
+        return F.image.resize(x, self._size, self._keep, self._interpolation)
 
 class RandomFlipLeftRight(HybridBlock):
     """Randomly flip the input image left to right with a probability
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 7b5832e1a..29d0105ae 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -99,11 +99,11 @@ def hybrid_forward(self, F, x, *args, **kwargs):
 
 
 class L2Loss(Loss):
-    r"""Calculates the mean squared error between `pred` and `label`.
+    r"""Calculates the mean squared error between `label` and `pred`.
 
-    .. math:: L = \frac{1}{2} \sum_i \vert {pred}_i - {label}_i \vert^2.
+    .. math:: L = \frac{1}{2} \sum_i \vert {label}_i - {pred}_i \vert^2.
 
-    `pred` and `label` can have arbitrary shape as long as they have the same
+    `label` and `pred` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -131,17 +131,17 @@ def __init__(self, weight=1., batch_axis=0, **kwargs):
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
         label = _reshape_like(F, label, pred)
-        loss = F.square(pred - label)
+        loss = F.square(label - pred)
         loss = _apply_weighting(F, loss, self._weight/2, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
 
 class L1Loss(Loss):
-    r"""Calculates the mean absolute error between `pred` and `label`.
+    r"""Calculates the mean absolute error between `label` and `pred`.
 
-    .. math:: L = \sum_i \vert {pred}_i - {label}_i \vert.
+    .. math:: L = \sum_i \vert {label}_i - {pred}_i \vert.
 
-    `pred` and `label` can have arbitrary shape as long as they have the same
+    `label` and `pred` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -169,7 +169,7 @@ def __init__(self, weight=None, batch_axis=0, **kwargs):
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
         label = _reshape_like(F, label, pred)
-        loss = F.abs(pred - label)
+        loss = F.abs(label - pred)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
@@ -195,7 +195,7 @@ class SigmoidBinaryCrossEntropyLoss(Loss):
             (1 - {label}_i) * \log(1 - {pred}_i)
 
 
-    `pred` and `label` can have arbitrary shape as long as they have the same
+    `label` and `pred` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -344,7 +344,7 @@ class KLDivLoss(Loss):
         L = \sum_i {label}_i * \big[\log({label}_i) - log({pred}_i)\big]
 
 
-    `pred` and `label` can have arbitrary shape as long as they have the same
+    `label` and `pred` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -481,13 +481,13 @@ class HuberLoss(Loss):
     exceeds rho but is equal to L2 loss otherwise. Also called SmoothedL1 loss.
 
     .. math::
-        L = \sum_i \begin{cases} \frac{1}{2 {rho}} ({pred}_i - {label}_i)^2 &
-                           \text{ if } |{pred}_i - {label}_i| < {rho} \\
-                           |{pred}_i - {label}_i| - \frac{{rho}}{2} &
+        L = \sum_i \begin{cases} \frac{1}{2 {rho}} ({label}_i - {pred}_i)^2 &
+                           \text{ if } |{label}_i - {pred}_i| < {rho} \\
+                           |{label}_i - {pred}_i| - \frac{{rho}}{2} &
                            \text{ otherwise }
             \end{cases}
 
-    `pred` and `label` can have arbitrary shape as long as they have the same
+    `label` and `pred` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
@@ -518,7 +518,7 @@ def __init__(self, rho=1, weight=None, batch_axis=0, **kwargs):
 
     def hybrid_forward(self, F, pred, label, sample_weight=None):
         label = _reshape_like(F, label, pred)
-        loss = F.abs(pred - label)
+        loss = F.abs(label - pred)
         loss = F.where(loss > self._rho, loss - 0.5 * self._rho,
                        (0.5/self._rho) * F.square(loss))
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
@@ -532,7 +532,7 @@ class HingeLoss(Loss):
         L = \sum_i max(0, {margin} - {pred}_i \cdot {label}_i)
 
     where `pred` is the classifier prediction and `label` is the target tensor
-    containing values -1 or 1. `pred` and `label` must have the same number of
+    containing values -1 or 1. `label` and `pred` must have the same number of
     elements.
 
     Parameters
@@ -576,7 +576,7 @@ class SquaredHingeLoss(Loss):
         L = \sum_i max(0, {margin} - {pred}_i \cdot {label}_i)^2
 
     where `pred` is the classifier prediction and `label` is the target tensor
-    containing values -1 or 1. `pred` and `label` can have arbitrary shape as
+    containing values -1 or 1. `label` and `pred` can have arbitrary shape as
     long as they have the same number of elements.
 
     Parameters
@@ -621,7 +621,7 @@ class LogisticLoss(Loss):
 
     where `pred` is the classifier prediction and `label` is the target tensor
     containing values -1 or 1 (0 or 1 if `label_format` is binary).
-    `pred` and `label` can have arbitrary shape as long as they have the same number of elements.
+    `label` and `pred` can have arbitrary shape as long as they have the same number of elements.
 
     Parameters
     ----------
@@ -666,14 +666,14 @@ def hybrid_forward(self, F, pred, label, sample_weight=None):
 
 class TripletLoss(Loss):
     r"""Calculates triplet loss given three input tensors and a positive margin.
-    Triplet loss measures the relative similarity between prediction, a positive
-    example and a negative example:
+    Triplet loss measures the relative similarity between a positive
+    example, a negative example, and prediction:
 
     .. math::
-        L = \sum_i \max(\Vert {pred}_i - {pos_i} \Vert_2^2 -
-                        \Vert {pred}_i - {neg_i} \Vert_2^2 + {margin}, 0)
+        L = \sum_i \max(\Vert {pos_i}_i - {pred} \Vert_2^2 -
+                        \Vert {neg_i}_i - {pred} \Vert_2^2 + {margin}, 0)
 
-    `pred`, `positive` and `negative` can have arbitrary shape as long as they
+    `positive`, `negative`, and 'pred' can have arbitrary shape as long as they
     have the same number of elements.
 
     Parameters
@@ -703,7 +703,7 @@ def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
     def hybrid_forward(self, F, pred, positive, negative):
         positive = _reshape_like(F, positive, pred)
         negative = _reshape_like(F, negative, pred)
-        loss = F.sum(F.square(pred-positive) - F.square(pred-negative),
+        loss = F.sum(F.square(positive-pred) - F.square(negative-pred),
                      axis=self._batch_axis, exclude=True)
         loss = F.relu(loss + self._margin)
         return _apply_weighting(F, loss, self._weight, None)
@@ -717,7 +717,7 @@ class PoissonNLLLoss(Loss):
     .. math::
         L = \text{pred} - \text{target} * \log(\text{pred}) +\log(\text{target!})
 
-    `pred`, `target` can have arbitrary shape as long as they have the same number of elements.
+    `target`, 'pred' can have arbitrary shape as long as they have the same number of elements.
 
     Parameters
     ----------
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 4d514c283..f8566dd05 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -262,7 +262,10 @@ def __init__(self, rate, axes=(), **kwargs):
         self._axes = axes
 
     def hybrid_forward(self, F, x):
-        return F.Dropout(x, p=self._rate, axes=self._axes, name='fwd')
+        if self._rate > 0:
+            return F.Dropout(x, p=self._rate, axes=self._axes, name='fwd', cudnn_off=False)
+        else:
+            return F.identity(x)
 
     def __repr__(self):
         s = '{name}(p = {_rate}, axes={_axes})'
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index f6c0a31b5..8060f38ac 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -60,7 +60,8 @@ class Trainer(object):
         See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     update_on_kvstore : bool, default None
         Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
-        suitable option depending on the type of kvstore.
+        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
+        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
 
     Properties
     ----------
@@ -393,6 +394,8 @@ def update(self, batch_size, ignore_stale_grad=False):
         self._update(ignore_stale_grad)
 
     def _update(self, ignore_stale_grad=False):
+        updates = [[] for _ in self._updaters]
+
         for i, param in enumerate(self._params):
             if param.grad_req == 'null':
                 continue
@@ -416,11 +419,17 @@ def _update(self, ignore_stale_grad=False):
                     self._kvstore.pull(i, param.list_data(), priority=-i)
                 continue
 
-            for upd, arr, grad in zip(self._updaters, param.list_data(), param.list_grad()):
+            for upd, arr, grad in zip(updates, param.list_data(), param.list_grad()):
                 if not ignore_stale_grad or arr._fresh_grad:
-                    upd(i, grad, arr)
+                    upd.append((i, grad, arr))
                     arr._fresh_grad = False
 
+        if not (self._kvstore and self._update_on_kvstore):
+            for updater, upd in zip(self._updaters, updates):
+                if upd:
+                    i, w, g = zip(*upd)
+                    updater(i, w, g)
+
     def save_states(self, fname):
         """Saves trainer states (e.g. optimizer, momentum) to a file.
 
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index 783249867..55edd950d 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -222,7 +222,7 @@ def _replace_atomic(src, dst):
     _MOVEFILE_WRITE_THROUGH = 0x8
     _windows_default_flags = _MOVEFILE_WRITE_THROUGH
 
-    text_type = unicode if sys.version_info[0] == 2 else str  # noqa
+    text_type = unicode if sys.version_info[0] == 2 else str  # pylint: disable=undefined-variable
 
     def _str_to_unicode(x):
         """Handle text decoding. Internal use only"""
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index b67ab624a..611592aa4 100755
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -217,7 +217,7 @@ def _init_bilinear(self, _, arr):
         c = (2 * f - 1 - f % 2) / (2. * f)
         for i in range(np.prod(shape)):
             x = i % shape[3]
-            y = (i / shape[3]) % shape[2]
+            y = (i // shape[3]) % shape[2]
             weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
         arr[:] = weight.reshape(shape)
 
@@ -657,7 +657,7 @@ def _init_weight(self, _, arr):
         c = (2 * f - 1 - f % 2) / (2. * f)
         for i in range(np.prod(shape)):
             x = i % shape[3]
-            y = (i / shape[3]) % shape[2]
+            y = (i // shape[3]) % shape[2]
             weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
         arr[:] = weight.reshape(shape)
 
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 38fe73915..c08077cc6 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -92,14 +92,14 @@ def _create_kvstore(kvstore, num_device, arg_params):
     arg_params : dict of str to `NDArray`.
         Model parameter, dict of name to `NDArray` of net's weights.
     """
-    update_on_kvstore = True
+    update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
     if kvstore is None:
         kv = None
     elif isinstance(kvstore, kvs.KVStore):
         kv = kvstore
     elif isinstance(kvstore, str):
         # create kvstore using the string type
-        if num_device is 1 and 'dist' not in kvstore:
+        if num_device == 1 and 'dist' not in kvstore:
             # no need to use kv for single device and single machine
             kv = None
         else:
@@ -162,6 +162,7 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
 def _update_params(param_arrays, grad_arrays, updater, num_device,
                    kvstore=None, param_names=None):
     """Perform update of param_arrays from grad_arrays not on kvstore."""
+    updates = [[] for _ in range(num_device)]
     for i, pair in enumerate(zip(param_arrays, grad_arrays)):
         arg_list, grad_list = pair
         if grad_list[0] is None:
@@ -178,7 +179,10 @@ def _update_params(param_arrays, grad_arrays, updater, num_device,
             # state for the same index but on diff devs, TODO(mli)
             # use a better solution later
             w, g = p
-            updater(index*num_device+k, g, w)
+            updates[k].append((index*num_device+k, g, w))
+    for dev_updates in updates:
+        i, w, g = zip(*dev_updates)
+        updater(i, w, g)
 
 
 def _multiple_callbacks(callbacks, *args, **kwargs):
diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py
index 9b5686185..66c666659 100644
--- a/python/mxnet/module/bucketing_module.py
+++ b/python/mxnet/module/bucketing_module.py
@@ -95,6 +95,7 @@ def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
         self._curr_bucket_key = None
         self._params_dirty = False
         self._monitor = None
+        self._grad_req = None
 
     def _reset_bind(self):
         """Internal utility function to reset binding."""
@@ -331,6 +332,7 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
         self.for_training = for_training
         self.inputs_need_grad = inputs_need_grad
         self.binded = True
+        self._grad_req = grad_req
 
         symbol, data_names, label_names = self._call_sym_gen(self._default_bucket_key)
         module = Module(symbol, data_names, label_names, logger=self.logger,
@@ -340,7 +342,7 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
                         group2ctxs=self._group2ctxs,
                         compression_params=self._compression_params)
         module.bind(data_shapes, label_shapes, for_training, inputs_need_grad,
-                    force_rebind=False, shared_module=None, grad_req=grad_req)
+                    force_rebind=False, shared_module=None, grad_req=self._grad_req)
         self._curr_module = module
         self._curr_bucket_key = self._default_bucket_key
         self._buckets[self._default_bucket_key] = module
@@ -373,7 +375,8 @@ def switch_bucket(self, bucket_key, data_shapes, label_shapes=None):
                             compression_params=self._compression_params)
             module.bind(data_shapes, label_shapes, self._curr_module.for_training,
                         self._curr_module.inputs_need_grad,
-                        force_rebind=False, shared_module=self._buckets[self._default_bucket_key])
+                        force_rebind=False, shared_module=self._buckets[self._default_bucket_key],
+                        grad_req=self._grad_req)
             if self._monitor is not None:
                 module.install_monitor(self._monitor)
             self._buckets[bucket_key] = module
diff --git a/python/mxnet/monitor.py b/python/mxnet/monitor.py
index e3185a128..2e10708e7 100644
--- a/python/mxnet/monitor.py
+++ b/python/mxnet/monitor.py
@@ -31,7 +31,7 @@
 
 
 class Monitor(object):
-    """Monitor outputs, weights, and gradients for debugging.
+    """Monitor inputs, outputs, weights, and gradients for debugging.
 
     Parameters
     ----------
@@ -46,8 +46,10 @@ class Monitor(object):
         Only tensors with names that match `name_pattern` will be included.
         For example, '.*weight|.*output' will print all weights and outputs and
         '.*backward.*' will print all gradients.
+    monitor_all : bool, default False
+        If true, monitor both input and output, otherwise monitor output only.
     """
-    def __init__(self, interval, stat_func=None, pattern='.*', sort=False):
+    def __init__(self, interval, stat_func=None, pattern='.*', sort=False, monitor_all=False):
         if stat_func is None:
             def asum_stat(x):
                 """returns |x|/size(x), async execution."""
@@ -61,6 +63,7 @@ def asum_stat(x):
         self.exes = []
         self.re_prog = re.compile(pattern)
         self.sort = sort
+        self.monitor_all = monitor_all
         def stat_helper(name, array):
             """wrapper for executor callback"""
             array = ctypes.cast(array, NDArrayHandle)
@@ -79,7 +82,7 @@ def install(self, exe):
         exe : mx.executor.Executor
             The Executor (returned by symbol.bind) to install to.
         """
-        exe.set_monitor_callback(self.stat_helper)
+        exe.set_monitor_callback(self.stat_helper, self.monitor_all)
         self.exes.append(exe)
 
     def tic(self):
diff --git a/python/mxnet/mxfeatures.py b/python/mxnet/mxfeatures.py
new file mode 100644
index 000000000..c546151ab
--- /dev/null
+++ b/python/mxnet/mxfeatures.py
@@ -0,0 +1,103 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=not-an-iterable
+
+"""runtime detection of compile time features in the native library"""
+
+import ctypes
+import enum
+from .base import _LIB, check_call, mx_uint
+
+feature_names = [
+    "CUDA",
+    "CUDNN",
+    "NCCL",
+    "CUDA_RTC",
+    "TENSORRT",
+    "CPU_SSE",
+    "CPU_SSE2",
+    "CPU_SSE3",
+    "CPU_SSE4_1",
+    "CPU_SSE4_2",
+    "CPU_SSE4A",
+    "CPU_AVX",
+    "CPU_AVX2",
+    "OPENMP",
+    "SSE",
+    "F16C",
+    "JEMALLOC",
+    "BLAS_OPEN",
+    "BLAS_ATLAS",
+    "BLAS_MKL",
+    "BLAS_APPLE",
+    "LAPACK",
+    "MKLDNN",
+    "OPENCV",
+    "CAFFE",
+    "PROFILER",
+    "DIST_KVSTORE",
+    "CXX14",
+    "SIGNAL_HANDLER",
+    "DEBUG"
+]
+
+
+Feature = enum.Enum('Feature', {name: index for index, name in enumerate(feature_names)})
+
+
+def has_feature(feature):
+    """
+    Check the library for compile-time feature at runtime
+
+    Parameters
+    ----------
+    feature : int
+        An integer representing the feature to check
+
+    Returns
+    -------
+    boolean
+        True if the feature is enabled, false otherwise
+    """
+    res = ctypes.c_bool()
+    check_call(_LIB.MXHasFeature(mx_uint(feature), ctypes.byref(res)))
+    return res.value
+
+
+def features_enabled():
+    """
+    Returns
+    -------
+    features: list of Feature
+        list of enabled features in the back-end
+    """
+    res = []
+    for f in Feature:
+        if has_feature(f.value):
+            res.append(f)
+    return res
+
+def features_enabled_str(sep=', '):
+    """
+    Returns
+    -------
+    string with a comma separated list of enabled features in the back-end. For example:
+    "CPU_SSE, OPENMP, F16C, LAPACK, MKLDNN, OPENCV, SIGNAL_HANDLER, DEBUG"
+    """
+    return sep.join(map(lambda x: x.name, features_enabled()))
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index 9a62620da..fb329f186 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -47,7 +47,7 @@
            "imdecode", "lesser", "lesser_equal", "logical_and", "logical_or", "logical_xor",
            "maximum", "minimum", "moveaxis", "modulo", "multiply", "not_equal", "onehot_encode",
            "power", "subtract", "true_divide", "waitall", "_new_empty_handle", "histogram",
-           "to_dlpack_for_read", "to_dlpack_for_write", "from_dlpack"]
+           "split_v2", "to_dlpack_for_read", "to_dlpack_for_write", "from_dlpack"]
 
 _STORAGE_TYPE_UNDEFINED = -1
 _STORAGE_TYPE_DEFAULT = 0
@@ -1133,6 +1133,14 @@ def split(self, *args, **kwargs):
         """
         return op.split(self, *args, **kwargs)
 
+    def split_v2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split_v2`.
+
+        The arguments are the same as for :py:func:`split_v2`, with
+        this array as data.
+        """
+        return split_v2(self, *args, **kwargs)
+
     def slice(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`slice`.
 
@@ -3901,6 +3909,12 @@ def histogram(a, bins=10, range=None):
         Values outside the range are ignored. The first element of the range must be less than or
         equal to the second. range affects the automatic bin computation as well, the range will
         be equally divided by the number of bins.
+
+    Returns
+    -------
+    NDArray
+        A created array.
+
     """
 
     # pylint: disable= no-member, protected-access
@@ -3916,6 +3930,51 @@ def histogram(a, bins=10, range=None):
     raise ValueError("bins argument should be either an integer or an NDArray")
     # pylint: enable= no-member, protected-access, redefined-builtin
 
+def split_v2(ary, indices_or_sections, axis=0, squeeze_axis=False):
+    """Split an array into multiple sub-arrays.
+
+    Parameters
+    ----------
+    ary : NDArray
+        Array to be divided into sub-arrays.
+    indices_or_sections : int or tuple of ints
+        If `indices_or_sections` is an integer, N, the array will be divided
+        into N equal arrays along `axis`.  If such a split is not possible,
+        an error is raised.
+        If `indices_or_sections` is a 1-D array of sorted integers, the entries
+        indicate where along `axis` the array is split.  For example,
+        ``[2, 3]`` would, for ``axis=0``, result in
+        - ary[:2]
+        - ary[2:3]
+        - ary[3:]
+        If an index exceeds the dimension of the array along `axis`,
+        an empty sub-array is returned correspondingly.
+    axis : int, optional
+        The axis along which to split, default is 0.
+    squeeze_axis: boolean, optional
+        Whether to squeeze the axis of sub-arrays or not, only useful when size
+        of the sub-arrays are 1 on the `axis`. Default is False.
+
+    Returns
+    -------
+    NDArray
+        A created array.
+
+    """
+    indices = []
+    axis_size = ary.shape[axis]
+    if isinstance(indices_or_sections, int):
+        sections = indices_or_sections
+        if axis_size % sections:
+            raise ValueError('array split does not result in an equal division')
+        section_size = int(axis_size / sections)
+        indices = [i * section_size for i in range(sections)]
+    elif isinstance(indices_or_sections, tuple):
+        indices = [0] + list(indices_or_sections)
+    else:
+        raise ValueError('indices_or_sections must either int or tuple of ints')
+    return _internal._split_v2(ary, indices, axis, squeeze_axis)
+
 PyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
 _c_str_dltensor = c_str('dltensor')
 _c_str_used_dltensor = c_str('used_dltensor')
diff --git a/python/mxnet/ndarray/random.py b/python/mxnet/ndarray/random.py
index 78339a020..f19c1e032 100644
--- a/python/mxnet/ndarray/random.py
+++ b/python/mxnet/ndarray/random.py
@@ -78,6 +78,14 @@ def uniform(low=0, high=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwarg
     out : NDArray, optional
         Store output to an existing NDArray.
 
+    Returns
+    -------
+    NDArray
+        An NDArray of type `dtype`. If input `shape` has shape, e.g.,
+        `(m, n)` and `low` and `high` are scalars, output shape will be `(m, n)`.
+        If `low` and `high` are NDArrays with shape, e.g., `(x, y)`, then the
+        return NDArray will have shape `(x, y, m, n)`, where `m*n` uniformly distributed
+        samples are drawn for each `[low, high)` pair.
 
     Examples
     --------
@@ -128,6 +136,13 @@ def normal(loc=0, scale=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwarg
     out : NDArray, optional
         Store output to an existing NDArray.
 
+    Returns
+    -------
+    NDArray
+        An NDArray of type `dtype`. If input `shape` has shape, e.g., `(m, n)` and
+        `loc` and `scale` are scalars, output shape will be `(m, n)`. If `loc` and
+        `scale` are NDArrays with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[loc, scale)` pair.
 
     Examples
     --------
@@ -178,6 +193,13 @@ def randn(*shape, **kwargs):
     out : NDArray
         Store output to an existing NDArray.
 
+    Returns
+    -------
+    NDArray
+        If input `shape` has shape, e.g., `(m, n)` and `loc` and `scale` are scalars, output
+        shape will be `(m, n)`. If `loc` and `scale` are NDArrays with shape, e.g., `(x, y)`,
+        then output will have shape `(x, y, m, n)`, where `m*n` samples are drawn for
+        each `[loc, scale)` pair.
 
     Examples
     --------
@@ -227,6 +249,13 @@ def poisson(lam=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwargs):
     out : NDArray, optional
         Store output to an existing NDArray.
 
+    Returns
+    -------
+    NDArray
+        If input `shape` has shape, e.g., `(m, n)` and `lam` is
+        a scalar, output shape will be `(m, n)`. If `lam`
+        is an NDArray with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each entry in `lam`.
 
     Examples
     --------
@@ -274,6 +303,12 @@ def exponential(scale=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwargs)
     out : NDArray, optional
         Store output to an existing NDArray.
 
+    Returns
+    -------
+    NDArray
+        If input `shape` has shape, e.g., `(m, n)` and `scale` is a scalar, output shape will
+        be `(m, n)`. If `scale` is an NDArray with shape, e.g., `(x, y)`, then `output`
+        will have shape `(x, y, m, n)`, where `m*n` samples are drawn for each entry in scale.
 
     Examples
     --------
@@ -320,6 +355,13 @@ def gamma(alpha=1, beta=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwarg
     out : NDArray, optional
         Store output to an existing NDArray.
 
+    Returns
+    -------
+    NDArray
+        If input `shape` has shape, e.g., `(m, n)` and `alpha` and `beta` are scalars, output
+        shape will be `(m, n)`. If `alpha` and `beta` are NDArrays with shape, e.g.,
+        `(x, y)`, then output will have shape `(x, y, m, n)`, where `m*n` samples are
+        drawn for each `[alpha, beta)` pair.
 
     Examples
     --------
@@ -369,6 +411,12 @@ def negative_binomial(k=1, p=1, shape=_Null, dtype=_Null, ctx=None,
     out : NDArray, optional
         Store output to an existing NDArray.
 
+    Returns
+    -------
+    NDArray
+        If input `shape` has shape, e.g., `(m, n)` and `k` and `p` are scalars, output shape
+        will be `(m, n)`. If `k` and `p` are NDArrays with shape, e.g., `(x, y)`, then
+        output will have shape `(x, y, m, n)`, where `m*n` samples are drawn for each `[k, p)` pair.
 
     Examples
     --------
@@ -420,6 +468,13 @@ def generalized_negative_binomial(mu=1, alpha=1, shape=_Null, dtype=_Null, ctx=N
     out : NDArray, optional
         Store output to an existing NDArray.
 
+    Returns
+    -------
+    NDArray
+        If input `shape` has shape, e.g., `(m, n)` and `mu` and `alpha` are scalars, output
+        shape will be `(m, n)`. If `mu` and `alpha` are NDArrays with shape, e.g., `(x, y)`,
+        then output will have shape `(x, y, m, n)`, where `m*n` samples are drawn for
+        each `[mu, alpha)` pair.
 
     Examples
     --------
@@ -469,8 +524,27 @@ def multinomial(data, shape=_Null, get_prob=False, out=None, dtype='int32', **kw
         Data type of the sample output array. The default is int32.
         Note that the data type of the log likelihood array is the same with that of `data`.
 
+    Returns
+    -------
+    List, or NDArray
+        For input `data` with `n` dimensions and shape `(d1, d2, ..., dn-1, k)`, and input
+        `shape` with shape `(s1, s2, ..., sx)`, returns an NDArray with shape
+        `(d1, d2, ... dn-1, s1, s2, ..., sx)`. The `s1, s2, ... sx` dimensions of the
+        returned NDArray consist of 0-indexed values sampled from each respective multinomial
+        distribution provided in the `k` dimension of `data`.
+
+        For the case `n`=1, and `x`=1 (one shape dimension), returned NDArray has shape `(s1,)`.
+
+        If `get_prob` is set to True, this function returns a list of format:
+        `[ndarray_output, log_likelihood_output]`, where `log_likelihood_output` is an NDArray of the
+        same shape as the sampled outputs.
+
     Examples
     --------
+    >>> probs = mx.nd.array([0, 0.1, 0.2, 0.3, 0.4])
+    >>> mx.nd.random.multinomial(probs)
+    [3]
+    <NDArray 1 @cpu(0)>
     >>> probs = mx.nd.array([[0, 0.1, 0.2, 0.3, 0.4], [0.4, 0.3, 0.2, 0.1, 0]])
     >>> mx.nd.random.multinomial(probs)
     [3 1]
@@ -503,6 +577,13 @@ def shuffle(data, **kwargs):
     out : NDArray, optional
         Array to store the result.
 
+    Returns
+    -------
+    NDArray
+        A new NDArray with the same shape and type as input `data`, but
+        with items in the first axis of the returned NDArray shuffled randomly.
+        The original input `data` is not modified.
+
     Examples
     --------
     >>> data = mx.nd.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
@@ -545,6 +626,12 @@ def randint(low, high, shape=_Null, dtype=_Null, ctx=None, out=None, **kwargs):
     out : NDArray, optional
         Store output to an existing NDArray.
 
+    Returns
+    -------
+    NDArray
+        An NDArray of type `dtype`. If input `shape` has shape, e.g.,
+        `(m, n)`, the returned NDArray will shape will be `(m, n)`. Contents
+        of the returned NDArray will be samples from the interval `[low, high)`.
 
     Examples
     --------
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index d290a3f2f..cb52ac54f 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -22,12 +22,15 @@
 import math
 import pickle
 import warnings
+import os
 import numpy
 from ..base import py_str
 from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply)
 from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
                        mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                       signsgd_update, signum_update)
+                       signsgd_update, signum_update,
+                       multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update,
+                       multi_mp_sgd_mom_update)
 from ..ndarray import sparse
 from ..random import normal
 
@@ -37,6 +40,8 @@
     'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register'
 ]
 
+def _flatten_list(nested_list):
+    return [item for sublist in nested_list for item in sublist]
 
 class Optimizer(object):
     """The base class inherited by all optimizers.
@@ -105,6 +110,7 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self._index_update_count = {}
         self.clip_gradient = clip_gradient
         self.multi_precision = multi_precision
+        self.aggregate_num = 0
 
         if param_idx2name is None:
             param_idx2name = {}
@@ -380,13 +386,44 @@ def _update_count(self, index):
 
         Parameters
         ----------
-        index : int
+        index : int or list of int
             The index to be updated.
         """
-        if index not in self._index_update_count:
-            self._index_update_count[index] = self.begin_num_update
-        self._index_update_count[index] += 1
-        self.num_update = max(self._index_update_count[index], self.num_update)
+        if not isinstance(index, (list, tuple)):
+            index = [index]
+        for idx in index:
+            if idx not in self._index_update_count:
+                self._index_update_count[idx] = self.begin_num_update
+            self._index_update_count[idx] += 1
+            self.num_update = max(self._index_update_count[idx], self.num_update)
+
+    def _get_lrs(self, indices):
+        """Gets the learning rates given the indices of the weights.
+
+        Parameters
+        ----------
+        indices : list of int
+            Indices corresponding to weights.
+
+        Returns
+        -------
+        lrs : list of float
+            Learning rates for those indices.
+        """
+        if self.lr_scheduler is not None:
+            lr = self.lr_scheduler(self.num_update)
+        else:
+            lr = self.lr
+
+        lrs = [lr for _ in indices]
+        for i, index in enumerate(indices):
+            if index in self.param_dict:
+                lrs[i] *= self.param_dict[index].lr_mult
+            elif index in self.lr_mult:
+                lrs[i] *= self.lr_mult[index]
+            elif index in self.idx2name:
+                lrs[i] *= self.lr_mult.get(self.idx2name[index], 1.0)
+        return lrs
 
     def _get_lr(self, index):
         """Gets the learning rate given the index of the weight.
@@ -401,18 +438,31 @@ def _get_lr(self, index):
         lr : float
             Learning rate for this index.
         """
-        if self.lr_scheduler is not None:
-            lr = self.lr_scheduler(self.num_update)
-        else:
-            lr = self.lr
+        return self._get_lrs([index])[0]
 
-        if index in self.param_dict:
-            lr *= self.param_dict[index].lr_mult
-        elif index in self.lr_mult:
-            lr *= self.lr_mult[index]
-        elif index in self.idx2name:
-            lr *= self.lr_mult.get(self.idx2name[index], 1.0)
-        return lr
+    def _get_wds(self, indices):
+        """Gets weight decays for indices.
+        Returns 0 for non-weights if the name of weights are provided for `__init__`.
+
+        Parameters
+        ----------
+        indices : list of int
+            Indices of weights.
+
+        Returns
+        -------
+        wds : list of float
+            Weight decays for those indices.
+        """
+        wds = [self.wd for _ in indices]
+        for i, index in enumerate(indices):
+            if index in self.param_dict:
+                wds[i] *= self.param_dict[index].wd_mult
+            elif index in self.wd_mult:
+                wds[i] *= self.wd_mult[index]
+            elif index in self.idx2name:
+                wds[i] *= self.wd_mult.get(self.idx2name[index], 1.0)
+        return wds
 
     def _get_wd(self, index):
         """Gets weight decay for index.
@@ -421,21 +471,14 @@ def _get_wd(self, index):
         Parameters
         ----------
         index : int
-            The index for weight.
+            The index of weight.
 
         Returns
         -------
         wd : float
             Weight decay for this index.
         """
-        wd = self.wd
-        if index in self.param_dict:
-            wd *= self.param_dict[index].wd_mult
-        elif index in self.wd_mult:
-            wd *= self.wd_mult[index]
-        elif index in self.idx2name:
-            wd *= self.wd_mult.get(self.idx2name[index], 1.0)
-        return wd
+        return self._get_wds([index])[0]
 
     def __getstate__(self):
         ret = self.__dict__.copy()
@@ -471,6 +514,13 @@ class SGD(Optimizer):
     provides slightly different semantics than the original update, and
     may lead to different empirical results.
 
+    In the case when ``update_on_kvstore`` is set to False (either globally via
+    MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in
+    :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update
+    of parameters, which may lead to improved performance. The aggregation size
+    is controlled by MXNET_OPTIMIZER_AGGREGATION_SIZE environment variable and
+    defaults to 4.
+
     Otherwise, **standard updates** are applied by::
 
         rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight)
@@ -502,6 +552,7 @@ def __init__(self, momentum=0.0, lazy_update=True, **kwargs):
         super(SGD, self).__init__(**kwargs)
         self.momentum = momentum
         self.lazy_update = lazy_update
+        self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4"))
 
     def create_state_multi_precision(self, index, weight):
         weight_master_copy = None
@@ -522,12 +573,22 @@ def create_state(self, index, weight):
             momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
         return momentum
 
-    def _update_impl(self, index, weight, grad, state, multi_precision=False):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
+    def _update_impl(self, indices, weights, grads, states, multi_precision=False):
+        aggregate = True
+        if not isinstance(indices, (tuple, list)):
+            indices = [indices]
+            weights = [weights]
+            grads = [grads]
+            states = [states]
+        for weight, grad in zip(weights, grads):
+            assert(isinstance(weight, NDArray))
+            assert(isinstance(grad, NDArray))
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
 
         kwargs = {'rescale_grad': self.rescale_grad}
         if self.momentum > 0:
@@ -535,26 +596,49 @@ def _update_impl(self, index, weight, grad, state, multi_precision=False):
         if self.clip_gradient:
             kwargs['clip_gradient'] = self.clip_gradient
 
-        if not multi_precision:
-            if state is not None:
-                sgd_mom_update(weight, grad, state, out=weight,
-                               lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
+        if aggregate:
+            if not multi_precision:
+                if self.momentum > 0:
+                    multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights,
+                                         num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
+                else:
+                    multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights,
+                                     num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
             else:
-                sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
-                           lr=lr, wd=wd, **kwargs)
+                if self.momentum > 0:
+                    multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads, *zip(*states))),
+                                            out=weights, num_weights=len(weights),
+                                            lrs=lrs, wds=wds, **kwargs)
+                else:
+                    multi_mp_sgd_update(*_flatten_list(zip(weights, grads,
+                                                           list(zip(*states))[1])),
+                                        out=weights, num_weights=len(weights),
+                                        lrs=lrs, wds=wds, **kwargs)
         else:
-            if state[0] is not None:
-                mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
-                                  lr=lr, wd=wd, **kwargs)
-            else:
-                mp_sgd_update(weight, grad, state[1], out=weight,
-                              lr=lr, wd=wd, **kwargs)
+            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
+                if not multi_precision:
+                    if state is not None:
+                        sgd_mom_update(weight, grad, state, out=weight,
+                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
+                    else:
+                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
+                                   lr=lr, wd=wd, **kwargs)
+                else:
+                    if state[0] is not None:
+                        mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
+                                          lr=lr, wd=wd, **kwargs)
+                    else:
+                        mp_sgd_update(weight, grad, state[1], out=weight,
+                                      lr=lr, wd=wd, **kwargs)
 
     def update(self, index, weight, grad, state):
         self._update_impl(index, weight, grad, state, multi_precision=False)
 
     def update_multi_precision(self, index, weight, grad, state):
-        use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
+        if not isinstance(index, (tuple, list)):
+            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
+        else:
+            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
         self._update_impl(index, weight, grad, state,
                           multi_precision=use_multi_precision)
 
@@ -978,10 +1062,10 @@ def update(self, index, weight, grad, state):
         if state is not None:
             mom = state
             mom[:] *= self.momentum
-            grad += wd * weight
             mom[:] += grad
+            mom[:] += wd * weight
             grad[:] += self.momentum * mom
-            weight[:] += -lr * grad
+            weight[:] -= lr * grad
         else:
             assert self.momentum == 0.0
             weight[:] += -lr * (grad + wd * weight)
@@ -1525,20 +1609,55 @@ def __init__(self, optimizer):
         self.optimizer = optimizer
         self.states = {}
         self.states_synced = {}
+        self.aggregate_updates = optimizer.aggregate_num > 0
 
     def __call__(self, index, grad, weight):
         """Updates weight given gradient and index."""
-        # convert ctypes.char_p.value back to python str if needed
-        if isinstance(index, bytes):
-            index = py_str(index)
-        if index not in self.states:
-            self.states[index] = self.optimizer.create_state_multi_precision(index, weight)
-            self.states_synced[index] = True
-        elif not self.states_synced[index]:
-            self.states[index] = \
-                self.sync_state_context(self.states[index], weight.context)
-            self.states_synced[index] = True
-        self.optimizer.update_multi_precision(index, weight, grad, self.states[index])
+        if not isinstance(index, (list, tuple)):
+            indices = [index]
+            grads = [grad]
+            weights = [weight]
+        else:
+            indices = index
+            grads = grad
+            weights = weight
+        for i, idx in enumerate(indices):
+            # convert ctypes.char_p.value back to python str if needed
+            if isinstance(idx, bytes):
+                indices[i] = py_str(idx)
+                idx = indices[i]
+            if idx not in self.states:
+                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
+                self.states_synced[idx] = True
+            elif not self.states_synced[idx]:
+                self.states[idx] = \
+                    self.sync_state_context(self.states[idx], weights[i].context)
+                self.states_synced[idx] = True
+        if self.aggregate_updates:
+            # segregate values based on type
+            type_map = {}
+            for i, w, g in zip(indices, weights, grads):
+                if w.dtype in type_map:
+                    type_map[w.dtype].append((i, w, g))
+                else:
+                    type_map[w.dtype] = [(i, w, g)]
+            for idx in type_map:
+                current_index = 0
+                indices, weights, grads = zip(*type_map[idx])
+                while current_index < len(indices):
+                    states = []
+                    step = min(self.optimizer.aggregate_num, len(indices) - current_index)
+                    for j in range(step):
+                        states.append(self.states[indices[current_index + j]])
+                    self.optimizer.update_multi_precision(
+                        indices[current_index:current_index + self.optimizer.aggregate_num],
+                        weights[current_index:current_index + self.optimizer.aggregate_num],
+                        grads[current_index:current_index + self.optimizer.aggregate_num],
+                        states)
+                    current_index += self.optimizer.aggregate_num
+        else:
+            for i, w, g in zip(indices, weights, grads):
+                self.optimizer.update_multi_precision(i, w, g, self.states[i])
 
     def sync_state_context(self, state, context):
         """sync state context."""
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
index d54267fc0..a83227a02 100644
--- a/python/mxnet/symbol/contrib.py
+++ b/python/mxnet/symbol/contrib.py
@@ -73,17 +73,14 @@ def rand_zipfian(true_classes, num_sampled, range_max):
 
     Examples
     --------
-    >>> true_cls = mx.nd.array([3])
-    >>> samples, exp_count_true, exp_count_sample = mx.nd.contrib.rand_zipfian(true_cls, 4, 5)
-    >>> samples
-    [1 3 3 3]
-    <NDArray 4 @cpu(0)>
-    >>> exp_count_true
-    [ 0.12453879]
-    <NDArray 1 @cpu(0)>
-    >>> exp_count_sample
-    [ 0.22629439  0.12453879  0.12453879  0.12453879]
-    <NDArray 4 @cpu(0)>
+    >>> true_cls = mx.sym.Variable('true_cls')
+    >>> samples, exp_count_true, exp_count_sample = mx.sym.contrib.rand_zipfian(true_cls, 4, 5)
+    >>> samples.eval(true_cls=mx.nd.array([3]))[0].asnumpy()
+    array([1, 3, 3, 3])
+    >>> exp_count_true.eval(true_cls=mx.nd.array([3]))[0].asnumpy()
+    array([0.12453879])
+    >>> exp_count_sample.eval(true_cls=mx.nd.array([3]))[0].asnumpy()
+    array([0.22629439, 0.12453879, 0.12453879, 0.12453879])
     """
     assert(isinstance(true_classes, Symbol)), "unexpected type %s" % type(true_classes)
     log_range = math.log(range_max + 1)
diff --git a/python/mxnet/symbol/random.py b/python/mxnet/symbol/random.py
index 34663cddf..4bdfe7045 100644
--- a/python/mxnet/symbol/random.py
+++ b/python/mxnet/symbol/random.py
@@ -66,6 +66,14 @@ def uniform(low=0, high=1, shape=_Null, dtype=_Null, **kwargs):
         `(x, y, m, n)`, where `m*n` samples are drawn for each `[low, high)` pair.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
+
+    Returns
+    -------
+    Symbol
+        If input `shape` has dimensions, e.g., `(m, n)`, and `low` and `high` are
+        scalars, returned Symbol will resolve to shape `(m, n)`. If `low` and `high`
+        are Symbols with shape, e.g., `(x, y)`, returned Symbol will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[low, high)` pair.
     """
     return _random_helper(_internal._random_uniform, _internal._sample_uniform,
                           [low, high], shape, dtype, kwargs)
@@ -91,6 +99,15 @@ def normal(loc=0, scale=1, shape=_Null, dtype=_Null, **kwargs):
         `(x, y, m, n)`, where `m*n` samples are drawn for each `[loc, scale)` pair.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
+
+    Returns
+    -------
+    Symbol
+        If input `shape` has dimensions, e.g., `(m, n)`, and `loc` and
+        `scale` are scalars, returned Symbol will resolve to shape `(m, n)`.
+        If `loc` and `scale` are Symbols with shape, e.g., `(x, y)`, returned
+        Symbol will resolve to shape `(x, y, m, n)`, where `m*n` samples are drawn
+        for each `[loc, scale)` pair.
     """
     return _random_helper(_internal._random_normal, _internal._sample_normal,
                           [loc, scale], shape, dtype, kwargs)
@@ -113,6 +130,14 @@ def poisson(lam=1, shape=_Null, dtype=_Null, **kwargs):
         `(x, y, m, n)`, where `m*n` samples are drawn for each entry in `lam`.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
+
+    Returns
+    -------
+    Symbol
+        If input `shape` has dimensions, e.g., `(m, n)`, and `lam` is
+        a scalar, output shape will be `(m, n)`. If `lam`
+        is an Symbol with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each entry in `lam`.
     """
     return _random_helper(_internal._random_poisson, _internal._sample_poisson,
                           [lam], shape, dtype, kwargs)
@@ -139,6 +164,14 @@ def exponential(scale=1, shape=_Null, dtype=_Null, **kwargs):
         `(x, y, m, n)`, where `m*n` samples are drawn for each entry in `scale`.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
+
+    Returns
+    -------
+    Symbol
+        If input `shape` has dimensions, e.g., `(m, n)`, and `scale` is
+        a scalar, returned Symbol will have shape `(m, n)`. If `scale`
+        is a Symbol with shape, e.g., `(x, y)`, returned Symbol will resolve to
+        shape `(x, y, m, n)`, where `m*n` samples are drawn for each entry in `scale`.
     """
     return _random_helper(_internal._random_exponential, _internal._sample_exponential,
                           [1.0/scale], shape, dtype, kwargs)
@@ -164,6 +197,14 @@ def gamma(alpha=1, beta=1, shape=_Null, dtype=_Null, **kwargs):
         `(x, y, m, n)`, where `m*n` samples are drawn for each `[alpha, beta)` pair.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
+
+    Returns
+    -------
+    Symbol
+        If input `shape` has dimensions, e.g., `(m, n)` and `alpha` and
+        `beta` are scalars, returned Symbol will resolve to shape `(m, n)`. If `alpha`
+        and `beta` are Symbols with shape, e.g., `(x, y)`, returned Symbol will resolve
+        to shape `(x, y, m, n)`, where `m*n` samples are drawn for each `[alpha, beta)` pair.
     """
     return _random_helper(_internal._random_gamma, _internal._sample_gamma,
                           [alpha, beta], shape, dtype, kwargs)
@@ -190,6 +231,14 @@ def negative_binomial(k=1, p=1, shape=_Null, dtype=_Null, **kwargs):
         `(x, y, m, n)`, where `m*n` samples are drawn for each `[k, p)` pair.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
+
+    Returns
+    -------
+    Symbol
+        If input `shape` has dimensions, e.g., `(m, n)`, and `k` and
+        `p` are scalars, returned Symbol will resolve to shape `(m, n)`. If `k`
+        and `p` are Symbols with shape, e.g., `(x, y)`, returned Symbol will resolve
+        to shape `(x, y, m, n)`, where `m*n` samples are drawn for each `[k, p)` pair.
     """
     return _random_helper(_internal._random_negative_binomial,
                           _internal._sample_negative_binomial,
@@ -218,6 +267,14 @@ def generalized_negative_binomial(mu=1, alpha=1, shape=_Null, dtype=_Null, **kwa
         `(x, y, m, n)`, where `m*n` samples are drawn for each `[mu, alpha)` pair.
     dtype : {'float16', 'float32', 'float64'}, optional
         Data type of output samples. Default is 'float32'
+
+    Returns
+    -------
+    Symbol
+        If input `shape` has dimensions, e.g., `(m, n)`, and `mu` and
+        `alpha` are scalars, returned Symbol will resolve to shape `(m, n)`. If `mu`
+        and `alpha` are Symbols with shape, e.g., `(x, y)`, returned Symbol will resolve
+        to shape `(x, y, m, n)`, where `m*n` samples are drawn for each `[mu, alpha)` pair.
     """
     return _random_helper(_internal._random_generalized_negative_binomial,
                           _internal._sample_generalized_negative_binomial,
@@ -248,6 +305,22 @@ def multinomial(data, shape=_Null, get_prob=True, dtype='int32', **kwargs):
     dtype : str or numpy.dtype, optional
         Data type of the sample output array. The default is int32.
         Note that the data type of the log likelihood array is the same with that of `data`.
+
+    Returns
+    -------
+    Symbol
+        For input `data` with `n` dimensions and shape `(d1, d2, ..., dn-1, k)`, and input
+        `shape` with shape `(s1, s2, ..., sx)`, returns a Symbol that resovles to shape
+        `(d1, d2, ... dn-1, s1, s2, ..., sx)`. The `s1, s2, ... sx` dimensions of the
+        returned Symbol's resolved value will consist of 0-indexed values sampled from each
+        respective multinomial distribution provided in the `k` dimension of `data`.
+
+        For the case `n`=1, and `x`=1 (one shape dimension), returned Symbol will resolve to
+        shape `(s1,)`.
+
+        If `get_prob` is set to True, this function returns a Symbol that will resolve to a list of
+        outputs: `[ndarray_output, log_likelihood_output]`, where `log_likelihood_output` will resolve
+        to the same shape as the sampled outputs in ndarray_output.
     """
     return _internal._sample_multinomial(data, shape, get_prob, dtype=dtype, **kwargs)
 
@@ -264,6 +337,12 @@ def shuffle(data, **kwargs):
     ----------
     data : NDArray
         Input data array.
+
+    Returns
+    -------
+    Symbol
+        A new symbol representing the shuffled version of input `data`.
+
     Examples
     --------
     >>> data = mx.nd.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
@@ -302,6 +381,12 @@ def randint(low, high, shape=_Null, dtype=_Null, **kwargs):
         `high` are scalars, output shape will be `(m, n)`.
     dtype : {'int32', 'int64'}, optional
         Data type of output samples. Default is 'int32'
+
+    Returns
+    -------
+    Symbol
+        If input `shape` has dimensions, e.g., `(m, n)`, and `low` and
+        `high` are scalars, returned Symbol will resolve to shape `(m, n)`.
     """
     return _random_helper(_internal._random_randint, None,
                           [low, high], shape, dtype, kwargs)
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 530d72796..43de0c9d7 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -48,7 +48,7 @@
 
 __all__ = ["Symbol", "var", "Variable", "Group", "load", "load_json",
            "pow", "maximum", "minimum", "hypot", "eye", "zeros", "ones", "full", "arange",
-           "histogram"]
+           "histogram", "split_v2"]
 
 
 class Symbol(SymbolBase):
@@ -1855,6 +1855,14 @@ def split(self, *args, **kwargs):
         """
         return op.split(self, *args, **kwargs)
 
+    def split_v2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split_v2`.
+
+        The arguments are the same as for :py:func:`split_v2`, with
+        this array as data.
+        """
+        return split_v2(self, *args, **kwargs)
+
     def slice(self, *args, **kwargs):
         """Convenience fluent method for :py:func:`slice`.
 
@@ -2958,6 +2966,11 @@ def histogram(a, bins=10, range=None, **kwargs):
         Values outside the range are ignored. The first element of the range must be less than or
         equal to the second. range affects the automatic bin computation as well, the range will
         be equally divided by the number of bins.
+
+    Returns
+    -------
+    out : Symbol
+        The created Symbol
     """
     if isinstance(bins, Symbol):
         return _internal._histogram(data=a, bins=bins, **kwargs)
@@ -2967,4 +2980,44 @@ def histogram(a, bins=10, range=None, **kwargs):
         return _internal._histogram(data=a, bin_cnt=bins, range=range, **kwargs)
     raise ValueError("bins argument should be either an integer or an NDArray")
 
+def split_v2(ary, indices_or_sections, axis=0, squeeze_axis=False):
+    """Split an array into multiple sub-arrays.
+
+    Parameters
+    ----------
+    ary : NDArray
+        Array to be divided into sub-arrays.
+    indices_or_sections : int or tuple of ints
+        If `indices_or_sections` is an integer, N, the array will be divided
+        into N equal arrays along `axis`.  If such a split is not possible,
+        an error is raised.
+        If `indices_or_sections` is a 1-D array of sorted integers, the entries
+        indicate where along `axis` the array is split.  For example,
+        ``[2, 3]`` would, for ``axis=0``, result in
+        - ary[:2]
+        - ary[2:3]
+        - ary[3:]
+        If an index exceeds the dimension of the array along `axis`,
+        an empty sub-array is returned correspondingly.
+    axis : int, optional
+        The axis along which to split, default is 0.
+    squeeze_axis: boolean, optional
+        Whether to squeeze the axis of sub-arrays or not, only useful when size
+        of the sub-arrays are 1 on the `axis`. Default is False.
+
+    Returns
+    -------
+    out : Symbol
+        The created Symbol
+    """
+    indices = []
+    sections = 0
+    if isinstance(indices_or_sections, int):
+        sections = indices_or_sections
+    elif isinstance(indices_or_sections, tuple):
+        indices = [0] + list(indices_or_sections)
+    else:
+        raise ValueError('indices_or_sections must either int or tuple of ints')
+    return _internal._split_v2(ary, indices, axis, squeeze_axis, sections)
+
 _set_symbol_class(Symbol)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 0a4d17dc2..4138e4d2d 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -620,8 +620,11 @@ def _parse_location(sym, location, ctx, dtype=default_dtype()):
         *In either case, value of all the arguments must be provided.*
     ctx : Context
         Device context.
-    dtype: np.float16 or np.float32 or np.float64
-        Datatype for mx.nd.array.
+    dtype: "asnumpy" or np.float16 or np.float32 or np.float64
+        If dtype is "asnumpy" then the mx.nd.array created will have the same
+        type as th numpy array from which it is copied.
+        Otherwise, dtype is the explicit datatype for all mx.nd.array objects
+        created in this function.
 
     Returns
     -------
@@ -643,7 +646,7 @@ def _parse_location(sym, location, ctx, dtype=default_dtype()):
     ValueError: Symbol arguments and keys of the given location do not match.
     """
     assert isinstance(location, (dict, list, tuple))
-    assert dtype in (np.float16, np.float32, np.float64)
+    assert dtype == "asnumpy" or dtype in (np.float16, np.float32, np.float64)
     if isinstance(location, dict):
         if set(location.keys()) != set(sym.list_arguments()):
             raise ValueError("Symbol arguments and keys of the given location do not match."
@@ -651,8 +654,8 @@ def _parse_location(sym, location, ctx, dtype=default_dtype()):
                              % (str(set(sym.list_arguments())), str(set(location.keys()))))
     else:
         location = {k: v for k, v in zip(sym.list_arguments(), location)}
-    location = {k: mx.nd.array(v, ctx=ctx, dtype=dtype) if isinstance(v, np.ndarray) \
-               else v for k, v in location.items()}
+    location = {k: mx.nd.array(v, ctx=ctx, dtype=v.dtype if dtype == "asnumpy" else dtype) \
+               if isinstance(v, np.ndarray) else v for k, v in location.items()}
     return location
 
 
@@ -677,8 +680,11 @@ def _parse_aux_states(sym, aux_states, ctx, dtype=default_dtype()):
         *In either case, all aux states of `sym` must be provided.*
     ctx : Context
         Device context.
-    dtype: np.float16 or np.float32 or np.float64
-        Datatype for mx.nd.array.
+    dtype: "asnumpy" or np.float16 or np.float32 or np.float64
+        If dtype is "asnumpy" then the mx.nd.array created will have the same
+        type as th numpy array from which it is copied.
+        Otherwise, dtype is the explicit datatype for all mx.nd.array objects
+        created in this function.
 
     Returns
     -------
@@ -702,7 +708,7 @@ def _parse_aux_states(sym, aux_states, ctx, dtype=default_dtype()):
     >>> _parse_aux_states(fc2, {'batchnorm0_moving_var': mean_states}, None)
     ValueError: Symbol aux_states names and given aux_states do not match.
     """
-    assert dtype in (np.float16, np.float32, np.float64)
+    assert dtype == "asnumpy" or dtype in (np.float16, np.float32, np.float64)
     if aux_states is not None:
         if isinstance(aux_states, dict):
             if set(aux_states.keys()) != set(sym.list_auxiliary_states()):
@@ -713,7 +719,8 @@ def _parse_aux_states(sym, aux_states, ctx, dtype=default_dtype()):
         elif isinstance(aux_states, (list, tuple)):
             aux_names = sym.list_auxiliary_states()
             aux_states = {k:v for k, v in zip(aux_names, aux_states)}
-        aux_states = {k: mx.nd.array(v, ctx=ctx, dtype=dtype) for k, v in aux_states.items()}
+        aux_states = {k: mx.nd.array(v, ctx=ctx, dtype=v.dtype if dtype == "asnumpy" else dtype) \
+                      for k, v in aux_states.items()}
     return aux_states
 
 
@@ -962,8 +969,11 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
             Contains the mapping between names of auxiliary states and their values.
     ctx : Context, optional
         running context
-    dtype: np.float16 or np.float32 or np.float64
-        Datatype for mx.nd.array.
+    dtype: "asnumpy" or np.float16 or np.float32 or np.float64
+        If dtype is "asnumpy" then the mx.nd.array created will have the same
+        type as th numpy array from which it is copied.
+        Otherwise, dtype is the explicit datatype for all mx.nd.array objects
+        created in this function.
 
     equal_nan: Boolean
         if True, `nan` is a valid value for checking equivalency (ie `nan` == `nan`)
@@ -979,7 +989,7 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
     >>> ret_expected = np.array([[19, 22], [43, 50]])
     >>> check_symbolic_forward(sym_dot, [mat1, mat2], [ret_expected])
     """
-    assert dtype in (np.float16, np.float32, np.float64)
+    assert dtype == "asnumpy" or dtype in (np.float16, np.float32, np.float64)
     if ctx is None:
         ctx = default_context()
 
@@ -988,7 +998,8 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
                                    dtype=dtype)
     if isinstance(expected, dict):
         expected = [expected[k] for k in sym.list_outputs()]
-    args_grad_data = {k:mx.nd.empty(v.shape, ctx=ctx, dtype=dtype) for k, v in location.items()}
+    args_grad_data = {k:mx.nd.empty(v.shape, ctx=ctx, dtype=v.dtype if dtype == "asnumpy" else dtype) \
+                      for k, v in location.items()}
 
     executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states)
     for g in executor.grad_arrays:
diff --git a/scala-package/.gitignore b/scala-package/.gitignore
index 22b12b31d..9bf785171 100644
--- a/scala-package/.gitignore
+++ b/scala-package/.gitignore
@@ -1,3 +1,4 @@
+target/
 .flattened-pom.xml
 core/src/main/scala/org/apache/mxnet/NDArrayAPIBase.scala
 core/src/main/scala/org/apache/mxnet/NDArrayBase.scala
diff --git a/scala-package/.mvn/wrapper/.gitignore b/scala-package/.mvn/wrapper/.gitignore
new file mode 100644
index 000000000..576738f6a
--- /dev/null
+++ b/scala-package/.mvn/wrapper/.gitignore
@@ -0,0 +1,2 @@
+maven-wrapper.jar
+
diff --git a/scala-package/.mvn/wrapper/MavenWrapperDownloader.java b/scala-package/.mvn/wrapper/MavenWrapperDownloader.java
new file mode 100755
index 000000000..fa4f7b499
--- /dev/null
+++ b/scala-package/.mvn/wrapper/MavenWrapperDownloader.java
@@ -0,0 +1,110 @@
+/*
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+*/
+
+import java.net.*;
+import java.io.*;
+import java.nio.channels.*;
+import java.util.Properties;
+
+public class MavenWrapperDownloader {
+
+    /**
+     * Default URL to download the maven-wrapper.jar from, if no 'downloadUrl' is provided.
+     */
+    private static final String DEFAULT_DOWNLOAD_URL =
+            "https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.4.2/maven-wrapper-0.4.2.jar";
+
+    /**
+     * Path to the maven-wrapper.properties file, which might contain a downloadUrl property to
+     * use instead of the default one.
+     */
+    private static final String MAVEN_WRAPPER_PROPERTIES_PATH =
+            ".mvn/wrapper/maven-wrapper.properties";
+
+    /**
+     * Path where the maven-wrapper.jar will be saved to.
+     */
+    private static final String MAVEN_WRAPPER_JAR_PATH =
+            ".mvn/wrapper/maven-wrapper.jar";
+
+    /**
+     * Name of the property which should be used to override the default download url for the wrapper.
+     */
+    private static final String PROPERTY_NAME_WRAPPER_URL = "wrapperUrl";
+
+    public static void main(String args[]) {
+        System.out.println("- Downloader started");
+        File baseDirectory = new File(args[0]);
+        System.out.println("- Using base directory: " + baseDirectory.getAbsolutePath());
+
+        // If the maven-wrapper.properties exists, read it and check if it contains a custom
+        // wrapperUrl parameter.
+        File mavenWrapperPropertyFile = new File(baseDirectory, MAVEN_WRAPPER_PROPERTIES_PATH);
+        String url = DEFAULT_DOWNLOAD_URL;
+        if(mavenWrapperPropertyFile.exists()) {
+            FileInputStream mavenWrapperPropertyFileInputStream = null;
+            try {
+                mavenWrapperPropertyFileInputStream = new FileInputStream(mavenWrapperPropertyFile);
+                Properties mavenWrapperProperties = new Properties();
+                mavenWrapperProperties.load(mavenWrapperPropertyFileInputStream);
+                url = mavenWrapperProperties.getProperty(PROPERTY_NAME_WRAPPER_URL, url);
+            } catch (IOException e) {
+                System.out.println("- ERROR loading '" + MAVEN_WRAPPER_PROPERTIES_PATH + "'");
+            } finally {
+                try {
+                    if(mavenWrapperPropertyFileInputStream != null) {
+                        mavenWrapperPropertyFileInputStream.close();
+                    }
+                } catch (IOException e) {
+                    // Ignore ...
+                }
+            }
+        }
+        System.out.println("- Downloading from: : " + url);
+
+        File outputFile = new File(baseDirectory.getAbsolutePath(), MAVEN_WRAPPER_JAR_PATH);
+        if(!outputFile.getParentFile().exists()) {
+            if(!outputFile.getParentFile().mkdirs()) {
+                System.out.println(
+                        "- ERROR creating output direcrory '" + outputFile.getParentFile().getAbsolutePath() + "'");
+            }
+        }
+        System.out.println("- Downloading to: " + outputFile.getAbsolutePath());
+        try {
+            downloadFileFromURL(url, outputFile);
+            System.out.println("Done");
+            System.exit(0);
+        } catch (Throwable e) {
+            System.out.println("- Error downloading");
+            e.printStackTrace();
+            System.exit(1);
+        }
+    }
+
+    private static void downloadFileFromURL(String urlString, File destination) throws Exception {
+        URL website = new URL(urlString);
+        ReadableByteChannel rbc;
+        rbc = Channels.newChannel(website.openStream());
+        FileOutputStream fos = new FileOutputStream(destination);
+        fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE);
+        fos.close();
+        rbc.close();
+    }
+
+}
diff --git a/scala-package/.mvn/wrapper/maven-wrapper.properties b/scala-package/.mvn/wrapper/maven-wrapper.properties
new file mode 100755
index 000000000..b6e678122
--- /dev/null
+++ b/scala-package/.mvn/wrapper/maven-wrapper.properties
@@ -0,0 +1 @@
+distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.6.0/apache-maven-3.6.0-bin.zip
\ No newline at end of file
diff --git a/scala-package/README.md b/scala-package/README.md
index 3859e5f32..be0fc41a5 100644
--- a/scala-package/README.md
+++ b/scala-package/README.md
@@ -84,7 +84,7 @@ Also, add the dependency which corresponds to your platform to the ```dependenci
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-linux-x86_64-gpu</artifactId>
-  <version>[1.5.0,)</version>
+  <version>[1.5.0-SNAPSHOT,)</version>
 </dependency>
 ```
 
@@ -96,7 +96,7 @@ Also, add the dependency which corresponds to your platform to the ```dependenci
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-linux-x86_64-cpu</artifactId>
-  <version>[1.5.0,)</version>
+  <version>[1.5.0-SNAPSHOT,)</version>
 </dependency>
 ```
 
@@ -107,11 +107,11 @@ Also, add the dependency which corresponds to your platform to the ```dependenci
 <dependency>
   <groupId>org.apache.mxnet</groupId>
   <artifactId>mxnet-full_2.11-osx-x86_64-cpu</artifactId>
-  <version>[1.5.0,)</version>
+  <version>[1.5.0-SNAPSHOT,)</version>
 </dependency>
 ```
 
-**Note:** ```<version>[1.5.0,)<\version>``` indicates that we will fetch packages with version 1.5.0 or higher. This will always ensure that the pom.xml is able to fetch the latest and greatest jar files from Maven Snapshot repository.
+**Note:** ```<version>[1.5.0-SNAPSHOT,)</version>``` indicates that we will fetch packages with version 1.5.0 or higher. This will always ensure that the pom.xml is able to fetch the latest and greatest jar files from Maven Snapshot repository.
 
 Build From Source
 -----------------
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 7264c39e8..cb0bcba8c 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -42,6 +42,7 @@
           </execution>
         </executions>
       </plugin>
+
       <plugin>
         <groupId>org.codehaus.mojo</groupId>
         <artifactId>exec-maven-plugin</artifactId>
@@ -138,12 +139,6 @@
       <version>INTERNAL</version>
       <scope>provided</scope>
     </dependency>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <version>4.11</version>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Context.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Context.scala
index ab44f434b..b04cd31ff 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Context.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Context.scala
@@ -38,11 +38,12 @@ object Context {
 }
 
 /**
- * Constructing a context.
-
- * @param deviceTypeName {'cpu', 'gpu'} String representing the device type
- * @param deviceId (default=0) The device id of the device, needed for GPU
- */
+  * Constructing a context which is used to specify the device and device type that will
+  * be utilized by the engine.
+  *
+  * @param deviceTypeName {'cpu', 'gpu'} String representing the device type
+  * @param deviceId (default=0) The device id of the device, needed for GPU
+  */
 class Context(deviceTypeName: String, val deviceId: Int = 0) extends Serializable {
   val deviceTypeid: Int = Context.devstr2type(deviceTypeName)
 
@@ -61,9 +62,9 @@ class Context(deviceTypeName: String, val deviceId: Int = 0) extends Serializabl
   }
 
   /**
-   * Return device type of current context.
-   * @return device_type
-   */
+    * Return device type of current context.
+    * @return device_type
+    */
   def deviceType: String = Context.devtype2str(deviceTypeid)
 
   override def toString: String = {
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala b/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala
index 77881ab94..0f756e240 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/Image.scala
@@ -37,7 +37,7 @@ object Image {
     * @param flag   Convert decoded image to grayscale (0) or color (1).
     * @param to_rgb Whether to convert decoded image
     *               to mxnet's default RGB format (instead of opencv's default BGR).
-    * @return NDArray in HWC format
+    * @return NDArray in HWC format with DType [[DType.UInt8]]
     */
   def imDecode(buf: Array[Byte], flag: Int,
                to_rgb: Boolean,
@@ -56,7 +56,7 @@ object Image {
   /**
     * Same imageDecode with InputStream
     * @param inputStream the inputStream of the image
-    * @return NDArray in HWC format
+    * @return NDArray in HWC format with DType [[DType.UInt8]]
     */
   def imDecode(inputStream: InputStream, flag: Int = 1,
                to_rgb: Boolean = true,
@@ -78,7 +78,7 @@ object Image {
     * @param flag     Convert decoded image to grayscale (0) or color (1).
     * @param to_rgb   Whether to convert decoded image to mxnet's default RGB format
     *                 (instead of opencv's default BGR).
-    * @return org.apache.mxnet.NDArray in HWC format
+    * @return org.apache.mxnet.NDArray in HWC format with DType [[DType.UInt8]]
     */
   def imRead(filename: String, flag: Option[Int] = None,
              to_rgb: Option[Boolean] = None,
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala b/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala
index cb9788569..3a51222cc 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/MX_PRIMITIVES.scala
@@ -82,4 +82,10 @@ object MX_PRIMITIVES {
 
   implicit def MX_DoubleToDouble(d: MX_Double) : Double = d.data
 
+  def isValidMxPrimitiveType(num : Any) : Boolean = {
+    num match {
+      case valid @ (_: Float | _: Double) => true
+      case _ => false
+    }
+  }
 }
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
index 163ed2682..ca2e986e7 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/NDArray.scala
@@ -28,6 +28,7 @@ import scala.collection.mutable
 import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 import scala.language.implicitConversions
 import scala.ref.WeakReference
+import scala.util.Try
 
 /**
   * NDArray Object extends from NDArrayBase for abstract function signatures
@@ -96,9 +97,11 @@ object NDArray extends NDArrayBase {
           case ndArr: Seq[NDArray @unchecked] =>
             if (ndArr.head.isInstanceOf[NDArray]) (ndArr.toArray, ndArr.toArray.map(_.handle))
             else throw new IllegalArgumentException(
-              "Unsupported out var type, should be NDArray or subclass of Seq[NDArray]")
+              s"""Unsupported out ${output.getClass} type,
+                 | should be NDArray or subclass of Seq[NDArray]""".stripMargin)
           case _ => throw new IllegalArgumentException(
-            "Unsupported out var type, should be NDArray or subclass of Seq[NDArray]")
+            s"""Unsupported out ${output.getClass} type,
+               | should be NDArray or subclass of Seq[NDArray]""".stripMargin)
         }
       } else {
         (null, null)
@@ -509,6 +512,61 @@ object NDArray extends NDArrayBase {
     array(sourceArr, shape, null)
   }
 
+  /**
+    * Create a new NDArray based on the structure of source Array
+    * @param sourceArr Array[Array...Array[MX_PRIMITIVE_TYPE]...]
+    * @param ctx context like to pass in
+    * @return an NDArray with the same shape of the input
+    * @throws IllegalArgumentException if the data type is not valid
+    */
+  def toNDArray(sourceArr: Array[_], ctx : Context = null) : NDArray = {
+    val shape = shapeGetter(sourceArr)
+    val container = new Array[Any](shape.product)
+    flattenArray(sourceArr, container, 0, container.length - 1)
+    val finalArr = container(0) match {
+      case f: Float => array(container.map(_.asInstanceOf[Float]), Shape(shape), ctx)
+      case d: Double => array(container.map(_.asInstanceOf[Double]), Shape(shape), ctx)
+      case _ => throw new IllegalArgumentException(
+        s"Unsupported type ${container(0).getClass}, please check MX_PRIMITIVES for valid types")
+    }
+    finalArr
+  }
+
+  private def shapeGetter(sourceArr : Any) : ArrayBuffer[Int] = {
+    sourceArr match {
+        // e.g : Array[Double] the inner layer
+      case arr: Array[_] if MX_PRIMITIVES.isValidMxPrimitiveType(arr(0)) => {
+        ArrayBuffer[Int](arr.length)
+      }
+        // e.g : Array[Array...[]]
+      case arr: Array[_] => {
+        var arrBuffer = new ArrayBuffer[Int]()
+        if (!arr.isEmpty) arrBuffer = shapeGetter(arr(0))
+        for (idx <- arr.indices) {
+          require(arrBuffer == shapeGetter(arr(idx)))
+        }
+        arrBuffer.insert(0, arr.length)
+        arrBuffer
+      }
+      case _ => throw new IllegalArgumentException(s"Wrong type passed: ${sourceArr.getClass}")
+    }
+  }
+
+  private def flattenArray(sourceArr : Any, arr : Array[Any],
+                            start : Int, end : Int) : Unit = {
+    sourceArr match {
+      case arrValid: Array[_] if MX_PRIMITIVES.isValidMxPrimitiveType(arrValid(0)) => {
+        for (i <- arrValid.indices) arr(start + i) = arrValid(i)
+      }
+      case arrAny: Array[_] => {
+        val fragment = (end - start + 1) / arrAny.length
+        for (i <- arrAny.indices)
+          flattenArray(arrAny(i), arr, start + i * fragment, start + (i + 1) * fragment)
+      }
+      case _ => throw new IllegalArgumentException(s"Wrong type passed: ${sourceArr.getClass}")
+    }
+  }
+
   /**
    * Returns evenly spaced values within a given interval.
    * Values are generated within the half-open interval [`start`, `stop`). In other
@@ -667,16 +725,18 @@ object NDArray extends NDArrayBase {
     genericNDArrayFunctionInvoke("_crop_assign", args, kwargs)
   }
 
-  // TODO: imdecode
 }
 
 /**
- * NDArray object in mxnet.
- * NDArray is basic ndarray/Tensor like data structure in mxnet. <br />
- * <b>
- * WARNING: it is your responsibility to clear this object through dispose().
- * </b>
- */
+  * NDArray object in mxnet.
+  * NDArray is basic ndarray/Tensor like data structure in mxnet. <br />
+  * <b>
+  * NOTE: NDArray is stored in native memory. Use NDArray in a try-with-resources() construct
+  * or a [[org.apache.mxnet.ResourceScope]] in a try-with-resource to have them
+  * automatically disposed. You can explicitly control the lifetime of NDArray
+  * by calling dispose manually. Failure to do this will result in leaking native memory.
+  * </b>
+  */
 class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
                              val writable: Boolean = true,
                              addToCollector: Boolean = true) extends NativeResource {
@@ -694,6 +754,11 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
   // we use weak reference to prevent gc blocking
   private[mxnet] val dependencies = mutable.HashMap.empty[Long, WeakReference[NDArray]]
 
+  private val lengthProperty = "mxnet.setNDArrayPrintLength"
+  private val layerProperty = "mxnet.setNDArrayPrintLayerLength"
+  private lazy val printLength = Try(System.getProperty(lengthProperty).toInt).getOrElse(1000)
+  private lazy val layerLength = Try(System.getProperty(layerProperty).toInt).getOrElse(10)
+
   def serialize(): Array[Byte] = {
     val buf = ArrayBuffer.empty[Byte]
     checkCall(_LIB.mxNDArraySaveRawBytes(handle, buf))
@@ -713,21 +778,22 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
   }
 
   /**
-   * Dispose all NDArrays who help to construct this array. <br />
-   * e.g. (a * b + c).disposeDeps() will dispose a, b, c (including their deps) and a * b
-   * @return this array
-   */
+    * Dispose all NDArrays who help to construct this array. <br />
+    * e.g. (a * b + c).disposeDeps() will dispose a, b, c (including their deps) and a * b
+    * @return this NDArray
+    */
   def disposeDeps(): NDArray = {
     disposeDepsExcept()
   }
 
   /**
-   * Dispose all NDArrays who help to construct this array, excepts those in the arguments. <br />
-   * e.g. (a * b + c).disposeDepsExcept(a, b)
-   * will dispose c and a * b.
-   * Note that a, b's dependencies will not be disposed either.
-   * @return this array
-   */
+    * Dispose all NDArrays who help to construct this array, excepts those in the arguments. <br />
+    * e.g. (a * b + c).disposeDepsExcept(a, b)
+    * will dispose c and a * b.
+    * Note that a, b's dependencies will not be disposed either.
+    * @param arrs array of NDArrays
+    * @return this array
+    */
   def disposeDepsExcept(arrs: NDArray*): NDArray = {
     if (dependencies != null) {
       val excepts = mutable.HashSet.empty[Long]
@@ -763,6 +829,56 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     checkCall(_LIB.mxFloat64NDArraySyncCopyFromCPU(handle, source, source.length))
   }
 
+  /**
+    * Visualize the internal structure of NDArray
+    * @return String that show the structure
+    */
+  override def toString: String = {
+    val abstractND = buildStringHelper(this, this.shape.length)
+    val otherInfo = s"<NDArray ${this.shape} ${this.context} ${this.dtype}>"
+    s"$abstractND\n$otherInfo"
+  }
+
+  /**
+    * Helper function to create formatted NDArray output
+    * The NDArray will be represented in a reduced version if too large
+    * @param nd NDArray as the input
+    * @param totalSpace totalSpace of the lowest dimension
+    * @return String format of NDArray
+    */
+  private def buildStringHelper(nd : NDArray, totalSpace : Int) : String = {
+    var result = ""
+    val THRESHOLD = layerLength        // longest NDArray[NDArray[...]] to show in full
+    val ARRAYTHRESHOLD = printLength   // longest array to show in full
+    val shape = nd.shape
+    val space = totalSpace - shape.length
+    if (shape.length != 1) {
+      val (length, postfix) =
+        if (shape(0) > THRESHOLD) {
+          // reduced NDArray
+          (10, s"\n${" " * (space + 1)}... with length ${shape(0)}\n")
+        } else {
+          (shape(0), "")
+        }
+      for (num <- 0 until length) {
+        val output = buildStringHelper(nd.at(num), totalSpace)
+        result += s"$output\n"
+      }
+      result = s"${" " * space}[\n$result${" " * space}$postfix${" " * space}]"
+    } else {
+      if (shape(0) > ARRAYTHRESHOLD) {
+        // reduced Array
+        val front = nd.slice(0, 10)
+        val back = nd.slice(shape(0) - 10, shape(0) - 1)
+        result = s"""${" " * space}[${front.toArray.mkString(",")}
+             | ... ${back.toArray.mkString(",")}]""".stripMargin
+      } else {
+        result = s"${" " * space}[${nd.toArray.mkString(",")}]"
+      }
+    }
+    result
+  }
+
   /**
    * Return a sliced NDArray that shares memory with current one.
    * NDArray only support continuous slicing on axis 0
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala b/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
index e690abba0..b205bbe47 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/io/NDArrayIter.scala
@@ -63,7 +63,8 @@ class NDArrayIter(data: IndexedSeq[(DataDesc, NDArray)],
            dataName: String = "data", labelName: String = "label") {
     this(IO.initDataDesc(data, allowEmpty = false, dataName,
       if (data == null || data.isEmpty)  MX_REAL_TYPE else data(0).dtype, Layout.UNDEFINED),
-      IO.initDataDesc(label, allowEmpty = true, labelName, MX_REAL_TYPE, Layout.UNDEFINED),
+      IO.initDataDesc(label, allowEmpty = true, labelName,
+        if (label == null || label.isEmpty)  MX_REAL_TYPE else label(0).dtype, Layout.UNDEFINED),
       dataBatchSize, shuffle, lastBatchHandle)
   }
 
@@ -175,7 +176,8 @@ class NDArrayIter(data: IndexedSeq[(DataDesc, NDArray)],
   private def _padData(ndArray: NDArray): NDArray = {
     val padNum = cursor + dataBatchSize - numData
     val shape = Shape(dataBatchSize) ++ ndArray.shape.slice(1, ndArray.shape.size)
-    val newArray = NDArray.zeros(shape)
+    // The new NDArray  has to be created such that it inherits dtype from the passed in array
+    val newArray = NDArray.zeros(shape, dtype = ndArray.dtype)
     NDArrayCollector.auto().withScope {
       val batch = ndArray.slice(cursor, numData)
       val padding = ndArray.slice(0, padNum)
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala
new file mode 100644
index 000000000..7d6f31e93
--- /dev/null
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/javaapi/Image.scala
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.javaapi
+// scalastyle:off
+import java.awt.image.BufferedImage
+// scalastyle:on
+import java.io.InputStream
+
+object Image {
+  /**
+    * Decode image with OpenCV.
+    * Note: return image in RGB by default, instead of OpenCV's default BGR.
+    * @param buf    Buffer containing binary encoded image
+    * @param flag   Convert decoded image to grayscale (0) or color (1).
+    * @param toRGB Whether to convert decoded image
+    *               to mxnet's default RGB format (instead of opencv's default BGR).
+    * @return NDArray in HWC format with DType [[DType.UInt8]]
+    */
+  def imDecode(buf: Array[Byte], flag: Int, toRGB: Boolean): NDArray = {
+    org.apache.mxnet.Image.imDecode(buf, flag, toRGB, None)
+  }
+
+  def imDecode(buf: Array[Byte]): NDArray = {
+    imDecode(buf, 1, true)
+  }
+
+  /**
+    * Same imageDecode with InputStream
+    *
+    * @param inputStream the inputStream of the image
+    * @param flag   Convert decoded image to grayscale (0) or color (1).
+    * @param toRGB Whether to convert decoded image
+    * @return NDArray in HWC format with DType [[DType.UInt8]]
+    */
+  def imDecode(inputStream: InputStream, flag: Int, toRGB: Boolean): NDArray = {
+    org.apache.mxnet.Image.imDecode(inputStream, flag, toRGB, None)
+  }
+
+  def imDecode(inputStream: InputStream): NDArray = {
+    imDecode(inputStream, 1, true)
+  }
+
+  /**
+    * Read and decode image with OpenCV.
+    * Note: return image in RGB by default, instead of OpenCV's default BGR.
+    * @param filename Name of the image file to be loaded.
+    * @param flag     Convert decoded image to grayscale (0) or color (1).
+    * @param toRGB   Whether to convert decoded image to mxnet's default RGB format
+    *                 (instead of opencv's default BGR).
+    * @return org.apache.mxnet.NDArray in HWC format with DType [[DType.UInt8]]
+    */
+  def imRead(filename: String, flag: Int, toRGB: Boolean): NDArray = {
+    org.apache.mxnet.Image.imRead(filename, Some(flag), Some(toRGB), None)
+  }
+
+  def imRead(filename: String): NDArray = {
+    imRead(filename, 1, true)
+  }
+
+  /**
+    * Resize image with OpenCV.
+    * @param src     source image in NDArray
+    * @param w       Width of resized image.
+    * @param h       Height of resized image.
+    * @param interp  Interpolation method (default=cv2.INTER_LINEAR).
+    * @return org.apache.mxnet.NDArray
+    */
+  def imResize(src: NDArray, w: Int, h: Int, interp: Integer): NDArray = {
+    val interpVal = if (interp == null) None else Some(interp.intValue())
+    org.apache.mxnet.Image.imResize(src, w, h, interpVal, None)
+  }
+
+  def imResize(src: NDArray, w: Int, h: Int): NDArray = {
+    imResize(src, w, h, null)
+  }
+
+  /**
+    * Do a fixed crop on the image
+    * @param src Src image in NDArray
+    * @param x0 starting x point
+    * @param y0 starting y point
+    * @param w width of the image
+    * @param h height of the image
+    * @return cropped NDArray
+    */
+  def fixedCrop(src: NDArray, x0: Int, y0: Int, w: Int, h: Int): NDArray = {
+    org.apache.mxnet.Image.fixedCrop(src, x0, y0, w, h)
+  }
+
+  /**
+    * Convert a NDArray image to a real image
+    * The time cost will increase if the image resolution is big
+    * @param src Source image file in RGB
+    * @return Buffered Image
+    */
+  def toImage(src: NDArray): BufferedImage = {
+    org.apache.mxnet.Image.toImage(src)
+  }
+}
diff --git a/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ImageTest.java b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ImageTest.java
new file mode 100644
index 000000000..0092744a2
--- /dev/null
+++ b/scala-package/core/src/test/java/org/apache/mxnet/javaapi/ImageTest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnet.javaapi;
+
+import org.apache.commons.io.FileUtils;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import java.io.File;
+import java.net.URL;
+
+import static org.junit.Assert.assertArrayEquals;
+
+public class ImageTest {
+
+    private static String imLocation;
+
+    private static void downloadUrl(String url, String filePath, int maxRetry) throws Exception{
+        File tmpFile = new File(filePath);
+        Boolean success = false;
+        if (!tmpFile.exists()) {
+            while (maxRetry > 0 && !success) {
+                try {
+                    FileUtils.copyURLToFile(new URL(url), tmpFile);
+                    success = true;
+                } catch(Exception e){
+                   maxRetry -= 1;
+                }
+            }
+        } else {
+            success = true;
+        }
+        if (!success) throw new Exception("$url Download failed!");
+    }
+
+    @BeforeClass
+    public static void downloadFile() throws Exception {
+        String tempDirPath = System.getProperty("java.io.tmpdir");
+        imLocation = tempDirPath + "/inputImages/Pug-Cookie.jpg";
+        downloadUrl("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg",
+                imLocation, 3);
+    }
+
+    @Test
+    public void testImageProcess() {
+        NDArray nd = Image.imRead(imLocation, 1, true);
+        assertArrayEquals(nd.shape().toArray(), new int[]{576, 1024, 3});
+        NDArray nd2 = Image.imResize(nd, 224, 224, null);
+        assertArrayEquals(nd2.shape().toArray(), new int[]{224, 224, 3});
+        NDArray cropped = Image.fixedCrop(nd, 0, 0, 224, 224);
+        Image.toImage(cropped);
+    }
+}
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
index d3969b0ce..698a2b53a 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/IOSuite.scala
@@ -237,7 +237,7 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     val shape0 = Shape(Array(1000, 2, 2))
     val data = IndexedSeq(NDArray.ones(shape0), NDArray.zeros(shape0))
     val shape1 = Shape(Array(1000, 1))
-    val label = IndexedSeq(NDArray.ones(shape1))
+    val label = IndexedSeq(NDArray.ones(shape1, dtype = DType.Int32))
     val batchData0 = NDArray.ones(Shape(Array(128, 2, 2)))
     val batchData1 = NDArray.zeros(Shape(Array(128, 2, 2)))
     val batchLabel = NDArray.ones(Shape(Array(128, 1)))
@@ -254,6 +254,7 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
       assert(tBatch.data(0).toArray === batchData0.toArray)
       assert(tBatch.data(1).toArray === batchData1.toArray)
       assert(tBatch.label(0).toArray === batchLabel.toArray)
+      assert(tBatch.label(0).dtype == DType.Int32)
     }
 
     assert(batchCount === nBatch0)
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
index bc7a0a026..054300e95 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/NDArraySuite.scala
@@ -22,10 +22,14 @@ import java.util.concurrent.atomic.AtomicInteger
 
 import org.apache.mxnet.NDArrayConversions._
 import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}
+import org.slf4j.LoggerFactory
+import scala.collection.mutable.ArrayBuffer
 
 class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
   private val sequence: AtomicInteger = new AtomicInteger(0)
 
+  private val logger = LoggerFactory.getLogger(classOf[NDArraySuite])
+
   test("to java array") {
     val ndarray = NDArray.zeros(2, 2)
     assert(ndarray.toArray === Array(0f, 0f, 0f, 0f))
@@ -85,6 +89,84 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     assert(ndarray.toArray === Array(1f, 2f, 3f, 4f))
   }
 
+  test("create NDArray based on Java Matrix") {
+    def arrayGen(num : Any) : Array[Any] = {
+      val array = num match {
+        case f: Float =>
+          (for (_ <- 0 until 100) yield Array(1.0f, 1.0f, 1.0f, 1.0f)).toArray
+        case d: Double =>
+          (for (_ <- 0 until 100) yield Array(1.0d, 1.0d, 1.0d, 1.0d)).toArray
+        case _ => throw new IllegalArgumentException(s"Unsupported Type ${num.getClass}")
+      }
+      Array(
+        Array(
+          array
+        ),
+        Array(
+          array
+        )
+      )
+    }
+    val floatData = 1.0f
+    var nd = NDArray.toNDArray(arrayGen(floatData))
+    require(nd.shape == Shape(2, 1, 100, 4))
+    val arr2 = Array(1.0f, 1.0f, 1.0f, 1.0f)
+    nd = NDArray.toNDArray(arr2)
+    require(nd.shape == Shape(4))
+    val doubleData = 1.0d
+    nd = NDArray.toNDArray(arrayGen(doubleData))
+    require(nd.shape == Shape(2, 1, 100, 4))
+    require(nd.dtype == DType.Float64)
+  }
+
+  test("test Visualize") {
+    var nd = NDArray.ones(Shape(1, 2, 1000, 1))
+    var data : String =
+      """
+        |[
+        | [
+        |  [
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |
+        |   ... with length 1000
+        |  ]
+        |  [
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |   [1.0]
+        |
+        |   ... with length 1000
+        |  ]
+        |  ]
+        |]
+        |<NDArray (1,2,1000,1) cpu(0) float32>""".stripMargin
+    require(nd.toString.split("\\s+").mkString == data.split("\\s+").mkString)
+    nd = NDArray.ones(Shape(1, 4))
+    data =
+      """
+        |[
+        | [1.0,1.0,1.0,1.0]
+        |]
+        |<NDArray (1,4) cpu(0) float32>""".stripMargin
+    require(nd.toString.split("\\s+").mkString == data.split("\\s+").mkString)
+  }
+
   test("plus") {
     var ndzeros = NDArray.zeros(2, 1)
     var ndones = ndzeros + 1f
diff --git a/scala-package/deploy/pom.xml b/scala-package/deploy/pom.xml
index c51aa9a92..4e9da8908 100644
--- a/scala-package/deploy/pom.xml
+++ b/scala-package/deploy/pom.xml
@@ -19,10 +19,10 @@
   </description>
 
   <properties>
+    <MXNET_DIR>${project.parent.basedir}/..</MXNET_DIR>
     <ARTIFACT_ID>mxnet-full_2.11-${platform}-${flavor}</ARTIFACT_ID>
     <revision>1.5.0-SNAPSHOT</revision>
     <repositoryId>apache.snapshots.https</repositoryId>
-    <repo_url>file://${project.build.directory}/repo</repo_url>
     <deploy_asc_types/>
     <deploy_asc_classifers/>
     <deploy_asc_files/>
@@ -117,6 +117,7 @@
             </goals>
             <configuration>
               <description>${project.description}</description>
+              <skip>false</skip>
               <repositoryId>${repositoryId}</repositoryId>
               <url>${repo_url}</url>
               <groupId>${project.groupId}</groupId>
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 564102a9f..30ccfdcce 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -15,6 +15,7 @@
 
   <properties>
     <skipTests>true</skipTests>
+    <skipJavaTests>${skipTests}</skipJavaTests>
   </properties>
 
   <build>
diff --git a/scala-package/examples/scripts/benchmark/run_java_inference_bm.sh b/scala-package/examples/scripts/benchmark/run_java_inference_bm.sh
index f426ddaaa..c62a7438d 100644
--- a/scala-package/examples/scripts/benchmark/run_java_inference_bm.sh
+++ b/scala-package/examples/scripts/benchmark/run_java_inference_bm.sh
@@ -20,7 +20,7 @@
 set -e
 
 MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
-CLASS_PATH=$MXNET_ROOT/scala-package/assembly/target/*:$MXNET_ROOT/scala-package/examples/target/*
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
 
 java -Xmx8G -Dmxnet.traceLeakedObjects=true -cp $CLASS_PATH \
 	org.apache.mxnetexamples.javaapi.benchmark.JavaBenchmark $@
diff --git a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/predictor/PredictorExample.java b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/predictor/PredictorExample.java
index c9b4426f5..c5d209998 100644
--- a/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/predictor/PredictorExample.java
+++ b/scala-package/examples/src/main/java/org/apache/mxnetexamples/javaapi/infer/predictor/PredictorExample.java
@@ -24,8 +24,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import javax.imageio.ImageIO;
-import java.awt.Graphics2D;
 import java.awt.image.BufferedImage;
 import java.io.BufferedReader;
 import java.io.File;
@@ -47,76 +45,7 @@ public class PredictorExample {
     private String inputImagePath = "/images/dog.jpg";
 
     final static Logger logger = LoggerFactory.getLogger(PredictorExample.class);
-
-    /**
-     * Load the image from file to buffered image
-     * It can be replaced by loadImageFromFile from ObjectDetector
-     * @param inputImagePath input image Path in String
-     * @return Buffered image
-     */
-    private static BufferedImage loadIamgeFromFile(String inputImagePath) {
-        BufferedImage buf = null;
-        try {
-            buf = ImageIO.read(new File(inputImagePath));
-        } catch (IOException e) {
-            System.err.println(e);
-        }
-        return buf;
-    }
-
-    /**
-     * Reshape the current image using ImageIO and Graph2D
-     * It can be replaced by reshapeImage from ObjectDetector
-     * @param buf Buffered image
-     * @param newWidth desired width
-     * @param newHeight desired height
-     * @return a reshaped bufferedImage
-     */
-    private static BufferedImage reshapeImage(BufferedImage buf, int newWidth, int newHeight) {
-        BufferedImage resizedImage = new BufferedImage(newWidth, newHeight, BufferedImage.TYPE_INT_RGB);
-        Graphics2D g = resizedImage.createGraphics();
-        g.drawImage(buf, 0, 0, newWidth, newHeight, null);
-        g.dispose();
-        return resizedImage;
-    }
-
-    /**
-     * Convert an image from a buffered image into pixels float array
-     * It can be replaced by bufferedImageToPixels from ObjectDetector
-     * @param buf buffered image
-     * @return Float array
-     */
-    private static float[] imagePreprocess(BufferedImage buf) {
-        // Get height and width of the image
-        int w = buf.getWidth();
-        int h = buf.getHeight();
-
-        // get an array of integer pixels in the default RGB color mode
-        int[] pixels = buf.getRGB(0, 0, w, h, null, 0, w);
-
-        // 3 times height and width for R,G,B channels
-        float[] result = new float[3 * h * w];
-
-        int row = 0;
-        // copy pixels to array vertically
-        while (row < h) {
-            int col = 0;
-            // copy pixels to array horizontally
-            while (col < w) {
-                int rgb = pixels[row * w + col];
-                // getting red color
-                result[0 * h * w + row * w + col] = (rgb >> 16) & 0xFF;
-                // getting green color
-                result[1 * h * w + row * w + col] = (rgb >> 8) & 0xFF;
-                // getting blue color
-                result[2 * h * w + row * w + col] = rgb & 0xFF;
-                col += 1;
-            }
-            row += 1;
-        }
-        buf.flush();
-        return result;
-    }
+    private static NDArray$ NDArray = NDArray$.MODULE$;
 
     /**
      * Helper class to print the maximum prediction result
@@ -170,11 +99,10 @@ public static void main(String[] args) {
         inputDesc.add(new DataDesc("data", inputShape, DType.Float32(), "NCHW"));
         Predictor predictor = new Predictor(inst.modelPathPrefix, inputDesc, context,0);
         // Prepare data
-        BufferedImage img = loadIamgeFromFile(inst.inputImagePath);
-
-        img = reshapeImage(img, 224, 224);
+        NDArray img = Image.imRead(inst.inputImagePath, 1, true);
+        img = Image.imResize(img, 224, 224, null);
         // predict
-        float[][] result = predictor.predict(new float[][]{imagePreprocess(img)});
+        float[][] result = predictor.predict(new float[][]{img.toArray()});
         try {
             System.out.println("Predict with Float input");
             System.out.println(printMaximumClass(result[0], inst.modelPathPrefix));
@@ -182,10 +110,10 @@ public static void main(String[] args) {
             System.err.println(e);
         }
         // predict with NDArray
-        NDArray nd = new NDArray(
-                imagePreprocess(img),
-                new Shape(new int[]{1, 3, 224, 224}),
-                Context.cpu());
+        NDArray nd = img;
+        nd = NDArray.transpose(nd, new Shape(new int[]{2, 0, 1}), null)[0];
+        nd = NDArray.expand_dims(nd, 0, null)[0];
+        nd = nd.asType(DType.Float32());
         List<NDArray> ndList = new ArrayList<>();
         ndList.add(nd);
         List<NDArray> ndResult = predictor.predictWithNDArray(ndList);
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/Util.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/Util.scala
index c1ff10c6c..dba343160 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/Util.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/Util.scala
@@ -24,9 +24,9 @@ import org.apache.commons.io.FileUtils
 
 object Util {
 
-  def downloadUrl(url: String, filePath: String, maxRetry: Option[Int] = None) : Unit = {
+  def downloadUrl(url: String, filePath: String, maxRetry: Int = 3) : Unit = {
     val tmpFile = new File(filePath)
-    var retry = maxRetry.getOrElse(3)
+    var retry = maxRetry
     var success = false
     if (!tmpFile.exists()) {
       while (retry > 0 && !success) {
diff --git a/scala-package/examples/src/test/java/org/apache/mxnetexamples/javaapi/infer/predictor/PredictorExampleTest.java b/scala-package/examples/src/test/java/org/apache/mxnetexamples/javaapi/infer/predictor/PredictorExampleTest.java
new file mode 100644
index 000000000..30bc8db44
--- /dev/null
+++ b/scala-package/examples/src/test/java/org/apache/mxnetexamples/javaapi/infer/predictor/PredictorExampleTest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mxnetexamples.javaapi.infer.predictor;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.apache.mxnetexamples.Util;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+
+public class PredictorExampleTest {
+
+    final static Logger logger = LoggerFactory.getLogger(PredictorExampleTest.class);
+    private static String modelPathPrefix = "";
+    private static String inputImagePath = "";
+
+    @BeforeClass
+    public static void downloadFile() {
+        logger.info("Downloading resnet-18 model");
+
+        String tempDirPath = System.getProperty("java.io.tmpdir");
+        logger.info("tempDirPath: %s".format(tempDirPath));
+
+        String baseUrl = "https://s3.us-east-2.amazonaws.com/scala-infer-models";
+
+        Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-symbol.json",
+                tempDirPath + "/resnet18/resnet-18-symbol.json", 3);
+        Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-0000.params",
+                tempDirPath + "/resnet18/resnet-18-0000.params", 3);
+        Util.downloadUrl(baseUrl + "/resnet-18/synset.txt",
+                tempDirPath + "/resnet18/synset.txt", 3);
+        Util.downloadUrl("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg",
+                tempDirPath + "/inputImages/resnet18/Pug-Cookie.jpg", 3);
+
+        modelPathPrefix = tempDirPath + File.separator + "resnet18/resnet-18";
+        inputImagePath = tempDirPath + File.separator +
+                "inputImages/resnet18/Pug-Cookie.jpg";
+    }
+
+    @Test
+    public void testPredictor(){
+        PredictorExample example = new PredictorExample();
+        String[] args = new String[]{
+                "--model-path-prefix", modelPathPrefix,
+                "--input-image", inputImagePath
+        };
+        example.main(args);
+    }
+
+}
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
index 14fb7b85e..2ccd38fc4 100644
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
+++ b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
@@ -49,14 +49,19 @@ class ExampleRNNSuite extends FunSuite with BeforeAndAfterAll {
       System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
       ctx = Context.gpu()
     }
-    LstmBucketing.runTraining(tempDirPath + "/RNN/sherlockholmes.train.txt",
-      tempDirPath + "/RNN/sherlockholmes.valid.txt", Array(ctx), 1)
+    if (!System.getenv().containsKey("CI")) {
+      LstmBucketing.runTraining(tempDirPath + "/RNN/sherlockholmes.train.txt",
+                                tempDirPath + "/RNN/sherlockholmes.valid.txt", Array(ctx), 1)
+    } else {
+      logger.info("Skipping test on CI...")
+    }
   }
 
   test("Example CI: Test TrainCharRNN") {
     val tempDirPath = System.getProperty("java.io.tmpdir")
     if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
+          System.getenv("SCALA_TEST_ON_GPU").toInt == 1 &&
+          !System.getenv().containsKey("CI")) {
       val ctx = Context.gpu()
       TrainCharRnn.runTrainCharRnn(tempDirPath + "/RNN/obama.txt",
         tempDirPath, ctx, 1)
diff --git a/scala-package/infer/pom.xml b/scala-package/infer/pom.xml
index 13ceebb83..565ac6e39 100644
--- a/scala-package/infer/pom.xml
+++ b/scala-package/infer/pom.xml
@@ -64,13 +64,5 @@
       <version>1.10.19</version>
       <scope>test</scope>
     </dependency>
-
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <version>4.11</version>
-      <scope>test</scope>
-    </dependency>
-
   </dependencies>
 </project>
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
index 78b237a4a..7146156d7 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/ObjectDetector.scala
@@ -28,7 +28,8 @@ import org.apache.mxnet.Context
 import scala.collection.mutable.ListBuffer
 
 /**
-  * A class for object detection tasks
+  * The ObjectDetector class helps to run ObjectDetection tasks where the goal
+  * is to find bounding boxes and corresponding labels for objects in a image.
   *
   * @param modelPathPrefix    Path prefix from where to load the model artifacts.
   *                           These include the symbol, parameters, and synset.txt.
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala
index 3014f8d97..05334e49a 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/ObjectDetector.scala
@@ -44,8 +44,8 @@ import scala.language.implicitConversions
   */
 class ObjectDetector private[mxnet] (val objDetector: org.apache.mxnet.infer.ObjectDetector){
 
-  def this(modelPathPrefix: String, inputDescriptors: java.util.List[DataDesc], contexts:
-  java.util.List[Context], epoch: Int)
+  def this(modelPathPrefix: String, inputDescriptors: java.lang.Iterable[DataDesc], contexts:
+  java.lang.Iterable[Context], epoch: Int)
   = this {
     val informationDesc = JavaConverters.asScalaIteratorConverter(inputDescriptors.iterator)
       .asScala.toIndexedSeq map {a => a: org.apache.mxnet.DataDesc}
@@ -79,7 +79,7 @@ class ObjectDetector private[mxnet] (val objDetector: org.apache.mxnet.infer.Obj
     * @return                 List of list of tuples of
     *                         (class, [probability, xmin, ymin, xmax, ymax])
     */
-  def objectDetectWithNDArray(input: java.util.List[NDArray], topK: Int):
+  def objectDetectWithNDArray(input: java.lang.Iterable[NDArray], topK: Int):
   java.util.List[java.util.List[ObjectDetectorOutput]] = {
     val ret = objDetector.objectDetectWithNDArray(convert(input.asScala.toIndexedSeq), Some(topK))
     (ret map {a => (a map {e => new ObjectDetectorOutput(e._1, e._2)}).asJava}).asJava
@@ -92,7 +92,7 @@ class ObjectDetector private[mxnet] (val objDetector: org.apache.mxnet.infer.Obj
     * @param topK             Number of result elements to return, sorted by probability
     * @return                 List of list of tuples of (class, probability)
     */
-  def imageBatchObjectDetect(inputBatch: java.util.List[BufferedImage], topK: Int):
+  def imageBatchObjectDetect(inputBatch: java.lang.Iterable[BufferedImage], topK: Int):
       java.util.List[java.util.List[ObjectDetectorOutput]] = {
     val ret = objDetector.imageBatchObjectDetect(inputBatch.asScala, Some(topK))
     (ret map {a => (a map {e => new ObjectDetectorOutput(e._1, e._2)}).asJava}).asJava
@@ -122,7 +122,7 @@ object ObjectDetector {
     org.apache.mxnet.infer.ImageClassifier.bufferedImageToPixels(resizedImage, inputImageShape)
   }
 
-  def loadInputBatch(inputImagePaths: java.util.List[String]): java.util.List[BufferedImage] = {
+  def loadInputBatch(inputImagePaths: java.lang.Iterable[String]): java.util.List[BufferedImage] = {
     org.apache.mxnet.infer.ImageClassifier
       .loadInputBatch(inputImagePaths.asScala.toList).toList.asJava
   }
diff --git a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
index 146fe9310..6c0871fae 100644
--- a/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
+++ b/scala-package/infer/src/main/scala/org/apache/mxnet/infer/javaapi/Predictor.scala
@@ -40,8 +40,8 @@ import scala.collection.JavaConverters._
 
 // JavaDoc description of class to be updated in https://issues.apache.org/jira/browse/MXNET-1178
 class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor){
-  def this(modelPathPrefix: String, inputDescriptors: java.util.List[DataDesc],
-           contexts: java.util.List[Context], epoch: Int)
+  def this(modelPathPrefix: String, inputDescriptors: java.lang.Iterable[DataDesc],
+           contexts: java.lang.Iterable[Context], epoch: Int)
   = this {
     val informationDesc = JavaConverters.asScalaIteratorConverter(inputDescriptors.iterator)
       .asScala.toIndexedSeq map {a => a: org.apache.mxnet.DataDesc}
@@ -97,10 +97,10 @@ class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor)
   }
 
   /**
-    * Takes input as List of one dimensional arrays and creates the NDArray needed for inference
+    * Takes input as List of one dimensional iterables and creates the NDArray needed for inference
     * The array will be reshaped based on the input descriptors.
     *
-    * @param input:            A List of a one-dimensional array.
+    * @param input:            A List of a one-dimensional iterables of DType Float.
                               An extra List is needed for when the model has more than one input.
     * @return                  Indexed sequence array of outputs
     */
@@ -118,10 +118,10 @@ class Predictor private[mxnet] (val predictor: org.apache.mxnet.infer.Predictor)
     * This method is useful when the input is a batch of data
     * Note: User is responsible for managing allocation/deallocation of input/output NDArrays.
     *
-    * @param input             List of NDArrays
+    * @param input             Iterable of NDArrays
     * @return                  Output of predictions as NDArrays
     */
-  def predictWithNDArray(input: java.util.List[NDArray]):
+  def predictWithNDArray(input: java.lang.Iterable[NDArray]):
   java.util.List[NDArray] = {
     val ret = predictor.predictWithNDArray(convert(JavaConverters
       .asScalaIteratorConverter(input.iterator).asScala.toIndexedSeq))
diff --git a/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorTest.java b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorTest.java
index a5e64911d..3219b5aac 100644
--- a/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorTest.java
+++ b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/ObjectDetectorTest.java
@@ -29,7 +29,9 @@
 
 import java.awt.image.BufferedImage;
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 public class ObjectDetectorTest {
 
@@ -92,6 +94,17 @@ public void testObjectDetectorWithBatchImage() {
         Assert.assertEquals(expectedResult, actualResult);
     }
 
+    @Test
+    public void testObjectDetectorWithIterableOfBatchImage() {
+
+        Set<BufferedImage> batchImage = new HashSet<>();
+        batchImage.add(inputImage);
+        Mockito.when(objectDetector.imageBatchObjectDetect(batchImage, topK)).thenReturn(expectedResult);
+        List<List<ObjectDetectorOutput>> actualResult = objectDetector.imageBatchObjectDetect(batchImage, topK);
+        Mockito.verify(objectDetector, Mockito.times(1)).imageBatchObjectDetect(batchImage, topK);
+        Assert.assertEquals(expectedResult, actualResult);
+    }
+
     @Test
     public void testObjectDetectorWithNDArrayInput() {
 
@@ -103,4 +116,16 @@ public void testObjectDetectorWithNDArrayInput() {
         Mockito.verify(objectDetector, Mockito.times(1)).objectDetectWithNDArray(inputL, topK);
         Assert.assertEquals(expectedResult, actualResult);
     }
+
+    @Test
+    public void testObjectDetectorWithIterableOfNDArrayInput() {
+
+        NDArray inputArr = ObjectDetector.bufferedImageToPixels(inputImage, getTestShape());
+        Set<NDArray> inputL = new HashSet<>();
+        inputL.add(inputArr);
+        Mockito.when(objectDetector.objectDetectWithNDArray(inputL, 5)).thenReturn(expectedResult);
+        List<List<ObjectDetectorOutput>> actualResult = objectDetector.objectDetectWithNDArray(inputL, topK);
+        Mockito.verify(objectDetector, Mockito.times(1)).objectDetectWithNDArray(inputL, topK);
+        Assert.assertEquals(expectedResult, actualResult);
+    }
 }
diff --git a/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/PredictorTest.java b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/PredictorTest.java
index e7a6c9652..0d83c74fe 100644
--- a/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/PredictorTest.java
+++ b/scala-package/infer/src/test/java/org/apache/mxnet/infer/javaapi/PredictorTest.java
@@ -25,9 +25,7 @@
 import org.junit.Test;
 import org.mockito.Mockito;
 
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
+import java.util.*;
 
 public class PredictorTest {
 
@@ -80,6 +78,31 @@ public void testPredictWithNDArray() {
         Assert.assertEquals(expectedResult, actualOutput);
     }
 
+    @Test
+    public void testPredictWithIterablesNDArray() {
+
+        float[] tmpArr = new float[224];
+        for (int y = 0; y < 224; y++)
+            tmpArr[y] = (int) (Math.random() * 10);
+
+        NDArray arr = new org.apache.mxnet.javaapi.NDArray(tmpArr, new Shape(new int[] {1, 1, 1, 224}), new Context("cpu", 0));
+
+        Set<NDArray> inputSet = new HashSet<>();
+        inputSet.add(arr);
+
+        NDArray expected = new NDArray(tmpArr, new Shape(new int[] {1, 1, 1, 224}), new Context("cpu", 0));
+        List<NDArray> expectedResult = new ArrayList<>();
+        expectedResult.add(expected);
+
+        Mockito.when(mockPredictor.predictWithNDArray(inputSet)).thenReturn(expectedResult);
+
+        List<NDArray> actualOutput = mockPredictor.predictWithNDArray(inputSet);
+
+        Mockito.verify(mockPredictor, Mockito.times(1)).predictWithNDArray(inputSet);
+
+        Assert.assertEquals(expectedResult, actualOutput);
+    }
+
     @Test
     public void testPredictWithListOfFloatsAsInput() {
         List<List<Float>> input = new ArrayList<>();
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index a0bb6be38..c514177db 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -60,6 +60,13 @@
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-deploy-plugin</artifactId>
+        <configuration>
+          <skip>true</skip>
+        </configuration>
+      </plugin>
     </plugins>
   </build>
 
diff --git a/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala b/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
index 4dfd6eb04..fa3565b4f 100644
--- a/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/org/apache/mxnet/javaapi/JavaNDArrayMacro.scala
@@ -96,9 +96,9 @@ private[mxnet] object JavaNDArrayMacro extends GeneratorBase {
       // add default out parameter
       argDef += s"out: org.apache.mxnet.javaapi.NDArray"
       if (useParamObject) {
-        impl += "if (po.getOut() != null) map(\"out\") = po.getOut()"
+        impl += "if (po.getOut() != null) map(\"out\") = po.getOut().nd"
       } else {
-        impl += "if (out != null) map(\"out\") = out"
+        impl += "if (out != null) map(\"out\") = out.nd"
       }
       val returnType = "Array[org.apache.mxnet.javaapi.NDArray]"
       // scalastyle:off
diff --git a/scala-package/mvnw b/scala-package/mvnw
new file mode 100755
index 000000000..5551fde8e
--- /dev/null
+++ b/scala-package/mvnw
@@ -0,0 +1,286 @@
+#!/bin/sh
+# ----------------------------------------------------------------------------
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# ----------------------------------------------------------------------------
+
+# ----------------------------------------------------------------------------
+# Maven2 Start Up Batch script
+#
+# Required ENV vars:
+# ------------------
+#   JAVA_HOME - location of a JDK home dir
+#
+# Optional ENV vars
+# -----------------
+#   M2_HOME - location of maven2's installed home dir
+#   MAVEN_OPTS - parameters passed to the Java VM when running Maven
+#     e.g. to debug Maven itself, use
+#       set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
+#   MAVEN_SKIP_RC - flag to disable loading of mavenrc files
+# ----------------------------------------------------------------------------
+
+if [ -z "$MAVEN_SKIP_RC" ] ; then
+
+  if [ -f /etc/mavenrc ] ; then
+    . /etc/mavenrc
+  fi
+
+  if [ -f "$HOME/.mavenrc" ] ; then
+    . "$HOME/.mavenrc"
+  fi
+
+fi
+
+# OS specific support.  $var _must_ be set to either true or false.
+cygwin=false;
+darwin=false;
+mingw=false
+case "`uname`" in
+  CYGWIN*) cygwin=true ;;
+  MINGW*) mingw=true;;
+  Darwin*) darwin=true
+    # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home
+    # See https://developer.apple.com/library/mac/qa/qa1170/_index.html
+    if [ -z "$JAVA_HOME" ]; then
+      if [ -x "/usr/libexec/java_home" ]; then
+        export JAVA_HOME="`/usr/libexec/java_home`"
+      else
+        export JAVA_HOME="/Library/Java/Home"
+      fi
+    fi
+    ;;
+esac
+
+if [ -z "$JAVA_HOME" ] ; then
+  if [ -r /etc/gentoo-release ] ; then
+    JAVA_HOME=`java-config --jre-home`
+  fi
+fi
+
+if [ -z "$M2_HOME" ] ; then
+  ## resolve links - $0 may be a link to maven's home
+  PRG="$0"
+
+  # need this for relative symlinks
+  while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+      PRG="$link"
+    else
+      PRG="`dirname "$PRG"`/$link"
+    fi
+  done
+
+  saveddir=`pwd`
+
+  M2_HOME=`dirname "$PRG"`/..
+
+  # make it fully qualified
+  M2_HOME=`cd "$M2_HOME" && pwd`
+
+  cd "$saveddir"
+  # echo Using m2 at $M2_HOME
+fi
+
+# For Cygwin, ensure paths are in UNIX format before anything is touched
+if $cygwin ; then
+  [ -n "$M2_HOME" ] &&
+    M2_HOME=`cygpath --unix "$M2_HOME"`
+  [ -n "$JAVA_HOME" ] &&
+    JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
+  [ -n "$CLASSPATH" ] &&
+    CLASSPATH=`cygpath --path --unix "$CLASSPATH"`
+fi
+
+# For Mingw, ensure paths are in UNIX format before anything is touched
+if $mingw ; then
+  [ -n "$M2_HOME" ] &&
+    M2_HOME="`(cd "$M2_HOME"; pwd)`"
+  [ -n "$JAVA_HOME" ] &&
+    JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`"
+  # TODO classpath?
+fi
+
+if [ -z "$JAVA_HOME" ]; then
+  javaExecutable="`which javac`"
+  if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then
+    # readlink(1) is not available as standard on Solaris 10.
+    readLink=`which readlink`
+    if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then
+      if $darwin ; then
+        javaHome="`dirname \"$javaExecutable\"`"
+        javaExecutable="`cd \"$javaHome\" && pwd -P`/javac"
+      else
+        javaExecutable="`readlink -f \"$javaExecutable\"`"
+      fi
+      javaHome="`dirname \"$javaExecutable\"`"
+      javaHome=`expr "$javaHome" : '\(.*\)/bin'`
+      JAVA_HOME="$javaHome"
+      export JAVA_HOME
+    fi
+  fi
+fi
+
+if [ -z "$JAVACMD" ] ; then
+  if [ -n "$JAVA_HOME"  ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+      # IBM's JDK on AIX uses strange locations for the executables
+      JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+      JAVACMD="$JAVA_HOME/bin/java"
+    fi
+  else
+    JAVACMD="`which java`"
+  fi
+fi
+
+if [ ! -x "$JAVACMD" ] ; then
+  echo "Error: JAVA_HOME is not defined correctly." >&2
+  echo "  We cannot execute $JAVACMD" >&2
+  exit 1
+fi
+
+if [ -z "$JAVA_HOME" ] ; then
+  echo "Warning: JAVA_HOME environment variable is not set."
+fi
+
+CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher
+
+# traverses directory structure from process work directory to filesystem root
+# first directory with .mvn subdirectory is considered project base directory
+find_maven_basedir() {
+
+  if [ -z "$1" ]
+  then
+    echo "Path not specified to find_maven_basedir"
+    return 1
+  fi
+
+  basedir="$1"
+  wdir="$1"
+  while [ "$wdir" != '/' ] ; do
+    if [ -d "$wdir"/.mvn ] ; then
+      basedir=$wdir
+      break
+    fi
+    # workaround for JBEAP-8937 (on Solaris 10/Sparc)
+    if [ -d "${wdir}" ]; then
+      wdir=`cd "$wdir/.."; pwd`
+    fi
+    # end of workaround
+  done
+  echo "${basedir}"
+}
+
+# concatenates all lines of a file
+concat_lines() {
+  if [ -f "$1" ]; then
+    echo "$(tr -s '\n' ' ' < "$1")"
+  fi
+}
+
+BASE_DIR=`find_maven_basedir "$(pwd)"`
+if [ -z "$BASE_DIR" ]; then
+  exit 1;
+fi
+
+##########################################################################################
+# Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
+# This allows using the maven wrapper in projects that prohibit checking in binary data.
+##########################################################################################
+if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then
+    if [ "$MVNW_VERBOSE" = true ]; then
+      echo "Found .mvn/wrapper/maven-wrapper.jar"
+    fi
+else
+    if [ "$MVNW_VERBOSE" = true ]; then
+      echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..."
+    fi
+    jarUrl="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.4.2/maven-wrapper-0.4.2.jar"
+    while IFS="=" read key value; do
+      case "$key" in (wrapperUrl) jarUrl="$value"; break ;;
+      esac
+    done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties"
+    if [ "$MVNW_VERBOSE" = true ]; then
+      echo "Downloading from: $jarUrl"
+    fi
+    wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar"
+
+    if command -v wget > /dev/null; then
+        if [ "$MVNW_VERBOSE" = true ]; then
+          echo "Found wget ... using wget"
+        fi
+        wget "$jarUrl" -O "$wrapperJarPath"
+    elif command -v curl > /dev/null; then
+        if [ "$MVNW_VERBOSE" = true ]; then
+          echo "Found curl ... using curl"
+        fi
+        curl -o "$wrapperJarPath" "$jarUrl"
+    else
+        if [ "$MVNW_VERBOSE" = true ]; then
+          echo "Falling back to using Java to download"
+        fi
+        javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java"
+        if [ -e "$javaClass" ]; then
+            if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then
+                if [ "$MVNW_VERBOSE" = true ]; then
+                  echo " - Compiling MavenWrapperDownloader.java ..."
+                fi
+                # Compiling the Java class
+                ("$JAVA_HOME/bin/javac" "$javaClass")
+            fi
+            if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then
+                # Running the downloader
+                if [ "$MVNW_VERBOSE" = true ]; then
+                  echo " - Running MavenWrapperDownloader.java ..."
+                fi
+                ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR")
+            fi
+        fi
+    fi
+fi
+##########################################################################################
+# End of extension
+##########################################################################################
+
+export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"}
+if [ "$MVNW_VERBOSE" = true ]; then
+  echo $MAVEN_PROJECTBASEDIR
+fi
+MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS"
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin; then
+  [ -n "$M2_HOME" ] &&
+    M2_HOME=`cygpath --path --windows "$M2_HOME"`
+  [ -n "$JAVA_HOME" ] &&
+    JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"`
+  [ -n "$CLASSPATH" ] &&
+    CLASSPATH=`cygpath --path --windows "$CLASSPATH"`
+  [ -n "$MAVEN_PROJECTBASEDIR" ] &&
+    MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"`
+fi
+
+WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
+
+exec "$JAVACMD" \
+  $MAVEN_OPTS \
+  -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \
+  "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \
+  ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@"
diff --git a/scala-package/mvnw.cmd b/scala-package/mvnw.cmd
new file mode 100755
index 000000000..48363fa60
--- /dev/null
+++ b/scala-package/mvnw.cmd
@@ -0,0 +1,161 @@
+@REM ----------------------------------------------------------------------------
+@REM Licensed to the Apache Software Foundation (ASF) under one
+@REM or more contributor license agreements.  See the NOTICE file
+@REM distributed with this work for additional information
+@REM regarding copyright ownership.  The ASF licenses this file
+@REM to you under the Apache License, Version 2.0 (the
+@REM "License"); you may not use this file except in compliance
+@REM with the License.  You may obtain a copy of the License at
+@REM
+@REM    http://www.apache.org/licenses/LICENSE-2.0
+@REM
+@REM Unless required by applicable law or agreed to in writing,
+@REM software distributed under the License is distributed on an
+@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+@REM KIND, either express or implied.  See the License for the
+@REM specific language governing permissions and limitations
+@REM under the License.
+@REM ----------------------------------------------------------------------------
+
+@REM ----------------------------------------------------------------------------
+@REM Maven2 Start Up Batch script
+@REM
+@REM Required ENV vars:
+@REM JAVA_HOME - location of a JDK home dir
+@REM
+@REM Optional ENV vars
+@REM M2_HOME - location of maven2's installed home dir
+@REM MAVEN_BATCH_ECHO - set to 'on' to enable the echoing of the batch commands
+@REM MAVEN_BATCH_PAUSE - set to 'on' to wait for a key stroke before ending
+@REM MAVEN_OPTS - parameters passed to the Java VM when running Maven
+@REM     e.g. to debug Maven itself, use
+@REM set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000
+@REM MAVEN_SKIP_RC - flag to disable loading of mavenrc files
+@REM ----------------------------------------------------------------------------
+
+@REM Begin all REM lines with '@' in case MAVEN_BATCH_ECHO is 'on'
+@echo off
+@REM set title of command window
+title %0
+@REM enable echoing my setting MAVEN_BATCH_ECHO to 'on'
+@if "%MAVEN_BATCH_ECHO%" == "on"  echo %MAVEN_BATCH_ECHO%
+
+@REM set %HOME% to equivalent of $HOME
+if "%HOME%" == "" (set "HOME=%HOMEDRIVE%%HOMEPATH%")
+
+@REM Execute a user defined script before this one
+if not "%MAVEN_SKIP_RC%" == "" goto skipRcPre
+@REM check for pre script, once with legacy .bat ending and once with .cmd ending
+if exist "%HOME%\mavenrc_pre.bat" call "%HOME%\mavenrc_pre.bat"
+if exist "%HOME%\mavenrc_pre.cmd" call "%HOME%\mavenrc_pre.cmd"
+:skipRcPre
+
+@setlocal
+
+set ERROR_CODE=0
+
+@REM To isolate internal variables from possible post scripts, we use another setlocal
+@setlocal
+
+@REM ==== START VALIDATION ====
+if not "%JAVA_HOME%" == "" goto OkJHome
+
+echo.
+echo Error: JAVA_HOME not found in your environment. >&2
+echo Please set the JAVA_HOME variable in your environment to match the >&2
+echo location of your Java installation. >&2
+echo.
+goto error
+
+:OkJHome
+if exist "%JAVA_HOME%\bin\java.exe" goto init
+
+echo.
+echo Error: JAVA_HOME is set to an invalid directory. >&2
+echo JAVA_HOME = "%JAVA_HOME%" >&2
+echo Please set the JAVA_HOME variable in your environment to match the >&2
+echo location of your Java installation. >&2
+echo.
+goto error
+
+@REM ==== END VALIDATION ====
+
+:init
+
+@REM Find the project base dir, i.e. the directory that contains the folder ".mvn".
+@REM Fallback to current working directory if not found.
+
+set MAVEN_PROJECTBASEDIR=%MAVEN_BASEDIR%
+IF NOT "%MAVEN_PROJECTBASEDIR%"=="" goto endDetectBaseDir
+
+set EXEC_DIR=%CD%
+set WDIR=%EXEC_DIR%
+:findBaseDir
+IF EXIST "%WDIR%"\.mvn goto baseDirFound
+cd ..
+IF "%WDIR%"=="%CD%" goto baseDirNotFound
+set WDIR=%CD%
+goto findBaseDir
+
+:baseDirFound
+set MAVEN_PROJECTBASEDIR=%WDIR%
+cd "%EXEC_DIR%"
+goto endDetectBaseDir
+
+:baseDirNotFound
+set MAVEN_PROJECTBASEDIR=%EXEC_DIR%
+cd "%EXEC_DIR%"
+
+:endDetectBaseDir
+
+IF NOT EXIST "%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config" goto endReadAdditionalConfig
+
+@setlocal EnableExtensions EnableDelayedExpansion
+for /F "usebackq delims=" %%a in ("%MAVEN_PROJECTBASEDIR%\.mvn\jvm.config") do set JVM_CONFIG_MAVEN_PROPS=!JVM_CONFIG_MAVEN_PROPS! %%a
+@endlocal & set JVM_CONFIG_MAVEN_PROPS=%JVM_CONFIG_MAVEN_PROPS%
+
+:endReadAdditionalConfig
+
+SET MAVEN_JAVA_EXE="%JAVA_HOME%\bin\java.exe"
+set WRAPPER_JAR="%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.jar"
+set WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain
+
+set DOWNLOAD_URL="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.4.2/maven-wrapper-0.4.2.jar"
+FOR /F "tokens=1,2 delims==" %%A IN (%MAVEN_PROJECTBASEDIR%\.mvn\wrapper\maven-wrapper.properties) DO (
+	IF "%%A"=="wrapperUrl" SET DOWNLOAD_URL=%%B 
+)
+
+@REM Extension to allow automatically downloading the maven-wrapper.jar from Maven-central
+@REM This allows using the maven wrapper in projects that prohibit checking in binary data.
+if exist %WRAPPER_JAR% (
+    echo Found %WRAPPER_JAR%
+) else (
+    echo Couldn't find %WRAPPER_JAR%, downloading it ...
+	echo Downloading from: %DOWNLOAD_URL%
+    powershell -Command "(New-Object Net.WebClient).DownloadFile('%DOWNLOAD_URL%', '%WRAPPER_JAR%')"
+    echo Finished downloading %WRAPPER_JAR%
+)
+@REM End of extension
+
+%MAVEN_JAVA_EXE% %JVM_CONFIG_MAVEN_PROPS% %MAVEN_OPTS% %MAVEN_DEBUG_OPTS% -classpath %WRAPPER_JAR% "-Dmaven.multiModuleProjectDirectory=%MAVEN_PROJECTBASEDIR%" %WRAPPER_LAUNCHER% %MAVEN_CONFIG% %*
+if ERRORLEVEL 1 goto error
+goto end
+
+:error
+set ERROR_CODE=1
+
+:end
+@endlocal & set ERROR_CODE=%ERROR_CODE%
+
+if not "%MAVEN_SKIP_RC%" == "" goto skipRcPost
+@REM check for post script, once with legacy .bat ending and once with .cmd ending
+if exist "%HOME%\mavenrc_post.bat" call "%HOME%\mavenrc_post.bat"
+if exist "%HOME%\mavenrc_post.cmd" call "%HOME%\mavenrc_post.cmd"
+:skipRcPost
+
+@REM pause the script if MAVEN_BATCH_PAUSE is set to 'on'
+if "%MAVEN_BATCH_PAUSE%" == "on" pause
+
+if "%MAVEN_TERMINATE_CMD%" == "on" exit %ERROR_CODE%
+
+exit /B %ERROR_CODE%
diff --git a/scala-package/mxnet-demo/java-demo/README.md b/scala-package/mxnet-demo/java-demo/README.md
index 73e9284a9..5dfbf14e8 100644
--- a/scala-package/mxnet-demo/java-demo/README.md
+++ b/scala-package/mxnet-demo/java-demo/README.md
@@ -1,16 +1,18 @@
 # MXNet Java Sample Project
 This is an project created to use Maven-published Scala/Java package with two Java examples.
 ## Setup
-You can use the `Makefile` to make the Java package. Simply do the following:
-```Bash
-make javademo
+You are required to use Maven to build the package with the following commands:
+```
+mvn package
 ```
-This will load the default parameter for all the environment variable.
-If you want to run with GPU on Linux, just simply add `USE_CUDA=1` when you run the make file
- 
+This command will pick the default values specified in the [pom](/~https://github.com/apache/incubator-mxnet/blob/master/scala-package/mxnet-demo/java-demo/pom.xml) file.
+
+Note: If you are planning to use GPU, please add `-Dmxnet.profile=linux-x86_64-gpu`
+
+### Use customized version set
 You can use the following instruction as an alternative to achieve the same result:
-User are required to use `mvn package` to build the package,
- which are shown below:
+You may use `mvn package` to build the package,
+using the following commands:
 ```Bash
 export SCALA_VERSION_PROFILE=2.11
 export SCALA_PKG_PROFILE=
@@ -71,9 +73,15 @@ If you want to test run on GPU, you can set a environment variable as follows:
 export SCALA_TEST_ON_GPU=1
 ```
 ## Clean up
-Clean up for Maven package is simple, you can run the pre-configed `Makefile` as:
+Clean up for Maven package is simple:
 ```Bash
-make javaclean
+mvn clean
+```
+
+## Convert to Eclipse project (Optional)
+You can convert the maven project to the eclipse one by running the following command:
+```
+mvn eclipse:eclipse
 ```
 
 ## Q & A
@@ -87,4 +95,4 @@ sudo apt install libopencv-imgcodecs3.4
 Is there any other version available?
 
 You can find nightly release version from [here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~1.5.0-SNAPSHOT~~).
-Please keep the same version in the Makefile or [above version](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~) to run this demo.
+Please keep the same version in the pom file or [other versions in here](https://repository.apache.org/#nexus-search;gav~org.apache.mxnet~~~~) to run this demo.
\ No newline at end of file
diff --git a/scala-package/mxnet-demo/java-demo/bin/java_sample.sh b/scala-package/mxnet-demo/java-demo/bin/java_sample.sh
old mode 100755
new mode 100644
index 4fb724aca..05a45d7cb
--- a/scala-package/mxnet-demo/java-demo/bin/java_sample.sh
+++ b/scala-package/mxnet-demo/java-demo/bin/java_sample.sh
@@ -17,4 +17,4 @@
 #!/bin/bash
 CURR_DIR=$(cd $(dirname $0)/../; pwd)
 CLASSPATH=$CLASSPATH:$CURR_DIR/target/*:$CLASSPATH:$CURR_DIR/target/dependency/*
-java -Xmx8G  -cp $CLASSPATH mxnet.HelloWorld
\ No newline at end of file
+java -Xmx8G  -cp $CLASSPATH mxnet.HelloWorld
diff --git a/scala-package/mxnet-demo/java-demo/bin/run_od.sh b/scala-package/mxnet-demo/java-demo/bin/run_od.sh
old mode 100755
new mode 100644
index abd0bf5b1..4370518dc
--- a/scala-package/mxnet-demo/java-demo/bin/run_od.sh
+++ b/scala-package/mxnet-demo/java-demo/bin/run_od.sh
@@ -17,4 +17,4 @@
 #!/bin/bash
 CURR_DIR=$(cd $(dirname $0)/../; pwd)
 CLASSPATH=$CLASSPATH:$CURR_DIR/target/*:$CLASSPATH:$CURR_DIR/target/dependency/*
-java -Xmx8G  -cp $CLASSPATH mxnet.ObjectDetection
\ No newline at end of file
+java -Xmx8G  -cp $CLASSPATH mxnet.ObjectDetection
diff --git a/scala-package/mxnet-demo/java-demo/pom.xml b/scala-package/mxnet-demo/java-demo/pom.xml
index b7502a66c..05d04b9e6 100644
--- a/scala-package/mxnet-demo/java-demo/pom.xml
+++ b/scala-package/mxnet-demo/java-demo/pom.xml
@@ -8,6 +8,13 @@
     <version>1.0-SNAPSHOT</version>
     <name>MXNet Java Demo</name>
 
+    <properties>
+        <maven.compiler.source>1.8</maven.compiler.source>
+        <maven.compiler.target>1.8</maven.compiler.target>
+        <mxnet.version>[1.5.0-SNAPSHOT, )</mxnet.version>
+        <mxnet.scalaprofile>2.11</mxnet.scalaprofile>
+    </properties>
+
     <profiles>
         <profile>
             <id>ci-nightly</id>
@@ -21,7 +28,30 @@
                 <mxnet.version>[1.5.0-SNAPSHOT, )</mxnet.version>
             </properties>
         </profile>
-    </profiles>
+        <profile>
+          <id>osx-x86_64</id>
+          <activation>
+            <os>
+              <family>mac</family>
+            </os>
+          </activation>
+          <properties>
+            <mxnet.profile>osx-x86_64-cpu</mxnet.profile>
+          </properties>
+        </profile>
+        <profile>
+          <id>linux-x86_64</id>
+          <activation>
+          <os>
+            <family>unix</family>
+            <name>Linux</name>
+          </os>
+          </activation>
+          <properties>
+            <mxnet.profile>linux-x86_64-cpu</mxnet.profile>
+          </properties>
+        </profile>
+        </profiles>
 
     <repositories>
         <repository>
@@ -30,12 +60,6 @@
         </repository>
     </repositories>
 
-    <properties>
-        <maven.compiler.source>1.8</maven.compiler.source>
-        <maven.compiler.target>1.8</maven.compiler.target>
-        <mxnet.version>[1.5.0-SNAPSHOT, )</mxnet.version>
-    </properties>
-
     <dependencies>
         <dependency>
             <groupId>org.apache.mxnet</groupId>
diff --git a/scala-package/mxnet-demo/scala-demo/Makefile b/scala-package/mxnet-demo/scala-demo/Makefile
deleted file mode 100644
index 9f7f2840e..000000000
--- a/scala-package/mxnet-demo/scala-demo/Makefile
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-SCALA_VERSION_PROFILE := 2.11
-SCALA_VERSION := 2.11.8
-
-ifeq ($(OS),Windows_NT)
-	UNAME_S := Windows
-else
-	UNAME_S := $(shell uname -s)
-endif
-
-ifeq ($(UNAME_S), Windows)
-	# TODO: currently scala package does not support windows
-	SCALA_PKG_PROFILE := windows
-else
-	ifeq ($(UNAME_S), Darwin)
-		SCALA_PKG_PROFILE := osx-x86_64-cpu
-	else
-		SCALA_PKG_PROFILE := linux-x86_64
-		ifeq ($(USE_CUDA), 1)
-        	SCALA_PKG_PROFILE := $(SCALA_PKG_PROFILE)-gpu
-        else
-        	SCALA_PKG_PROFILE := $(SCALA_PKG_PROFILE)-cpu
-        endif
-	endif
-endif
-
-scalademo:
-	(mvn package -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
-		-Dmxnet.scalaprofile=$(SCALA_VERSION_PROFILE) \
-		-Dscala.version=$(SCALA_VERSION))
-
-scala_ci_demo:
-	(mvn -Pci-nightly package -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
-		-Dmxnet.scalaprofile=$(SCALA_VERSION_PROFILE) \
-		-Dscala.version=$(SCALA_VERSION))
-
-scalaclean:
-	(mvn clean -Dmxnet.profile=$(SCALA_PKG_PROFILE) \
-		-Dmxnet.scalaprofile=$(SCALA_VERSION_PROFILE) \
-		-Dscala.version=$(SCALA_VERSION))
\ No newline at end of file
diff --git a/scala-package/mxnet-demo/scala-demo/README.md b/scala-package/mxnet-demo/scala-demo/README.md
index 1cc5a6d42..b994a196b 100644
--- a/scala-package/mxnet-demo/scala-demo/README.md
+++ b/scala-package/mxnet-demo/scala-demo/README.md
@@ -1,7 +1,15 @@
 # MXNet Scala Sample Project
 This is an project created to use Maven-published Scala package with two Scala examples.
 ## Setup
-User are required to use `mvn package` to build the package,
+You are required to use maven to build the package, by running the following:
+```
+mvn package
+```
+This command will pick the default values specified in the pom file.
+
+Note: If you are planning to use GPU, please add `-Dmxnet.profile=linux-x86_64-gpu`
+
+### Use customized version set
  which are shown below:
 ```Bash
 export SCALA_VERSION_PROFILE=2.11 SCALA_VERSION=2.11.8
@@ -14,13 +22,6 @@ These environment variable (`SCALA_PKG_PROFILE`, `SCALA_VERSION_PROFILE`, `SCALA
 should be set before executing the line above.
 
 To obtain the most recent MXNet version, please click [here](https://mvnrepository.com/search?q=org.apache.mxnet)
- 
-You can also use the `Makefile` as an alternative to do the same thing. Simply do the following:
-```Bash
-make scalademo
-```
-This will load the default parameter for all the environment variable.
- If you want to run with GPU on Linux, just simply add `USE_CUDA=1` when you run the make file
 
 ## Run
 ### Hello World
@@ -54,9 +55,9 @@ If you want to test run on GPU, you can set a environment variable as follows:
 export SCALA_TEST_ON_GPU=1
 ```
 ## Clean up
-Clean up for Maven package is simple, you can run the pre-configed `Makefile` as:
+To clean up a Maven package, run the following:
 ```Bash
-make scalaclean
+mvn clean
 ```
 
 ## Q & A
diff --git a/scala-package/mxnet-demo/scala-demo/pom.xml b/scala-package/mxnet-demo/scala-demo/pom.xml
index 1d89f9765..37d60d133 100644
--- a/scala-package/mxnet-demo/scala-demo/pom.xml
+++ b/scala-package/mxnet-demo/scala-demo/pom.xml
@@ -22,10 +22,35 @@
                 <mxnet.version>[1.5.0-SNAPSHOT, )</mxnet.version>
             </properties>
         </profile>
+        <profile>
+          <id>osx-x86_64</id>
+          <activation>
+            <os>
+              <family>mac</family>
+            </os>
+          </activation>
+          <properties>
+            <mxnet.profile>osx-x86_64-cpu</mxnet.profile>
+          </properties>
+        </profile>
+        <profile>
+          <id>linux-x86_64</id>
+          <activation>
+          <os>
+            <family>unix</family>
+            <name>Linux</name>
+          </os>
+          </activation>
+          <properties>
+            <mxnet.profile>linux-x86_64-cpu</mxnet.profile>
+          </properties>
+        </profile>
     </profiles>
 
     <properties>
+        <mxnet.scalaprofile>2.11</mxnet.scalaprofile>
         <mxnet.version>[1.3.1, )</mxnet.version>
+        <scala.version>2.11.8</scala.version>
     </properties>
 
     <dependencies>
diff --git a/scala-package/packageTest/README.md b/scala-package/packageTest/README.md
index 3f1eeb842..e9980f353 100644
--- a/scala-package/packageTest/README.md
+++ b/scala-package/packageTest/README.md
@@ -7,7 +7,7 @@ This is an project created to run the test suite on a fully packaged mxnet jar.
 To setup the packageTest, you must first build your tests. To build the tests, follow these steps from the mxnet main directory:
 
 1. Build MXNet and the scala package from source following the directions [here](https://mxnet.incubator.apache.org/install/scala_setup.html#source)
-2. Build the tests by running `make scalatestcompile`.
+2. Build the tests by running `mvn test-compile`.
 3. Follow setup instructions below for your testing goal
 
 ## Running
@@ -18,13 +18,13 @@ There are three different modes of operation for testing based on the location o
 
 If you have a jar file, you can install it to your maven cache repository(`~/.m2/repository`). This might be useful if you acquire the .jar file from elsewhere. To install, it is easiest to use `mvn install:install-file -Dfile=<path-to-file> -DpomFile=<path-to-pomfile>`. If the pom file is not available, you can also run `mvn install:install-file -Dfile=<path-to-file> -DgroupId=<group-id> -DartifactId=<artifact-id> -Dversion=<version> -Dpackaging=<packaging>`. With the full mxnet jar, this might look like `mvn install:install-file -Dfile=<path-to-file> -DgroupId=org.apache.mxnet -DartifactId=mxnet-full_2.11-linux-x86_64-cpu -Dversion=1.3.0 -Dpackaging=jar`.
 
-You can also run `make scalainstall` to install from a local build.
+You can also run `mvn install` to install from a local build.
 
 After installing, run `make testinstall` in the package test directory to run the tests.  Note that unless you also install an additional mxnetexamples jar, you can only run the unit tests.
 
 ### Test Local Deployment
 
-To test the jars that would be produced by a deployment, you can run `make scaladeploylocal` from the main mxnet directory. This produces a local snapshot located at `scala-package/local-snapshot`. To test this local snapshot, run `make testlocal`.
+To test the jars that would be produced by a deployment, you can run `mvn deploy` from the main mxnet directory. This produces a local snapshot located at `scala-package/deploy/target/repo`. To test this local snapshot, run `make testlocal`.  It also installs the component packages needed for testing the examples in `scala-package/*/target/repo`.
 
 ### Remote Repository Snapshot
 
@@ -36,11 +36,13 @@ Test the snapshot repo using `make testsnapshot` or a different repo using `make
 
 You are able to run unit tests, integration tests, or both using this utility. To run the unit tests, add the flag `UNIT=1` to make (e.g. `make testsnapshot UNIT=1`). Use `INTEGRATION=1` for integration tests. The default behavior is to run both the unit and integration tests. However, the integration tests require that the mxnet examples be installed in addition to the full mxnet package (see test mode instructions above).
 
+For running on GPU, add the flag `USE_CUDA=1`.
+
 An additional option, you can specify the mxnet version with `MXNET_VERSION=1.3.1-SNAPSHOT`.
 
 ## Cleaning Up
 
-You can clean temporary files and target artifacts by running `make scalaclean`.
+You can clean temporary files and target artifacts by running `make clean`.
 
 ## Troubleshooting
 
diff --git a/scala-package/packageTest/examples/pom.xml b/scala-package/packageTest/examples/pom.xml
index e11be657e..070b78dda 100644
--- a/scala-package/packageTest/examples/pom.xml
+++ b/scala-package/packageTest/examples/pom.xml
@@ -21,6 +21,53 @@
           <skipTests>false</skipTests>
         </properties>
       </profile>
+      <profile>
+        <id>fromLocal</id>
+        <repositories>
+          <repository>
+            <id>parent</id>
+            <url>file://${basedir}/../../target/repo</url>
+            <snapshots>
+              <enabled>true</enabled>
+            </snapshots>
+          </repository>
+          <repository>
+            <id>init</id>
+            <url>file://${basedir}/../../init/target/repo</url>
+            <snapshots>
+              <enabled>true</enabled>
+            </snapshots>
+          </repository>
+          <repository>
+            <id>macros</id>
+            <url>file://${basedir}/../../macros/target/repo</url>
+            <snapshots>
+              <enabled>true</enabled>
+            </snapshots>
+          </repository>
+          <repository>
+            <id>core</id>
+            <url>file://${basedir}/../../core/target/repo</url>
+            <snapshots>
+              <enabled>true</enabled>
+            </snapshots>
+          </repository>
+          <repository>
+            <id>infer</id>
+            <url>file://${basedir}/../../infer/target/repo</url>
+            <snapshots>
+              <enabled>true</enabled>
+            </snapshots>
+          </repository>
+          <repository>
+            <id>examples</id>
+            <url>file://${basedir}/../../examples/target/repo</url>
+            <snapshots>
+              <enabled>true</enabled>
+            </snapshots>
+          </repository>
+        </repositories>
+      </profile>
     </profiles>
 
     <build>
@@ -39,8 +86,8 @@
     <dependencies>
         <dependency>
             <groupId>org.apache.mxnet</groupId>
-            <artifactId>mxnet-examples_${mxnet.scalaprofile}</artifactId>
-            <version>${mxnet.version}</version>
+            <artifactId>mxnet-examples</artifactId>
+            <version>INTERNAL</version>
             <scope>test</scope>
         </dependency>
     </dependencies>
diff --git a/scala-package/packageTest/pom.xml b/scala-package/packageTest/pom.xml
index f7d9e3b18..f6a16dd77 100644
--- a/scala-package/packageTest/pom.xml
+++ b/scala-package/packageTest/pom.xml
@@ -41,7 +41,7 @@
         <id>fromLocal</id>
         <repositories>
           <repository>
-            <id>local-snapshot</id>
+            <id>full</id>
             <url>file://${basedir}/../deploy/target/repo</url>
             <snapshots>
               <enabled>true</enabled>
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 6665e953d..ca2fbbd8f 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -43,6 +43,7 @@
     <dollar>$</dollar>
     <MXNET_DIR>${project.basedir}/..</MXNET_DIR>
     <skipJavaTests>true</skipJavaTests>
+    <repo_url>file://${project.build.directory}/repo</repo_url>
   </properties>
 
   <packaging>pom</packaging>
@@ -131,7 +132,7 @@
                 </goals>
                 <configuration>
                   <executable>bash</executable>
-                  <commandlineArgs>-c 'mkdir -p ${project.build.directory}; if [[ $(ldd ${MXNET_DIR}/lib/libmxnet.so | grep libcuda.so | wc -l) == "0" ]]; then echo flavor=cpu &gt; ${project.build.directory}/flavor.properties; else echo flavor=gpu &gt; ${project.build.directory}/flavor.properties; fi'</commandlineArgs>
+                  <commandlineArgs>-c 'mkdir -p ${project.build.directory}; if [[ $(ldd ${MXNET_DIR}/lib/libmxnet.so | grep libcuda | wc -l) == "0" ]]; then echo flavor=cpu &gt; ${project.build.directory}/flavor.properties; else echo flavor=gpu &gt; ${project.build.directory}/flavor.properties; fi'</commandlineArgs>
                 </configuration>
               </execution>
             </executions>
@@ -158,6 +159,34 @@
         </plugins>
       </build>
     </profile>
+    <profile>
+      <id>staging</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-deploy-plugin</artifactId>
+            <configuration>
+              <skip>true</skip>
+            </configuration>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+    <profile>
+      <id>nightly</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.apache.maven.plugins</groupId>
+            <artifactId>maven-deploy-plugin</artifactId>
+            <configuration>
+              <skip>true</skip>
+            </configuration>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
   </profiles>
 
   <build>
@@ -337,7 +366,8 @@
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-deploy-plugin</artifactId>
         <configuration>
-          <skip>true</skip>
+          <skip>false</skip>
+          <altDeploymentRepository>deployrepo::default::${repo_url}</altDeploymentRepository>
         </configuration>
       </plugin>
     </plugins>
@@ -383,6 +413,24 @@
       <scope>test</scope>
     </dependency>
 
+    <!-- Following libraries are required by running javah, they should be excluded from .jar -->
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>${scala.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-reflect</artifactId>
+      <version>${scala.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>4.11</version>
+      <scope>test</scope>
+    </dependency>
+
     <!-- Following libraries are required by running javah, they should be excluded from .jar -->
     <dependency>
       <groupId>org.scala-lang</groupId>
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 80bd60538..b436e8ca6 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -22,20 +22,6 @@
  * \file c_api.cc
  * \brief C API of mxnet
  */
-#include <dmlc/base.h>
-#include <dmlc/logging.h>
-#include <dmlc/io.h>
-#include <dmlc/memory_io.h>
-#include <dmlc/recordio.h>
-#include <dmlc/omp.h>
-#include <mxnet/base.h>
-#include <mxnet/ndarray.h>
-#include <mxnet/operator.h>
-#include <mxnet/io.h>
-#include <mxnet/c_api.h>
-#include <mxnet/kvstore.h>
-#include <mxnet/rtc.h>
-#include <mxnet/storage.h>
 #include <vector>
 #include <sstream>
 #include <string>
@@ -43,6 +29,21 @@
 #include <memory>
 #include <functional>
 #include <utility>
+#include "dmlc/base.h"
+#include "dmlc/logging.h"
+#include "dmlc/io.h"
+#include "dmlc/memory_io.h"
+#include "dmlc/recordio.h"
+#include "dmlc/omp.h"
+#include "mxnet/base.h"
+#include "mxnet/ndarray.h"
+#include "mxnet/operator.h"
+#include "mxnet/io.h"
+#include "mxnet/c_api.h"
+#include "mxnet/kvstore.h"
+#include "mxnet/rtc.h"
+#include "mxnet/storage.h"
+#include "mxnet/mxfeatures.h"
 #include "./c_api_common.h"
 #include "../operator/custom/custom-inl.h"
 #include "../operator/tensor/matrix_op-inl.h"
@@ -85,6 +86,13 @@ inline int MXAPIGetFunctionRegInfo(const FunRegType *e,
 }
 
 // NOTE: return value is added in API_END
+
+int MXHasFeature(const mx_uint feature, bool* out) {
+  API_BEGIN();
+  *out = features::is_enabled(feature);
+  API_END();
+}
+
 int MXRandomSeed(int seed) {
   API_BEGIN();
   mxnet::RandomSeed(seed);
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index e2e53c726..66566ed70 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -645,8 +645,6 @@ int MXExecutorGetOptimizedSymbol(ExecutorHandle handle,
   API_END_HANDLE_ERROR(delete s);
 }
 
-
-
 int MXExecutorSetMonitorCallback(ExecutorHandle handle,
                                  ExecutorMonitorCallback callback,
                                  void* callback_handle) {
@@ -658,6 +656,22 @@ int MXExecutorSetMonitorCallback(ExecutorHandle handle,
     callback_temp(name, handle, callback_handle_temp);
   };
   Executor *exec = static_cast<Executor*>(handle);
-  exec->SetMonitorCallback(clbk);
+  exec->SetMonitorCallback(clbk, false);
+  API_END();
+}
+
+int MXExecutorSetMonitorCallbackEX(ExecutorHandle handle,
+                                   ExecutorMonitorCallback callback,
+                                   void* callback_handle,
+                                   bool monitor_all) {
+  API_BEGIN();
+  ExecutorMonitorCallback callback_temp = callback;
+  void* callback_handle_temp = callback_handle;
+  std::function<void(const char*, void*)> clbk
+  = [callback_temp, callback_handle_temp](const char *name, void* handle) {
+    callback_temp(name, handle, callback_handle_temp);
+  };
+  Executor *exec = static_cast<Executor*>(handle);
+  exec->SetMonitorCallback(clbk, monitor_all);
   API_END();
 }
diff --git a/src/c_api/c_api_profile.cc b/src/c_api/c_api_profile.cc
index 6bb5a9656..51213fbc8 100644
--- a/src/c_api/c_api_profile.cc
+++ b/src/c_api/c_api_profile.cc
@@ -225,25 +225,26 @@ struct ProfileConfigParam : public dmlc::Parameter<ProfileConfigParam> {
   int profile_process;
   DMLC_DECLARE_PARAMETER(ProfileConfigParam) {
     DMLC_DECLARE_FIELD(profile_all).set_default(false)
-      .describe("Profile all.");
+      .describe("Profile all. Default is False.");
     DMLC_DECLARE_FIELD(profile_symbolic).set_default(true)
-      .describe("Profile symbolic operators.");
+      .describe("Profile symbolic operators.  Default is True.");
     DMLC_DECLARE_FIELD(profile_imperative).set_default(true)
-      .describe("Profile imperative operators.");
+      .describe("Profile imperative operators.  Default is True.");
     DMLC_DECLARE_FIELD(profile_memory).set_default(true)
-      .describe("Profile memory.");
+      .describe("Profile memory.  Default is True.");
     DMLC_DECLARE_FIELD(profile_api).set_default(true)
-      .describe("Profile C API.");
+      .describe("Profile C API.  Default is True.");
     DMLC_DECLARE_FIELD(filename).set_default("profile.json")
       .describe("File name to write profiling info.");
     DMLC_DECLARE_FIELD(continuous_dump).set_default(true)
-      .describe("Periodically dump (and append) priofling data to file while running.");
+      .describe("Periodically dump (and append) profiling data to file while running. "
+                "Default is True.");
     DMLC_DECLARE_FIELD(dump_period).set_default(1.0f)
       .describe("When continuous dump is enabled, the period between subsequent "
                   "profile info dumping.");
     DMLC_DECLARE_FIELD(aggregate_stats).set_default(false)
       .describe("Maintain aggregate stats, required for MXDumpAggregateStats.  Note that "
-      "this can have anegative performance impact.");
+      "this can have a negative performance impact. Default is False.");
     DMLC_DECLARE_FIELD(profile_process)
       .add_enum("worker", static_cast<int>(ProfileProcess::kWorker))
       .add_enum("server", static_cast<int>(ProfileProcess::kServer))
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 9dc6b50ae..32a9a1ee2 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -22,12 +22,12 @@
  * \file c_api_symbolic.cc
  * \brief C API of mxnet
  */
-#include <mxnet/base.h>
-#include <mxnet/c_api.h>
-#include <nnvm/c_api.h>
-#include <nnvm/pass.h>
-#include <nnvm/pass_functions.h>
-#include <nnvm/symbolic.h>
+#include "mxnet/base.h"
+#include "mxnet/c_api.h"
+#include "nnvm/c_api.h"
+#include "nnvm/pass.h"
+#include "nnvm/pass_functions.h"
+#include "nnvm/symbolic.h"
 #if MXNET_USE_NGRAPH == 1
 #include <ngraph_imperative.h>
 #endif
@@ -675,7 +675,6 @@ int MXQuantizeSymbol(SymbolHandle sym_handle,
   g.attrs["excluded_nodes"] = std::make_shared<nnvm::any>(std::move(excluded_node_names));
   g.attrs["offline_params"] = std::make_shared<nnvm::any>(std::move(offline));
   g.attrs["quantized_dtype"] = std::make_shared<nnvm::any>(std::move(quantized_type));
-  g.attrs["calib_quantize"] = std::make_shared<nnvm::any>(calib_quantize);
   g = ApplyPass(std::move(g), "QuantizeGraph");
   s->outputs = g.outputs;
   *ret_sym_handle = s;
@@ -692,10 +691,9 @@ int MXSetCalibTableToQuantizedSymbol(SymbolHandle qsym_handle,
   API_BEGIN();
   nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(qsym_handle);
   nnvm::Graph g = Symbol2Graph(*sym);
-  const std::string prefix = "quantized_";
   std::unordered_map<std::string, std::pair<float, float>> calib_table;
   for (size_t i = 0; i < num_layers; ++i) {
-    calib_table.emplace(prefix+layer_names[i], std::make_pair(min_ranges[i], max_ranges[i]));
+    calib_table.emplace(layer_names[i], std::make_pair(min_ranges[i], max_ranges[i]));
   }
   g.attrs["calib_table"] = std::make_shared<nnvm::any>(std::move(calib_table));
   g = ApplyPass(std::move(g), "SetCalibTableToQuantizedGraph");
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index 152e62ca1..21cb7475b 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -212,7 +212,8 @@ class NaiveEngine final : public Engine {
 
  private:
   // callback to oncomplete
-  static void OnComplete(Engine *engine, void *param) {
+  static void OnComplete(Engine *engine, void *param,
+                         const dmlc::Error* error) {
     static_cast<NaiveEngine*>(engine)->req_completed_ = true;
   }
   // whether action is completed
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index 516e04bf5..8d44d9c15 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -67,7 +67,7 @@ RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
   RunContext ret;
   switch (ctx.dev_mask()) {
     case cpu::kDevMask:
-      ret = RunContext{ctx, nullptr};
+      ret = RunContext{ctx, nullptr, false};
       break;
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
@@ -85,7 +85,9 @@ RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
         use_counter = counter;
         counter = (counter + 1) % kStreams;
       }
-      ret = RunContext{ctx, gpu_streams_.at(ctx.dev_id).at(use_counter)};
+      ret = RunContext{ctx,
+                       gpu_streams_.at(ctx.dev_id).at(use_counter),
+                       false};
       break;
 #else
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
@@ -103,7 +105,7 @@ RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
   RunContext ret;
   switch (ctx.dev_mask()) {
     case cpu::kDevMask:
-      ret = RunContext{ctx, nullptr};
+      ret = RunContext{ctx, nullptr, false};
       break;
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
@@ -114,7 +116,7 @@ RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
           gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream<gpu>(false, false, ctx.dev_id);
         }
       }
-      ret = RunContext{ctx, gpu_io_streams_.at(ctx.dev_id)};
+      ret = RunContext{ctx, gpu_io_streams_.at(ctx.dev_id), false};
       break;
 #else
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index 3a7587fef..6a6004011 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -478,10 +478,14 @@ inline void ThreadedEngine::ThrowException(ThreadedVar* threaded_var) {
   return;
 }
 
-void ThreadedEngine::OnCompleteStatic(
-    Engine *engine, void *opr_block_) {
+void ThreadedEngine::OnCompleteStatic(Engine *engine, void *opr_block_,
+                                      const dmlc::Error* error) {
   OprBlock *opr_block = static_cast<OprBlock*>(opr_block_);
   ThreadedOpr *threaded_opr = opr_block->opr;
+  if (error != nullptr) {
+    auto ex_p = std::make_exception_ptr(*error);
+    threaded_opr->opr_exception = std::make_shared<std::exception_ptr>(ex_p);
+  }
   if (opr_block->profiling && threaded_opr->opr_name) {
     // record operator end timestamp
     opr_block->opr_profile->stop();
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index ccfd09d64..4a2d4196d 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -403,6 +403,10 @@ class ThreadedEngine : public Engine {
     BulkStatus& bulk_status = *BulkStatusStore::Get();
     std::swap(bulk_status.bulk_size, bulk_size);
     if (bulk_status.count >= bulk_status.bulk_size) BulkFlush();
+    if (!bulk_status.functions) {
+      bulk_status.functions.reset(new std::vector<SyncFn>());
+    }
+    bulk_status.functions->reserve(bulk_size);
     return bulk_size;
   }
 
@@ -416,7 +420,7 @@ class ThreadedEngine : public Engine {
     /*! \brief context of current ops */
     Context ctx;
     /*! \brief current op functions */
-    SyncFn fn;
+    std::shared_ptr<std::vector<SyncFn>> functions;
     /*! \brief constant variables */
     std::vector<VarHandle> const_vars;
     /*! \brief mutable variables */
@@ -465,21 +469,19 @@ class ThreadedEngine : public Engine {
     }
   }
 
-  static void OnCompleteStatic(Engine *engine, void *threaded_opr);
+  static void OnCompleteStatic(Engine *engine, void *threaded_opr,
+                               const dmlc::Error* error);
   /*! \brief append an operator to bulk */
   inline void BulkAppend(SyncFn exec_fn, Context exec_ctx,
                          std::vector<VarHandle> const& const_vars,
                          std::vector<VarHandle> const& mutable_vars) {
     BulkStatus& bulk_status = *BulkStatusStore::Get();
+    if (!bulk_status.functions) {
+      bulk_status.functions.reset(new std::vector<SyncFn>());
+    }
+    bulk_status.functions->push_back(exec_fn);
     if (!bulk_status.count) {
       bulk_status.ctx = exec_ctx;
-      bulk_status.fn = std::move(exec_fn);
-    } else {
-      auto prev_fn = std::move(bulk_status.fn);
-      bulk_status.fn = [exec_fn, prev_fn](RunContext rctx) {
-          prev_fn(rctx);
-          exec_fn(rctx);
-        };
     }
 
     ++bulk_status.count;
@@ -496,13 +498,23 @@ class ThreadedEngine : public Engine {
     if (!bulk_status.count) return;
     bulk_status.count = 0;
     DeduplicateVarHandle(&bulk_status.const_vars, &bulk_status.mutable_vars);
-    SyncFn fn = std::move(bulk_status.fn);
-    this->PushAsync([fn](RunContext ctx, CallbackOnComplete on_complete) {
-        fn(ctx);
+    auto functions = bulk_status.functions;
+    this->PushAsync([functions](RunContext ctx, CallbackOnComplete on_complete) {
+        ctx.is_bulk = true;
+        for (auto& fn : *functions) {
+          fn(ctx);
+        }
+        ctx.is_bulk = false;
+        bool is_gpu = ctx.ctx.dev_mask() == gpu::kDevMask;
+        if (is_gpu) {
+          ctx.get_stream<gpu>()->Wait();
+        }
         on_complete();
       }, bulk_status.ctx, bulk_status.const_vars, bulk_status.mutable_vars,
       FnProperty::kNormal, 0, "ImperativeBulk");
 
+    bulk_status.functions.reset(new std::vector<SyncFn>());
+    bulk_status.functions->reserve(bulk_status.bulk_size);
     bulk_status.const_vars.clear();
     bulk_status.mutable_vars.clear();
   }
diff --git a/src/executor/attach_op_resource_pass.cc b/src/executor/attach_op_resource_pass.cc
index 56122cda6..aa03a1104 100644
--- a/src/executor/attach_op_resource_pass.cc
+++ b/src/executor/attach_op_resource_pass.cc
@@ -55,8 +55,6 @@ void AttachOpResources(
     const auto op = inode.source->op();
     const bool rsc_req = (fresource.count(op) != 0);
     const bool rsc_ex_req = (fresource_ex.count(op) != 0);
-    CHECK(!(rsc_req && rsc_ex_req))
-      << "An operator could not register both ResourceRequestEx and ResourceRequest";
     if (rsc_req || rsc_ex_req) {
       auto reqs = rsc_ex_req ? fresource_ex[op](inode.source->attrs,
                                                 dev_masks[nid],
@@ -64,20 +62,34 @@ void AttachOpResources(
                              : fresource[op](inode.source->attrs);
       // Get the resource of temporal space.
       for (const ResourceRequest& req : reqs) {
-        if (req.type == ResourceRequest::kTempSpace) {
-          if (cached_temp.count(ctx) != 0) {
-            requested.push_back(cached_temp.at(ctx));
-          } else {
-            Resource r = ResourceManager::Get()->Request(ctx, req);
-            requested.push_back(r);
-            cached_temp[ctx] = r;
+        switch (req.type) {
+          case ResourceRequest::kTempSpace: {
+            // the scope is needed when there's new declaration of variable.
+            if (cached_temp.count(ctx) != 0) {
+              requested.push_back(cached_temp.at(ctx));
+            } else {
+              Resource r = ResourceManager::Get()->Request(ctx, req);
+              requested.push_back(r);
+              cached_temp[ctx] = r;
+            }
+            break;
           }
-        } else if (req.type == ResourceRequest::kRandom) {
-          requested.push_back(ResourceManager::Get()->Request(ctx, req));
-        } else if (req.type == ResourceRequest::kParallelRandom) {
-          requested.push_back(ResourceManager::Get()->Request(ctx, req));
-        } else {
-          LOG(FATAL) << "resource type not yet supported";
+          case ResourceRequest::kRandom: {
+            requested.push_back(ResourceManager::Get()->Request(ctx, req));
+            break;
+          }
+          case ResourceRequest::kParallelRandom: {
+            requested.push_back(ResourceManager::Get()->Request(ctx, req));
+            break;
+          }
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+          case ResourceRequest::kCuDNNDropoutDesc: {
+            requested.push_back(ResourceManager::Get()->Request(ctx, req));
+            break;
+          }
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+          default:
+            LOG(FATAL) << "resource type " << req.type << " is not yet supported";
         }
       }
       CHECK(vdispatch[nid] != DispatchMode::kUndefined);
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 03413d03b..38b38c8cf 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -101,9 +101,10 @@ void GraphExecutor::Print(std::ostream &os) const {  // NOLINT(*)
   os << "Total " << 11 << " TempSpace resource requested\n";
 }
 
-void GraphExecutor::SetMonitorCallback(const MonitorCallback& callback) {
+void GraphExecutor::SetMonitorCallback(const MonitorCallback& callback, bool monitor_all) {
   CHECK(callback) << "invalid callback";
   monitor_callback_ = callback;
+  monitor_all_ = monitor_all;
 }
 
 const std::vector<NDArray>& GraphExecutor::outputs() const {
@@ -1306,7 +1307,36 @@ void GraphExecutor::BulkInferenceOpSegs() {
   }
 }
 
-void GraphExecutor::ExecuteMonCallback(size_t nid) {
+void GraphExecutor::ExecuteMonInputCallback(size_t nid) {
+  static const auto& flist_inputs =
+      nnvm::Op::GetAttr<nnvm::FListInputNames>("FListInputNames");
+  const auto& idx = graph_.indexed_graph();
+  std::vector<std::string> input_names;
+  OpNode& opnode = op_nodes_[nid];
+  const auto& inode = idx[nid];
+  const auto& node = idx[nid].source;
+  if (flist_inputs.count(node->op())) {
+    input_names = flist_inputs[node->op()](node->attrs);
+  } else {
+    for (size_t i = 0; i < node->num_inputs(); ++i) {
+      input_names.emplace_back("input" + std::to_string(i));
+    }
+  }
+  CHECK_EQ(opnode.exec->in_array.size(), input_names.size());
+  for (size_t i = 0; i < opnode.exec->in_array.size(); ++i) {
+    if (node->inputs[i].node->is_variable()) {
+    // Monitor variable
+    NDArray *cpy = new NDArray(opnode.exec->in_array[i]);
+    std::string name = node->inputs[i].node->attrs.name;
+    this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy));
+    }
+    NDArray *cpy = new NDArray(opnode.exec->in_array[i]);
+    std::string name = inode.source->attrs.name + "_" + input_names[i];
+    this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy));
+  }
+}
+
+void GraphExecutor::ExecuteMonOutputCallback(size_t nid) {
   static const auto& flist_outputs =
       nnvm::Op::GetAttr<nnvm::FListOutputNames>("FListOutputNames");
   const auto& idx = graph_.indexed_graph();
@@ -1356,6 +1386,10 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
     if (inode.source->is_variable()) continue;
     OpNode& opnode = op_nodes_[nid];
     if (op_nodes_[nid].skip_exec_node) continue;
+    // Monitor callbacks
+    if (monitor_callback_ && monitor_all_) {
+      ExecuteMonInputCallback(nid);
+    }
     opnode.exec->op_ctx.is_train = is_train;
     opnode.exec->op_ctx.need_grad = need_grad_;
     if (opnode.exec->exec_type() == ExecType::kCrossDeviceCopy) {
@@ -1374,7 +1408,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
     }
     // Monitor callbacks
     if (monitor_callback_) {
-      ExecuteMonCallback(nid);
+      ExecuteMonOutputCallback(nid);
     }
   }
 }
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index 7b2834d5c..342caa512 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -68,7 +68,7 @@ class GraphExecutor : public Executor {
   const std::unordered_map<std::string, NDArray>& arg_grad_map() const override;
   const std::unordered_map<std::string, NDArray>& aux_state_map() const override;
   void Print(std::ostream &os) const override; // NOLINT(*)
-  void SetMonitorCallback(const MonitorCallback& callback) override;
+  void SetMonitorCallback(const MonitorCallback& callback, bool monitor_all = false) override;
   // Initialize the rest of attributes
   // after setting up arguments.
   void FinishInitGraph(nnvm::Symbol symbol, nnvm::Graph g,
@@ -213,8 +213,10 @@ class GraphExecutor : public Executor {
    *  ret.opr Can be nullptr if creation failed.
   */
   CachedSegOpr CreateCachedSegOpr(size_t topo_start, size_t topo_end);
-  // run the monitor callback for node `nid`
-  void ExecuteMonCallback(size_t nid);
+  // run the monitor callback for input of node `nid`
+  void ExecuteMonInputCallback(size_t nid);
+  // run the monitor callback for output of node `nid`
+  void ExecuteMonOutputCallback(size_t nid);
   // peform bulking and segmentation on an inference graph
   void BulkInferenceOpSegs();
   // perform bulking and segmentation on a training graph
@@ -254,6 +256,8 @@ class GraphExecutor : public Executor {
   size_t num_forward_nodes_{0};
   // monitor call back
   std::function<void(const char*, void*)> monitor_callback_{nullptr};
+  // monitor both input and output from monitor call back
+  bool monitor_all_{false};
   // whether to enable bulk execution
   bool prefer_bulk_execution_;
   // cached segment operator
diff --git a/src/executor/onnx_to_tensorrt.cc b/src/executor/onnx_to_tensorrt.cc
index e3a4ae868..f7fbc8f81 100644
--- a/src/executor/onnx_to_tensorrt.cc
+++ b/src/executor/onnx_to_tensorrt.cc
@@ -28,7 +28,7 @@
 
 #include "./onnx_to_tensorrt.h"
 
-#include <onnx/onnx.pb.h>
+#include <onnx/onnx_pb.h>
 
 #include <NvInfer.h>
 #include <google/protobuf/io/coded_stream.h>
@@ -100,8 +100,8 @@ nvinfer1::ICudaEngine* onnxToTrtCtx(
   }
 
   if ( !trt_parser->parse(onnx_model.c_str(), onnx_model.size()) ) {
-      int nerror = trt_parser->getNbErrors();
-      for ( int i=0; i < nerror; ++i ) {
+      size_t nerror = trt_parser->getNbErrors();
+      for ( size_t i=0; i < nerror; ++i ) {
         nvonnxparser::IParserError const* error = trt_parser->getError(i);
         if ( error->node() != -1 ) {
           ::ONNX_NAMESPACE::NodeProto const& node =
diff --git a/src/executor/tensorrt_pass.cc b/src/executor/tensorrt_pass.cc
index d26704c35..762dc0de9 100644
--- a/src/executor/tensorrt_pass.cc
+++ b/src/executor/tensorrt_pass.cc
@@ -31,7 +31,7 @@
 #include <mxnet/op_attr_types.h>
 #include <mxnet/operator.h>
 #include <nnvm/graph_attr_types.h>
-#include <onnx/onnx.pb.h>
+#include <onnx/onnx_pb.h>
 
 #include "../operator/contrib/nnvm_to_onnx-inl.h"
 #include "./exec_pass.h"
diff --git a/src/executor/trt_graph_executor.cc b/src/executor/trt_graph_executor.cc
index ec35fee98..85ce16885 100644
--- a/src/executor/trt_graph_executor.cc
+++ b/src/executor/trt_graph_executor.cc
@@ -21,7 +21,7 @@
 
 #include "trt_graph_executor.h"
 
-#include <onnx/onnx.pb.h>
+#include <onnx/onnx_pb.h>
 #include <NvInfer.h>
 #include "./onnx_to_tensorrt.h"
 #include "../operator/contrib/tensorrt-inl.h"
@@ -133,7 +133,7 @@ void TrtGraphExecutor::Init(nnvm::Symbol symbol,
   }
 
   auto trt_groups = GetTrtCompatibleSubsets(g, shared_buffer);
-  for (auto trt_group : trt_groups) {
+  for (const auto &trt_group : trt_groups) {
     if (trt_group.size() > 1) {
       g = ReplaceSubgraph(std::move(g), trt_group, shared_buffer);
       g = ReinitGraph(std::move(g), default_ctx, ctx_map, in_arg_ctxes, arg_grad_ctxes,
@@ -142,7 +142,6 @@ void TrtGraphExecutor::Init(nnvm::Symbol symbol,
     }
   }
 
-
   InitArguments(g.indexed_graph(), g.GetAttr<nnvm::ShapeVector>("shape"),
                 g.GetAttr<nnvm::DTypeVector>("dtype"),
                 g.GetAttr<StorageTypeVector>("storage_type"),
@@ -188,7 +187,7 @@ void TrtGraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
     const uint32_t eid = idx.entry_id(nid, 0);
     const TShape& inferred_shape = inferred_shapes[eid];
     const int inferred_dtype = inferred_dtypes[eid];
-    const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
+    const auto inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
     const std::string& arg_name = idx[nid].source->attrs.name;
     // aux_states
     if (mutable_nodes.count(nid)) {
@@ -427,7 +426,7 @@ Executor *TrtGraphExecutor::TensorRTBind(nnvm::Symbol symbol,
                                          std::unordered_map<std::string, NDArray> *shared_buffer,
                                          Executor *shared_exec) {
   auto exec = new exec::TrtGraphExecutor();
-  exec->Init(symbol, default_ctx, group2ctx,
+  exec->Init(std::move(symbol), default_ctx, group2ctx,
              in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
              arg_shape_map, arg_dtype_map, arg_stype_map,
              grad_req_types, param_names,
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index f4047d1bd..58ec4e65b 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -583,14 +583,11 @@ void CachedOp::StaticInitExec(
     }
 
     size_t bulk_size = idx.num_nodes();
-    std::unordered_set<uint32_t> excludes;
     if (recording || keep_fwd) {
       bulk_size = keep_fwd ? config_.backward_bulk_size : config_.forward_bulk_size;
-      for (const auto& i : idx.outputs()) excludes.insert(idx.entry_id(i));
-      for (const auto& i : idx.input_nodes()) excludes.insert(idx.entry_id(i, 0));
     }
 
-    CreateEngineOpSeg(idx, default_ctx, start_nid, end_nid, bulk_size, excludes,
+    CreateEngineOpSeg(idx, default_ctx, start_nid, end_nid, bulk_size,
                       state.execs, skip_plus_node, &state.opr_segs);
   }
 
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index dbfb2b581..552fa4d0c 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -241,6 +241,12 @@ inline void SetDependency(const nnvm::NodeAttrs& attrs,
         requested.push_back(ResourceManager::Get()->Request(ctx, req));
         write_vars.push_back(requested.back().var);
         break;
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+       case ResourceRequest::kCuDNNDropoutDesc:
+        requested.push_back(ResourceManager::Get()->Request(ctx, req));
+        write_vars.push_back(requested.back().var);
+        break;
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
        default:
         LOG(FATAL) << "resource type not yet supported";
       }
@@ -406,7 +412,7 @@ inline void PushFCompute(const FCompute& fn,
       fn(attrs, opctx, input_blobs, tmp_req, output_blobs);
       // post-fcompute fallback, cast to original storage type
       CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
-      if (is_gpu) {
+      if (is_gpu && !rctx.is_bulk) {
         rctx.get_stream<gpu>()->Wait();
       }
     }, ctx, read_vars, write_vars, FnProperty::kNormal,
@@ -929,7 +935,6 @@ inline void CreateEngineOpSeg(
     const size_t start_nid,
     const size_t end_nid,
     const size_t bulk_size,
-    const std::unordered_set<uint32_t>& excludes,
     const std::vector<std::shared_ptr<exec::OpExecutor> >& execs,
     const std::vector<int> skip_plus_node,
     std::vector<EngineOprSeg> *opr_segs) {
@@ -945,13 +950,6 @@ inline void CreateEngineOpSeg(
 
     // Stop at async nodes and invalid node (due to input/output is not allocated)
     bool stop = is_async || !valid || seg_execs.size() >= bulk_size;
-    for (size_t i = 0; i < node.inputs.size() && !stop; ++i) {
-      if (excludes.count(idx.entry_id(node.inputs[i]))) stop = true;
-    }
-    auto num_outputs = node.source->num_outputs();
-    for (size_t i = 0; i < num_outputs && !stop; ++i) {
-      if (excludes.count(idx.entry_id(nid, i))) stop = true;
-    }
 
     // Create opr segment for previous nodes.
     if (stop && nid > seg_start) {
diff --git a/src/initialize.cc b/src/initialize.cc
index de7edd1b1..8d0e3c304 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -44,7 +44,11 @@ class LibraryInitializer {
   LibraryInitializer() {
     dmlc::InitLogging("mxnet");
 #if MXNET_USE_SIGNAL_HANDLER && DMLC_LOG_STACK_TRACE
-    signal(SIGSEGV, SegfaultLogger);
+    struct sigaction sa;
+    sigaction(SIGSEGV, NULL, &sa);
+    if (sa.sa_handler == NULL) {
+        signal(SIGSEGV, SegfaultLogger);
+    }
 #endif
 
 // disable openmp for multithreaded workers
diff --git a/src/io/image_aug_default.cc b/src/io/image_aug_default.cc
index f31664709..cd06de2b2 100644
--- a/src/io/image_aug_default.cc
+++ b/src/io/image_aug_default.cc
@@ -97,8 +97,6 @@ struct DefaultImageAugmentParam : public dmlc::Parameter<DefaultImageAugmentPara
   int pad;
   /*! \brief shape of the image data*/
   TShape data_shape;
-  /*! \brief random seed for augmentations */
-  dmlc::optional<int> seed_aug;
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(DefaultImageAugmentParam) {
@@ -188,8 +186,6 @@ struct DefaultImageAugmentParam : public dmlc::Parameter<DefaultImageAugmentPara
     DMLC_DECLARE_FIELD(pad).set_default(0)
         .describe("Change size from ``[width, height]`` into "
                   "``[pad + width + pad, pad + height + pad]`` by padding pixes");
-    DMLC_DECLARE_FIELD(seed_aug).set_default(dmlc::optional<int>())
-        .describe("Random seed for augmentations.");
   }
 };
 
@@ -208,9 +204,7 @@ std::vector<dmlc::ParamFieldInfo> ListDefaultAugParams() {
 class DefaultImageAugmenter : public ImageAugmenter {
  public:
   // contructor
-  DefaultImageAugmenter() {
-    seed_init_state = false;
-  }
+  DefaultImageAugmenter() {}
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     kwargs_left = param_.InitAllowUnknown(kwargs);
@@ -250,10 +244,6 @@ class DefaultImageAugmenter : public ImageAugmenter {
   }
   cv::Mat Process(const cv::Mat &src, std::vector<float> *label,
                   common::RANDOM_ENGINE *prnd) override {
-    if (!seed_init_state && param_.seed_aug.has_value()) {
-      prnd->seed(param_.seed_aug.value());
-      seed_init_state = true;
-    }
     using mshadow::index_t;
     bool is_cropped = false;
 
@@ -558,7 +548,6 @@ class DefaultImageAugmenter : public ImageAugmenter {
   DefaultImageAugmentParam param_;
   /*! \brief list of possible rotate angle */
   std::vector<int> rotate_list_;
-  bool seed_init_state;
 };
 
 ImageAugmenter* ImageAugmenter::Create(const std::string& name) {
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index b3f7c40b2..44fcdb832 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -38,6 +38,7 @@
 #include <cstring>
 
 #include "../operator/elemwise_op_common.h"
+#include "../operator/image/resize-inl.h"
 
 #if MXNET_USE_OPENCV
   #include <opencv2/opencv.hpp>
@@ -285,19 +286,8 @@ inline void Imresize(const nnvm::NodeAttrs& attrs,
                      const std::vector<TBlob> &inputs,
                      const std::vector<OpReqType> &req,
                      const std::vector<TBlob> &outputs) {
-#if MXNET_USE_OPENCV
-  CHECK_NE(inputs[0].type_flag_, mshadow::kFloat16) << "imresize doesn't support fp16";
-  const int DTYPE[] = {CV_32F, CV_64F, -1, CV_8U, CV_32S};
-  int cv_type = CV_MAKETYPE(DTYPE[inputs[0].type_flag_], inputs[0].shape_[2]);
   const auto& param = nnvm::get<ResizeParam>(attrs.parsed);
-  cv::Mat buf(inputs[0].shape_[0], inputs[0].shape_[1], cv_type, inputs[0].dptr_);
-  cv::Mat dst(outputs[0].shape_[0], outputs[0].shape_[1], cv_type, outputs[0].dptr_);
-  cv::resize(buf, dst, cv::Size(param.w, param.h), 0, 0, param.interp);
-  CHECK(!dst.empty());
-  CHECK_EQ(static_cast<void*>(dst.ptr()), outputs[0].dptr_);
-#else
-  LOG(FATAL) << "Build with USE_OPENCV=1 for image io.";
-#endif  // MXNET_USE_OPENCV
+  op::image::ResizeImpl(inputs, outputs, param.h, param.w, param.interp);
 }
 
 
diff --git a/src/io/image_iter_common.h b/src/io/image_iter_common.h
index a2324a4b5..10cd8ab4e 100644
--- a/src/io/image_iter_common.h
+++ b/src/io/image_iter_common.h
@@ -125,12 +125,16 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
   bool verbose;
   /*! \brief partition the data into multiple parts */
   int num_parts;
-  /*! \brief the index of the part will read*/
+  /*! \brief the index of the part will read */
   int part_index;
-  /*! \brief the size of a shuffle chunk*/
+  /*! \brief device id used to create context for internal NDArray */
+  int device_id;
+  /*! \brief the size of a shuffle chunk */
   size_t shuffle_chunk_size;
-  /*! \brief the seed for chunk shuffling*/
+  /*! \brief the seed for chunk shuffling */
   int shuffle_chunk_seed;
+  /*! \brief random seed for augmentations */
+  dmlc::optional<int> seed_aug;
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
@@ -161,10 +165,17 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
         .describe("Virtually partition the data into these many parts.");
     DMLC_DECLARE_FIELD(part_index).set_default(0)
         .describe("The *i*-th virtual partition to be read.");
+    DMLC_DECLARE_FIELD(device_id).set_default(0)
+        .describe("The device id used to create context for internal NDArray. "\
+                  "Setting device_id to -1 will create Context::CPU(0). Setting "
+                  "device_id to valid positive device id will create "
+                  "Context::CPUPinned(device_id). Default is 0.");
     DMLC_DECLARE_FIELD(shuffle_chunk_size).set_default(0)
         .describe("The data shuffle buffer size in MB. Only valid if shuffle is true.");
     DMLC_DECLARE_FIELD(shuffle_chunk_seed).set_default(0)
         .describe("The random seed for shuffling");
+    DMLC_DECLARE_FIELD(seed_aug).set_default(dmlc::optional<int>())
+        .describe("Random seed for augmentations.");
   }
 };
 
diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index b567c7297..00c381986 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -285,9 +285,14 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
     shape_vec.push_back(param_.label_width);
     TShape label_shape(shape_vec.begin(), shape_vec.end());
 
-    out->data.at(0) = NDArray(data_shape, Context::CPU(0), false,
+    auto ctx = Context::CPU(0);
+    auto dev_id = param_.device_id;
+    if (dev_id != -1) {
+      ctx = Context::CPUPinned(dev_id);
+    }
+    out->data.at(0) = NDArray(data_shape, ctx, false,
       mshadow::DataType<DType>::kFlag);
-    out->data.at(1) = NDArray(label_shape, Context::CPU(0), false,
+    out->data.at(1) = NDArray(label_shape, ctx, false,
       mshadow::DataType<real_t>::kFlag);
     unit_size_[0] = param_.data_shape.Size();
     unit_size_[1] = param_.label_width;
@@ -519,6 +524,13 @@ inline size_t ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t*
       cv::Mat res;
       rec.Load(blob.dptr, blob.size);
       cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
+
+      // If augmentation seed is supplied
+      // Re-seed RNG to guarantee reproducible results
+      if (param_.seed_aug.has_value()) {
+        prnds_[tid]->seed(idx + param_.seed_aug.value() + kRandMagic);
+      }
+
       switch (param_.data_shape[0]) {
        case 1:
 #if MXNET_USE_LIBJPEG_TURBO
diff --git a/src/kvstore/kvstore_nccl.h b/src/kvstore/kvstore_nccl.h
index d0f397cc6..a4ba53391 100644
--- a/src/kvstore/kvstore_nccl.h
+++ b/src/kvstore/kvstore_nccl.h
@@ -342,7 +342,7 @@ class KVStoreNCCL : public KVStoreLocal {
       } else {
         auto& buf = merge_buf_[key];
         int root = src.ctx().dev_id;
-        assert(root == buf.ctx().dev_id);
+        assert(root == buf.merged.ctx().dev_id);
         root_id = FindRootId(dst, root);
 
         // Check whether we got the same set of devices
diff --git a/src/mxfeatures.cc b/src/mxfeatures.cc
new file mode 100644
index 000000000..7a435d7c8
--- /dev/null
+++ b/src/mxfeatures.cc
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file mxfeatures.cc
+ * \brief check MXNet features including compile time support
+ */
+
+#include "mxnet/mxfeatures.h"
+#include <bitset>
+
+namespace mxnet {
+namespace features {
+
+class FeatureSet {
+ public:
+  FeatureSet() :
+      feature_bits() {
+    // GPU
+    feature_bits.set(CUDA, MXNET_USE_CUDA);
+    feature_bits.set(CUDNN, MXNET_USE_CUDNN);
+    feature_bits.set(NCCL, MXNET_USE_NCCL);
+    feature_bits.set(CUDA_RTC, MXNET_ENABLE_CUDA_RTC);
+    feature_bits.set(TENSORRT, MXNET_USE_TENSORRT);
+
+    // Check flags for example with gcc -msse3 -mavx2 -dM -E - < /dev/null | egrep "SSE|AVX"
+#if __SSE__
+    feature_bits.set(CPU_SSE);
+#endif
+#if __SSE2__
+    feature_bits.set(CPU_SSE2);
+#endif
+#if __SSE3__
+    feature_bits.set(CPU_SSE3);
+#endif
+#if __SSE4_1__
+    feature_bits.set(CPU_SSE4_1);
+#endif
+#if __SSE4_2__
+    feature_bits.set(CPU_SSE4_2);
+#endif
+#if __SSE4A__
+    feature_bits.set(CPU_SSE4A);
+#endif
+#if __AVX__
+    feature_bits.set(CPU_AVX);
+#endif
+#if __AVX2__
+    feature_bits.set(CPU_AVX2);
+#endif
+
+    // CPU
+    feature_bits.set(OPENMP, MXNET_USE_OPENMP);
+    feature_bits.set(F16C, MXNET_USE_F16C);
+
+    // Math
+    feature_bits.set(BLAS_OPEN, MXNET_USE_BLAS_OPEN);
+    feature_bits.set(BLAS_ATLAS, MXNET_USE_BLAS_ATLAS);
+    feature_bits.set(BLAS_MKL, MXNET_USE_BLAS_MKL);
+    feature_bits.set(BLAS_APPLE, MXNET_USE_BLAS_APPLE);
+    feature_bits.set(LAPACK, MXNET_USE_LAPACK);
+    feature_bits.set(MKLDNN, MXNET_USE_MKLDNN);
+
+    // Image
+    feature_bits.set(OPENCV, MXNET_USE_OPENCV);
+
+    // Misc
+    feature_bits.set(CAFFE, MXNET_USE_CAFFE);
+    feature_bits.set(DIST_KVSTORE, MXNET_USE_DIST_KVSTORE);
+    feature_bits.set(SIGNAL_HANDLER, MXNET_USE_SIGNAL_HANDLER);
+#ifndef NDEBUG
+    feature_bits.set(DEBUG);
+#endif
+
+#if USE_JEMALLOC == 1
+    feature_bits.set(JEMALLOC);
+#endif
+  }
+  bool is_enabled(const unsigned feat) const {
+    CHECK_LT(feat, MAX_FEATURES);
+    return feature_bits.test(feat);
+  }
+
+ private:
+  std::bitset<MAX_FEATURES> feature_bits;
+};
+
+static FeatureSet featureSet;
+
+bool is_enabled(const unsigned feat) {
+  return featureSet.is_enabled(feat);
+}
+
+}  // namespace features
+}  // namespace mxnet
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 6c4ce9c75..42fb66370 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -170,16 +170,28 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
 
 #if MXNET_USE_MKLDNN == 1
 
-NDArray::NDArray(const mkldnn::memory *mkldnn_mem, bool static_data)
+NDArray::NDArray(mkldnn::memory::primitive_desc mem_pd)
     : storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
-  auto mem_pd = mkldnn_mem->get_primitive_desc();
   auto mem_desc = mem_pd.desc();
   shape_ = TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
   dtype_ = get_mxnet_type(mem_desc.data.data_type);
-  auto data = TBlob(mkldnn_mem->get_data_handle(), shape_, cpu::kDevMask, dtype_);
-  ptr_ = std::make_shared<Chunk>(data, 0);
+  ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
+  ptr_->CheckAndAlloc(mem_pd.get_size());
   ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(mem_pd, ptr_->shandle.dptr);
-  ptr_->static_data = static_data;
+}
+
+NDArray::NDArray(const std::shared_ptr<mkldnn::memory> &mkldnn_mem)
+    : storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
+  auto mem_pd = mkldnn_mem->get_primitive_desc();
+  auto mem_desc = mem_pd.desc();
+  shape_ = TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
+  dtype_ = get_mxnet_type(mem_desc.data.data_type);
+  ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
+  ptr_->shandle.dptr = mkldnn_mem->get_data_handle();
+  ptr_->shandle.size = mem_pd.get_size();
+  ptr_->delay_alloc = false;
+  ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(mkldnn_mem);
+  ptr_->static_data = true;
 }
 
 NDArray NDArray::MKLDNNDataReshape(const TShape &shape) const {
@@ -712,19 +724,16 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &
   return ptr_->mkl_mem_->GetRaw();
 }
 
-void NDArray::UpdateMKLDNNMemDesc() {
+void NDArray::UpdateMKLDNNMemDesc(mkldnn::memory::format format) {
   const mkldnn::memory *mem = GetMKLDNNData();
   auto mem_desc = mem->get_primitive_desc().desc();
   auto this_dtype = get_mkldnn_type(dtype());
-  if (this_dtype != mem_desc.data.data_type) {
-    mkldnn::memory::desc data_md(
-        mkldnn::memory::dims(mem_desc.data.dims,
-                             mem_desc.data.dims + mem_desc.data.ndims),
-        this_dtype, static_cast<mkldnn::memory::format>(mem_desc.data.format));
-    mkldnn::memory::primitive_desc pd(data_md, CpuEngine::Get()->get_engine());
-    ptr_->mkl_mem_.reset(new MKLDNNMemory(pd, ptr_->shandle.dptr));
-    MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
-  }
+  mkldnn::memory::desc data_md(
+      mkldnn::memory::dims(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims),
+      this_dtype, format);
+  mkldnn::memory::primitive_desc pd(data_md, CpuEngine::Get()->get_engine());
+  ptr_->mkl_mem_.reset(new MKLDNNMemory(pd, ptr_->shandle.dptr));
+  MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
 }
 #endif
 
diff --git a/src/operator/contrib/bilinear_resize-inl.cuh b/src/operator/contrib/bilinear_resize-inl.cuh
new file mode 100644
index 000000000..b8dacb1c4
--- /dev/null
+++ b/src/operator/contrib/bilinear_resize-inl.cuh
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file bilinear_resize-inl.cuh
+ * \brief bilinear resize operator cuda implementation
+ * \author Hang Zhang, Jake Lee
+*/
+
+#ifndef MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_
+#define MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_
+
+#include <cuda_runtime_api.h>
+#include <algorithm>
+
+namespace mxnet {
+namespace op {
+
+using namespace mshadow;
+
+enum ImageLayout {
+  HWC,
+  NHWC,
+  NCHW
+};
+
+template<typename In, typename Out>
+struct ScalarConvert {
+  static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
+};
+
+// The maximum number of threads in a block
+static const unsigned MAX_BLOCK_SIZE = 512U;
+
+// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
+static unsigned getNumThreads(int nElem, const bool smaller) {
+  unsigned threadSizes[5] = {32, 64, 128, 256, MAX_BLOCK_SIZE};
+  const int maxi = smaller ? 4 : 5;
+  for (int i = 0; i != maxi; ++i) {
+    if (static_cast<unsigned>(nElem) <= threadSizes[i]) {
+      return threadSizes[i];
+    }
+  }
+  return smaller ? (MAX_BLOCK_SIZE >> 1) : MAX_BLOCK_SIZE;
+}
+
+// caffe_gpu_interp2_kernel overloading with Tensor<xpu, 3, DType>
+template<typename xpu, typename Dtype, typename Acctype>
+__global__ void caffe_gpu_interp2_kernel(const int n,
+    const Acctype rheight, const Acctype rwidth,
+    const Tensor<xpu, 3, Dtype> data1,
+    Tensor<xpu, 3, Dtype> data2,
+    ImageLayout layout) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int channels = data1.size(2);
+  const int height1 = data1.size(0);
+  const int width1 = data1.size(1);
+  const int height2 = data2.size(0);
+  const int width2 = data2.size(1);
+
+  if (index < n) {
+    const int w2 = index % width2;  // 0:width2-1
+    const int h2 = index / width2;  // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+      for (int c = 0; c < channels; ++c) {
+        const Dtype val = data1[h1][w1][c];
+        data2[h2][w2][c] = val;
+      }
+      return;
+    }
+    //
+    const Acctype h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
+    //
+    const Acctype w1r = rwidth * w2;
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+    for (int c = 0; c < channels; ++c) {
+      const Acctype val = h0lambda * (w0lambda * data1[h1][w1][c]
+                            + w1lambda * data1[h1][w1+w1p][c])
+                            + h1lambda * (w0lambda * data1[h1+h1p][w1][c]
+                            + w1lambda * data1[h1+h1p][w1+w1p][c]);
+      data2[h2][w2][c] = ScalarConvert<Acctype, Dtype>::to(val);
+    }
+  }
+}
+
+// caffe_gpu_interp2_kernel overloading with Tensor<xpu, 4, DType>
+template<typename xpu, typename Dtype, typename Acctype>
+__global__ void caffe_gpu_interp2_kernel(const int n,
+    const Acctype rheight, const Acctype rwidth,
+    const Tensor<xpu, 4, Dtype> data1,
+    Tensor<xpu, 4, Dtype> data2,
+    ImageLayout layout) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int batch_size = (layout == NHWC) ? data1.size(0) : data1.size(0);
+  int channels = (layout == NHWC) ? data1.size(3) : data1.size(1);
+  int height1 = (layout == NHWC) ? data1.size(1) : data1.size(2);
+  int width1 = (layout == NHWC) ? data1.size(2) : data1.size(3);
+  int height2 = (layout == NHWC) ? data2.size(1) : data2.size(2);
+  int width2 = (layout == NHWC) ? data2.size(2): data2.size(3);
+
+  if (index < n) {
+    const int w2 = index % width2;  // 0:width2-1
+    const int h2 = index / width2;  // 0:height2-1
+    // special case: just copy
+    if (height1 == height2 && width1 == width2) {
+      const int h1 = h2;
+      const int w1 = w2;
+
+      for (int n = 0; n < batch_size; ++n) {
+        for (int c = 0; c < channels; ++c) {
+          if (layout == NHWC) {
+            const Dtype val = data1[n][h1][w1][c];
+            data2[n][h2][w2][c] = val;
+          } else {
+            const Dtype val = data1[n][c][h1][w1];
+            data2[n][c][h2][w2] = val;
+          }
+        }
+      }
+      return;
+    }
+    //
+    const Acctype h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const Acctype h1lambda = h1r - h1;
+    const Acctype h0lambda = Acctype(1) - h1lambda;
+    //
+    const Acctype w1r = rwidth * w2;
+    const int w1 = w1r;
+    const int w1p = (w1 < width1 - 1) ? 1 : 0;
+    const Acctype w1lambda = w1r - w1;
+    const Acctype w0lambda = Acctype(1) - w1lambda;
+
+    for (auto n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < channels; ++c) {
+        if (layout == NHWC) {
+          const Acctype val = h0lambda * (w0lambda * data1[n][h1][w1][c]
+                            + w1lambda * data1[n][h1][w1+w1p][c])
+                            + h1lambda * (w0lambda * data1[n][h1+h1p][w1][c]
+                            + w1lambda * data1[n][h1+h1p][w1+w1p][c]);
+          data2[n][h2][w2][c] = ScalarConvert<Acctype, Dtype>::to(val);
+        } else {
+          const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
+                            + w1lambda * data1[n][c][h1][w1+w1p])
+                            + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
+                            + w1lambda * data1[n][c][h1+h1p][w1+w1p]);
+          data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_BILINEAR_RESIZE_CUH_
\ No newline at end of file
diff --git a/src/operator/contrib/bilinear_resize-inl.h b/src/operator/contrib/bilinear_resize-inl.h
index ff3f794d1..5a653d8a1 100644
--- a/src/operator/contrib/bilinear_resize-inl.h
+++ b/src/operator/contrib/bilinear_resize-inl.h
@@ -50,11 +50,17 @@ namespace op {
 struct BilinearSampleParam : public dmlc::Parameter<BilinearSampleParam> {
   int height;
   int width;
+  dmlc::optional<float> scale_height;
+  dmlc::optional<float> scale_width;
   DMLC_DECLARE_PARAMETER(BilinearSampleParam) {
-    DMLC_DECLARE_FIELD(height).set_range(1, 10000)
-    .describe("output height (required)");
-    DMLC_DECLARE_FIELD(width).set_range(1, 10000)
-    .describe("output width (required)");
+    DMLC_DECLARE_FIELD(height).set_default(1).set_range(1, 10000)
+    .describe("output height (required, but ignored if scale_height is defined)");
+    DMLC_DECLARE_FIELD(width).set_default(1).set_range(1, 10000)
+    .describe("output width (required, but ignored if scale_width is defined)");
+    DMLC_DECLARE_FIELD(scale_height).set_default(dmlc::optional<float>())
+    .describe("sampling scale of the height (optional, ignores height if defined)");
+    DMLC_DECLARE_FIELD(scale_width).set_default(dmlc::optional<float>())
+    .describe("sampling scale of the scale_width (optional, ignores width if defined)");
   }
 };
 
@@ -129,8 +135,18 @@ static bool BilinearSampleOpInferShape(const nnvm::NodeAttrs& attrs,
   const BilinearSampleParam& param = nnvm::get<BilinearSampleParam>(attrs.parsed);
   TShape dshape(in_shape->at(0));
   if (dshape.ndim() == 0) return false;
-  dshape[2] = param.height;
-  dshape[3] = param.width;
+  if (param.scale_height.has_value()) {
+    dshape[2] = static_cast<int>(param.scale_height.value() * in_shape->at(0)[2]);
+  } else {
+    dshape[2] = param.height;
+  }
+
+  if (param.scale_height.has_value()) {
+    dshape[3] = static_cast<int>(param.scale_width.value() * in_shape->at(0)[3]);
+  } else {
+    dshape[3] = param.width;
+  }
+
   out_shape->clear();
   out_shape->push_back(dshape);
   return true;
diff --git a/src/operator/contrib/bilinear_resize.cu b/src/operator/contrib/bilinear_resize.cu
index f01c9c2fa..b0a4c4b31 100644
--- a/src/operator/contrib/bilinear_resize.cu
+++ b/src/operator/contrib/bilinear_resize.cu
@@ -25,86 +25,13 @@
 #include <cuda_runtime_api.h>
 #include <algorithm>
 #include "bilinear_resize-inl.h"
+#include "bilinear_resize-inl.cuh"
 
 namespace mxnet {
 namespace op {
 
 using namespace mshadow;
 
-template<typename In, typename Out>
-struct ScalarConvert {
-  static __host__ __device__ __forceinline__ Out to(const In v) { return (Out) v; }
-};
-
-
-// The maximum number of threads in a block
-static const unsigned MAX_BLOCK_SIZE = 512U;
-
-// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
-static unsigned getNumThreads(int nElem, const bool smaller) {
-  unsigned threadSizes[5] = {32, 64, 128, 256, MAX_BLOCK_SIZE};
-  const int maxi = smaller ? 4 : 5;
-  for (int i = 0; i != maxi; ++i) {
-    if (static_cast<unsigned>(nElem) <= threadSizes[i]) {
-      return threadSizes[i];
-    }
-  }
-  return smaller ? (MAX_BLOCK_SIZE >> 1) : MAX_BLOCK_SIZE;
-}
-
-template<typename xpu, typename Dtype, typename Acctype>
-__global__ void caffe_gpu_interp2_kernel(const int n,
-    const Acctype rheight, const Acctype rwidth,
-    const Tensor<xpu, 4, Dtype> data1,
-    Tensor<xpu, 4, Dtype> data2) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-  const int batchsize = data1.size(0);
-  const int channels = data1.size(1);
-  const int height1 = data1.size(2);
-  const int width1 = data1.size(3);
-  const int height2 = data2.size(2);
-  const int width2 = data2.size(3);
-
-  if (index < n) {
-    const int w2 = index % width2;  // 0:width2-1
-    const int h2 = index / width2;  // 0:height2-1
-    // special case: just copy
-    if (height1 == height2 && width1 == width2) {
-      const int h1 = h2;
-      const int w1 = w2;
-      for (int n = 0; n < batchsize ; n++) {
-        for (int c = 0; c < channels; ++c) {
-          const Dtype val = data1[n][c][h1][w1];
-          data2[n][c][h2][w2] = val;
-        }
-      }
-      return;
-    }
-    //
-    const Acctype h1r = rheight * h2;
-    const int h1 = h1r;
-    const int h1p = (h1 < height1 - 1) ? 1 : 0;
-    const Acctype h1lambda = h1r - h1;
-    const Acctype h0lambda = Acctype(1) - h1lambda;
-    //
-    const Acctype w1r = rwidth * w2;
-    const int w1 = w1r;
-    const int w1p = (w1 < width1 - 1) ? 1 : 0;
-    const Acctype w1lambda = w1r - w1;
-    const Acctype w0lambda = Acctype(1) - w1lambda;
-    //
-    for (int n = 0; n < batchsize ; n++) {
-        for (int c = 0; c < channels; ++c) {
-        const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
-                            + w1lambda * data1[n][c][h1][w1+w1p])
-                            + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
-                            + w1lambda * data1[n][c][h1+h1p][w1+w1p]);
-        data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
-      }
-    }
-  }
-}
-
 // Backward (adjoint) operation 1 <- 2 (accumulates)
 template<typename xpu, typename Dtype, typename Acctype>
 __global__ void caffe_gpu_interp2_kernel_backward(const int n,
@@ -181,9 +108,10 @@ void SpatialUpSamplingBilinearUpdateOutput(mshadow::Stream<gpu> *s,
   dim3 blocks(static_cast<int>(num_kernels / num_threads) + 1);
   dim3 threads(num_threads);
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
+  ImageLayout layout = NCHW;
   caffe_gpu_interp2_kernel<xpu, DType, AccReal>
   <<<blocks, threads , 0, stream>>>(
-    num_kernels, rheight, rwidth, idata, odata);
+    num_kernels, rheight, rwidth, idata, odata, layout);
   MSHADOW_CUDA_POST_KERNEL_CHECK(SpatialUpSamplingBilinearUpdateOutput);
 }
 
@@ -215,6 +143,5 @@ NNVM_REGISTER_OP(_contrib_BilinearResize2D)
 
 NNVM_REGISTER_OP(_backward_contrib_BilinearResize2D)
 .set_attr<FCompute>("FCompute<gpu>", BilinearSampleOpBackward<gpu>);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/erfinv-inl.h b/src/operator/contrib/erfinv-inl.h
new file mode 100644
index 000000000..8d718ade6
--- /dev/null
+++ b/src/operator/contrib/erfinv-inl.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2014 Indiana University
+ * All rights reserved.
+ * Written by Prof. Gary L. Pavlis, Dept. of Geol. Sci.,
+ *           Indiana University, Bloomington, IN
+ * This software is licensed under the New BSD license:
+ * Redistribution and use in source and binary forms,
+ * with or without modification, are permitted provided
+ * that the following conditions are met:
+ * Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ * Redistributions in binary form must reproduce the
+ * above copyright notice, this list of conditions and
+ * the following disclaimer in the documentation and/or
+ * other materials provided with the distribution.
+ * Neither the name of Indiana University nor
+ * the names of its contributors may be used to endorse
+ * or promote products derived from this software without
+ * specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/*
+ * The next function is taken from
+ * /~https://github.com/antelopeusersgroup/antelope_contrib/blob/master/lib/location/libgenloc/erfinv.c.
+ * Output was modified to be inf or -inf when input is 1 or -1.
+ */
+#ifndef MXNET_OPERATOR_CONTRIB_ERFINV_INL_H_
+#define MXNET_OPERATOR_CONTRIB_ERFINV_INL_H_
+
+#define _USE_MATH_DEFINES
+
+#include <mxnet/base.h>
+#include <limits>
+#include "math.h"
+
+namespace mxnet {
+namespace op {
+namespace mshadow_op {
+
+/*! \brief inverse gauss error function */
+struct erfinv : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType v) {
+    /* Function to calculate inverse error function.  Rational approximation
+    is used to generate an initial approximation, which is then improved to
+    full accuracy by two steps of Newton's method.  Code is a direct
+    translation of the erfinv m file in matlab version 2.0.
+    Author:  Gary L. Pavlis, Indiana University
+    Date:  February 1996
+    */
+    const double central_range = 0.7;
+    double y = static_cast<double>(v);
+    double y_fab = std::fabs(y);
+    /*working variables */
+    double x = 0.0;
+    double z, num, dem;
+    /* coefficients in rational expansion */
+    double a[4]={ 0.886226899, -1.645349621,  0.914624893, -0.140543331};
+    double b[4]={-2.118377725,  1.442710462, -0.329097515,  0.012229801};
+    double c[4]={-1.970840454, -1.624906493,  3.429567803,  1.641345311};
+    double d[2]={ 3.543889200,  1.637067800};
+    if (y_fab > 1.0) {
+      /* This needs IEEE constant*/
+      return DType(std::numeric_limits<double>::quiet_NaN());
+    } else if (y_fab == 1.0) {
+      return DType((std::copysign(1.0, y))*std::numeric_limits<double>::infinity());
+    } else if (y_fab <= central_range) {
+            z = y*y;
+            num = (((a[3]*z + a[2])*z + a[1])*z + a[0]);
+            dem = ((((b[3]*z + b[2])*z + b[1])*z +b[0])*z + 1.0);
+            x = y*num/dem;
+    } else {
+            z = std::sqrt(-std::log((1.0-y_fab)/2.0));
+            num = ((c[3]*z + c[2])*z + c[1])*z + c[0];
+            dem = (d[1]*z + d[0])*z + 1.0;
+            x = (std::copysign(1.0, y))*num/dem;
+    }
+    /* Two steps of Newton-Raphson correction */
+    x = x - (std::erf(x) - y)/((2.0/std::sqrt(M_PI))*std::exp(-x*x));
+    x = x - (std::erf(x) - y)/((2.0/std::sqrt(M_PI))*std::exp(-x*x));
+
+    return DType(x);
+  }
+};
+
+}  // namespace mshadow_op
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_ERFINV_INL_H_
diff --git a/src/operator/contrib/gradient_multiplier_op.cc b/src/operator/contrib/gradient_multiplier_op.cc
new file mode 100644
index 000000000..47f891ef8
--- /dev/null
+++ b/src/operator/contrib/gradient_multiplier_op.cc
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file gradient_multiplier_op.cc
+ * \brief
+ * \author Istvan Fehervari
+*/
+#include "../tensor/elemwise_unary_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+static bool BinaryScalarStorageType(const nnvm::NodeAttrs& attrs,
+                                    const int dev_mask,
+                                    DispatchMode* dispatch_mode,
+                                    std::vector<int> *in_attrs,
+                                    std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  const auto in_stype = in_attrs->at(0);
+  auto &out_stype = out_attrs->at(0);
+  bool dispatched = false;
+  if (!dispatched && (in_stype == kDefaultStorage)) {
+    // dense -> dense
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+  if (!dispatched && in_stype == kRowSparseStorage) {
+    // row sparse -> row sparse
+    dispatched = storage_type_assign(&out_stype, kRowSparseStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+    // FComputeEx can handle dns output on cpu, too
+    if (dev_mask == cpu::kDevMask && out_stype == kDefaultStorage) {
+      DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+      dispatched = true;
+    }
+  }
+  if (!dispatched && in_stype == kCSRStorage) {
+    // csr -> csr
+    dispatched = storage_type_assign(&out_stype, kCSRStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+    // FComputeEx can handle dns output on cpu, too
+    if (dev_mask == cpu::kDevMask && out_stype == kDefaultStorage) {
+      DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+      dispatched = true;
+    }
+  }
+  if (!dispatched) {
+    dispatched = dispatch_fallback(out_attrs, dispatch_mode);
+  }
+  return dispatched;
+}
+
+MXNET_OPERATOR_REGISTER_UNARY(_contrib_gradientmultiplier)
+.describe(R"code(This operator implements the gradient multiplier function.
+In forward pass it acts as an identity transform. During backpropagation it
+multiplies the gradient from the subsequent level by a scalar factor lambda and passes it to
+the preceding layer.
+)code" ADD_FILELINE)
+.set_attr_parser([](NodeAttrs* attrs) {
+    attrs->parsed = std::stod(attrs->dict["scalar"]);
+  })
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_contrib_backward_gradientmultiplier"})
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.add_argument("scalar", "float", "lambda multiplier");
+
+MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_contrib_backward_gradientmultiplier)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BinaryScalarStorageType)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, op::mshadow_op::mul>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", BinaryScalarOp::ComputeEx<cpu, op::mshadow_op::mul>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/gradient_multiplier_op.cu b/src/operator/contrib/gradient_multiplier_op.cu
new file mode 100644
index 000000000..7159cea98
--- /dev/null
+++ b/src/operator/contrib/gradient_multiplier_op.cu
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file gradient_multiplier_op.cu
+ * \brief
+ * \author Istvan Fehervari
+*/
+#include "../tensor/elemwise_unary_op.h"
+#include "../tensor/elemwise_binary_scalar_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_contrib_gradientmultiplier)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::IdentityComputeEx<gpu>)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
+
+NNVM_REGISTER_OP(_contrib_backward_gradientmultiplier)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, op::mshadow_op::mul>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, op::mshadow_op::mul>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/nnvm_to_onnx-inl.h b/src/operator/contrib/nnvm_to_onnx-inl.h
index 011ffe6b7..0994f7e63 100644
--- a/src/operator/contrib/nnvm_to_onnx-inl.h
+++ b/src/operator/contrib/nnvm_to_onnx-inl.h
@@ -37,7 +37,7 @@
 #include <nnvm/graph.h>
 #include <nnvm/pass_functions.h>
 
-#include <onnx/onnx.pb.h>
+#include <onnx/onnx_pb.h>
 
 #include <algorithm>
 #include <iostream>
@@ -70,7 +70,7 @@ struct ONNXParam : public dmlc::Parameter<ONNXParam> {
   nnvm_to_onnx::InferenceMap_t output_map;
   ::onnx::ModelProto onnx_pb_graph;
 
-  ONNXParam() {}
+  ONNXParam() = default;
 
   ONNXParam(const ::onnx::ModelProto& onnx_graph,
            const nnvm_to_onnx::InferenceMap_t& input_map,
@@ -104,14 +104,14 @@ std::unordered_map<std::string, uint32_t> GetOutputLookup(const nnvm::IndexedGra
 void ConvertPlaceholder(
   const std::string& node_name,
   const std::unordered_map<std::string, TShape>& placeholder_shapes,
-  GraphProto* const graph_proto);
+  GraphProto* graph_proto);
 
-void ConvertConstant(GraphProto* const graph_proto,
+void ConvertConstant(GraphProto* graph_proto,
   const std::string& node_name,
-  std::unordered_map<std::string, NDArray>* const shared_buffer);
+  std::unordered_map<std::string, NDArray>* shared_buffer);
 
-void ConvertOutput(op::nnvm_to_onnx::InferenceMap_t* const trt_output_map,
-                   GraphProto* const graph_proto,
+void ConvertOutput(op::nnvm_to_onnx::InferenceMap_t* trt_output_map,
+                   GraphProto* graph_proto,
                    const std::unordered_map<std::string, uint32_t>::iterator& out_iter,
                    const std::string& node_name,
                    const nnvm::Graph& g,
@@ -169,7 +169,7 @@ void ConvertElementwiseAdd(NodeProto *node_proto,
 
 ONNXParam ConvertNnvmGraphToOnnx(
     const nnvm::Graph &g,
-    std::unordered_map<std::string, NDArray> *const shared_buffer);
+    std::unordered_map<std::string, NDArray>* shared_buffer);
 
 static const std::unordered_map<std::string, ConverterFunction> converter_map = {
   {"Convolution", ConvertConvolution},
diff --git a/src/operator/contrib/nnvm_to_onnx.cc b/src/operator/contrib/nnvm_to_onnx.cc
index 784384e94..58a465455 100644
--- a/src/operator/contrib/nnvm_to_onnx.cc
+++ b/src/operator/contrib/nnvm_to_onnx.cc
@@ -62,15 +62,22 @@ namespace nnvm_to_onnx {
 op::ONNXParam ConvertNnvmGraphToOnnx(
     const nnvm::Graph& g,
     std::unordered_map<std::string, NDArray>* const shared_buffer) {
-    op::ONNXParam onnx_param;
-    op::nnvm_to_onnx::NameToIdx_t onnx_input_map;
-    op::nnvm_to_onnx::InferenceMap_t onnx_output_map;
+
+  static std::atomic_ulong subgraph_count = { 0 };
+
+  op::ONNXParam onnx_param;
+  op::nnvm_to_onnx::NameToIdx_t onnx_input_map;
+  op::nnvm_to_onnx::InferenceMap_t onnx_output_map;
 
   const nnvm::IndexedGraph& ig = g.indexed_graph();
   const auto& storage_types = g.GetAttr<StorageTypeVector>("storage_type");
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
   const auto& shape_inputs = g.GetAttr<ShapeVector>("shape_inputs");
 
+  // TODO(kellens): At the moment this check always passes no matter the weight dtypes used in your
+  // graph.  We should first iterate over datatypes by name and ensure  they're valid types
+  // (fp16 or fp32) and that they're uniform.  Then ensure later conversions set tensor types
+  // correctly in ONNX.
   for (auto& e : storage_types) {
     if (e != mshadow::kFloat32) {
       LOG(FATAL) << "ONNX converter does not support types other than float32 "
@@ -79,9 +86,23 @@ op::ONNXParam ConvertNnvmGraphToOnnx(
   }
 
   ModelProto model_proto;
-  // Need to determine IR versions and features to support
-  model_proto.set_ir_version(static_cast<int64>(2));
+
+  // We're currently serializing our models in ONNX 3, opset 8 as it is best supported by the
+  // currently linked version of the onnx-tensorrt library.
+  // More information on ONNX versions and opsets can be found at:
+  // /~https://github.com/onnx/onnx/blob/master/docs/IR.md
+
+  auto opset_proto = model_proto.add_opset_import();
+  const int64 onnx_opset = 8;
+  const int64 onnx_major_version = 3;
+
+  // Declare our ONNX versions in our protobuf model.
+  opset_proto->set_version(onnx_opset);
+  model_proto.set_ir_version(onnx_major_version);
+
   GraphProto* graph_proto = model_proto.mutable_graph();
+  auto subgraph_name_id = subgraph_count.fetch_add(1);
+  graph_proto->set_name("MXNetTRTSubgraph" + std::to_string(subgraph_name_id));
 
   std::unordered_map<std::string, TShape> placeholder_shapes =
       GetPlaceholderShapes(shape_inputs, ig);
@@ -176,6 +197,20 @@ void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
   // const bool no_bias = conv_param.no_bias;
   const dmlc::optional<int> layout = conv_param.layout;
 
+  // dilations
+  AttributeProto* const dilations = node_proto->add_attribute();
+  dilations->set_name("dilations");
+  dilations->set_type(AttributeProto::INTS);
+  for (const dim_t kval : dilate) {
+    dilations->add_ints(static_cast<int64>(kval));
+  }
+
+  // group
+  AttributeProto* const group = node_proto->add_attribute();
+  group->set_name("group");
+  group->set_type(AttributeProto::INT);
+  group->set_i(static_cast<int64>(num_group));
+
   // kernel shape
   AttributeProto* const kernel_shape = node_proto->add_attribute();
   kernel_shape->set_name("kernel_shape");
@@ -195,14 +230,6 @@ void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
     pads->add_ints(static_cast<int64>(kval));
   }
 
-  // dilations
-  AttributeProto* const dilations = node_proto->add_attribute();
-  dilations->set_name("dilations");
-  dilations->set_type(AttributeProto::INTS);
-  for (const dim_t kval : dilate) {
-    dilations->add_ints(static_cast<int64>(kval));
-  }
-
   // strides
   AttributeProto* const strides = node_proto->add_attribute();
   strides->set_name("strides");
@@ -210,12 +237,6 @@ void ConvertConvolution(NodeProto* node_proto, const NodeAttrs& attrs,
   for (const dim_t kval : stride) {
     strides->add_ints(static_cast<int64>(kval));
   }
-
-  // group
-  AttributeProto* const group = node_proto->add_attribute();
-  group->set_name("group");
-  group->set_type(AttributeProto::INT);
-  group->set_i(static_cast<int64>(num_group));
 }  // end ConvertConvolution
 
 void ConvertPooling(NodeProto* node_proto, const NodeAttrs& attrs,
@@ -242,7 +263,7 @@ void ConvertPooling(NodeProto* node_proto, const NodeAttrs& attrs,
   AttributeProto* const kernel_shape = node_proto->add_attribute();
   kernel_shape->set_name("kernel_shape");
   kernel_shape->set_type(AttributeProto::INTS);
-  for (int kval : kernel) {
+  for (dim_t kval : kernel) {
     kernel_shape->add_ints(static_cast<int64>(kval));
   }
 
@@ -250,15 +271,19 @@ void ConvertPooling(NodeProto* node_proto, const NodeAttrs& attrs,
   AttributeProto* const pads = node_proto->add_attribute();
   pads->set_name("pads");
   pads->set_type(AttributeProto::INTS);
-  for (int kval : pad) {
-    pads->add_ints(static_cast<int64>(kval));
+
+  // Convert from MXNet symetric pads to ONNX non-symetric by running through padding twice.
+  for (int i =0; i < 2; i++) {
+    for (dim_t kval : pad) {
+      pads->add_ints(static_cast<int64>(kval));
+    }
   }
 
   // strides
   AttributeProto* const strides = node_proto->add_attribute();
   strides->set_name("strides");
   strides->set_type(AttributeProto::INTS);
-  for (int kval : stride) {
+  for (dim_t kval : stride) {
     strides->add_ints(static_cast<int64>(kval));
   }
 
@@ -315,11 +340,6 @@ void ConvertFullyConnected(NodeProto* node_proto, const NodeAttrs& attrs,
       beta->set_type(AttributeProto::FLOAT);
       beta->set_f(1.0f);
 
-      AttributeProto* const broadcast = node_proto->add_attribute();
-      broadcast->set_name("broadcast");
-      broadcast->set_type(AttributeProto::INT);
-      broadcast->set_i(1);
-
       AttributeProto* const transA = node_proto->add_attribute();
       transA->set_name("transA");
       transA->set_type(AttributeProto::INT);
@@ -371,11 +391,6 @@ void ConvertBatchNorm(NodeProto* node_proto, const NodeAttrs& attrs,
   epsilon->set_type(AttributeProto::FLOAT);
   epsilon->set_f(static_cast<float>(param.eps));
 
-  AttributeProto* const is_test = node_proto->add_attribute();
-  is_test->set_name("is_test");
-  is_test->set_type(AttributeProto::INT);
-  is_test->set_i(1);
-
   AttributeProto* const momentum = node_proto->add_attribute();
   momentum->set_name("momentum");
   momentum->set_type(AttributeProto::FLOAT);
@@ -384,31 +399,16 @@ void ConvertBatchNorm(NodeProto* node_proto, const NodeAttrs& attrs,
   AttributeProto* const spatial = node_proto->add_attribute();
   spatial->set_name("spatial");
   spatial->set_type(AttributeProto::INT);
-  spatial->set_i(1);
-
-  AttributeProto* const consumed = node_proto->add_attribute();
-  consumed->set_name("consumed_inputs");
-  consumed->set_type(AttributeProto::INTS);
-
-  for (int i = 0; i < 5; i++) {
-    int val = (i < 3) ? 0 : 1;
-    consumed->add_ints(static_cast<int64>(val));
-  }
+  // MXNet computes mean and variance per feature for batchnorm.  Enabling spatial mode
+  // (default in ONNX3) implies running batchnorm on all spatial features so we need to explicitly
+  // disable this for MXNet's BatchNorm.
+  spatial->set_i(0);
 }
 
 void ConvertElementwiseAdd(NodeProto* node_proto, const NodeAttrs& /*attrs*/,
                            const nnvm::IndexedGraph& /*ig*/,
                            const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
   node_proto->set_op_type("Add");
-  AttributeProto* const axis = node_proto->add_attribute();
-  axis->set_name("axis");
-  axis->set_type(AttributeProto::INT);
-  axis->set_i(1);
-
-  AttributeProto* const broadcast = node_proto->add_attribute();
-  broadcast->set_name("broadcast");
-  broadcast->set_type(AttributeProto::INT);
-  broadcast->set_i(0);  // 1
 }
 
 std::unordered_map<std::string, TShape> GetPlaceholderShapes(
@@ -461,32 +461,40 @@ void ConvertPlaceholder(
 void ConvertConstant(
     GraphProto* const graph_proto, const std::string& node_name,
     std::unordered_map<std::string, NDArray>* const shared_buffer) {
-  NodeProto* const node_proto = graph_proto->add_node();
-  node_proto->set_name(node_name);
-  node_proto->add_output(node_name);
-  node_proto->set_op_type("Constant");
+  TensorProto* const initializer_proto = graph_proto->add_initializer();
+
+  // Create initializer for constants
+  initializer_proto->set_name(node_name);
+  // TODO(kellens): convert to fp16 if needed.
+  initializer_proto->set_data_type(TensorProto_DataType_FLOAT);
 
   const NDArray nd = shared_buffer->find(node_name)->second;
   const TBlob& blob = nd.data();
   const TShape shape = blob.shape_;
-  const int32_t size = shape.Size();
 
+  for (auto& dim : shape) {
+    initializer_proto->add_dims(static_cast<int64>(dim));
+  }
+
+  auto size = shape.Size();
+  // TODO(kellens): Note hard coded float32 size assumed.
   std::shared_ptr<float> shared_data_ptr(new float[size]);
   float* const data_ptr = shared_data_ptr.get();
   nd.SyncCopyToCPU(static_cast<void*>(data_ptr), size);
 
-  AttributeProto* const tensor_attr = node_proto->add_attribute();
-  tensor_attr->set_name("value");
-  tensor_attr->set_type(AttributeProto::TENSOR);
-
-  TensorProto* const tensor_proto = tensor_attr->mutable_t();
-  tensor_proto->set_data_type(TensorProto_DataType_FLOAT);
-  for (auto& dim : shape) {
-    tensor_proto->add_dims(static_cast<int64>(dim));
+  for (size_t blob_idx = 0; blob_idx < size; ++blob_idx) {
+    initializer_proto->add_float_data(data_ptr[blob_idx]);
   }
 
-  for (int blob_idx = 0; blob_idx < size; ++blob_idx) {
-    tensor_proto->add_float_data(data_ptr[blob_idx]);
+  // Create inputs for constants.
+  ValueInfoProto* const input_proto = graph_proto->add_input();
+  input_proto->set_name(node_name);
+
+  // TODO(kellens): (fp16 support)
+  input_proto->mutable_type()->mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
+  for (auto& dim : shape) {
+    auto new_dim = input_proto->mutable_type()->mutable_tensor_type()->mutable_shape()->add_dim();
+    new_dim->set_dim_value(static_cast<int64>(dim));
   }
 }
 
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index 7c450b77c..cc8e4db40 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -699,7 +699,7 @@ class CuDNNRNNOp : public Operator {
       if (param_.p > 0) {
         CUDNN_CALL(cudnnDropoutGetStatesSize(s->dnn_handle_, &dropout_byte_));
         dropout_size_ = dropout_byte_ / sizeof(DType);
-        dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU());
+        dropout_states_ = Storage::Get()->Alloc(dropout_byte_, Context::GPU(s->dev_id));
       } else {
         dropout_states_ = {};
         dropout_byte_ = 0;
@@ -764,7 +764,7 @@ class CuDNNRNNOp : public Operator {
                                                 &reserve_space_byte_));
       workspace_size_ = workspace_byte_ / sizeof(DType);
       // Allocate the reserve space
-      reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU());
+      reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
 
       // Check that number of params are correct
       size_t cudnn_param_size;
diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index c64ed28ec..448016341 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -26,30 +26,45 @@
 #define MXNET_OPERATOR_IMAGE_IMAGE_RANDOM_INL_H_
 
 
-#include <mxnet/base.h>
 #include <algorithm>
-#include <vector>
 #include <cmath>
 #include <limits>
-#include <algorithm>
+#include <tuple>
 #include <utility>
+#include <vector>
+#include "mxnet/base.h"
 #include "../mxnet_op.h"
 #include "../operator_common.h"
+#if MXNET_USE_OPENCV
+  #include <opencv2/opencv.hpp>
+#endif  // MXNET_USE_OPENCV
 
 namespace mxnet {
 namespace op {
 namespace image {
 
+// There are no parameters for this operator.
+// Hence, no arameter registration.
+
+// Shape and Type inference for image to tensor operator
 inline bool ToTensorShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1U);
   CHECK_EQ(out_attrs->size(), 1U);
+
   TShape &shp = (*in_attrs)[0];
   if (!shp.ndim()) return false;
-  CHECK_EQ(shp.ndim(), 3)
-      << "Input image must have shape (height, width, channels), but got " << shp;
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape({shp[2], shp[0], shp[1]}));
+
+  CHECK((shp.ndim() == 3) || (shp.ndim() == 4))
+      << "Input image must have shape (height, width, channels), or "
+      << "(N, height, width, channels) but got " << shp;
+  if (shp.ndim() == 3) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape({shp[2], shp[0], shp[1]}));
+  } else if (shp.ndim() == 4) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape({shp[0], shp[3], shp[1], shp[2]}));
+  }
+
   return true;
 }
 
@@ -62,55 +77,119 @@ inline bool ToTensorType(const nnvm::NodeAttrs& attrs,
   return (*in_attrs)[0] != -1;
 }
 
-void ToTensor(const nnvm::NodeAttrs &attrs,
-                     const OpContext &ctx,
-                     const std::vector<TBlob> &inputs,
-                     const std::vector<OpReqType> &req,
-                     const std::vector<TBlob> &outputs) {
-  CHECK_EQ(req[0], kWriteTo)
-    << "`to_tensor` does not support inplace";
+// Operator Implementation
+
+template<int req>
+struct totensor_forward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(uint32_t c, float* out_data, const DType* in_data,
+                                  const int length, const int channel, const int step,
+                                  const float normalize_factor = 255.0f) {
+      #pragma omp parallel for
+      for (int i = 0; i < length; ++i) {
+        KERNEL_ASSIGN(out_data[step + c*length + i], req,
+                      (in_data[step + i*channel + c]) / normalize_factor);
+      }
+  }
+};
 
-  int length = inputs[0].shape_[0] * inputs[0].shape_[1];
-  int channel = inputs[0].shape_[2];
+template<typename xpu>
+void ToTensorImpl(const OpContext &ctx,
+                  const std::vector<TBlob> &inputs,
+                  const std::vector<TBlob> &outputs,
+                  const std::vector<OpReqType> &req,
+                  const int length,
+                  const uint32_t channel,
+                  const int step = 0) {
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
 
   MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    float* output = outputs[0].dptr<float>();
-    DType* input = inputs[0].dptr<DType>();
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+      float* output = outputs[0].dptr<float>();
+      DType* input = inputs[0].dptr<DType>();
+      mxnet_op::Kernel<totensor_forward<req_type>, xpu>::Launch(
+          s, channel, output, input, length, channel, step);
+    });
+  });
+}
 
-    for (int l = 0; l < length; ++l) {
-      for (int c = 0; c < channel; ++c) {
-        output[c*length + l] = static_cast<float>(input[l*channel + c]) / 255.0f;
-      }
+template<typename xpu>
+void ToTensorOpForward(const nnvm::NodeAttrs &attrs,
+                       const OpContext &ctx,
+                       const std::vector<TBlob> &inputs,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+
+  CHECK_EQ(req[0], kWriteTo)
+    << "`to_tensor` does not support inplace updates";
+
+  // 3D Input - (h, w, c)
+  if (inputs[0].ndim() == 3) {
+    const int length = inputs[0].shape_[0] * inputs[0].shape_[1];
+    const uint32_t channel = inputs[0].shape_[2];
+    ToTensorImpl<xpu>(ctx, inputs, outputs, req, length, channel);
+  } else if (inputs[0].ndim() == 4) {
+    // 4D input (n, h, w, c)
+    const int batch_size = inputs[0].shape_[0];
+    const int length = inputs[0].shape_[1] * inputs[0].shape_[2];
+    const uint32_t channel = inputs[0].shape_[3];
+    const int step = channel * length;
+
+    #pragma omp parallel for
+    for (auto n = 0; n < batch_size; ++n) {
+      ToTensorImpl<xpu>(ctx, inputs, outputs, req, length, channel, n*step);
     }
-  });
+  }
 }
 
 struct NormalizeParam : public dmlc::Parameter<NormalizeParam> {
   nnvm::Tuple<float> mean;
   nnvm::Tuple<float> std;
+
   DMLC_DECLARE_PARAMETER(NormalizeParam) {
     DMLC_DECLARE_FIELD(mean)
-    .describe("Sequence of mean for each channel.");
+    .set_default(nnvm::Tuple<float> {0.0f, 0.0f, 0.0f, 0.0f})
+    .describe("Sequence of means for each channel. "
+              "Default value is 0.");
     DMLC_DECLARE_FIELD(std)
-    .describe("Sequence of standard deviations for each channel.");
+    .set_default(nnvm::Tuple<float> {1.0f, 1.0f, 1.0f, 1.0f})
+    .describe("Sequence of standard deviations for each channel. "
+              "Default value is 1.");
   }
 };
 
-inline bool NormalizeShape(const nnvm::NodeAttrs& attrs,
+// Shape and Type inference for image Normalize operator
+
+// Shape inference
+inline bool NormalizeOpShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
   const NormalizeParam &param = nnvm::get<NormalizeParam>(attrs.parsed);
+
   const auto& dshape = (*in_attrs)[0];
   if (!dshape.ndim()) return false;
 
-  CHECK_EQ(dshape.ndim(), 3)
-      << "Input tensor must have shape (channels, height, width), but got "
-      << dshape;
-  auto nchannels = dshape[0];
-  CHECK(nchannels == 3 || nchannels == 1)
+  CHECK((dshape.ndim() == 3) || (dshape.ndim() == 4))
+      << "Input tensor must have shape (channels, height, width), or "
+      << "(N, channels, height, width), but got " << dshape;
+
+  uint32_t nchannels;
+  if (dshape.ndim() == 3) {
+    nchannels = dshape[0];
+    CHECK(nchannels == 3 || nchannels == 1)
       << "The first dimension of input tensor must be the channel dimension with "
       << "either 1 or 3 elements, but got input with shape " << dshape;
-  CHECK(param.mean.ndim() == 1 || param.mean.ndim() == nchannels)
+  } else if (dshape.ndim() == 4) {
+    nchannels = dshape[1];
+    CHECK(nchannels == 3 || nchannels == 1)
+      << "The second dimension of input tensor must be the channel dimension with "
+      << "either 1 or 3 elements, but got input with shape " << dshape;
+  }
+
+  CHECK((param.mean.ndim() == 1) || (param.mean.ndim() == nchannels))
       << "Invalid mean for input with shape " << dshape
       << ". mean must have either 1 or " << nchannels
       << " elements, but got " << param.mean;
@@ -123,28 +202,216 @@ inline bool NormalizeShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-void Normalize(const nnvm::NodeAttrs &attrs,
+// Type Inference
+inline bool NormalizeOpType(const nnvm::NodeAttrs& attrs,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  return out_attrs->at(0) != -1;
+}
+
+template<int req>
+struct normalize_forward {
+    template<typename DType>
+    MSHADOW_XINLINE static void Map(uint32_t c, DType* out_data, const DType* in_data,
+                                    const float mean_d0, const float mean_d1, const float mean_d2,
+                                    const float std_d0, const float std_d1, const float std_d2,
+                                    const int length, const int step) {
+        float mean, std;
+        switch (c) {
+          case 0 : mean = mean_d0;
+                   std = std_d0;
+                   break;
+          case 1 : mean = mean_d1;
+                   std = std_d1;
+                   break;
+          case 2 : mean = mean_d2;
+                   std = std_d2;
+                   break;
+        }
+        #pragma omp parallel for
+        for (int i = 0; i < length; ++i) {
+          KERNEL_ASSIGN(out_data[step + c*length + i], req,
+                        (in_data[step + c*length + i] - mean) / std);
+        }
+    }
+};
+
+template<typename xpu>
+void NormalizeImpl(const OpContext &ctx,
+                   const std::vector<TBlob> &inputs,
+                   const std::vector<TBlob> &outputs,
+                   const std::vector<OpReqType> &req,
+                   const float mean_d0, const float mean_d1,
+                   const float mean_d2, const float std_d0,
+                   const float std_d1, const float std_d2,
+                   const int length,
+                   const uint32_t channel,
+                   const int step = 0) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        DType* input = inputs[0].dptr<DType>();
+        DType* output = outputs[0].dptr<DType>();
+        mxnet_op::Kernel<normalize_forward<req_type>, xpu>::Launch(
+            s, channel, output, input, mean_d0, mean_d1, mean_d2,
+            std_d0, std_d1, std_d2, length, step);
+      });
+    });
+}
+
+template<typename xpu>
+void NormalizeOpForward(const nnvm::NodeAttrs &attrs,
                       const OpContext &ctx,
                       const std::vector<TBlob> &inputs,
                       const std::vector<OpReqType> &req,
                       const std::vector<TBlob> &outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+
   const NormalizeParam &param = nnvm::get<NormalizeParam>(attrs.parsed);
 
-  int nchannels = inputs[0].shape_[0];
-  int length = inputs[0].shape_[1] * inputs[0].shape_[2];
+  // Note: We need mean and std_dev in the kernel.
+  // It is costly (device copy) to pass it as vector, for gpu kernel.
+  // Hence, passing it as below for performance.
+  float mean_d0, mean_d1, mean_d2;
+  float std_d0, std_d1, std_d2;
+
+  // Mean and Std can be 1 or 3 D only.
+  if (param.mean.ndim() == 1) {
+    mean_d0 = mean_d1 = mean_d2 = param.mean[0];
+  } else {
+    mean_d0 = param.mean[0];
+    mean_d1 = param.mean[1];
+    mean_d2 = param.mean[2];
+  }
 
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    DType* input = inputs[0].dptr<DType>();
-    DType* output = outputs[0].dptr<DType>();
+  if (param.std.ndim() == 1) {
+    std_d0 = std_d1 = std_d2 = param.std[0];
+  } else {
+    std_d0 = param.std[0];
+    std_d1 = param.std[1];
+    std_d2 = param.std[2];
+  }
 
-    for (int i = 0; i < nchannels; ++i) {
-      DType mean = param.mean[param.mean.ndim() > 1 ? i : 0];
-      DType std = param.std[param.std.ndim() > 1 ? i : 0];
-      for (int j = 0; j < length; ++j) {
-        output[i*length + j] = (input[i*length + j] - mean) / std;
-      }
+  // 3D input (c, h, w)
+  if (inputs[0].ndim() == 3) {
+    const int length = inputs[0].shape_[1] * inputs[0].shape_[2];
+    const uint32_t channel = inputs[0].shape_[0];
+    NormalizeImpl<xpu>(ctx, inputs, outputs, req, mean_d0, mean_d1, mean_d2,
+                       std_d0, std_d1, std_d2, length, channel);
+  } else if (inputs[0].ndim() == 4) {
+    // 4D input (n, c, h, w)
+    const int batch_size = inputs[0].shape_[0];
+    const int length = inputs[0].shape_[2] * inputs[0].shape_[3];
+    const uint32_t channel = inputs[0].shape_[1];
+    const int step = channel * length;
+
+    #pragma omp parallel for
+    for (auto n = 0; n < batch_size; ++n) {
+      NormalizeImpl<xpu>(ctx, inputs, outputs, req, mean_d0, mean_d1, mean_d2,
+                       std_d0, std_d1, std_d2, length, channel, n*step);
     }
-  });
+  }
+}
+
+// Backward function
+template<int req>
+struct normalize_backward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(uint32_t c, DType* in_grad, const DType* out_grad,
+                                  const float std_d0, const float std_d1, const float std_d2,
+                                  const int length, const int step) {
+    // d/dx{(x - mean) / std_dev} => (1 / std_dev)
+    float std_dev;
+    switch (c) {
+        case 0 : std_dev = std_d0;
+                 break;
+        case 1 : std_dev = std_d1;
+                 break;
+        case 2 : std_dev = std_d2;
+                 break;
+    }
+
+    #pragma omp parallel for
+    for (int i = 0; i < length; ++i) {
+      KERNEL_ASSIGN(in_grad[step + c*length + i], req,
+                    out_grad[step + c*length + i] * (1.0 / std_dev));
+    }
+  }
+};
+
+template<typename xpu>
+void NormalizeBackwardImpl(const OpContext &ctx,
+                           const std::vector<TBlob> &inputs,
+                           const std::vector<TBlob> &outputs,
+                           const std::vector<OpReqType> &req,
+                           const float std_d0, const float std_d1, const float std_d2,
+                           const int length,
+                           const uint32_t channel,
+                           const int step = 0) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        DType* out_grad = inputs[0].dptr<DType>();
+        DType* in_grad = outputs[0].dptr<DType>();
+        mxnet_op::Kernel<normalize_backward<req_type>, xpu>::Launch(
+            s, channel, in_grad, out_grad, std_d0, std_d1, std_d2, length, step);
+      });
+    });
+}
+
+template<typename xpu>
+void NormalizeOpBackward(const nnvm::NodeAttrs &attrs,
+                         const OpContext &ctx,
+                         const std::vector<TBlob> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<TBlob> &outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+
+  const NormalizeParam &param = nnvm::get<NormalizeParam>(attrs.parsed);
+  float std_d0, std_d1, std_d2;
+
+  // Std can be 1 or 3 D only
+  if (param.std.ndim() == 1) {
+    std_d0 = std_d1 = std_d2 = param.std[0];
+  } else {
+    std_d0 = param.std[0];
+    std_d1 = param.std[1];
+    std_d2 = param.std[2];
+  }
+
+  // Note: inputs[0] is out_grad
+  const TBlob& in_data = inputs[1];
+
+  // 3D input (c, h, w)
+  if (in_data.ndim() == 3) {
+    const int length = in_data.shape_[1] * in_data.shape_[2];
+    const uint32_t channel = in_data.shape_[0];
+    NormalizeBackwardImpl<xpu>(ctx, inputs, outputs, req, std_d0, std_d1, std_d2, length, channel);
+  } else if (in_data.ndim() == 4) {
+    // 4D input (n, c, h, w)
+    const int batch_size = in_data.shape_[0];
+    const int length = in_data.shape_[2] * in_data.shape_[3];
+    const uint32_t channel = in_data.shape_[1];
+    const int step = channel * length;
+
+    #pragma omp parallel for
+    for (auto n = 0; n < batch_size; ++n) {
+      NormalizeBackwardImpl<xpu>(ctx, inputs, outputs, req,
+                                 std_d0, std_d1, std_d2, length,
+                                 channel, n*step);
+    }
+  }
 }
 
 template<typename DType>
@@ -190,7 +457,7 @@ void FlipImpl(const TShape &shape, DType *src, DType *dst) {
   }
 }
 
-void FlipLeftRight(const nnvm::NodeAttrs &attrs,
+inline void FlipLeftRight(const nnvm::NodeAttrs &attrs,
                    const OpContext &ctx,
                    const std::vector<TBlob> &inputs,
                    const std::vector<OpReqType> &req,
@@ -202,7 +469,7 @@ void FlipLeftRight(const nnvm::NodeAttrs &attrs,
   });
 }
 
-void FlipTopBottom(const nnvm::NodeAttrs &attrs,
+inline void FlipTopBottom(const nnvm::NodeAttrs &attrs,
                    const OpContext &ctx,
                    const std::vector<TBlob> &inputs,
                    const std::vector<OpReqType> &req,
@@ -214,7 +481,7 @@ void FlipTopBottom(const nnvm::NodeAttrs &attrs,
   });
 }
 
-void RandomFlipLeftRight(
+inline void RandomFlipLeftRight(
     const nnvm::NodeAttrs &attrs,
     const OpContext &ctx,
     const std::vector<TBlob> &inputs,
@@ -235,7 +502,7 @@ void RandomFlipLeftRight(
   });
 }
 
-void RandomFlipTopBottom(
+inline void RandomFlipTopBottom(
     const nnvm::NodeAttrs &attrs,
     const OpContext &ctx,
     const std::vector<TBlob> &inputs,
@@ -287,7 +554,7 @@ inline void AdjustBrightnessImpl(const float& alpha_b,
   });
 }
 
-void RandomBrightness(const nnvm::NodeAttrs &attrs,
+inline void RandomBrightness(const nnvm::NodeAttrs &attrs,
                       const OpContext &ctx,
                       const std::vector<TBlob> &inputs,
                       const std::vector<OpReqType> &req,
@@ -405,7 +672,7 @@ inline void RandomSaturation(const nnvm::NodeAttrs &attrs,
   AdjustSaturationImpl(alpha_s, ctx, inputs, req, outputs);
 }
 
-void RGB2HLSConvert(const float& src_r,
+inline void RGB2HLSConvert(const float& src_r,
                     const float& src_g,
                     const float& src_b,
                     float *dst_h,
@@ -443,7 +710,7 @@ void RGB2HLSConvert(const float& src_r,
   *dst_s = s;
 }
 
-void HLS2RGBConvert(const float& src_h,
+inline void HLS2RGBConvert(const float& src_h,
                     const float& src_l,
                     const float& src_s,
                     float *dst_r,
@@ -494,7 +761,7 @@ void HLS2RGBConvert(const float& src_h,
   *dst_r = r * 255.f;
 }
 
-void AdjustHueImpl(float alpha,
+inline void AdjustHueImpl(float alpha,
                    const OpContext &ctx,
                    const std::vector<TBlob> &inputs,
                    const std::vector<OpReqType> &req,
@@ -521,7 +788,7 @@ void AdjustHueImpl(float alpha,
   });
 }
 
-void RandomHue(const nnvm::NodeAttrs &attrs,
+inline void RandomHue(const nnvm::NodeAttrs &attrs,
                const OpContext &ctx,
                const std::vector<TBlob> &inputs,
                const std::vector<OpReqType> &req,
@@ -554,7 +821,7 @@ struct RandomColorJitterParam : public dmlc::Parameter<RandomColorJitterParam> {
   }
 };
 
-void RandomColorJitter(const nnvm::NodeAttrs &attrs,
+inline void RandomColorJitter(const nnvm::NodeAttrs &attrs,
                        const OpContext &ctx,
                        const std::vector<TBlob> &inputs,
                        const std::vector<OpReqType> &req,
@@ -623,7 +890,7 @@ struct RandomLightingParam : public dmlc::Parameter<RandomLightingParam> {
   }
 };
 
-void AdjustLightingImpl(const nnvm::Tuple<float>& alpha,
+inline void AdjustLightingImpl(const nnvm::Tuple<float>& alpha,
                         const OpContext &ctx,
                         const std::vector<TBlob> &inputs,
                         const std::vector<OpReqType> &req,
@@ -658,7 +925,7 @@ void AdjustLightingImpl(const nnvm::Tuple<float>& alpha,
   });
 }
 
-void AdjustLighting(const nnvm::NodeAttrs &attrs,
+inline void AdjustLighting(const nnvm::NodeAttrs &attrs,
                     const OpContext &ctx,
                     const std::vector<TBlob> &inputs,
                     const std::vector<OpReqType> &req,
@@ -668,7 +935,7 @@ void AdjustLighting(const nnvm::NodeAttrs &attrs,
   AdjustLightingImpl(param.alpha, ctx, inputs, req, outputs);
 }
 
-void RandomLighting(const nnvm::NodeAttrs &attrs,
+inline void RandomLighting(const nnvm::NodeAttrs &attrs,
                     const OpContext &ctx,
                     const std::vector<TBlob> &inputs,
                     const std::vector<OpReqType> &req,
diff --git a/src/operator/image/image_random.cc b/src/operator/image/image_random.cc
index 26f520bb8..810bffbdd 100644
--- a/src/operator/image/image_random.cc
+++ b/src/operator/image/image_random.cc
@@ -39,31 +39,155 @@ DMLC_REGISTER_PARAMETER(RandomLightingParam);
 DMLC_REGISTER_PARAMETER(RandomColorJitterParam);
 
 NNVM_REGISTER_OP(_image_to_tensor)
-.describe(R"code()code" ADD_FILELINE)
+.describe(R"code(Converts an image NDArray of shape (H x W x C) or (N x H x W x C) 
+with values in the range [0, 255] to a tensor NDArray of shape (C x H x W) or (N x C x H x W)
+with values in the range [0, 1)
+
+Example:
+    .. code-block:: python
+        image = mx.nd.random.uniform(0, 255, (4, 2, 3)).astype(dtype=np.uint8)
+        to_tensor(image)
+            [[[ 0.85490197  0.72156864]
+              [ 0.09019608  0.74117649]
+              [ 0.61960787  0.92941177]
+              [ 0.96470588  0.1882353 ]]
+             [[ 0.6156863   0.73725492]
+              [ 0.46666667  0.98039216]
+              [ 0.44705883  0.45490196]
+              [ 0.01960784  0.8509804 ]]
+             [[ 0.39607844  0.03137255]
+              [ 0.72156864  0.52941179]
+              [ 0.16470589  0.7647059 ]
+              [ 0.05490196  0.70588237]]]
+             <NDArray 3x4x2 @cpu(0)>
+
+        image = mx.nd.random.uniform(0, 255, (2, 4, 2, 3)).astype(dtype=np.uint8)
+        to_tensor(image)
+            [[[[0.11764706 0.5803922 ]
+               [0.9411765  0.10588235]
+               [0.2627451  0.73333335]
+               [0.5647059  0.32156864]]
+              [[0.7176471  0.14117648]
+               [0.75686276 0.4117647 ]
+               [0.18431373 0.45490196]
+               [0.13333334 0.6156863 ]]
+              [[0.6392157  0.5372549 ]
+               [0.52156866 0.47058824]
+               [0.77254903 0.21568628]
+               [0.01568628 0.14901961]]]
+             [[[0.6117647  0.38431373]
+               [0.6784314  0.6117647 ]
+               [0.69411767 0.96862745]
+               [0.67058825 0.35686275]]
+              [[0.21960784 0.9411765 ]
+               [0.44705883 0.43529412]
+               [0.09803922 0.6666667 ]
+               [0.16862746 0.1254902 ]]
+              [[0.6156863  0.9019608 ]
+               [0.35686275 0.9019608 ]
+               [0.05882353 0.6509804 ]
+               [0.20784314 0.7490196 ]]]]
+            <NDArray 2x3x4x2 @cpu(0)>
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
 .set_attr<nnvm::FInferShape>("FInferShape", ToTensorShape)
 .set_attr<nnvm::FInferType>("FInferType", ToTensorType)
-.set_attr<FCompute>("FCompute<cpu>", ToTensor)
+.set_attr<FCompute>("FCompute<cpu>", ToTensorOpForward<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_copy" })
-.add_argument("data", "NDArray-or-Symbol", "The input.");
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray");
 
 NNVM_REGISTER_OP(_image_normalize)
-.describe(R"code()code" ADD_FILELINE)
+.describe(R"code(Normalize an tensor of shape (C x H x W) or (N x C x H x W) with mean and
+    standard deviation.
+
+    Given mean `(m1, ..., mn)` and std `(s\ :sub:`1`\ , ..., s\ :sub:`n`)` for `n` channels,
+    this transform normalizes each channel of the input tensor with:
+
+.. math::
+
+        output[i] = (input[i] - m\ :sub:`i`\ ) / s\ :sub:`i`
+
+    If mean or std is scalar, the same value will be applied to all channels.
+
+    Default value for mean is 0.0 and stand deviation is 1.0.
+
+Example:
+
+    .. code-block:: python
+        image = mx.nd.random.uniform(0, 1, (3, 4, 2))
+        normalize(image, mean=(0, 1, 2), std=(3, 2, 1))
+            [[[ 0.18293785  0.19761486]
+              [ 0.23839645  0.28142193]
+              [ 0.20092112  0.28598186]
+              [ 0.18162774  0.28241724]]
+             [[-0.2881726  -0.18821815]
+              [-0.17705294 -0.30780914]
+              [-0.2812064  -0.3512327 ]
+              [-0.05411351 -0.4716435 ]]
+             [[-1.0363373  -1.7273437 ]
+              [-1.6165586  -1.5223348 ]
+              [-1.208275   -1.1878313 ]
+              [-1.4711051  -1.5200229 ]]]
+            <NDArray 3x4x2 @cpu(0)>
+
+        image = mx.nd.random.uniform(0, 1, (2, 3, 4, 2))
+        normalize(image, mean=(0, 1, 2), std=(3, 2, 1))
+            [[[[ 0.18934818  0.13092826]
+               [ 0.3085322   0.27869293]
+               [ 0.02367868  0.11246539]
+               [ 0.0290431   0.2160573 ]]
+              [[-0.4898908  -0.31587923]
+               [-0.08369008 -0.02142242]
+               [-0.11092162 -0.42982462]
+               [-0.06499392 -0.06495637]]
+              [[-1.0213816  -1.526392  ]
+               [-1.2008414  -1.1990893 ]
+               [-1.5385206  -1.4795225 ]
+               [-1.2194707  -1.3211205 ]]]
+             [[[ 0.03942481  0.24021089]
+               [ 0.21330701  0.1940066 ]
+               [ 0.04778443  0.17912441]
+               [ 0.31488964  0.25287187]]
+              [[-0.23907584 -0.4470462 ]
+               [-0.29266903 -0.2631998 ]
+               [-0.3677222  -0.40683383]
+               [-0.11288315 -0.13154092]]
+              [[-1.5438497  -1.7834496 ]
+               [-1.431566   -1.8647819 ]
+               [-1.9812102  -1.675859  ]
+               [-1.3823645  -1.8503251 ]]]]
+            <NDArray 2x3x4x2 @cpu(0)>
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<NormalizeParam>)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<NormalizeParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", NormalizeShape)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", NormalizeOpShape)
+.set_attr<nnvm::FInferType>("FInferType", NormalizeOpType)
+.set_attr<FCompute>("FCompute<cpu>", NormalizeOpForward<cpu>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
+  [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
-.set_attr<FCompute>("FCompute<cpu>", Normalize)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_copy" })
-.add_argument("data", "NDArray-or-Symbol", "The input.")
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_image_normalize"})
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")
 .add_arguments(NormalizeParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_backward_image_normalize)
+.set_attr_parser(ParamParser<NormalizeParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", NormalizeOpBackward<cpu>);
+
 MXNET_REGISTER_IMAGE_AUG_OP(_image_flip_left_right)
 .describe(R"code()code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", FlipLeftRight);
diff --git a/src/operator/image/image_random.cu b/src/operator/image/image_random.cu
new file mode 100644
index 000000000..5f9aff27e
--- /dev/null
+++ b/src/operator/image/image_random.cu
@@ -0,0 +1,42 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+/*!
+* \file image_random.cu
+* \brief GPU Implementation of image transformation operators
+*/
+#include "./image_random-inl.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+namespace image {
+
+NNVM_REGISTER_OP(_image_to_tensor)
+.set_attr<FCompute>("FCompute<gpu>", ToTensorOpForward<gpu>);
+
+NNVM_REGISTER_OP(_image_normalize)
+.set_attr<FCompute>("FCompute<gpu>", NormalizeOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_image_normalize)
+.set_attr<FCompute>("FCompute<gpu>", NormalizeOpBackward<gpu>);
+
+}  // namespace image
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/image/image_utils.h b/src/operator/image/image_utils.h
new file mode 100644
index 000000000..a7155345c
--- /dev/null
+++ b/src/operator/image/image_utils.h
@@ -0,0 +1,59 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file image_utils.h
+ * \brief the image operator utility function implementation
+ * \author Jake Lee
+ */
+
+#ifndef MXNET_OPERATOR_IMAGE_IMAGE_UTILS_H_
+#define MXNET_OPERATOR_IMAGE_IMAGE_UTILS_H_
+
+#include <vector>
+#if MXNET_USE_OPENCV
+  #include <opencv2/opencv.hpp>
+#endif  // MXNET_USE_OPENCV
+
+namespace mxnet {
+namespace op {
+namespace image {
+
+enum ImageLayout {H, W, C};
+enum BatchImageLayout {N, kH, kW, kC};
+
+struct SizeParam {
+  int height;
+  int width;
+  SizeParam() {
+    height = 0;
+    width = 0;
+  }
+  SizeParam(int height_, int width_) {
+    height = height_;
+    width = width_;
+  }
+};  // struct SizeParam
+
+}  // namespace image
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_IMAGE_IMAGE_UTILS_H_
diff --git a/src/operator/image/resize-inl.h b/src/operator/image/resize-inl.h
new file mode 100644
index 000000000..3e1310068
--- /dev/null
+++ b/src/operator/image/resize-inl.h
@@ -0,0 +1,218 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+/*!
+* \file resize-inl.h
+* \brief image resize operator using opencv and only support bilinear resize
+* \author Jake Lee
+*/
+#ifndef MXNET_OPERATOR_IMAGE_RESIZE_INL_H_
+#define MXNET_OPERATOR_IMAGE_RESIZE_INL_H_
+
+#include <mxnet/base.h>
+#include <vector>
+
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "image_utils.h"
+
+#if MXNET_USE_OPENCV
+  #include <opencv2/opencv.hpp>
+#endif  // MXNET_USE_OPENCV
+
+namespace mxnet {
+namespace op {
+namespace image {
+
+using namespace mshadow;
+
+#if MXNET_USE_CUDA
+template<typename DType, typename T, typename Acctype>
+void ResizeImplCUDA(Stream<gpu> *s,
+                      const T input,
+                      const T output);
+#endif  // MXNET_USE_CUDA
+
+struct ResizeParam : public dmlc::Parameter<ResizeParam> {
+  nnvm::Tuple<int> size;
+  bool keep_ratio;
+  int interp;
+  DMLC_DECLARE_PARAMETER(ResizeParam) {
+    DMLC_DECLARE_FIELD(size)
+    .set_default(nnvm::Tuple<int>())
+    .describe("Size of new image. Could be (width, height) or (size)");
+    DMLC_DECLARE_FIELD(keep_ratio)
+    .describe("Whether to resize the short edge or both edges to `size`, "
+      "if size is give as an integer.")
+    .set_default(false);
+    DMLC_DECLARE_FIELD(interp)
+    .set_default(1)
+    .describe("Interpolation method for resizing. By default uses bilinear interpolation"
+        "Options are INTER_NEAREST - a nearest-neighbor interpolation"
+        "INTER_LINEAR - a bilinear interpolation"
+        "INTER_AREA - resampling using pixel area relation"
+        "INTER_CUBIC - a bicubic interpolation over 4x4 pixel neighborhood"
+        "INTER_LANCZOS4 - a Lanczos interpolation over 8x8 pixel neighborhood"
+        "Note that the GPU version only support bilinear interpolation(1)"
+        " and the result on cpu would be slightly different from gpu."
+        "It uses opencv resize function which tend to align center on cpu"
+        "while using contrib.bilinearResize2D which aligns corner on gpu");
+  }
+};
+// handle the keep ratio param
+inline SizeParam GetHeightAndWidth(int data_h,
+                                    int data_w,
+                                    const ResizeParam& param) {
+  CHECK((param.size.ndim() == 1) || (param.size.ndim() == 2))
+      << "Input size dimension must be 1 or 2, but got "
+      << param.size.ndim();
+  int resized_h;
+  int resized_w;
+  if (param.size.ndim() == 1) {
+    CHECK_GT(param.size[0], 0)
+      << "Input size should be greater than 0, but got "
+      << param.size[0];
+    if (!param.keep_ratio) {
+      resized_h = param.size[0];
+      resized_w = param.size[0];
+    } else {
+      if (data_h > data_w) {
+        resized_w = param.size[0];
+        resized_h = static_cast<int>(data_h * resized_w / data_w);
+      } else {
+        resized_h = param.size[0];
+        resized_w = static_cast<int>(data_w * resized_h / data_h);
+      }
+    }
+  } else {
+    CHECK_GT(param.size[0], 0)
+        << "Input width should be greater than 0, but got "
+        << param.size[0];
+    CHECK_GT(param.size[1], 0)
+        << "Input height should be greater than 0, but got "
+        << param.size[1];
+    resized_h = param.size[1];
+    resized_w = param.size[0];
+  }
+  return SizeParam(resized_h, resized_w);
+}
+
+inline bool ResizeShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape> *in_attrs,
+                             std::vector<TShape> *out_attrs) {
+  // input attrs should only be (h, w, c) or (n, h, w, c)
+  CHECK((in_attrs->at(0).ndim() == 3U) || (in_attrs->at(0).ndim() == 4U))
+    << "Input image dimension should be 3 or 4 but got "
+    << in_attrs->at(0).ndim();
+  const auto& ishape = (*in_attrs)[0];
+  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+  SizeParam size;
+  if (ishape.ndim() == 3) {
+    size = GetHeightAndWidth(ishape[H], ishape[W], param);
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape({size.height, size.width, ishape[C]}));
+  } else {
+    size = GetHeightAndWidth(ishape[kH], ishape[kW], param);
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0,
+      TShape({ishape[N], size.height, size.width, ishape[kC]}));
+  }
+  return true;
+}
+
+inline void ResizeImpl(const std::vector<TBlob> &inputs,
+                      const std::vector<TBlob> &outputs,
+                      const int height,
+                      const int width,
+                      const int interp,
+                      const int input_index = 0,
+                      const int output_index = 0) {
+#if MXNET_USE_OPENCV
+  CHECK_NE(inputs[0].type_flag_, mshadow::kFloat16) << "opencv image mat doesn't support fp16";
+  CHECK((inputs[0].type_flag_ != mshadow::kInt32) || (inputs[0].type_flag_ != mshadow::kInt64))
+      << "opencv resize doesn't support int32, int64";
+  // mapping to opencv matrix element type according to channel
+  const int DTYPE[] = {CV_32F, CV_64F, -1, CV_8U, CV_32S};
+  if (inputs[0].ndim() == 3) {
+    const int cv_type = CV_MAKETYPE(DTYPE[inputs[0].type_flag_], inputs[0].shape_[C]);
+    cv::Mat buf(inputs[0].shape_[H], inputs[0].shape_[W], cv_type, inputs[0].dptr_);
+    cv::Mat dst(outputs[0].shape_[H], outputs[0].shape_[W], cv_type, outputs[0].dptr_);
+    cv::resize(buf, dst, cv::Size(width, height), 0, 0, interp);
+    CHECK(!dst.empty());
+    CHECK_EQ(static_cast<void*>(dst.ptr()), outputs[0].dptr_);
+  } else {
+    const int cv_type = CV_MAKETYPE(DTYPE[inputs[0].type_flag_], inputs[0].shape_[kC]);
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      cv::Mat buf(inputs[0].shape_[kH], inputs[0].shape_[kW], cv_type,
+        inputs[0].dptr<DType>() + input_index);
+      cv::Mat dst(outputs[0].shape_[kH], outputs[0].shape_[kW], cv_type,
+        outputs[0].dptr<DType>() + output_index);
+      cv::resize(buf, dst, cv::Size(width, height), 0, 0, interp);
+      CHECK(!dst.empty());
+      CHECK_EQ(static_cast<void*>(dst.ptr()), outputs[0].dptr<DType>() + output_index);
+    });
+  }
+#else
+  LOG(FATAL) << "Build with USE_OPENCV=1 for image resize operator.";
+#endif  // MXNET_USE_OPENCV
+}
+
+template <typename xpu>
+inline void Resize(const nnvm::NodeAttrs &attrs,
+                   const OpContext &ctx,
+                   const std::vector<TBlob> &inputs,
+                   const std::vector<OpReqType> &req,
+                   const std::vector<TBlob> &outputs) {
+  CHECK_EQ(outputs.size(), 1U);
+  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+  SizeParam size;
+  if (std::is_same<xpu, gpu>::value) {
+#if MXNET_USE_CUDA
+    CHECK(param.interp == 1) << "interp should be 1 for using Resize on GPU.";
+    mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+    MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      if (inputs[0].ndim() == 3) {
+        Tensor<gpu, 3, DType> input = inputs[0].get<gpu, 3, DType>(s);
+        Tensor<gpu, 3, DType> output = outputs[0].get<gpu, 3, DType>(s);
+        ResizeImplCUDA<DType, Tensor<gpu, 3, DType>, float>(s, input, output);
+      } else {
+        Tensor<gpu, 4, DType> input = inputs[0].get<gpu, 4, DType>(s);
+        Tensor<gpu, 4, DType> output = outputs[0].get<gpu, 4, DType>(s);
+        ResizeImplCUDA<DType, Tensor<gpu, 4, DType>, float>(s, input, output);
+      }
+    });
+#endif  // MXNET_USE_CUDA
+  } else if (inputs[0].ndim() == 3) {
+    size = GetHeightAndWidth(inputs[0].shape_[H], inputs[0].shape_[W], param);
+    ResizeImpl(inputs, outputs, size.height, size.width, param.interp);
+  } else {
+    size = GetHeightAndWidth(inputs[0].shape_[kH], inputs[0].shape_[kW], param);
+    const auto batch_size = inputs[0].shape_[N];
+    const auto input_step = inputs[0].shape_[kH] * inputs[0].shape_[kW] * inputs[0].shape_[kC];
+    const auto output_step = size.height * size.width * inputs[0].shape_[kC];
+    #pragma omp parallel for
+    for (auto i = 0; i < batch_size; ++i) {
+      ResizeImpl(inputs, outputs, size.height, size.width,
+        param.interp, i * input_step, i * output_step);
+    }
+  }
+}
+
+}  // namespace image
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_IMAGE_RESIZE_INL_H_
diff --git a/src/operator/image/resize.cc b/src/operator/image/resize.cc
new file mode 100644
index 000000000..d3b28f080
--- /dev/null
+++ b/src/operator/image/resize.cc
@@ -0,0 +1,83 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file resize.cc
+ * \brief resize operator cpu
+ * \author Jake Lee
+*/
+#include <mxnet/base.h>
+#include "./resize-inl.h"
+#include "../operator_common.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+namespace image {
+
+DMLC_REGISTER_PARAMETER(ResizeParam);
+
+NNVM_REGISTER_OP(_image_resize)
+.describe(R"code(Resize an image NDArray of shape (H x W x C) or (N x H x W x C) 
+to the given size
+Example:
+    .. code-block:: python
+        image = mx.nd.random.uniform(0, 255, (4, 2, 3)).astype(dtype=np.uint8)
+        mx.nd.image.resize(image, (3, 3))
+            [[[124 111 197]
+              [158  80 155]
+              [193  50 112]]
+
+             [[110 100 113]
+              [134 165 148]
+              [157 231 182]]
+
+             [[202 176 134]
+              [174 191 149]
+              [147 207 164]]]
+            <NDArray 3x3x3 @cpu(0)>
+        image = mx.nd.random.uniform(0, 255, (2, 4, 2, 3)).astype(dtype=np.uint8)
+        mx.nd.image.resize(image, (2, 2))            
+            [[[[ 59 133  80]
+               [187 114 153]]
+
+              [[ 38 142  39]
+               [207 131 124]]]
+
+
+              [[[117 125 136]
+               [191 166 150]]
+
+              [[129  63 113]
+               [182 109  48]]]]
+            <NDArray 2x2x2x3 @cpu(0)>
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ResizeParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ResizeShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", Resize<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_copy" })
+.add_argument("data", "NDArray-or-Symbol", "The input.")
+.add_arguments(ResizeParam::__FIELDS__());
+
+}  // namespace image
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/image/resize.cu b/src/operator/image/resize.cu
new file mode 100644
index 000000000..f045f3b23
--- /dev/null
+++ b/src/operator/image/resize.cu
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * Copyright (c) 2019 by Contributors
+ * \file bilinear_resize.cu
+ * \brief bilinear resize operator
+ * \author Hang Zhang, Jake Lee
+*/
+#include <algorithm>
+#include "./resize-inl.h"
+#include "../contrib/bilinear_resize-inl.cuh"
+
+namespace mxnet {
+namespace op {
+namespace image {
+
+using namespace mshadow;
+
+template<typename DType, typename T, typename AccReal>
+void ResizeImplCUDA(mshadow::Stream<gpu> *s,
+                      const T input,
+                      const T output) {
+  int outputHeight;
+  int outputWidth;
+  int inputHeight;
+  int inputWidth;
+  mxnet::op::ImageLayout layout;
+  if (std::is_same<T, Tensor<gpu, 3, DType>>::value) {
+    layout = HWC;
+    outputHeight = output.size(0);
+    outputWidth = output.size(1);
+    inputHeight = input.size(0);
+    inputWidth = input.size(1);
+  } else {
+    layout = NHWC;
+    outputHeight = output.size(1);
+    outputWidth = output.size(2);
+    inputHeight = input.size(1);
+    inputWidth = input.size(2);
+  }
+  const AccReal rheight = (outputHeight > 1) ? (AccReal)(inputHeight - 1)/
+                         (outputHeight - 1) : AccReal(0);
+  const AccReal rwidth = (outputWidth > 1) ? (AccReal)(inputWidth - 1)/
+                         (outputWidth - 1) : AccReal(0);
+  const int num_kernels = outputHeight * outputWidth;
+  const int num_threads = getNumThreads(inputHeight * inputWidth, false);
+  dim3 blocks(static_cast<int>(num_kernels / num_threads) + 1);
+  dim3 threads(num_threads);
+  cudaStream_t stream = mshadow::Stream<gpu>::GetStream(s);
+  caffe_gpu_interp2_kernel<gpu, DType, AccReal>
+  <<<blocks, threads , 0, stream>>>(
+    num_kernels, rheight, rwidth, input, output, layout);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(caffe_gpu_interp2_kernel);
+}
+
+NNVM_REGISTER_OP(_image_resize)
+.set_attr<FCompute>("FCompute<gpu>", Resize<gpu>);
+
+}  // namespace image
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 0b20a0263..f56436b8f 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -31,6 +31,7 @@
 #include "math_functions-inl.h"
 #include "special_functions-inl.h"
 #include "./operator_tune.h"
+#include "./contrib/erfinv-inl.h"
 
 #ifdef __CUDACC__
 #include <cuda_fp16.h>
@@ -169,6 +170,8 @@ struct softrelu : public mxnet_op::tunable {
 
 MXNET_UNARY_MATH_OP(softrelu_grad, -math::expm1(-a));
 
+MXNET_UNARY_MATH_OP(erfinv_grad, 0.5 * math::sqrt(PI) * math::exp(math::sqr(erfinv::Map(a))));
+
 MXNET_UNARY_MATH_OP(erf_grad, 2.0 / math::sqrt(PI) * math::exp(-(a * a)));
 
 MXNET_SIMPLE_UNARY_MATH_OP(erf);
diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h
index 9668c227b..2a828994f 100644
--- a/src/operator/nn/dropout-inl.h
+++ b/src/operator/nn/dropout-inl.h
@@ -39,12 +39,15 @@
 #include "../random/sampler.h"
 #include "../tensor/elemwise_binary_broadcast_op.h"
 
-#if defined(USE_MKL) && defined(_OPENMP)
+#define MXNET_USE_MKL_DROPOUT defined(USE_MKL) && defined(_OPENMP) && !defined(__CUDACC__)
+#if MXNET_USE_MKL_DROPOUT
 #include <omp.h>
 
 #include <mkl_vml_functions.h>
 #include <mkl_vsl.h>
-#endif  // USE_MKL && _OPENMP
+#endif  // MXNET_USE_MKL_DROPOUT
+
+#define MXNET_USE_CUDNN_DROPOUT MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
 
 namespace dropout {
 enum DropoutOpInputs {kData};
@@ -62,6 +65,7 @@ struct DropoutParam : public dmlc::Parameter<DropoutParam> {
   float p;
   int mode;
   TShape axes;
+  dmlc::optional<bool> cudnn_off;
   DMLC_DECLARE_PARAMETER(DropoutParam) {
     DMLC_DECLARE_FIELD(p).set_default(0.5)
     .set_range(0, 1)
@@ -73,12 +77,15 @@ struct DropoutParam : public dmlc::Parameter<DropoutParam> {
     .describe("Whether to only turn on dropout during training or to also turn on for inference.");
     DMLC_DECLARE_FIELD(axes).set_default(TShape())
     .describe("Axes for variational dropout kernel.");
+    DMLC_DECLARE_FIELD(cudnn_off).set_default(dmlc::optional<bool>(true))
+    .describe("Whether to turn off cudnn in dropout operator. "
+              "This option is ignored if axes is specified.");
   }
 };  // struct DropoutParam
 
 template<typename xpu, typename DType>
 class DropoutOp {
-#if defined(USE_MKL) && defined(_OPENMP)
+#if MXNET_USE_MKL_DROPOUT
   static void BernoulliGenerate(common::random::RandGenerator<cpu, DType> gen,
                                 int n, double p, int* r) {
     typename RandGenerator<xpu, DType>::Impl genImpl(&gen, 1);
@@ -100,86 +107,55 @@ class DropoutOp {
       }
     }
   }
-
-  // MKL forward pass
-  static bool MSHADOW_CINLINE MKLForward(mshadow::Stream<cpu> *s, RandGenerator<cpu, DType> *pgen,
-                                         const double pkeep,
-                                         const std::vector<TBlob> &in_data,
-                                         const std::vector<TBlob> &out_data) {
+  static inline bool MKLAvailable() {
     // BernoulliGenerate expects an array int, so for types smaller than int, the mask buffer
     // will be too small, so we can;t use MKL in those cases
-    if (sizeof(DType) >= sizeof(int)) {
-      Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu, DType>(s);
-      DType *outptr = out.dptr_;
-      DType *dataptr = data.dptr_;
-      auto maskptr = reinterpret_cast<int *>(mask.dptr_);
-      int count = mask.shape_[0] * mask.shape_[1];
-      BernoulliGenerate(*pgen, count, pkeep, maskptr);
-      const float pk_1 = 1.0f / pkeep;
+    return sizeof(DType) >= sizeof(int);
+  }
+
+  // MKL forward pass
+  inline void MKLForward(const OpContext &ctx,
+                         const std::vector<TBlob> &in_data,
+                         const std::vector<TBlob> &out_data) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    RandGenerator<xpu, DType> *pgen = ctx.requested[0].get_parallel_random<xpu, DType>();
+    CHECK_NOTNULL(pgen);
+    Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu, DType>(s);
+    DType *outptr = out.dptr_;
+    DType *dataptr = data.dptr_;
+    auto maskptr = reinterpret_cast<int *>(mask.dptr_);
+    int count = mask.shape_[0] * mask.shape_[1];
+    BernoulliGenerate(*pgen, count, this->pkeep_, maskptr);
+    const float pk_1 = 1.0f / this->pkeep_;
 #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-      for (int i = 0; i < count; ++i) {
-        outptr[i] = dataptr[i] * maskptr[i] * pk_1;
-      }
-      return true;
+    for (int i = 0; i < count; ++i) {
+      outptr[i] = dataptr[i] * maskptr[i] * pk_1;
     }
-    return false;
   }
 
   // MKL backward pass
-  static bool MSHADOW_CINLINE MKLBackward(mshadow::Stream<cpu> *s, const double pkeep,
-                                          const std::vector<TBlob> &in_grad,
-                                          const std::vector<TBlob> &out_data,
-                                          const std::vector<TBlob> &out_grad) {
-    if (sizeof(DType) >= sizeof(int)) {
-      Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu, DType>(s);
-      DType *ingradptr = gdata.dptr_;
-      const DType *outgradptr = grad.dptr_;
-      auto maskptr = reinterpret_cast<int *>(mask.dptr_);
-      int count = mask.shape_[0] * mask.shape_[1];
-      const float pk_1 = 1.0f / pkeep;
+  inline void MKLBackward(const OpContext &ctx,
+                          const std::vector<TBlob> &in_grad,
+                          const std::vector<TBlob> &out_data,
+                          const std::vector<TBlob> &out_grad) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu, DType>(s);
+    DType *ingradptr = gdata.dptr_;
+    const DType *outgradptr = grad.dptr_;
+    auto maskptr = reinterpret_cast<int *>(mask.dptr_);
+    int count = mask.shape_[0] * mask.shape_[1];
+    const float pk_1 = 1.0f / this->pkeep_;
 #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-      for (int i = 0; i < count; ++i) {
-        ingradptr[i] = outgradptr[i] * maskptr[i] * pk_1;
-      }
-      return true;
+    for (int i = 0; i < count; ++i) {
+      ingradptr[i] = outgradptr[i] * maskptr[i] * pk_1;
     }
-    return false;
   }
 
-#ifdef __CUDACC__
-  // GPU never uses MKL
-  static bool MSHADOW_CINLINE MKLForward(mshadow::Stream<gpu> *s, RandGenerator<gpu, DType> *pgen,
-                                         const double pkeep,
-                                         const std::vector<TBlob> &in_data,
-                                         const std::vector<TBlob> &out_data) {
-    return false;
-  }
-  static bool MSHADOW_CINLINE MKLBackward(mshadow::Stream<gpu> *s, const double pkeep,
-                                          const std::vector<TBlob> &in_grad,
-                                          const std::vector<TBlob> &out_data,
-                                          const std::vector<TBlob> &out_grad) {
-    return false;
-  }
-#endif  // __CUDACC__
-
-#else  // #if defined(USE_MKL) && defined(_OPENMP)
-  static bool MSHADOW_CINLINE MKLForward(mshadow::Stream<xpu> *s, RandGenerator<xpu, DType> *pgen,
-                                const double pkeep,
-                                const std::vector<TBlob> &in_data,
-                                const std::vector<TBlob> &out_data) {
-    return false;
-  }
-  static bool MSHADOW_CINLINE MKLBackward(mshadow::Stream<xpu> *s, const double pkeep,
-                                          const std::vector<TBlob> &in_grad,
-                                          const std::vector<TBlob> &out_data,
-                                          const std::vector<TBlob> &out_grad) {
-    return false;
-  }
-#endif  // #if defined(USE_MKL) && defined(_OPENMP)
+#endif  // #if MXNET_USE_MKL_DROPOUT
 
  public:
   /*!
@@ -227,52 +203,181 @@ class DropoutOp {
     }
   };
 
-  void Init(const DropoutParam &param) {
+  explicit DropoutOp(const DropoutParam &param, Context ctx) {
     this->pkeep_ = 1.0f - param.p;
     this->mode_ = static_cast<dropout::DropoutOpMode>(param.mode);
     this->axes_ = param.axes;
+    this->dropout_passthrough_ = true;
+#if MXNET_USE_CUDNN_DROPOUT
+    this->cudnn_off_ = param.cudnn_off && param.cudnn_off.value();
+    this->ctx_ = ctx;
+    if (ctx.dev_type == kGPU && this->pkeep_ > 0 && !this->cudnn_off_) {
+      dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+      CUDNN_CALL(cudnnCreateTensorDescriptor(&x_desc_));
+      CUDNN_CALL(cudnnCreateTensorDescriptor(&y_desc_));
+      CUDNN_CALL(cudnnCreateTensorDescriptor(&dx_desc_));
+      CUDNN_CALL(cudnnCreateTensorDescriptor(&dy_desc_));
+      CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
+    }
+#endif  // MXNET_USE_CUDNN_DROPOUT
+  }
+
+  ~DropoutOp() {
+#if MXNET_USE_CUDNN_DROPOUT
+    if (this->ctx_.dev_type == kGPU && this->pkeep_ > 0 && !this->cudnn_off_) {
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(x_desc_));
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(y_desc_));
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(dx_desc_));
+      CUDNN_CALL(cudnnDestroyTensorDescriptor(dy_desc_));
+      CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
+    }
+#endif  // MXNET_USE_CUDNN_DROPOUT
   }
 
+#if MXNET_USE_CUDNN_DROPOUT && defined(__CUDACC__)
+  inline bool CuDNNAvailable() {
+    return this->pkeep_ > 0 && !this->cudnn_off_;
+  }
+
+  inline void CuDNNForward(const OpContext &ctx,
+                           const TBlob &in,
+                           const TBlob &mask,
+                           const TBlob &out) {
+      Stream<xpu> *s = ctx.get_stream<xpu>();
+
+      // set dropout state.
+      ctx.requested[0].get_cudnn_dropout_desc(&dropout_desc_, s, 1.0f - this->pkeep_, seed_);
+
+      // describe input/output tensor
+      int dim[4], stride[4];
+      dim[0] = 1;
+      dim[1] = 1;
+      dim[2] = 1;
+      dim[3] = out.Size();
+      stride[0] = out.Size();
+      stride[1] = out.Size();
+      stride[2] = out.Size();
+      stride[3] = 1;
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(x_desc_,
+                                            dtype_,
+                                            4,
+                                            dim,
+                                            stride));
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(y_desc_,
+                                            dtype_,
+                                            4,
+                                            dim,
+                                            stride));
+
+      // perform dropout with cudnn
+      CUDNN_CALL(cudnnDropoutGetReserveSpaceSize(x_desc_, &dropout_reserve_byte_));
+      // cudnn uses bits to record the positions that are dropped, so reserve bytes is always
+      // 1/8 of input size.
+      CHECK_GE(mask.Size() * sizeof(DType), dropout_reserve_byte_) <<
+        "The size of the mask space is smaller than the required cudnn reserved space.";
+      CUDNN_CALL(cudnnDropoutForward(s->dnn_handle_,
+                                     dropout_desc_,
+                                     x_desc_,
+                                     in.dptr<DType>(),
+                                     y_desc_,
+                                     out.dptr<DType>(),
+                                     mask.dptr<DType>(),
+                                     dropout_reserve_byte_));
+  }
+
+  inline void CuDNNBackward(const OpContext &ctx,
+                            const TBlob &out_grad,
+                            const TBlob &mask,
+                            const TBlob &in_grad) {
+      Stream<xpu> *s = ctx.get_stream<xpu>();
+
+      // describe input/output tensor
+      int dim[4], stride[4];
+      dim[0] = 1;
+      dim[1] = 1;
+      dim[2] = 1;
+      dim[3] = in_grad.Size();
+      stride[0] = in_grad.Size();
+      stride[1] = in_grad.Size();
+      stride[2] = in_grad.Size();
+      stride[3] = 1;
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dy_desc_,
+                                            dtype_,
+                                            4,
+                                            dim,
+                                            stride));
+      CUDNN_CALL(cudnnSetTensorNdDescriptor(dx_desc_,
+                                            dtype_,
+                                            4,
+                                            dim,
+                                            stride));
+
+      // perform dropout with cudnn
+      CUDNN_CALL(cudnnDropoutBackward(s->dnn_handle_,
+                                      dropout_desc_,
+                                      dy_desc_,
+                                      out_grad.dptr<DType>(),
+                                      dx_desc_,
+                                      in_grad.dptr<DType>(),
+                                      mask.dptr<DType>(),
+                                      dropout_reserve_byte_));
+  }
+#endif  // MXNET_USE_CUDNN_DROPOUT && defined(__CUDACC__)
+
   void Forward(const OpContext &ctx,
                const std::vector<TBlob> &in_data,
                const std::vector<OpReqType> &req,
                const std::vector<TBlob> &out_data) {
+    this->dropout_passthrough_ = true;
     if (req[dropout::kOut] != kNullOp) {
       CHECK_EQ(in_data.size(), 1U);
       if (ctx.is_train) {
         CHECK_EQ(out_data.size(), 2U);
       }
       Stream<xpu> *s = ctx.get_stream<xpu>();
+      const TBlob &in = in_data[dropout::kData];
       const TBlob &out = out_data[dropout::kOut];
-      if (ctx.is_train || this->mode_ == dropout::kAlways) {
-        RandGenerator<xpu, DType> *pgen = ctx.requested[0].get_parallel_random<xpu, DType>();
-        CHECK_NOTNULL(pgen);
-        if (this->axes_.ndim() != 0 || !MKLForward(s, pgen, this->pkeep_, in_data, out_data)) {
-          const TBlob &mask = out_data[dropout::kMask];
+      const TBlob &mask = out_data[dropout::kMask];
+      if (this->pkeep_ < 1 && (ctx.is_train || this->mode_ == dropout::kAlways)) {
+        this->dropout_passthrough_ = false;
+        if (this->axes_.ndim() == 0) {
+#if MXNET_USE_MKL_DROPOUT
+          if (MKLAvailable()) {
+            MKLForward(ctx, in_data, out_data);
+            return;
+          }
+#endif  // MXNET_USE_MKL_DROPOUT
+#if MXNET_USE_CUDNN_DROPOUT && defined(__CUDACC__)
+          if (CuDNNAvailable()) {
+            CuDNNForward(ctx, in, mask, out);
+            return;
+          }
+#endif  // MXNET_USE_CUDNN_DROPOUT && defined(__CUDACC__)
+          RandGenerator<xpu, DType> *pgen = ctx.requested[0].get_parallel_random<xpu, DType>();
+          CHECK_NOTNULL(pgen);
           CHECK(req[dropout::kOut] != kAddTo);
-          if (this->axes_.ndim() == 0) {
-            // standard case for dropout
-            LaunchRNG<DropoutKernel, xpu>(s, pgen, out.Size(),
+          LaunchRNG<DropoutKernel, xpu>(s, pgen, out.Size(),
                                         out.dptr<DType>(),
                                         mask.dptr<DType>(),
-                                        in_data[dropout::kData].dptr<DType>(),
+                                        in.dptr<DType>(),
                                         this->pkeep_);
-            return;
-          }
-
+          return;
+        } else {
+          RandGenerator<xpu, DType> *pgen = ctx.requested[0].get_parallel_random<xpu, DType>();
+          CHECK_NOTNULL(pgen);
           // initialize the mask
           LaunchRNG<BernoulliKernel, xpu>(s, pgen, mask.Size(),
                                           mask.dptr<DType>(),
                                           this->pkeep_);
           // broadcast mul
           TShape new_lshape, new_rshape, new_oshape;
-          int ndim = BinaryBroadcastShapeCompact(in_data[dropout::kData].shape_,
+          int ndim = BinaryBroadcastShapeCompact(in.shape_,
                                                  mask.shape_, out.shape_,
                                                  &new_lshape, &new_rshape, &new_oshape);
           if (!ndim) {
             MXNET_ASSIGN_REQ_SWITCH(req[dropout::kOut], Req, {
               mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::mul, Req>, xpu>::Launch(
-                s, out.Size(), out.dptr<DType>(), in_data[dropout::kData].dptr<DType>(),
+                s, out.Size(), out.dptr<DType>(), in.dptr<DType>(),
                 mask.dptr<DType>());
             });
           } else {
@@ -284,21 +389,16 @@ class DropoutOp {
                                mshadow_op::mul>, xpu>::
               template LaunchEx(s, new_oshape.Size(), req[dropout::kOut],
               lstride, rstride, oshape,
-              in_data[dropout::kData].dptr<DType>(),
+              in.dptr<DType>(),
               mask.dptr<DType>(), out.dptr<DType>());
             });
           }
         }
       } else {
-        const TBlob& data = in_data[dropout::kData];
-        if (req[dropout::kOut] == kWriteTo) {
-          mxnet_op::copy(s, out, data);
-        } else {
-          MXNET_ASSIGN_REQ_SWITCH(req[dropout::kOut], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, Req>, xpu>::Launch(
-              s, out.Size(), out.dptr<DType>(), data.dptr<DType>());
-          });
-        }
+        MXNET_ASSIGN_REQ_SWITCH(req[dropout::kOut], Req, {
+          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, Req>, xpu>::Launch(
+            s, out.Size(), out.dptr<DType>(), in.dptr<DType>());
+        });
       }
     }
   }
@@ -311,20 +411,32 @@ class DropoutOp {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (ctx.is_train || mode_ == dropout::kAlways) {
-      if (this->axes_.ndim() != 0 || !MKLBackward(s, this->pkeep_, in_grad, out_data, out_grad)) {
-        const TBlob &gdata = in_grad[dropout::kData];
-        const TBlob &grad = out_grad[dropout::kOut];
-        const TBlob &mask = out_data[dropout::kMask];
-        if (this->axes_.ndim() == 0) {
-          // standard case for dropout
-          CHECK_EQ(grad.Size(), mask.Size());
-          MXNET_ASSIGN_REQ_SWITCH(req[dropout::kData], Req, {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::mul, Req>, xpu>::Launch(
-              s, gdata.Size(), gdata.dptr<DType>(), grad.dptr<DType>(), mask.dptr<DType>());
-          });
+    if (!this->dropout_passthrough_) {
+      this->dropout_passthrough_ = true;
+      const TBlob &gdata = in_grad[dropout::kData];
+      const TBlob &grad = out_grad[dropout::kOut];
+      const TBlob &mask = out_data[dropout::kMask];
+      if (this->axes_.ndim() == 0) {
+#if MXNET_USE_MKL_DROPOUT
+        if (MKLAvailable()) {
+          MKLBackward(ctx, in_grad, out_data, out_grad);
+          return;
+        }
+#endif  // MXNET_USE_MKL_DROPOUT
+#if MXNET_USE_CUDNN_DROPOUT && defined(__CUDACC__)
+        if (CuDNNAvailable()) {
+          CuDNNBackward(ctx, grad, mask, gdata);
           return;
         }
+#endif  // MXNET_USE_CUDNN_DROPOUT && defined(__CUDACC__)
+        // standard case for dropout
+        CHECK_EQ(grad.Size(), mask.Size());
+        MXNET_ASSIGN_REQ_SWITCH(req[dropout::kData], Req, {
+          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::mul, Req>, xpu>::Launch(
+            s, gdata.Size(), gdata.dptr<DType>(), grad.dptr<DType>(), mask.dptr<DType>());
+        });
+        return;
+      } else {
         // broardcast mul
         TShape new_lshape, new_rshape, new_oshape;
         int ndim = BinaryBroadcastShapeCompact(grad.shape_,
@@ -350,14 +462,10 @@ class DropoutOp {
     } else {
       const TBlob& gdata = in_grad[dropout::kData];
       const TBlob& grad = out_grad[dropout::kOut];
-      if (req[dropout::kData] == kWriteTo) {
-        mxnet_op::copy(s, gdata, grad);
-      } else {
-        MXNET_ASSIGN_REQ_SWITCH(req[dropout::kData], Req, {
-          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, Req>, xpu>::Launch(
-            s, gdata.Size(), gdata.dptr<DType>(), grad.dptr<DType>());
-        });
-      }
+      MXNET_ASSIGN_REQ_SWITCH(req[dropout::kData], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, Req>, xpu>::Launch(
+          s, gdata.Size(), gdata.dptr<DType>(), grad.dptr<DType>());
+      });
     }
   }
 
@@ -366,30 +474,57 @@ class DropoutOp {
   real_t pkeep_;
   /*! \brief Dropout mode */
   dropout::DropoutOpMode mode_;
+  /*! \brief Axes on which dropout mask is shared in the form of broadcast multiply */
   TShape axes_;
+  /*! \brief Flag to record whether forward is executed in pass-through mode */
+  bool dropout_passthrough_;
+#if MXNET_USE_CUDNN_DROPOUT
+  bool cudnn_off_;
+  Context ctx_;
+  cudnnDataType_t dtype_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+  uint64_t seed_ = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
+  size_t dropout_reserve_byte_;
+  cudnnTensorDescriptor_t x_desc_, y_desc_, dx_desc_, dy_desc_;
+#endif  // MXNET_USE_CUDNN_DROPOUT
 };  // class DropoutOp
 
+static OpStatePtr CreateDropoutState(const nnvm::NodeAttrs &attrs,
+                                     const Context ctx,
+                                     const std::vector<TShape> &in_shapes,
+                                     const std::vector<int> &in_types) {
+  const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
+  OpStatePtr state;
+  MSHADOW_REAL_TYPE_SWITCH(in_types[dropout::kData], DType, {
+    if (ctx.dev_type == kGPU) {
+      state = OpStatePtr::Create<DropoutOp<gpu, DType>>(param, ctx);
+    } else {
+      state = OpStatePtr::Create<DropoutOp<cpu, DType>>(param, ctx);
+    }
+    return state;
+  });
+  LOG(FATAL) << "should never reach here";
+  return OpStatePtr();  // should never reach here
+}
+
 template<typename xpu>
-void DropoutCompute(const nnvm::NodeAttrs& attrs,
+void DropoutCompute(const OpStatePtr& state,
                     const OpContext& ctx,
                     const std::vector<TBlob>& inputs,
                     const std::vector<OpReqType>& req,
                     const std::vector<TBlob>& outputs) {
-  const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    DropoutOp<xpu, DType> op;
-    op.Init(param);
+    DropoutOp<xpu, DType>& op = state.get_state<DropoutOp<xpu, DType>>();
     op.Forward(ctx, inputs, req, outputs);
   });
 }
 
 template<typename xpu>
-void DropoutGradCompute(const nnvm::NodeAttrs& attrs,
+void DropoutGradCompute(const OpStatePtr& state,
                         const OpContext& ctx,
                         const std::vector<TBlob>& inputs,
                         const std::vector<OpReqType>& req,
                         const std::vector<TBlob>& outputs) {
-  const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1);
   CHECK_EQ(req.size(), 1);
@@ -399,12 +534,13 @@ void DropoutGradCompute(const nnvm::NodeAttrs& attrs,
   out_data[dropout::kMask] = inputs[1];
 
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    DropoutOp<xpu, DType> op;
-    op.Init(param);
+    DropoutOp<xpu, DType>& op = state.get_state<DropoutOp<xpu, DType>>();
     op.Backward(ctx, out_grads, out_data, req, outputs);
   });
 }
 
 }  // namespace op
 }  // namespace mxnet
+
+#undef MXNET_USE_MKL_DROPOUT
 #endif  // MXNET_OPERATOR_NN_DROPOUT_INL_H_
diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc
index 3021e0105..d6cbeb4e5 100644
--- a/src/operator/nn/dropout.cc
+++ b/src/operator/nn/dropout.cc
@@ -119,25 +119,43 @@ Example::
   for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
   return true;
 })
-.set_attr<FCompute>("FCompute<cpu>", DropoutCompute<cpu>)
+.set_attr<FCreateOpState>("FCreateOpState", CreateDropoutState)
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", DropoutCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", DropoutGrad{"_backward_Dropout"})
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
   return std::vector<std::pair<int, int> >{{0, 0}};
 })
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
-  return std::vector<ResourceRequest>{ ResourceRequest::kParallelRandom };
-})
+.set_attr<FResourceRequestEx>("FResourceRequestEx",
+  [](const NodeAttrs& attrs, const int dev_mask, const DispatchMode dispatch_mode) {
+    std::vector<ResourceRequest> request;
+    const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
+    if (param.p == 0) return request;
+    if (dev_mask == kGPU) {
+#if MXNET_USE_CUDNN_DROPOUT
+      // if cudnn is used, parallel random is not needed.
+      if (1.0f - param.p > 0
+          && !(param.cudnn_off && param.cudnn_off.value())
+          && param.axes.ndim() == 0) {
+        request.emplace_back(ResourceRequest::kCuDNNDropoutDesc);
+        return request;
+      }
+#endif
+    }
+    request.emplace_back(ResourceRequest::kParallelRandom);
+    return request;
+  })
 .add_argument("data", "NDArray-or-Symbol", "Input array to which dropout will be applied.")
 .add_arguments(DropoutParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_Dropout)
 .set_num_outputs(1)
+.set_attr<bool>("TIsLayerOpBackward", true)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr_parser(ParamParser<DropoutParam>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
   return std::vector<std::pair<int, int> >{{0, 0}};
 })
-.set_attr<FCompute>("FCompute<cpu>", DropoutGradCompute<cpu>);
+.set_attr<FStatefulCompute>("FStatefulCompute<cpu>", DropoutGradCompute<cpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index 832490b08..20c5714dd 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -30,10 +30,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(Dropout)
-.set_attr<FCompute>("FCompute<gpu>", DropoutCompute<gpu>);
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DropoutCompute<gpu>);
 
 NNVM_REGISTER_OP(_backward_Dropout)
-.set_attr<FCompute>("FCompute<gpu>", DropoutGradCompute<gpu>);
+.set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DropoutGradCompute<gpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 18ef3f3e7..f770c4aba 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -190,6 +190,9 @@ static int GetTypeSize(int dtype) {
 }
 
 static inline size_t GetArraySize(const NDArray &arr) {
+  if (arr.IsMKLDNNData()) {
+    return arr.GetMKLDNNData()->get_primitive_desc().get_size();
+  }
   return arr.shape().Size() * GetTypeSize(arr.dtype());
 }
 
@@ -238,26 +241,25 @@ static inline size_t GetMemDescSize(const mkldnn::memory::desc &md) {
   return ret;
 }
 
-inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) {
+inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int dtype = -1) {
+  int ndim = arr.shape().ndim();
   mkldnn::memory::dims dims(ndim);
+  dtype = (dtype == -1) ? arr.dtype() : dtype;
   for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
-  return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()),
-                              mkldnn::memory::format::any};
-}
-
-inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) {
-  return GetMemDesc(arr, arr.shape().ndim());
+  return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), mkldnn::memory::format::any};
 }
 
 inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
-                                                 int num_groups) {
-  auto ndim = arr.shape().ndim();
-  mkldnn::memory::dims tz = mkldnn::memory::dims{0};
+                                                 int num_groups,
+                                                 bool quantized = false) {
+  int dtype = quantized ? mshadow::kInt8 : arr.dtype();
   if (num_groups == 1) {
-    return GetMemDesc(arr);
+    return GetMemDesc(arr, dtype);
   } else {
+    auto ndim = arr.shape().ndim();
     CHECK((ndim == 3) || (ndim == 4))
         << "MKL-DNN weight currectly supports 3d and 4d layout";
+    auto tz = mkldnn::memory::dims{0};
     const int N = 0, H = 2, W = 3, C = 1;
     if (ndim == 3) {
       tz = mkldnn::memory::dims{
@@ -269,8 +271,7 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
           static_cast<int>(arr.shape()[C]), static_cast<int>(arr.shape()[H]),
           static_cast<int>(arr.shape()[W])};
     }
-    return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()),
-                                mkldnn::memory::format::any};
+    return mkldnn::memory::desc{tz, get_mkldnn_type(dtype), mkldnn::memory::format::any};
   }
 }
 
@@ -447,6 +448,8 @@ static inline void CreateDefaultInputs(const std::vector<NDArray> &arrs,
   }
 }
 
+const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups);
+
 const mkldnn::memory *GetWeights(const NDArray &arr,
                                  const mkldnn::memory::primitive_desc &target_pd,
                                  int num_groups);
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index ccb9d7ec0..d40c40668 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -229,61 +229,49 @@ void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) {
   }
 }
 
-const mkldnn::memory *GetWeights(const NDArray &arr,
-                                 const mkldnn::memory::primitive_desc &target_pd,
-                                 int num_groups) {
-  const mkldnn::memory *mem = arr.GetMKLDNNData(target_pd);
-  // If the weight array already uses the target layout, simply return it
-  // directly.
-  if (mem)
-    return mem;
-
-  mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype());
-  mkldnn::memory::dims tz = mkldnn::memory::dims{0};
-  mkldnn::memory::format format = mkldnn::memory::format::format_undef;
+const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
+  const auto type = get_mkldnn_type(arr.dtype());
+  auto tz = mkldnn::memory::dims{0};
+  auto format = mkldnn::memory::format::format_undef;
   auto engine = CpuEngine::Get()->get_engine();
   const int O = 0, I = 1, H = 2, W = 3;
   if (arr.shape().ndim() == 2) {
-    tz = mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
-                              static_cast<int>(arr.shape()[I])};
+    tz = mkldnn::memory::dims{static_cast<int>(arr.shape()[O]), static_cast<int>(arr.shape()[I])};
     format = mkldnn::memory::format::oi;
   } else if (arr.shape().ndim() == 3) {
     tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups,
-                                    static_cast<int>(arr.shape()[O] /
-                                                     num_groups),
+             ? mkldnn::memory::dims{num_groups, static_cast<int>(arr.shape()[O] / num_groups),
                                     static_cast<int>(arr.shape()[I]),
                                     static_cast<int>(arr.shape()[H])}
              : mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
                                     static_cast<int>(arr.shape()[I]),
                                     static_cast<int>(arr.shape()[H])};
-    format = num_groups > 1 ? mkldnn::memory::format::goiw
-                            : mkldnn::memory::format::oiw;
+    format = num_groups > 1 ? mkldnn::memory::format::goiw : mkldnn::memory::format::oiw;
   } else if (arr.shape().ndim() == 4) {
     tz = num_groups > 1
-             ? mkldnn::memory::dims{num_groups,
-                                    static_cast<int>(arr.shape()[O] /
-                                                     num_groups),
+             ? mkldnn::memory::dims{num_groups, static_cast<int>(arr.shape()[O] / num_groups),
                                     static_cast<int>(arr.shape()[I]),
                                     static_cast<int>(arr.shape()[H]),
                                     static_cast<int>(arr.shape()[W])}
-             : mkldnn::memory::dims{static_cast<int>(arr.shape()[O]),
-                                    static_cast<int>(arr.shape()[I]),
-                                    static_cast<int>(arr.shape()[H]),
-                                    static_cast<int>(arr.shape()[W])};
-    format = num_groups > 1 ? mkldnn::memory::format::goihw
-                            : mkldnn::memory::format::oihw;
+             : mkldnn::memory::dims{
+                   static_cast<int>(arr.shape()[O]), static_cast<int>(arr.shape()[I]),
+                   static_cast<int>(arr.shape()[H]), static_cast<int>(arr.shape()[W])};
+    format = num_groups > 1 ? mkldnn::memory::format::goihw : mkldnn::memory::format::oihw;
   } else {
     LOG(FATAL) << "The weight array has an unsupported number of dimensions";
-    return nullptr;
   }
-  mkldnn::memory::desc md =
-      mkldnn::memory::desc{tz, type, format};
-  mkldnn::memory::primitive_desc pd =
-      mkldnn::memory::primitive_desc{md, engine};
-  mem = arr.GetMKLDNNData(pd);
-  if (mem == nullptr)
-    mem = arr.GetMKLDNNDataReorder(target_pd);
+  const auto md = mkldnn::memory::desc{tz, type, format};
+  const auto pd = mkldnn::memory::primitive_desc{md, engine};
+  return arr.GetMKLDNNData(pd);
+}
+
+const mkldnn::memory *GetWeights(const NDArray &arr,
+                                 const mkldnn::memory::primitive_desc &target_pd, int num_groups) {
+  const mkldnn::memory *mem = arr.GetMKLDNNData(target_pd);
+  // If the weight array already uses the target layout, simply return it directly.
+  if (mem) return mem;
+  mem = GetWeights(arr, num_groups);
+  if (mem == nullptr) mem = arr.GetMKLDNNDataReorder(target_pd);
   if (mem->get_primitive_desc() == target_pd) return mem;
 
   auto ret = TmpMemMgr::Get()->Alloc(target_pd);
@@ -350,6 +338,7 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
       case mkldnn_oIhw8i:
       case mkldnn_oIhw16i:
       case mkldnn_OIhw8i8o:
+      case mkldnn_hwio_s8s8:
       case mkldnn_OIhw16i16o:
       case mkldnn_OIhw4i16o4i:
       case mkldnn_OIhw4i16o4i_s8s8:
@@ -384,9 +373,11 @@ mkldnn_memory_format_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
     switch (desc.data.format) {
       case mkldnn_goihw:
       case mkldnn_hwigo:
+      case mkldnn_hwigo_s8s8:
       case mkldnn_gOIhw8i8o:
       case mkldnn_gOIhw16i16o:
       case mkldnn_gOIhw4i16o4i:
+      case mkldnn_gOIhw4i16o4i_s8s8:
       case mkldnn_gOIhw8i16o2i:
       case mkldnn_gOIhw8o16i2o:
       case mkldnn_gOIhw8o8i:
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
index 971c66ad9..ab6650ead 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
@@ -42,7 +42,6 @@ struct MKLDNNConvParam : public dmlc::Parameter<MKLDNNConvParam> {
   bool with_sum;
   bool with_postsum_relu;
   bool quantized;
-  bool weight_channelwise_scale;
 
   dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
   dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
@@ -58,8 +57,6 @@ struct MKLDNNConvParam : public dmlc::Parameter<MKLDNNConvParam> {
     .describe("Add post relu after sum");
     DMLC_DECLARE_FIELD(quantized).set_default(false)
     .describe("enable quantization");
-    DMLC_DECLARE_FIELD(weight_channelwise_scale).set_default(true)
-    .describe("Quantize weight with channel wise scales.");
     DMLC_DECLARE_FIELD(min_calib_range)
     .set_default(dmlc::optional<float>())
     .describe("The minimum scalar value in the form of float32 obtained "
@@ -85,23 +82,28 @@ static inline bool IsOutputUInt8(const MKLDNNConvParam &mkldnn_param) {
          mkldnn_param.with_postsum_relu;
 }
 
-mkldnn::convolution_forward::primitive_desc
-GetConvFwdImpl(const MKLDNNConvFullParam &param, const bool is_train,
-               const NDArray &data, const NDArray &weights, const NDArray *bias,
-               const NDArray &output);
+mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(const MKLDNNConvFullParam &param,
+                                                           const bool is_train,
+                                                           const NDArray &data,
+                                                           const NDArray &weights,
+                                                           const NDArray *bias,
+                                                           const NDArray &output);
 
 class MKLDNNConvForward {
  public:
   mkldnn::convolution_forward::primitive_desc fwd_pd;
 
-  MKLDNNConvForward(const MKLDNNConvFullParam &param, const bool is_train,
-                    const NDArray &data, const NDArray &weights,
-                    const NDArray *bias, const NDArray &output)
-      : fwd_pd(GetConvFwdImpl(param, is_train, data, weights, bias, output)) {}
+  MKLDNNConvForward(const MKLDNNConvFullParam &param, const bool is_train, const NDArray &data,
+                    const NDArray &weights, const NDArray *bias, const NDArray &output);
 
   void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
                  const mkldnn::memory *bias, const mkldnn::memory &output);
 
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) {
+    this->data_->set_data_handle(data.get_data_handle());
+    this->out_->set_data_handle(output.get_data_handle());
+  }
+
   const mkldnn::convolution_forward &GetFwd() const {
     return *fwd_;
   }
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index 7f423ce45..a3aca98d9 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -45,15 +45,21 @@ bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input) {
           (input.shape().ndim() == 4));
 }
 
-mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
-    const MKLDNNConvFullParam &param, const bool is_train,
-    const NDArray &data, const NDArray &weights, const NDArray *bias,
-    const NDArray &output) {
+mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(const MKLDNNConvFullParam &param,
+                                                           const bool is_train, const NDArray &data,
+                                                           const NDArray &weights,
+                                                           const NDArray *bias,
+                                                           const NDArray &output) {
   auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
   auto data_md = GetMemDesc(data);
-  auto weight_md = GetWeightDesc(weights, param.conv_param.num_group);
+  auto weight_md = GetWeightDesc(weights, param.conv_param.num_group, param.mkldnn_param.quantized);
   auto out_md = GetMemDesc(output);
-  auto engine = CpuEngine::Get()->get_engine();
+  auto bias_md =
+      bias ? (param.mkldnn_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias))
+           : mkldnn::memory::desc{
+             {}, mkldnn::memory::data_type::data_undef, mkldnn::memory::format::any};
+  auto bias_md_ptr = bias ? &bias_md : nullptr;
+
   mkldnn::memory::dims strides(param.conv_param.kernel.ndim());
   mkldnn::memory::dims padding(param.conv_param.kernel.ndim());
   if (param.conv_param.kernel.ndim() == 1) {
@@ -77,55 +83,61 @@ mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
   mkldnn::primitive_attr attr;
   mkldnn::post_ops ops;
   if (param.mkldnn_param.with_relu) {
-    float scale = 1.0f;            // for fp32, scale is 1.
-    float alpha = 0.0f;            // negative slope for mkldnn_eltwise_relu.
-    float beta = 1.0f;             // ignored for mkldnn_eltwise_relu.
+    float scale = 1.0f;  // for fp32, scale is 1.
+    float alpha = 0.0f;  // negative slope for mkldnn_eltwise_relu.
+    float beta = 1.0f;   // ignored for mkldnn_eltwise_relu.
     ops.append_eltwise(scale, eltwise_relu, alpha, beta);
   }
   if (param.mkldnn_param.with_sum) {
     ops.append_sum(param.sum_scale);
   }
   if (param.mkldnn_param.with_postsum_relu) {
-    float scale = 1.0f;            // for fp32, scale is 1.
-    float alpha = 0.0f;            // negative slope for mkldnn_eltwise_relu.
-    float beta = 1.0f;             // ignored for mkldnn_eltwise_relu.
+    float scale = 1.0f;  // for fp32, scale is 1.
+    float alpha = 0.0f;  // negative slope for mkldnn_eltwise_relu.
+    float beta = 1.0f;   // ignored for mkldnn_eltwise_relu.
     ops.append_eltwise(scale, eltwise_relu, alpha, beta);
   }
   attr.set_post_ops(ops);
 
   if (param.mkldnn_param.quantized && param.requantize_scales.size()) {
-    int mask = param.mkldnn_param.weight_channelwise_scale ? 2 : 0;
+    int mask = (param.requantize_scales.size() > 1) ? 2 : 0;
     attr.set_output_scales(mask, param.requantize_scales);
     attr.set_int_output_round_mode(round_nearest);
   }
-
-  // MKL-DNN introduced padded formats since 0.15 which require more memory
-  // for computation compared with the actual tensor size. Currently, MKL-DNN
-  // operators are still reusing those memory from memory planning and the
-  // memory size may smaller than what MKL-DNN kernels require. So here we need
-  // select suboptimal kernel for computation according to tensor sizes.
-  if (param.conv_param.dilate.ndim() == 0 && bias == nullptr) {
-    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
-        data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero);
-    auto conv_pd =  mkldnn::convolution_forward::primitive_desc(desc, attr, engine);
-    while (conv_pd.dst_primitive_desc().get_size() != GetArraySize(output) ||
-           conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
-           conv_pd.weights_primitive_desc().get_size() != GetArraySize(weights)) {
-      CHECK(conv_pd.next_impl()) << "No implementation";
+  auto GetConvFwdPd = [&param, &data, &weights, &output,
+                       &attr](const mkldnn::convolution_forward::desc &desc) {
+    auto engine = CpuEngine::Get()->get_engine();
+    try {
+      auto conv_pd = mkldnn::convolution_forward::primitive_desc(desc, attr, engine);
+      while (conv_pd.dst_primitive_desc().get_size() != GetArraySize(output) ||
+             conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
+             (!param.mkldnn_param.quantized &&
+              conv_pd.weights_primitive_desc().get_size() != GetArraySize(weights))) {
+        // next_impl() will visit desc and engine, please make sure they are still alive here.
+        CHECK(conv_pd.next_impl()) << "No convolution implementation for this request.";
+      }
+      return conv_pd;
+    } catch (mkldnn::error &e) {
+      if (e.status == mkldnn_unimplemented && param.mkldnn_param.quantized) {
+        LOG(ERROR) << "AVX512-BW support or Intel(R) MKL dependency is "
+                      "required for int8 convolution";
+      } else {
+        LOG(ERROR) << e.message;
+      }
+      throw;
     }
-    return conv_pd;
+  };
+
+  if (param.conv_param.dilate.ndim() == 0 && bias_md_ptr == nullptr) {
+    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
+                                           weight_md, out_md, strides, padding, padding,
+                                           mkldnn::padding_kind::zero);
+    return GetConvFwdPd(desc);
   } else if (param.conv_param.dilate.ndim() == 0) {
-    auto bias_md = GetMemDesc(*bias);
-    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
-        data_md, weight_md, bias_md, out_md, strides, padding, padding,
-        mkldnn::padding_kind::zero);
-    auto conv_pd =  mkldnn::convolution_forward::primitive_desc(desc, attr, engine);
-    while (conv_pd.dst_primitive_desc().get_size() != GetArraySize(output) ||
-           conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
-           conv_pd.weights_primitive_desc().get_size() != GetArraySize(weights)) {
-      CHECK(conv_pd.next_impl()) << "No implementation";
-    }
-    return conv_pd;
+    mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
+                                           weight_md, *bias_md_ptr, out_md, strides, padding,
+                                           padding, mkldnn::padding_kind::zero);
+    return GetConvFwdPd(desc);
   } else {
     mkldnn::memory::dims dilates(param.conv_param.kernel.ndim());
     if (param.conv_param.dilate.ndim() == 1) {
@@ -134,34 +146,19 @@ mkldnn::convolution_forward::primitive_desc GetConvFwdImpl(
       dilates[0] = param.conv_param.dilate[0] - 1;
       dilates[1] = param.conv_param.dilate[1] - 1;
     } else {
-      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
-                 << param.conv_param.dilate.ndim()
+      LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.conv_param.dilate.ndim()
                  << ", supporting only 1 or 2.";
     }
-    if (bias == nullptr) {
-      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
-          data_md, weight_md, out_md, strides, dilates, padding, padding,
-          mkldnn::padding_kind::zero);
-      auto conv_pd =  mkldnn::convolution_forward::primitive_desc(desc, attr, engine);
-      while (conv_pd.dst_primitive_desc().get_size() != GetArraySize(output) ||
-             conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
-             conv_pd.weights_primitive_desc().get_size() != GetArraySize(weights)) {
-        CHECK(conv_pd.next_impl()) << "No implementation";
-      }
-      return conv_pd;
-    } else {
-      auto bias_md = GetMemDesc(*bias);
-      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct,
-                                             data_md, weight_md, bias_md, out_md, strides,
-                                             dilates, padding, padding,
+    if (bias_md_ptr == nullptr) {
+      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
+                                             weight_md, out_md, strides, dilates, padding, padding,
                                              mkldnn::padding_kind::zero);
-      auto conv_pd =  mkldnn::convolution_forward::primitive_desc(desc, attr, engine);
-      while (conv_pd.dst_primitive_desc().get_size() != GetArraySize(output) ||
-             conv_pd.src_primitive_desc().get_size() != GetArraySize(data) ||
-             conv_pd.weights_primitive_desc().get_size() != GetArraySize(weights)) {
-        CHECK(conv_pd.next_impl()) << "No implementation";
-      }
-      return conv_pd;
+      return GetConvFwdPd(desc);
+    } else {
+      mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
+                                             weight_md, *bias_md_ptr, out_md, strides, dilates,
+                                             padding, padding, mkldnn::padding_kind::zero);
+      return GetConvFwdPd(desc);
     }
   }
 }
@@ -328,48 +325,31 @@ static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights(
   }
 }
 
-void MKLDNNConvForward::SetNewMem(const mkldnn::memory &data,
-                                  const mkldnn::memory &weight,
-                                  const mkldnn::memory *bias,
-                                  const mkldnn::memory &output) {
-  if (this->data_ == nullptr)
-    this->data_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-            fwd_pd.src_primitive_desc(), data.get_data_handle()));
-  else
-    this->data_->set_data_handle(data.get_data_handle());
-
-  if (this->weight_ == nullptr)
-    this->weight_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-            fwd_pd.weights_primitive_desc(), weight.get_data_handle()));
-  else
-    this->weight_->set_data_handle(weight.get_data_handle());
-
-  if (this->out_ == nullptr)
-    this->out_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-            fwd_pd.dst_primitive_desc(), output.get_data_handle()));
-  else
-    this->out_->set_data_handle(output.get_data_handle());
-
-  if (bias != nullptr) {
-    if (this->bias_ == nullptr)
-      this->bias_ = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
-              fwd_pd.bias_primitive_desc(), bias->get_data_handle()));
-    else
-      this->bias_->set_data_handle(bias->get_data_handle());
-    if (this->fwd_ == nullptr)
-      this->fwd_ = std::shared_ptr<mkldnn::convolution_forward>(
-          new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data_),
-                                          mkldnn::primitive::at(*this->weight_),
-                                          mkldnn::primitive::at(*this->bias_),
-                                          *this->out_));
-  } else if (this->fwd_ == nullptr) {
-    this->fwd_ = std::shared_ptr<mkldnn::convolution_forward>(
-        new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data_),
-                                        mkldnn::primitive::at(*this->weight_),
-                                        *this->out_));
+MKLDNNConvForward::MKLDNNConvForward(const MKLDNNConvFullParam &param, const bool is_train,
+                                     const NDArray &data, const NDArray &weights,
+                                     const NDArray *bias, const NDArray &output)
+    : fwd_pd(GetConvFwdImpl(param, is_train, data, weights, bias, output)) {
+  data_ = std::make_shared<mkldnn::memory>(fwd_pd.src_primitive_desc(), nullptr);
+  weight_ = std::make_shared<mkldnn::memory>(fwd_pd.weights_primitive_desc(), nullptr);
+  out_ = std::make_shared<mkldnn::memory>(fwd_pd.dst_primitive_desc(), nullptr);
+  if (bias) {
+    bias_ = std::make_shared<mkldnn::memory>(fwd_pd.bias_primitive_desc(), nullptr);
+    fwd_ = std::make_shared<mkldnn::convolution_forward>(fwd_pd, *this->data_, *this->weight_,
+                                                         *this->bias_, *this->out_);
+  } else {
+    fwd_ = std::make_shared<mkldnn::convolution_forward>(fwd_pd, *this->data_, *this->weight_,
+                                                         *this->out_);
   }
 }
 
+void MKLDNNConvForward::SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight,
+                                  const mkldnn::memory *bias, const mkldnn::memory &output) {
+  data_->set_data_handle(data.get_data_handle());
+  weight_->set_data_handle(weight.get_data_handle());
+  out_->set_data_handle(output.get_data_handle());
+  if (bias != nullptr) bias_->set_data_handle(bias->get_data_handle());
+}
+
 MKLDNNConvForward &GetConvFwd(const ConvolutionParam &param,
                               const bool is_train, const NDArray &data,
                               const NDArray &weights, const NDArray *bias,
diff --git a/src/operator/nn/mkldnn/mkldnn_slice-inl.h b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
new file mode 100644
index 000000000..f41db01a9
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_slice-inl.h
+ * \brief
+ * \author Zhiyuan Huang
+*/
+
+#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_SLICE_INL_H_
+#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_SLICE_INL_H_
+
+#if MXNET_USE_MKLDNN == 1
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <utility>
+#include "../../operator_common.h"
+#include "../../tensor/slice-inl.h"
+#include "./mkldnn_base-inl.h"
+
+namespace mxnet {
+namespace op {
+
+class MKLDNNSliceFwd {
+ public:
+  MKLDNNSliceFwd(const SliceParam &param,
+                 const NDArray &in,
+                 const NDArray &out);
+  void SetNewMem(const mkldnn::memory &input, const mkldnn::memory &output);
+  const mkldnn::reorder &GetPd() const;
+
+ private:
+  std::shared_ptr<mkldnn::memory> data_;
+  std::shared_ptr<mkldnn::memory> out_;
+  std::shared_ptr<mkldnn::reorder> fwd_;
+};
+
+typedef ParamOpSign<SliceParam> MKLDNNSliceSignature;
+MKLDNNSliceFwd &GetSliceForward(const SliceParam &param, const bool is_train,
+                 const NDArray &in_data, const NDArray &out_data);
+
+void MKLDNNSlice(const SliceParam &param, const OpContext& ctx,
+                 const NDArray &in, OpReqType req, const NDArray &out);
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_SLICE_INL_H_
diff --git a/src/operator/nn/mkldnn/mkldnn_slice.cc b/src/operator/nn/mkldnn/mkldnn_slice.cc
new file mode 100644
index 000000000..f3c8a14e0
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_slice.cc
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_slice.cc
+ * \brief
+ * \author Zhiyuan Huang
+*/
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+#include "./mkldnn_slice-inl.h"
+
+namespace mxnet {
+namespace op {
+
+MKLDNNSliceFwd::MKLDNNSliceFwd(const SliceParam &param,
+                               const NDArray &in,
+                               const NDArray &out) {
+  const TShape ishape = in.shape();
+  const TShape oshape = out.shape();
+  uint32_t N = ishape.ndim();
+  mkldnn::memory::dims dims(N);
+  mkldnn::memory::dims offsets(N);
+  for (uint32_t i = 0; i < N; ++i) {
+    int s = 0;
+    if (param.begin[i]) {
+      s = *param.begin[i];
+      if (s < 0) s += ishape[i];
+    }
+    dims[i] = oshape[i];
+    offsets[i] = s;
+  }
+  auto in_mem_pd = in.GetMKLDNNData()->get_primitive_desc();
+  auto out_mem_pd = out.GetMKLDNNData()->get_primitive_desc();
+  auto view_pd = mkldnn::view::primitive_desc(in_mem_pd, dims, offsets);
+  auto reorder_pd = reorder::primitive_desc(view_pd.dst_primitive_desc(), out_mem_pd);
+  this->data_ = std::make_shared<mkldnn::memory>(view_pd.dst_primitive_desc(), nullptr);
+  this->out_ = std::make_shared<mkldnn::memory>(view_pd.dst_primitive_desc(), nullptr);
+  this->fwd_ = std::make_shared<mkldnn::reorder>(reorder_pd, *this->data_, *this->out_);
+}
+
+void MKLDNNSliceFwd::SetNewMem(const mkldnn::memory &input, const mkldnn::memory &output) {
+  this->data_->set_data_handle(input.get_data_handle());
+  this->out_->set_data_handle(output.get_data_handle());
+}
+
+const mkldnn::reorder &MKLDNNSliceFwd::GetPd() const {
+  return *fwd_;
+}
+
+MKLDNNSliceFwd &GetSliceForward(const SliceParam &param, const bool is_train,
+                                const NDArray &in_data, const NDArray &out_data) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<MKLDNNSliceSignature, MKLDNNSliceFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNSliceSignature, MKLDNNSliceFwd, OpHash> fwds;
+#endif
+  MKLDNNSliceSignature key(param);
+  key.AddSign(is_train);
+  key.AddSign(in_data);
+  key.AddSign(out_data);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    MKLDNNSliceFwd fwd(param, in_data, out_data);
+    it = AddToCache(&fwds, key, fwd);
+  }
+  return it->second;
+}
+
+void MKLDNNSlice(const SliceParam &param, const OpContext& ctx,
+                 const NDArray &in, OpReqType req, const NDArray &out) {
+  MKLDNNSliceFwd &fwd = GetSliceForward(param, ctx.is_train, in, out);
+  auto in_mem = in.GetMKLDNNData();
+  auto out_mem_pd = out.GetMKLDNNData()->get_primitive_desc();
+  auto out_mem = CreateMKLDNNMem(out, out_mem_pd, req);
+  fwd.SetNewMem(*in_mem, *out_mem.second);
+  MKLDNNStream::Get()->RegisterPrim(fwd.GetPd());
+  CommitOutput(out, out_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index 2018e80cb..56d35b23b 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -234,9 +234,11 @@ IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::log2);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::log2_grad);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::log10);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::log10_grad);  // NOLINT()
-IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sin);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::erf);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::erf_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::erfinv);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::erfinv_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sin);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sin_grad);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sinh);  // NOLINT()
 IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sinh_grad);  // NOLINT()
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 9251b8614..223a1aa6c 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -82,6 +82,301 @@ struct SGDParam : public dmlc::Parameter<SGDParam> {
   }
 };
 
+struct MultiSGDParam : public dmlc::Parameter<MultiSGDParam> {
+  nnvm::Tuple<float> lrs;
+  nnvm::Tuple<float> wds;
+  float rescale_grad;
+  float clip_gradient;
+  int num_weights;
+  DMLC_DECLARE_PARAMETER(MultiSGDParam) {
+    DMLC_DECLARE_FIELD(lrs)
+    .describe("Learning rates.");
+    DMLC_DECLARE_FIELD(wds)
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
+    DMLC_DECLARE_FIELD(rescale_grad)
+    .set_default(1.0f)
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+    .set_default(-1.0f)
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
+    DMLC_DECLARE_FIELD(num_weights)
+    .set_default(1)
+    .describe("Number of updated weights.");
+  }
+};
+
+struct MultiSGDMomParam : public dmlc::Parameter<MultiSGDMomParam> {
+  nnvm::Tuple<float> lrs;
+  nnvm::Tuple<float> wds;
+  float momentum;
+  float rescale_grad;
+  float clip_gradient;
+  int num_weights;
+  DMLC_DECLARE_PARAMETER(MultiSGDMomParam) {
+    DMLC_DECLARE_FIELD(lrs)
+    .describe("Learning rates.");
+    DMLC_DECLARE_FIELD(wds)
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
+    DMLC_DECLARE_FIELD(momentum)
+    .set_default(0.0f)
+    .describe("The decay rate of momentum estimates at each epoch.");
+    DMLC_DECLARE_FIELD(rescale_grad)
+    .set_default(1.0f)
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+    .set_default(-1.0f)
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
+    DMLC_DECLARE_FIELD(num_weights)
+    .set_default(1)
+    .describe("Number of updated weights.");
+  }
+};
+
+template<typename ParamType, int input_stride>
+inline bool MultiSGDShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape> *in_attrs,
+                          std::vector<TShape> *out_attrs) {
+  const ParamType& param = dmlc::get<ParamType>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), input_stride * param.num_weights);
+  CHECK_EQ(out_attrs->size(), param.num_weights);
+
+  bool all_inferred = true;
+  auto& input_shapes = *in_attrs;
+  auto& output_shapes = *out_attrs;
+  // Learning rates
+  CHECK_EQ(param.lrs.ndim(), param.num_weights)
+    << "Number of learning rates is inconsistent with num_weights "
+    << "parameter passed. Expected number of learning rates: "
+    << param.num_weights << ", and got " << param.lrs.ndim();
+  // Weight decays
+  CHECK_EQ(param.wds.ndim(), param.num_weights)
+    << "Number of weight decays is inconsistent with num_weights "
+    << "parameter passed. Expected number of weight decays: "
+    << param.num_weights << ", and got " << param.wds.ndim();
+  // Weights and gradients
+  for (int i = 0; i < param.num_weights; ++i) {
+    std::vector<TShape> input_vec;
+    std::vector<TShape> output_vec({output_shapes[i]});
+    for (int j = 0; j < input_stride; ++j) {
+      input_vec.push_back(input_shapes[i * input_stride + j]);
+    }
+    all_inferred = all_inferred && ElemwiseShape<input_stride, 1>(attrs, &input_vec, &output_vec);
+  }
+  return all_inferred;
+}
+
+template <typename ParamType, int input_stride, int num_fp32_inputs>
+inline bool MP_MultiSGD_InferType(const nnvm::NodeAttrs& attrs,
+                                  std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  const ParamType& param = dmlc::get<ParamType>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), input_stride * param.num_weights);
+  CHECK_EQ(out_attrs->size(), param.num_weights);
+
+  bool all_inferred = true;
+  auto& input_types = *in_attrs;
+  auto& output_types = *out_attrs;
+  // Weights and gradients
+  for (int i = 0; i < param.num_weights; ++i) {
+    std::vector<int> input_vec;
+    std::vector<int> output_vec({output_types[i]});
+    for (int j = 0; j < input_stride - num_fp32_inputs; ++j) {
+      input_vec.push_back(input_types[i * input_stride + j]);
+    }
+    all_inferred = all_inferred &&
+                   ElemwiseType<input_stride - num_fp32_inputs, 1>(attrs, &input_vec, &output_vec);
+  }
+  // master copies of weights
+  for (int i = 0; i < param.num_weights; ++i) {
+    for (int j = 0; j < num_fp32_inputs; ++j) {
+      TYPE_ASSIGN_CHECK(input_types, input_stride * i + input_stride - 1 - j, mshadow::kFloat32);
+    }
+  }
+  return all_inferred;
+}
+
+template<typename DType, typename MPDType>
+struct MultiSGDKernelParam {
+  static const int N = 60;
+  int count;
+  size_t max_size;
+  size_t sizes[N];
+  DType * weights[N];
+  DType * grads[N];
+  MPDType * mom[N];
+  MPDType * weights32[N];
+  DType * out_data[N];
+  MPDType lrs[N];
+  MPDType wds[N];
+  MPDType clip_gradient;
+  MPDType rescale_grad;
+  MPDType momentum;
+};
+
+template <typename MPDType, bool has_momentum, bool has_mixed_precision>
+struct MultiSGDKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, const MultiSGDKernelParam<DType, MPDType>& param,
+    const OpReqType req) {
+    for (int index = 0; index < param.count; ++index) {
+      if ((size_t)i < param.sizes[index]) {
+        MPDType w = has_mixed_precision ? param.weights32[index][i] :
+                                          MPDType(param.weights[index][i]);
+        MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0);
+        if (param.clip_gradient >= 0.0f) {
+          mom = param.momentum*mom
+                - param.lrs[index]*param.wds[index]*w
+                - param.lrs[index]
+                *mshadow_op::clip::Map(param.rescale_grad *
+                                       static_cast<MPDType>(param.grads[index][i]),
+                                     param.clip_gradient);
+        } else {
+          mom = param.momentum*mom
+                - param.lrs[index]*param.wds[index]*w
+                - param.lrs[index]*param.rescale_grad*static_cast<MPDType>(param.grads[index][i]);
+        }
+        if (has_momentum) {
+          param.mom[index][i] = mom;
+        }
+        w = w + mom;
+        if (has_mixed_precision) {
+          param.weights32[index][i] = w;
+        }
+        KERNEL_ASSIGN(param.out_data[index][i], req, w);
+      }
+    }
+  }
+};
+
+template<typename xpu,
+         typename DType,
+         typename MPDType,
+         typename ParamType = MultiSGDParam,
+         int input_stride = 2>
+MultiSGDKernelParam<DType, MPDType> FillMultiSGDKernelParam(const nnvm::NodeAttrs& attrs,
+                                                            const OpContext &ctx,
+                                                            const std::vector<TBlob> &inputs,
+                                                            const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  const ParamType& p = nnvm::get<ParamType>(attrs.parsed);
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MultiSGDKernelParam<DType, MPDType> param;
+  param.clip_gradient = p.clip_gradient;
+  param.rescale_grad = p.rescale_grad;
+  param.momentum = 0;
+  param.count = p.num_weights;
+  param.max_size = 0;
+  for (int i = 0; i < param.count; ++i) {
+    param.sizes[i] = inputs[i * input_stride].shape_.Size();
+    if (param.max_size < param.sizes[i]) {
+      param.max_size = param.sizes[i];
+    }
+    param.weights[i] = inputs[i * input_stride].FlatTo2D<xpu, DType>(s).dptr_;
+    param.grads[i] = inputs[i * input_stride + 1].FlatTo2D<xpu, DType>(s).dptr_;
+    // if mixed precision, then the last input in a set
+    // is 32-bit master copy of the weights
+    if (!std::is_same<DType, MPDType>::value) {
+      param.weights32[i] = inputs[i * input_stride + input_stride - 1]
+                           .FlatTo2D<xpu, MPDType>(s).dptr_;
+    }
+    param.out_data[i] = outputs[i].FlatTo2D<xpu, DType>(s).dptr_;
+    param.lrs[i] = p.lrs[i];
+    param.wds[i] = p.wds[i];
+  }
+
+  return param;
+}
+
+
+template<typename xpu,
+         typename DType,
+         typename MPDType,
+         int input_stride = 3>
+MultiSGDKernelParam<DType, MPDType> FillMultiSGDMomKernelParam(const nnvm::NodeAttrs& attrs,
+                                                            const OpContext &ctx,
+                                                            const std::vector<TBlob> &inputs,
+                                                            const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  const MultiSGDMomParam& p = nnvm::get<MultiSGDMomParam>(attrs.parsed);
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MultiSGDKernelParam<DType, MPDType> param =
+    FillMultiSGDKernelParam<xpu,
+                            DType,
+                            MPDType,
+                            MultiSGDMomParam,
+                            input_stride>(attrs, ctx, inputs, outputs);
+  param.momentum = p.momentum;
+  for (int i = 0; i < param.count; ++i) {
+    param.mom[i] = inputs[i * input_stride + 2].FlatTo2D<xpu, MPDType>(s).dptr_;
+  }
+
+  return param;
+}
+
+template<typename T>
+class type_identity {
+ public:
+  using type = T;
+};
+
+template<typename T>
+class single_precision {
+ public:
+  using type = float;
+};
+
+template<typename xpu, template<typename> class MPTypeChooser, int input_stride>
+inline void MultiSGDUpdate(const nnvm::NodeAttrs& attrs,
+                           const OpContext &ctx,
+                           const std::vector<TBlob> &inputs,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    using MPDType = typename MPTypeChooser<DType>::type;
+    MultiSGDKernelParam<DType, MPDType> param =
+      FillMultiSGDKernelParam<xpu,
+                              DType,
+                              MPDType,
+                              MultiSGDParam,
+                              input_stride>(attrs, ctx, inputs, outputs);
+    Kernel<MultiSGDKernel<MPDType,
+                          false,
+                          !std::is_same<DType, MPDType>::value>,
+                          xpu>::Launch(s, param.max_size, param, req[0]);
+  });
+}
+
+template<typename xpu, template<typename> class MPTypeChooser, int input_stride>
+inline void MultiSGDMomUpdate(const nnvm::NodeAttrs& attrs,
+                              const OpContext &ctx,
+                              const std::vector<TBlob> &inputs,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
+  using namespace mxnet_op;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    using MPDType = typename MPTypeChooser<DType>::type;
+    MultiSGDKernelParam<DType, MPDType> param =
+      FillMultiSGDMomKernelParam<xpu,
+                                 DType,
+                                 MPDType,
+                                 input_stride>(attrs, ctx, inputs, outputs);
+    Kernel<MultiSGDKernel<MPDType,
+                          true,
+                          !std::is_same<DType, MPDType>::value>,
+                          xpu>::Launch(s, param.max_size, param, req[0]);
+  });
+}
 
 struct SGDKernel {
   template<typename DType>
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 6c44f99c1..982995ad2 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -31,6 +31,8 @@ namespace op {
 
 DMLC_REGISTER_PARAMETER(SGDParam);
 DMLC_REGISTER_PARAMETER(SGDMomParam);
+DMLC_REGISTER_PARAMETER(MultiSGDParam);
+DMLC_REGISTER_PARAMETER(MultiSGDMomParam);
 DMLC_REGISTER_PARAMETER(FTMLParam);
 DMLC_REGISTER_PARAMETER(AdamParam);
 DMLC_REGISTER_PARAMETER(RMSPropParam);
@@ -52,7 +54,7 @@ It updates the weights using::
 
  weight = weight - learning_rate * sign(gradient)
 
-.. note:: 
+.. note::
    - sparse ndarray not supported for this optimizer yet.
 )code" ADD_FILELINE)
 .set_num_inputs(2)
@@ -81,7 +83,7 @@ It updates the weights using::
 
 Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
 
-.. note:: 
+.. note::
    - sparse ndarray not supported for this optimizer yet.
 )code" ADD_FILELINE)
 .set_num_inputs(3)
@@ -313,10 +315,197 @@ inline bool SGDStorageType(const nnvm::NodeAttrs& attrs,
   return dispatched;
 }
 
+NNVM_REGISTER_OP(multi_sgd_update)
+.describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer.
+
+It updates the weights using::
+
+ weight = weight - learning_rate * (gradient + wd * weight)
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights * 2);
+  })
+.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights);
+  })
+.set_attr_parser(ParamParser<MultiSGDParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDParam, 2>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, -1>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("weight_") + std::to_string(i));
+      ret.push_back(std::string("grad_") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<FCompute>("FCompute<cpu>", MultiSGDUpdate<cpu, type_identity, 2>)
+.add_argument("data", "NDArray-or-Symbol[]", "Weights")
+.add_arguments(MultiSGDParam::__FIELDS__());
+
+NNVM_REGISTER_OP(multi_sgd_mom_update)
+.describe(R"code(Momentum update function for Stochastic Gradient Descent (SGD) optimizer.
+
+Momentum update has better convergence rates on neural networks. Mathematically it looks
+like below:
+
+.. math::
+
+  v_1 = \alpha * \nabla J(W_0)\\
+  v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\
+  W_t = W_{t-1} + v_t
+
+It updates the weights using::
+
+  v = momentum * v - learning_rate * gradient
+  weight += v
+
+Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights * 3);
+  })
+.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights);
+  })
+.set_attr_parser(ParamParser<MultiSGDMomParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDMomParam, 3>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, -1>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("weight_") + std::to_string(i));
+      ret.push_back(std::string("grad_") + std::to_string(i));
+      ret.push_back(std::string("mom_") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    std::vector<uint32_t> ret;
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    for (int i = 0; i < param.num_weights; ++i) {
+      ret.push_back(i * 3 + 2);
+    }
+    return ret;
+  })
+.set_attr<FCompute>("FCompute<cpu>", MultiSGDMomUpdate<cpu, type_identity, 3>)
+.add_argument("data", "NDArray-or-Symbol[]", "Weights, gradients and momentum")
+.add_arguments(MultiSGDMomParam::__FIELDS__());
+
+NNVM_REGISTER_OP(multi_mp_sgd_update)
+.describe(R"code(Update function for multi-precision Stochastic Gradient Descent (SDG) optimizer.
+
+It updates the weights using::
+
+ weight = weight - learning_rate * (gradient + wd * weight)
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights * 3);
+  })
+.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights);
+  })
+.set_attr_parser(ParamParser<MultiSGDParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDParam, 3>)
+.set_attr<nnvm::FInferType>("FInferType", MP_MultiSGD_InferType<MultiSGDParam, 3, 1>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<MultiSGDParam>(attrs.parsed).num_weights;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("weight_") + std::to_string(i));
+      ret.push_back(std::string("grad_") + std::to_string(i));
+      ret.push_back(std::string("weight32_") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    std::vector<uint32_t> ret;
+    const MultiSGDParam& param = dmlc::get<MultiSGDParam>(attrs.parsed);
+    for (int i = 0; i < param.num_weights; ++i) {
+      ret.push_back(i * 3 + 2);
+    }
+    return ret;
+  })
+.set_attr<FCompute>("FCompute<cpu>", MultiSGDUpdate<cpu, single_precision, 3>)
+.add_argument("data", "NDArray-or-Symbol[]", "Weights")
+.add_arguments(MultiSGDParam::__FIELDS__());
+
+NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
+.describe(R"code(Momentum update function for multi-precision Stochastic Gradient Descent (SGD) optimizer.
+
+Momentum update has better convergence rates on neural networks. Mathematically it looks
+like below:
+
+.. math::
+
+  v_1 = \alpha * \nabla J(W_0)\\
+  v_t = \gamma v_{t-1} - \alpha * \nabla J(W_{t-1})\\
+  W_t = W_{t-1} + v_t
+
+It updates the weights using::
+
+  v = momentum * v - learning_rate * gradient
+  weight += v
+
+Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights * 4);
+  })
+.set_num_outputs([](const nnvm::NodeAttrs& attrs) {
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    return static_cast<uint32_t>(param.num_weights);
+  })
+.set_attr_parser(ParamParser<MultiSGDMomParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", MultiSGDShape<MultiSGDMomParam, 4>)
+.set_attr<nnvm::FInferType>("FInferType", MP_MultiSGD_InferType<MultiSGDMomParam, 4, 2>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    uint32_t num_args = dmlc::get<MultiSGDMomParam>(attrs.parsed).num_weights;
+    std::vector<std::string> ret;
+    for (uint32_t i = 0; i < num_args; ++i) {
+      ret.push_back(std::string("weight_") + std::to_string(i));
+      ret.push_back(std::string("grad_") + std::to_string(i));
+      ret.push_back(std::string("mom_") + std::to_string(i));
+      ret.push_back(std::string("weight32_") + std::to_string(i));
+    }
+    return ret;
+  })
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    std::vector<uint32_t> ret;
+    const MultiSGDMomParam& param = dmlc::get<MultiSGDMomParam>(attrs.parsed);
+    for (int i = 0; i < param.num_weights; ++i) {
+      ret.push_back(i * 4 + 2);
+      ret.push_back(i * 4 + 3);
+    }
+    return ret;
+  })
+.set_attr<FCompute>("FCompute<cpu>", MultiSGDMomUpdate<cpu, single_precision, 4>)
+.add_argument("data", "NDArray-or-Symbol[]", "Weights")
+.add_arguments(MultiSGDMomParam::__FIELDS__());
 
 NNVM_REGISTER_OP(sgd_update)
 MXNET_ADD_SPARSE_OP_ALIAS(sgd_update)
-.describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer.
+.describe(R"code(Update function for Stochastic Gradient Descent (SGD) optimizer.
 
 It updates the weights using::
 
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 0fd2ca83f..c42cf1831 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -242,6 +242,15 @@ NNVM_REGISTER_OP(mp_sgd_update)
 NNVM_REGISTER_OP(mp_sgd_mom_update)
 .set_attr<FCompute>("FCompute<gpu>", MP_SGDMomUpdate<gpu>);
 
+NNVM_REGISTER_OP(multi_sgd_update)
+.set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, type_identity, 2>);
+NNVM_REGISTER_OP(multi_sgd_mom_update)
+.set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, type_identity, 3>);
+NNVM_REGISTER_OP(multi_mp_sgd_update)
+.set_attr<FCompute>("FCompute<gpu>", MultiSGDUpdate<gpu, single_precision, 3>);
+NNVM_REGISTER_OP(multi_mp_sgd_mom_update)
+.set_attr<FCompute>("FCompute<gpu>", MultiSGDMomUpdate<gpu, single_precision, 4>);
+
 NNVM_REGISTER_OP(ftml_update)
 .set_attr<FCompute>("FCompute<gpu>", FTMLUpdate<gpu>);
 
diff --git a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
index 89c3c1994..b66adf787 100644
--- a/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_dequantize-inl.h
@@ -74,6 +74,10 @@ static void MKLDNNDequantizeComputeKer(const std::vector<NDArray> &inputs,
     i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
   }
   mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+  if (i_fmt == mkldnn::memory::format::nhwc) {
+    // For 4d tensor, nchw is the default format
+    i_fmt = mkldnn::memory::format::nchw;
+  }
   auto o_desc = mkldnn::memory::desc(i_dims,
                                     (mkldnn::memory::data_type)data_type_enum<DstType>::type,
                                     i_fmt);
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
new file mode 100644
index 000000000..e201d290e
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize_v2-inl.h
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_quantize_v2-inl.h
+ * \brief
+ */
+
+#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_V2_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_V2_INL_H_
+#if MXNET_USE_MKLDNN == 1
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../quantize_v2-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template <typename SrcType, typename DstType>
+static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
+                                     const std::vector<NDArray>& outputs,
+                                     const QuantizeV2Param& param,
+                                     const std::vector<OpReqType>& req) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using red::limits::MaxValue;
+  using red::limits::MinValue;
+  SrcType real_range = 0.f;
+  DstType quantized_range = 0;
+  NDArray in_buffer = inputs[0];
+  SrcType data_min = red::limits::MaxValue<SrcType>();
+  SrcType data_max = red::limits::MinValue<SrcType>();
+  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+    data_min = param.min_calib_range.value();
+    data_max = param.max_calib_range.value();
+  } else {
+    // no calib info
+    in_buffer = inputs[0].Reorder2Default();
+    auto in_ptr = in_buffer.data().dptr<SrcType>();
+    auto nthreads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+    std::vector<SrcType> data_maxs(nthreads, data_max);
+    std::vector<SrcType> data_mins(nthreads, data_min);
+#pragma omp parallel for num_threads(nthreads)
+    for (index_t i = 0; i < static_cast<index_t>(in_buffer.shape().Size()); i++) {
+      int tid = omp_get_thread_num();
+      if (in_ptr[i] > data_maxs[tid]) data_maxs[tid] = in_ptr[i];
+      if (in_ptr[i] < data_mins[tid]) data_mins[tid] = in_ptr[i];
+    }
+    for (index_t i = 0; i < nthreads; i++) {
+      if (data_maxs[i] > data_max) data_max = data_maxs[i];
+      if (data_mins[i] < data_min) data_min = data_mins[i];
+    }
+  }
+
+  auto out_type = GetOutputType(param);
+  if (out_type == mshadow::kUint8) {
+    real_range = std::max<SrcType>(0.f, data_max);
+    quantized_range = MaxValue<DstType>();
+    *outputs[1].data().dptr<float>() = 0.f;
+    *outputs[2].data().dptr<float>() = real_range;
+  } else if (out_type == mshadow::kInt8) {
+    real_range = MaxAbs(data_min, data_max);
+    quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
+    *outputs[1].data().dptr<float>() = -real_range;
+    *outputs[2].data().dptr<float>() = real_range;
+  } else {
+    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+  }
+  float scale = static_cast<float>(quantized_range) / real_range;
+
+  primitive_attr attr;
+  const int mask = 0;
+  std::vector<float> scales = {scale};
+  attr.set_output_scales(mask, scales);
+  attr.set_int_output_round_mode(round_nearest);
+  mkldnn::engine cpu_engine = mxnet::CpuEngine::Get()->get_engine();
+
+  if (in_buffer.IsView() && in_buffer.IsMKLDNNData()) in_buffer = inputs[0].Reorder2Default();
+  auto i_mem = in_buffer.GetMKLDNNData();
+  auto i_mpd = i_mem->get_primitive_desc();
+  auto i_desc = i_mpd.desc();
+  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+  if (i_fmt == mkldnn::memory::format::nchw ||
+      i_fmt == mkldnn::memory::format::nChw8c ||
+      i_fmt == mkldnn_nChw16c) {
+    i_fmt = mkldnn::memory::format::nhwc;
+  }
+  size_t i_ndim = in_buffer.shape().ndim();
+  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+  for (size_t i = 0; i < i_ndim; i++) {
+    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
+  }
+  auto o_desc =
+      mkldnn::memory::desc(i_dims, (mkldnn::memory::data_type)data_type_enum<DstType>::type, i_fmt);
+  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+  auto reorder_pd = reorder::primitive_desc(i_mpd, o_mpd, attr);
+  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
+  CommitOutput(outputs[0], o_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+static void MKLDNNQuantizeV2Compute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                                    const std::vector<NDArray>& inputs,
+                                    const std::vector<OpReqType>& req,
+                                    const std::vector<NDArray>& outputs) {
+  const QuantizeV2Param& param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+  auto out_type = GetOutputType(param);
+  if (out_type == mshadow::kUint8) {
+    MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
+  } else if (out_type == mshadow::kInt8) {
+    MKLDNNQuantizeComputeKer<float, int8_t>(inputs, outputs, param, req);
+  } else {
+    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
+#endif  // MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_V2_INL_H_
diff --git a/src/operator/quantization/quantization_utils.h b/src/operator/quantization/quantization_utils.h
index ee7112205..efc841009 100644
--- a/src/operator/quantization/quantization_utils.h
+++ b/src/operator/quantization/quantization_utils.h
@@ -27,6 +27,7 @@
 #include <mxnet/base.h>
 #include <algorithm>
 #include "../mxnet_op.h"
+#include "../tensor/broadcast_reduce_op.h"
 
 namespace mxnet {
 namespace op {
@@ -171,6 +172,20 @@ struct QuantizationRangeForMultiplicationStruct {
   }
 };
 
+template<typename xpu, typename DType>
+inline size_t ConfigReduce(mshadow::Stream<xpu>* s,
+                           const TShape& data_shape,
+                           const TShape& out_shape,
+                           TShape* src_shape,
+                           TShape* dst_shape) {
+  BroadcastReduceShapeCompact(data_shape, out_shape, src_shape, dst_shape);
+  constexpr int NDim = 2;
+  CHECK_EQ(src_shape->ndim(), NDim);
+  CHECK_EQ(dst_shape->ndim(), NDim);
+
+  return broadcast::ReduceWorkspaceSize<NDim, DType>(s, *dst_shape, kWriteTo, *src_shape);
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZATION_UTILS_H_
diff --git a/src/operator/quantization/quantize.cc b/src/operator/quantization/quantize.cc
index 5227751bc..e486f058b 100644
--- a/src/operator/quantization/quantize.cc
+++ b/src/operator/quantization/quantize.cc
@@ -71,7 +71,7 @@ where
 `scale = quantized_range / MaxAbs(min_range, max_range).`
 
 .. Note::
-    This operator only supports forward propogation. DO NOT use it in training.)code" ADD_FILELINE)
+    This operator only supports forward propagation. DO NOT use it in training.)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<QuantizeParam>)
 .set_num_inputs(3)
 .set_num_outputs(3)
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index fcd0fb421..af533978a 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -26,6 +26,7 @@
 #include <nnvm/pass.h>
 #include <mxnet/op_attr_types.h>
 #include <unordered_set>
+#include "quantize_v2-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -63,12 +64,12 @@ NodePtr InsertNode(std::string op_name,
 }
 
 std::vector<NodeEntry> OfflineParams(std::vector<NodeEntry>&& outputs,
-                                     std::unordered_set<std::string>&& offline_params) {
+                                     const std::unordered_set<std::string>& offline_params) {
   std::string node_suffixs[3] = {"", "_min", "_max"};
   std::unordered_map<Node*, NodePtr> mirror_map;
   nnvm::NodeEntryMap<NodePtr> entry_var;
   auto need_offline = [&](NodePtr n) {
-    return (n->op() == Op::Get("_contrib_quantize")) &&
+    return (n->op() == Op::Get("_contrib_quantize_v2")) &&
            n->inputs[0].node->is_variable() &&
            offline_params.count(n->inputs[0].node->attrs.name);
   };
@@ -88,7 +89,8 @@ std::vector<NodeEntry> OfflineParams(std::vector<NodeEntry>&& outputs,
   return outputs;
 }
 
-inline bool NeedQuantize(NodePtr node, const std::unordered_set<std::string>& excluded_nodes) {
+inline bool NeedQuantize(const NodePtr node,
+                         const std::unordered_set<std::string>& excluded_nodes) {
   static auto& quantized_op_map = Op::GetAttr<mxnet::FQuantizedOp>("FQuantizedOp");
   static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
   const auto& op = node->op();
@@ -121,10 +123,9 @@ Graph QuantizeGraph(Graph &&src) {
   static const auto& need_requantize_map = Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
   static const auto& avoid_quantize_input_map =
       Op::GetAttr<mxnet::FAvoidQuantizeInput>("FAvoidQuantizeInput");
-  auto offline_params = src.GetAttr<std::unordered_set<std::string>>("offline_params");
-  auto excluded_nodes = src.GetAttr<std::unordered_set<std::string>>("excluded_nodes");
-  auto quantized_dtype = src.GetAttr<std::string>("quantized_dtype");
-  auto calib_quantize = src.GetAttr<bool>("calib_quantize");
+  const auto offline_params = src.GetAttr<std::unordered_set<std::string>>("offline_params");
+  const auto excluded_nodes = src.GetAttr<std::unordered_set<std::string>>("excluded_nodes");
+  const auto quantized_dtype = src.GetAttr<std::string>("quantized_dtype");
 
   // mirror_map stores the mapping from the currently visited graph to the newly created quantized
   // graph. Key is the currently visited graph's node pointer, and value is a copied node of the key
@@ -174,24 +175,10 @@ Graph QuantizeGraph(Graph &&src) {
               }
             }
 
-            NodePtr quantize_node = InsertNode("_contrib_quantize",
+            NodePtr quantize_node = InsertNode("_contrib_quantize_v2",
               e.node->attrs.name + suffix + "_quantize", new_node, mirror_entry);
             quantize_node->attrs.dict["out_type"] = quantized_dtype;
             quantize_node->op()->attr_parser(&(quantize_node->attrs));
-            if (calib_quantize) {
-              NodePtr min_var = CreateNode("nullptr", e.node->attrs.name + suffix + "_min");
-              quantize_node->inputs.emplace_back(NodeEntry{min_var, 0, 0});
-              NodePtr max_var = CreateNode("nullptr", e.node->attrs.name + suffix + "_max");
-              quantize_node->inputs.emplace_back(NodeEntry{max_var, 0, 0});
-            } else {
-              NodePtr min_node = InsertNode("min",
-                  e.node->attrs.name + suffix + "_min", quantize_node, mirror_entry);
-              min_node->op()->attr_parser(&(min_node->attrs));
-
-              NodePtr max_node = InsertNode("max",
-                  e.node->attrs.name + suffix + "_max", quantize_node, mirror_entry);
-              max_node->op()->attr_parser(&(max_node->attrs));
-            }
             mirror_entry_map[e] = NodeEntry{quantize_node, 0, e.version};
           }
         } else if (mirror_node->op() == Op::Get("_contrib_dequantize")) {
@@ -269,43 +256,35 @@ Graph QuantizeGraph(Graph &&src) {
       // the new_node.
       *new_node = *node;
       new_node->inputs.clear();
-      if (node->is_variable() && node->attrs.name == "data") {
-        // Insert identity for data to collect calib for it.
-        NodePtr identity_node =
-            CreateNode("identity", new_node->attrs.name + "_id");
-        identity_node->inputs.emplace_back(NodeEntry{new_node, 0, 0});
-        new_node = identity_node;
-      } else {
-        for (const auto& e : node->inputs) {
-          NodePtr mirror_node = mirror_map.at(e.node.get());
-          NodeEntry mirror_entry = NodeEntry{
-            mirror_node, e.index, e.version};
-          // if input node is quantized operator, add dequantize node
-          if (NeedQuantize(e.node, excluded_nodes) &&
-              (mirror_node->op() != Op::Get("_contrib_dequantize"))) {
-            // here we calculate the output number (exclude min/max, in order to
-            // calculate min/max index from mirror node) based on assumption that
-            // there is only 1min and 1max output from mirror node (which is
-            // currently true)
-            size_t num_outputs = mirror_node->num_outputs() - 2;
-            uint32_t min_index = num_outputs + 2 * e.index;
-            uint32_t max_index = num_outputs + 2 * e.index + 1;
-            NodePtr dequantize_node = CreateNode("_contrib_dequantize",
-              e.node->attrs.name + "_dequantize");
-            dequantize_node->inputs.emplace_back(mirror_entry);
-            dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
-            dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
-            dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
+      for (const auto& e : node->inputs) {
+        NodePtr mirror_node = mirror_map.at(e.node.get());
+        NodeEntry mirror_entry = NodeEntry{
+          mirror_node, e.index, e.version};
+        // if input node is quantized operator, add dequantize node
+        if (NeedQuantize(e.node, excluded_nodes) &&
+            (mirror_node->op() != Op::Get("_contrib_dequantize"))) {
+          // here we calculate the output number (exclude min/max, in order to
+          // calculate min/max index from mirror node) based on assumption that
+          // there is only 1min and 1max output from mirror node (which is
+          // currently true)
+          size_t num_outputs = mirror_node->num_outputs() - 2;
+          uint32_t min_index = num_outputs + 2 * e.index;
+          uint32_t max_index = num_outputs + 2 * e.index + 1;
+          NodePtr dequantize_node = CreateNode("_contrib_dequantize",
+            e.node->attrs.name + "_dequantize");
+          dequantize_node->inputs.emplace_back(mirror_entry);
+          dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, min_index, 0});
+          dequantize_node->inputs.emplace_back(NodeEntry{mirror_node, max_index, 0});
+          dequantize_node->op()->attr_parser(&(dequantize_node->attrs));
 
-            new_node->inputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
-            mirror_map[e.node.get()] = std::move(dequantize_node);
-          } else if (mirror_entry_map.count(e)) {
-            new_node->inputs.emplace_back(
-                NodeEntry{mirror_entry_map[e].node->inputs[0].node, e.index, e.version});
-          } else {
-            new_node->inputs.emplace_back(
-                NodeEntry{mirror_node, e.index, e.version});
-          }
+          new_node->inputs.emplace_back(NodeEntry{dequantize_node, 0, 0});
+          mirror_map[e.node.get()] = std::move(dequantize_node);
+        } else if (mirror_entry_map.count(e)) {
+          new_node->inputs.emplace_back(
+              NodeEntry{mirror_entry_map[e].node->inputs[0].node, e.index, e.version});
+        } else {
+          new_node->inputs.emplace_back(
+              NodeEntry{mirror_node, e.index, e.version});
         }
       }
     }
@@ -334,8 +313,7 @@ Graph QuantizeGraph(Graph &&src) {
     }
   }
 
-  if (!offline_params.empty()) outputs =
-    OfflineParams(std::move(outputs), std::move(offline_params));
+  if (!offline_params.empty()) outputs = OfflineParams(std::move(outputs), offline_params);
 
   Graph ret;
   ret.outputs = std::move(outputs);
@@ -361,7 +339,11 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
           && need_requantize_map[quantized_op_node->op()](quantized_op_node->attrs))
           << quantized_op_node->attrs.name << " op must register FNeedRequantize attr"
                                               " and the attr func should return true";
-      std::string out_data_name = quantized_op_node->attrs.name + "_";
+      const std::string prefix = "quantized_";
+      CHECK(std::equal(prefix.begin(), prefix.end(), quantized_op_node->attrs.name.begin()))
+          << "an quantized op should start with `quantized_`";
+
+      std::string out_data_name = quantized_op_node->attrs.name.substr(prefix.size()) + "_";
       auto list_output_names_func = flist_outputs.get(quantized_op_node->op(), nullptr);
       // Here it's assumed that the quantized_op node only produces three outputs:
       // out_data, min_range, and max_range. So we want to get the pre-calculated min_calib_range
@@ -381,6 +363,34 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
         node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
         node->op()->attr_parser(&(node->attrs));
       }
+    } else if (node->op() == Op::Get("_contrib_quantize_v2")) {
+      NodePtr float_op_node = node->inputs[0].node;
+      auto float_op_idx = node->inputs[0].index;
+      std::string out_data_name = float_op_node->attrs.name;
+      if (float_op_node->op()) {
+        auto list_output_names_func = flist_outputs.get(float_op_node->op(), nullptr);
+        // We want to get the pre-calculated min_range and max_range from the calibration table for
+        // out_data. Here we create the output data name same as its constructed in
+        // GraphExecutor::ExecuteMonCallback.
+        if (list_output_names_func != nullptr) {
+          std::vector<std::string> names = list_output_names_func(float_op_node->attrs);
+          out_data_name += "_" + names[float_op_idx];
+        } else {
+          out_data_name += "_" + std::to_string(float_op_idx);
+        }
+      }
+      const auto calib_table_iter = calib_table.find(out_data_name);
+      if (calib_table_iter != calib_table.end()) {
+        node->attrs.dict["min_calib_range"] = std::to_string(calib_table_iter->second.first);
+        node->attrs.dict["max_calib_range"] = std::to_string(calib_table_iter->second.second);
+        node->op()->attr_parser(&(node->attrs));
+        const QuantizeV2Param& param = nnvm::get<QuantizeV2Param>(node->attrs.parsed);
+        if (param.out_type == QuantizeV2Param::OutType::kUint8 &&
+            param.min_calib_range.value() < 0.0f) {
+          LOG(WARNING) << "Calibration statistics indicates that node `" << node->attrs.name
+                       << "` has negative input, consider use `auto` or `int8` as out_type";
+        }
+      }
     }
   });
   return g;
diff --git a/src/operator/quantization/quantize_v2-inl.h b/src/operator/quantization/quantize_v2-inl.h
new file mode 100644
index 000000000..5ae10a7e4
--- /dev/null
+++ b/src/operator/quantization/quantize_v2-inl.h
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file quantize_v2-inl.h
+ * \brief implementation of quantize operation
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZE_V2_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZE_V2_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <limits>
+#include "../elemwise_op_common.h"
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "./quantization_utils.h"
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+
+struct QuantizeV2Param : public dmlc::Parameter<QuantizeV2Param> {
+  enum OutType { kAuto = 0, kInt8, kUint8 };
+  int out_type;
+  dmlc::optional<float> min_calib_range;
+  dmlc::optional<float> max_calib_range;
+  DMLC_DECLARE_PARAMETER(QuantizeV2Param) {
+    DMLC_DECLARE_FIELD(out_type)
+      .add_enum("auto", kAuto)
+      .add_enum("int8", kInt8)
+      .add_enum("uint8", kUint8)
+      .set_default(kInt8)
+      .describe("Output data type. `auto` can be specified to automatically determine output type "
+                "according to min_calib_range.");
+    DMLC_DECLARE_FIELD(min_calib_range)
+      .set_default(dmlc::optional<float>())
+      .describe("The minimum scalar value in the form of float32. If present, it will be used to "
+                "quantize the fp32 data into int8 or uint8.");
+    DMLC_DECLARE_FIELD(max_calib_range)
+      .set_default(dmlc::optional<float>())
+      .describe("The maximum scalar value in the form of float32. If present, it will be used to "
+                "quantize the fp32 data into int8 or uint8.");
+  }
+};
+
+static mshadow::TypeFlag GetOutputType(const QuantizeV2Param &param) {
+  auto out_type = mshadow::kInt8;
+  if (param.out_type == QuantizeV2Param::OutType::kAuto) {
+    if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+      if (param.min_calib_range.value() >= 0.0) {
+        out_type = mshadow::kUint8;
+      } else {
+        out_type = mshadow::kInt8;
+      }
+    }
+  } else if (param.out_type == QuantizeV2Param::OutType::kInt8) {
+    out_type = mshadow::kInt8;
+  } else if (param.out_type == QuantizeV2Param::OutType::kUint8) {
+    out_type = mshadow::kUint8;
+  } else {
+    LOG(FATAL) << "Unsupported out_type in params: " <<param.out_type;
+  }
+  return out_type;
+}
+
+// quantize float to uint8_t
+struct quantize_v2_unsigned {
+  template <typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range, float *omax_range,
+                                  const SrcDType *in, const float imin_range,
+                                  const float imax_range, const double min_limit,
+                                  const double max_limit) {
+    const float scale = (max_limit - min_limit) / (imax_range - imin_range);
+    out[i] = static_cast<DstDType>((in[i] - imin_range) * scale + 0.5);
+    *omin_range = imin_range;
+    *omax_range = imax_range;
+  }
+
+  template <typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range, float *omax_range,
+                                  const SrcDType *in, const float *imin_range,
+                                  const float *imax_range, const double min_limit,
+                                  const double max_limit) {
+    Map(i, out, omin_range, omax_range, in, *imin_range, *imax_range, min_limit, max_limit);
+  }
+};
+
+// keep zero-center
+struct quantize_v2_zero_centered {
+  template <typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range, float *omax_range,
+                                  const SrcDType *in, const float imin_range,
+                                  const float imax_range, const float quantized_range) {
+    float real_range = MaxAbs(imin_range, imax_range);
+    float scale = quantized_range / real_range;
+    SrcDType x = in[i];
+    out[i] = static_cast<DstDType>(Sign(x) * Min(Abs(x) * scale + 0.5f, quantized_range));
+    *omin_range = -real_range;
+    *omax_range = real_range;
+  }
+
+  template <typename DstDType, typename SrcDType>
+  MSHADOW_XINLINE static void Map(int i, DstDType *out, float *omin_range, float *omax_range,
+                                  const SrcDType *in, const float *imin_range,
+                                  const float *imax_range, const float quantized_range) {
+    Map(i, out, omin_range, omax_range, in, *imin_range, *imax_range, quantized_range);
+  }
+};
+
+template <typename xpu>
+void QuantizeV2Compute(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                       const std::vector<TBlob> &inputs, const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  typedef float SrcDType;
+  using mshadow::red::limits::MaxValue;
+  using mshadow::red::limits::MinValue;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+  auto out_type = GetOutputType(param);
+  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+    if (out_type == mshadow::kUint8) {
+      Kernel<quantize_v2_unsigned, xpu>::Launch(
+          s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
+          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
+          param.max_calib_range.value(), MinValue<uint8_t>(), MaxValue<uint8_t>());
+    } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
+      Kernel<quantize_v2_zero_centered, xpu>::Launch(
+          s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
+          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), param.min_calib_range.value(),
+          param.max_calib_range.value(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+    } else {
+      LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+    }
+  } else {  // model is not calibrated
+    TShape src_shape, dst_shape;
+    const size_t actual_float_size = sizeof(float);
+    const size_t temp_reduce_size =
+        ConfigReduce<xpu, SrcDType>(s, inputs[0].shape_, TShape({1}), &src_shape, &dst_shape);
+    Tensor<xpu, 1, char> temp_space = ctx.requested[0].get_space_typed<xpu, 1, char>(
+        Shape1(2 * actual_float_size + temp_reduce_size), s);
+    const int dev_id = ctx.run_ctx.ctx.dev_id;
+    TBlob in_min_t(reinterpret_cast<SrcDType *>(temp_space.dptr_), Shape1(1), xpu::kDevMask,
+                   dev_id);
+    TBlob in_max_t(reinterpret_cast<SrcDType *>(temp_space.dptr_) + 1, Shape1(1), xpu::kDevMask,
+                   dev_id);
+    Tensor<xpu, 1, char> workspace(temp_space.dptr_ + 2 * actual_float_size,
+                                   Shape1(temp_reduce_size), s);
+    broadcast::Reduce<red::minimum, 2, SrcDType, mshadow::op::identity>(
+        s, in_min_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
+    broadcast::Reduce<red::maximum, 2, SrcDType, mshadow::op::identity>(
+        s, in_max_t.reshape(dst_shape), kWriteTo, workspace, inputs[0].reshape(src_shape));
+    if (out_type == mshadow::kUint8) {
+      Kernel<quantize_v2_unsigned, xpu>::Launch(
+          s, outputs[0].Size(), outputs[0].dptr<uint8_t>(), outputs[1].dptr<float>(),
+          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
+          in_max_t.dptr<float>(), MinValue<uint8_t>(), MaxValue<uint8_t>());
+    } else if (out_type == mshadow::kInt8) {  // zero-centered quantization
+      Kernel<quantize_v2_zero_centered, xpu>::Launch(
+          s, outputs[0].Size(), outputs[0].dptr<int8_t>(), outputs[1].dptr<float>(),
+          outputs[2].dptr<float>(), inputs[0].dptr<SrcDType>(), in_min_t.dptr<float>(),
+          in_max_t.dptr<float>(), MinAbs(MaxValue<int8_t>(), MinValue<int8_t>()));
+    } else {
+      LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+    }
+  }
+}
+
+static inline bool QuantizeV2Shape(const nnvm::NodeAttrs &attrs, std::vector<TShape> *in_attrs,
+                                   std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 3U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape{1});
+  SHAPE_ASSIGN_CHECK(*out_attrs, 2, TShape{1});
+  return !shape_is_none(out_attrs->at(0));
+}
+
+static inline bool QuantizeV2Type(const nnvm::NodeAttrs &attrs, std::vector<int> *in_attrs,
+                                  std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 3U);
+  const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  auto out_type = GetOutputType(param);
+  if (out_type == mshadow::kUint8) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
+  } else if (out_type == mshadow::kInt8) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  } else {
+    LOG(FATAL) << "quantize op only supports int8 and uint8 as output type";
+  }
+  TYPE_ASSIGN_CHECK(*out_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 2, mshadow::kFloat32);
+  return (*in_attrs)[0] != -1;
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZE_V2_INL_H_
diff --git a/src/operator/quantization/quantize_v2.cc b/src/operator/quantization/quantize_v2.cc
new file mode 100644
index 000000000..21410933d
--- /dev/null
+++ b/src/operator/quantization/quantize_v2.cc
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file quantize.cc
+ * \brief
+ */
+
+#include "./quantize_v2-inl.h"
+#if MXNET_USE_MKLDNN == 1
+#include "./mkldnn/mkldnn_quantize_v2-inl.h"
+#endif
+
+namespace mxnet {
+namespace op {
+DMLC_REGISTER_PARAMETER(QuantizeV2Param);
+
+static bool QuantizeV2StorageType(const nnvm::NodeAttrs& attrs, const int dev_mask,
+                                  DispatchMode* dispatch_mode, std::vector<int>* in_attrs,
+                                  std::vector<int>* out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+#if MXNET_USE_MKLDNN == 1
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    *dispatch_mode = DispatchMode::kFComputeEx;
+  }
+#endif
+  (*out_attrs)[0] = kDefaultStorage;
+  (*out_attrs)[1] = kDefaultStorage;
+  (*out_attrs)[2] = kDefaultStorage;
+  return true;
+}
+
+NNVM_REGISTER_OP(_contrib_quantize_v2)
+.describe(R"code(Quantize a input tensor from float to `out_type`,
+with user-specified `min_calib_range` and `max_calib_range` or the input range collected at runtime.
+
+Output `min_range` and `max_range` are scalar floats that specify the range for the input data.
+
+When out_type is `uint8`, the output is calculated using the following equation:
+
+`out[i] = (in[i] - min_range) * range(OUTPUT_TYPE) / (max_range - min_range) + 0.5`,
+
+where `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`.
+
+When out_type is `int8`, the output is calculate using the following equation
+by keep zero centered for the quantized value:
+
+`out[i] = sign(in[i]) * min(abs(in[i] * scale + 0.5f, quantized_range)`,
+
+where
+`quantized_range = MinAbs(max(int8), min(int8))` and
+`scale = quantized_range / MaxAbs(min_range, max_range).`
+
+When out_type is `auto`, the output type is automatically determined by min_calib_range if presented.
+If min_calib_range < 0.0f, the output type will be int8, otherwise will be uint8.
+If min_calib_range isn't presented, the output type will be int8.
+
+.. Note::
+    This operator only supports forward propagation. DO NOT use it in training.)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<QuantizeV2Param>)
+.set_num_inputs(1)
+.set_num_outputs(3)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  return std::vector<std::string>{"data"};
+})
+.set_attr<nnvm::FInferShape>("FInferShape", QuantizeV2Shape)
+.set_attr<nnvm::FInferType>("FInferType", QuantizeV2Type)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizeV2StorageType)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", MKLDNNQuantizeV2Compute)
+#endif
+.set_attr<FCompute>("FCompute<cpu>", QuantizeV2Compute<cpu>)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) {
+  const QuantizeV2Param &param = nnvm::get<QuantizeV2Param>(attrs.parsed);
+  if (param.min_calib_range.has_value() && param.max_calib_range.has_value()) {
+    return std::vector<ResourceRequest>();
+  } else {
+    return std::vector<ResourceRequest>(1, ResourceRequest::kTempSpace);
+  }
+})
+.add_argument("data", "NDArray-or-Symbol", "A ndarray/symbol of type `float32`")
+.add_arguments(QuantizeV2Param::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/quantize_v2.cu b/src/operator/quantization/quantize_v2.cu
new file mode 100644
index 000000000..ab0cf9c5a
--- /dev/null
+++ b/src/operator/quantization/quantize_v2.cu
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file quantize_v2.cu
+ * \brief
+ */
+#include "./quantize_v2-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_contrib_quantize_v2)
+.set_attr<FCompute>("FCompute<gpu>", QuantizeV2Compute<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/quantization/requantize-inl.h b/src/operator/quantization/requantize-inl.h
index e07a149f8..148453e63 100644
--- a/src/operator/quantization/requantize-inl.h
+++ b/src/operator/quantization/requantize-inl.h
@@ -87,20 +87,6 @@ struct RequantizeKernel {
   }
 };
 
-template<typename xpu, typename DType>
-inline size_t ConfigReduce(mshadow::Stream<xpu>* s,
-                           const TShape& data_shape,
-                           const TShape& out_shape,
-                           TShape* src_shape,
-                           TShape* dst_shape) {
-  BroadcastReduceShapeCompact(data_shape, out_shape, src_shape, dst_shape);
-  constexpr int NDim = 2;
-  CHECK_EQ(src_shape->ndim(), NDim);
-  CHECK_EQ(dst_shape->ndim(), NDim);
-
-  return broadcast::ReduceWorkspaceSize<NDim, DType>(s, *dst_shape, kWriteTo, *src_shape);
-}
-
 template<typename xpu>
 void RequantizeForward(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index 1a59473cf..61506c2af 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -65,9 +65,9 @@ struct SequenceLastParam : public dmlc::Parameter<SequenceLastParam> {
 
 template <int req>
 struct SequenceLastKernel {
-  template <typename DType>
+  template <typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in,
-                                  const DType *idx, int offset1, int offset2,
+                                  const IType *idx, int offset1, int offset2,
                                   mshadow::Shape<2> oshape) {
     const auto opos = mxnet_op::unravel(i, oshape);
     const int seqpos = static_cast<int>(idx[opos[0]]) - 1;
@@ -77,9 +77,9 @@ struct SequenceLastKernel {
 };
 
 struct SequenceLastGradKernel {
-  template <typename DType>
+  template <typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, DType *in_grad, const DType *out_grad,
-                                  const DType *idx, int offset1, int offset2,
+                                  const IType *idx, int offset1, int offset2,
                                   mshadow::Shape<2> oshape) {
     const auto opos = mxnet_op::unravel(i, oshape);
     const int seqpos = static_cast<int>(idx[opos[0]]) - 1;
@@ -88,14 +88,14 @@ struct SequenceLastGradKernel {
   }
 };
 
-template <typename xpu, typename DType>
+template <typename xpu, typename DType, typename IType>
 class SequenceLastOp : public Operator {
  public:
   explicit SequenceLastOp(SequenceLastParam p) { this->param_ = p; }
 
   void sequence_last(const mshadow::Tensor<xpu, 3, DType> &data,
                      const mshadow::Tensor<xpu, 2, DType> &out,
-                     const mshadow::Tensor<xpu, 1, DType> &indices,
+                     const mshadow::Tensor<xpu, 1, IType> &indices,
                      const OpReqType req, mshadow::Stream<xpu> *const s) {
     using namespace mshadow;
     using namespace mshadow::expr;
@@ -115,7 +115,7 @@ class SequenceLastOp : public Operator {
 
   void sequence_last_grad(const mshadow::Tensor<xpu, 3, DType> &in_grad,
                           const mshadow::Tensor<xpu, 2, DType> &out_grad,
-                          const mshadow::Tensor<xpu, 1, DType> &indices,
+                          const mshadow::Tensor<xpu, 1, IType> &indices,
                           mshadow::Stream<xpu> *const s) {
     using namespace mshadow;
     using namespace mshadow::expr;
@@ -163,11 +163,11 @@ class SequenceLastOp : public Operator {
     Tensor<xpu, 2, DType> out =
         out_data[seq_last::kOut].get_with_shape<xpu, 2, DType>(
             Shape2(batch, rest_size), s);
-    Tensor<xpu, 1, DType> indices =
+    Tensor<xpu, 1, IType> indices =
         param_.use_sequence_length
-            ? in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s)
+            ? in_data[seq_last::kSequenceLength].get<xpu, 1, IType>(s)
             : ctx.requested[seq_last::kTempSpace]
-                  .get_space_typed<xpu, 1, DType>(Shape1(batch), s);
+                  .get_space_typed<xpu, 1, IType>(Shape1(batch), s);
     if (!param_.use_sequence_length) indices = max_seq_len;
 
     sequence_last(data, out, indices, req[seq_last::kOut], s);
@@ -206,11 +206,11 @@ class SequenceLastOp : public Operator {
     Tensor<xpu, 2, DType> output_grad =
         out_grad[seq_last::kOut].get_with_shape<xpu, 2, DType>(
             Shape2(batch, rest_size), s);
-    Tensor<xpu, 1, DType> indices =
+    Tensor<xpu, 1, IType> indices =
         param_.use_sequence_length
-            ? in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s)
+            ? in_data[seq_last::kSequenceLength].get<xpu, 1, IType>(s)
             : ctx.requested[seq_last::kTempSpace]
-                  .get_space_typed<xpu, 1, DType>(Shape1(batch), s);
+                  .get_space_typed<xpu, 1, IType>(Shape1(batch), s);
 
     if (req[seq_last::kData] == kWriteTo) data_grad = 0.0f;
     sequence_last_grad(data_grad, output_grad, indices, s);
@@ -221,7 +221,7 @@ class SequenceLastOp : public Operator {
 };  // class SequenceLastOp
 
 template <typename xpu>
-Operator *CreateOp(SequenceLastParam param, int dtype);
+Operator *CreateOp(SequenceLastParam param, int dtype, int itype);
 
 #if DMLC_USE_CXX11
 class SequenceLastProp : public OperatorProperty {
@@ -281,8 +281,6 @@ class SequenceLastProp : public OperatorProperty {
     for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/sequence_last.cc b/src/operator/sequence_last.cc
index 345524b38..f2388a8ef 100644
--- a/src/operator/sequence_last.cc
+++ b/src/operator/sequence_last.cc
@@ -28,10 +28,13 @@
 namespace mxnet {
 namespace op {
 template <>
-Operator *CreateOp<cpu>(SequenceLastParam param, int dtype) {
+Operator *CreateOp<cpu>(SequenceLastParam param, int dtype, int itype) {
   Operator *op = nullptr;
-  MSHADOW_TYPE_SWITCH(dtype, DType,
-                           { op = new SequenceLastOp<cpu, DType>(param); })
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceLastOp<cpu, DType, IType>(param);
+        });
+    });
   return op;
 }
 
@@ -39,7 +42,12 @@ Operator *CreateOp<cpu>(SequenceLastParam param, int dtype) {
 Operator *SequenceLastProp::CreateOperatorEx(Context ctx,
                                              std::vector<TShape> *in_shape,
                                              std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+  if (in_type->size() >= 2 && (*in_type)[1] != -1) {
+    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[1]);
+  }
+
+  // sequence_length not passed in, so fall back to using input array dtype for second argument
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(SequenceLastParam);
diff --git a/src/operator/sequence_last.cu b/src/operator/sequence_last.cu
index dfc4e59d1..fb5ae8471 100644
--- a/src/operator/sequence_last.cu
+++ b/src/operator/sequence_last.cu
@@ -28,10 +28,13 @@
 
 namespace mxnet {
 namespace op {
-template <> Operator *CreateOp<gpu>(SequenceLastParam param, int dtype) {
+template <> Operator *CreateOp<gpu>(SequenceLastParam param, int dtype, int itype) {
   Operator *op = NULL;
-  MSHADOW_TYPE_SWITCH(dtype, DType,
-                           { op = new SequenceLastOp<gpu, DType>(param); })
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceLastOp<gpu, DType, IType>(param);
+        });
+    });
   return op;
 }
 
diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
index c93ffb5f1..c2584abd4 100644
--- a/src/operator/sequence_mask-inl.h
+++ b/src/operator/sequence_mask-inl.h
@@ -68,8 +68,8 @@ struct SequenceMaskParam : public dmlc::Parameter<SequenceMaskParam> {
 // (seqlen, batch, rest) case
 template <int req>
 struct SequenceMask0Kernel {
-  template <typename DType>
-  MSHADOW_XINLINE static void Map(int b, DType *in, const DType *idx,
+  template <typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int b, DType *in, const IType *idx,
                                   index_t max_s_len, index_t batch_size,
                                   index_t restsize, DType value) {
     const index_t seqpos = static_cast<int>(idx[b]);
@@ -86,8 +86,8 @@ struct SequenceMask0Kernel {
 // (batch, seqlen, rest) case
 template <int req>
 struct SequenceMask1Kernel {
-  template <typename DType>
-  MSHADOW_XINLINE static void Map(int b, DType *in, const DType *idx,
+  template <typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int b, DType *in, const IType *idx,
                                   index_t max_s_len, index_t batch_size,
                                   index_t restsize, DType value) {
     const index_t seqpos = static_cast<int>(idx[b]);
@@ -101,13 +101,13 @@ struct SequenceMask1Kernel {
   }
 };
 
-template <typename xpu, typename DType>
+template <typename xpu, typename DType, typename IType>
 class SequenceMaskOp : public Operator {
  public:
   explicit SequenceMaskOp(SequenceMaskParam p) { this->param_ = p; }
 
   void sequence_mask(const mshadow::Tensor<xpu, 3, DType> &data,
-                     const mshadow::Tensor<xpu, 1, DType> &indices,
+                     const mshadow::Tensor<xpu, 1, IType> &indices,
                      const OpReqType req, mshadow::Stream<xpu> *const s,
                      DType val) {
     using namespace mshadow;
@@ -153,8 +153,8 @@ class SequenceMaskOp : public Operator {
     // Actual implementation of masking
     Assign(out, req[seq_mask::kOut], F<mshadow_op::identity>(data));
     if (param_.use_sequence_length) {
-      Tensor<xpu, 1, DType> indices =
-          in_data[seq_mask::kSequenceLength].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, IType> indices =
+          in_data[seq_mask::kSequenceLength].get<xpu, 1, IType>(s);
       sequence_mask(out, indices, req[seq_mask::kOut], s,
                     static_cast<DType>(param_.value));
     }
@@ -190,8 +190,8 @@ class SequenceMaskOp : public Operator {
     if (!param_.use_sequence_length) {
       Assign(data_g, req[seq_mask::kData], F<mshadow_op::identity>(out_g));
     } else {
-      Tensor<xpu, 1, DType> indices =
-          in_data[seq_mask::kSequenceLength].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, IType> indices =
+          in_data[seq_mask::kSequenceLength].get<xpu, 1, IType>(s);
       if (req[seq_mask::kData] == kAddTo) {
         Tensor<xpu, 3, DType> out_g_temp =
             ctx.requested[seq_mask::kTempSpace].get_space_typed<xpu, 3, DType>(
@@ -212,7 +212,7 @@ class SequenceMaskOp : public Operator {
 };  // class SequenceMaskOp
 
 template <typename xpu>
-Operator *CreateOp(SequenceMaskParam param, int dtype);
+Operator *CreateOp(SequenceMaskParam param, int dtype, int itype);
 
 #if DMLC_USE_CXX11
 class SequenceMaskProp : public OperatorProperty {
@@ -270,8 +270,6 @@ class SequenceMaskProp : public OperatorProperty {
     for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/sequence_mask.cc b/src/operator/sequence_mask.cc
index e02c57bfd..76e58386b 100644
--- a/src/operator/sequence_mask.cc
+++ b/src/operator/sequence_mask.cc
@@ -28,10 +28,13 @@
 namespace mxnet {
 namespace op {
 template <>
-Operator *CreateOp<cpu>(SequenceMaskParam param, int dtype) {
+Operator *CreateOp<cpu>(SequenceMaskParam param, int dtype, int itype) {
   Operator *op = nullptr;
-  MSHADOW_TYPE_SWITCH(dtype, DType,
-                           { op = new SequenceMaskOp<cpu, DType>(param); })
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceMaskOp<cpu, DType, IType>(param);
+        });
+    });
   return op;
 }
 
@@ -39,7 +42,12 @@ Operator *CreateOp<cpu>(SequenceMaskParam param, int dtype) {
 Operator *SequenceMaskProp::CreateOperatorEx(Context ctx,
                                              std::vector<TShape> *in_shape,
                                              std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+  if (in_type->size() >= 2 && (*in_type)[1] != -1) {
+    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[1]);
+  }
+
+  // sequence_length not passed in, so fall back to using input array dtype for second argument
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(SequenceMaskParam);
diff --git a/src/operator/sequence_mask.cu b/src/operator/sequence_mask.cu
index 2ca883224..cec627c4c 100644
--- a/src/operator/sequence_mask.cu
+++ b/src/operator/sequence_mask.cu
@@ -29,10 +29,13 @@
 namespace mxnet {
 namespace op {
 
-template <> Operator *CreateOp<gpu>(SequenceMaskParam param, int dtype) {
+template <> Operator *CreateOp<gpu>(SequenceMaskParam param, int dtype, int itype) {
   Operator *op = NULL;
-  MSHADOW_TYPE_SWITCH(dtype, DType,
-                           { op = new SequenceMaskOp<gpu, DType>(param); })
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceMaskOp<gpu, DType, IType>(param);
+        });
+    });
   return op;
 }
 
diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
index 5c48729e1..eb9f71ccc 100644
--- a/src/operator/sequence_reverse-inl.h
+++ b/src/operator/sequence_reverse-inl.h
@@ -65,14 +65,14 @@ struct SequenceReverseParam : public dmlc::Parameter<SequenceReverseParam> {
 };
 
 struct ReverseKernel {
-  template <typename DType>
+  template <typename DType, typename IType>
   MSHADOW_XINLINE static void Map(const int i, DType *const out_data,
                                   const DType *const in_data,
                                   const OpReqType req,
                                   const index_t max_seq_len,
                                   const index_t batch_size,
                                   const index_t other_dim, const index_t numel,
-                                  const DType *const indices) {
+                                  const IType *const indices) {
     for (index_t batch = 0; batch < batch_size; ++batch) {
       const index_t num_seq =
           indices ? static_cast<index_t>(indices[batch]) : max_seq_len;
@@ -102,13 +102,13 @@ struct ReverseKernel {
   }
 };
 
-template <typename xpu, typename DType>
+template <typename xpu, typename DType, typename IType>
 class SequenceReverseOp : public Operator {
  public:
   explicit SequenceReverseOp(SequenceReverseParam p) { this->param_ = p; }
   void sequence_reverse(const mshadow::Tensor<xpu, 3, DType> &data,
                         const mshadow::Tensor<xpu, 3, DType> &out,
-                        const OpReqType req, const DType *const indices,
+                        const OpReqType req, const IType *const indices,
                         mshadow::Stream<xpu> *const s) {
     using namespace mshadow;
     using namespace mshadow::expr;
@@ -145,9 +145,9 @@ class SequenceReverseOp : public Operator {
     Tensor<xpu, 3, DType> out =
         out_data[seq_reverse::kOut].get_with_shape<xpu, 3, DType>(s3, s);
 
-    const DType *const indices =
+    const IType *const indices =
         param_.use_sequence_length
-            ? in_data[seq_reverse::kSequenceLength].dptr<DType>()
+            ? in_data[seq_reverse::kSequenceLength].dptr<IType>()
             : nullptr;
 
     sequence_reverse(data, out, req[seq_reverse::kOut], indices, s);
@@ -179,9 +179,9 @@ class SequenceReverseOp : public Operator {
     Tensor<xpu, 3, DType> output_grad =
         out_grad[seq_reverse::kOut].get_with_shape<xpu, 3, DType>(s3, s);
 
-    const DType *const indices =
+    const IType *const indices =
         param_.use_sequence_length
-            ? in_data[seq_reverse::kSequenceLength].dptr<DType>()
+            ? in_data[seq_reverse::kSequenceLength].dptr<IType>()
             : nullptr;
 
     sequence_reverse(output_grad, data_grad, req[seq_reverse::kData], indices,
@@ -193,7 +193,7 @@ class SequenceReverseOp : public Operator {
 };  // class SequenceReverseOp
 
 template <typename xpu>
-Operator *CreateOp(SequenceReverseParam param, int dtype);
+Operator *CreateOp(SequenceReverseParam param, int dtype, int itype);
 
 #if DMLC_USE_CXX11
 class SequenceReverseProp : public OperatorProperty {
@@ -249,8 +249,6 @@ class SequenceReverseProp : public OperatorProperty {
     for (size_t i = 0; i < in_type->size(); ++i) {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/sequence_reverse.cc b/src/operator/sequence_reverse.cc
index 21cab7891..9225b6b5d 100644
--- a/src/operator/sequence_reverse.cc
+++ b/src/operator/sequence_reverse.cc
@@ -28,10 +28,13 @@
 namespace mxnet {
 namespace op {
 template <>
-Operator *CreateOp<cpu>(SequenceReverseParam param, int dtype) {
+Operator *CreateOp<cpu>(SequenceReverseParam param, int dtype, int itype) {
   Operator *op = nullptr;
-  MSHADOW_TYPE_SWITCH(dtype, DType,
-                           { op = new SequenceReverseOp<cpu, DType>(param); })
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceReverseOp<cpu, DType, IType>(param);
+        });
+    });
   return op;
 }
 
@@ -39,7 +42,13 @@ Operator *CreateOp<cpu>(SequenceReverseParam param, int dtype) {
 Operator *SequenceReverseProp::CreateOperatorEx(
     Context ctx, std::vector<TShape> *in_shape,
     std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+
+  if (in_type->size() >= 2 && (*in_type)[1] != -1) {
+    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[1]);
+  }
+
+  // sequence_length not passed in, so fall back to using input array dtype for second argument
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(SequenceReverseParam);
diff --git a/src/operator/sequence_reverse.cu b/src/operator/sequence_reverse.cu
index 1edc9c13d..db5b416b3 100644
--- a/src/operator/sequence_reverse.cu
+++ b/src/operator/sequence_reverse.cu
@@ -28,11 +28,13 @@
 
 namespace mxnet {
 namespace op {
-template <> Operator *CreateOp<gpu>(SequenceReverseParam param, int dtype) {
+template <> Operator *CreateOp<gpu>(SequenceReverseParam param, int dtype, int itype) {
   Operator *op = nullptr;
   MSHADOW_TYPE_SWITCH(dtype, DType, {
-    op = new SequenceReverseOp<gpu, DType>(param);
-  })
+      MSHADOW_TYPE_SWITCH(itype, IType, {
+          op = new SequenceReverseOp<gpu, DType, IType>(param);
+        });
+    });
   return op;
 }
 
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv.cc b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
index 65e0e5c4b..499d7390e 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv.cc
@@ -43,10 +43,10 @@ static void UpdateConvWeightBias(NDArray *weight, NDArray *bias, bool no_bias,
                                 true, beta.dtype());
   const DType *weight_ptr = weight->data().dptr<DType>();
   const DType *bias_ptr = no_bias ? nullptr : bias->data().dptr<DType>();
-  const DType *gamma_ptr = gamma.Reorder2Default().data().dptr<DType>();
-  const DType *beta_ptr = beta.Reorder2Default().data().dptr<DType>();
-  const DType *mean_ptr = mean.Reorder2Default().data().dptr<DType>();
-  const DType *var_ptr = variance.Reorder2Default().data().dptr<DType>();
+  const DType *gamma_ptr = gamma.data().dptr<DType>();
+  const DType *beta_ptr = beta.data().dptr<DType>();
+  const DType *mean_ptr = mean.data().dptr<DType>();
+  const DType *var_ptr = variance.data().dptr<DType>();
   DType *update_weight_ptr = update_weight.data().dptr<DType>();
   DType *update_bias_ptr = update_bias.data().dptr<DType>();
   size_t channel = gamma.shape()[0];
@@ -77,23 +77,17 @@ static inline size_t GetInSumIndex(const MKLDNNConvFusionParam &param) {
 }
 
 template <typename DType>
-static void QuantizeConvWeightBias(NDArray *weight, NDArray *bias,
-                                   bool has_bias, float data_scale,
-                                   bool weight_channelwise_scale,
-                                   std::vector<float> *weight_scales) {
+static std::vector<float> GetWeightScales(const NDArray &weight, bool weight_channelwise_scale) {
   using red::limits::MaxValue;
   using red::limits::MinValue;
-  const DType *weight_ptr = weight->data().dptr<DType>();
-  NDArray quantized_weight = NDArray(weight->storage_type(), weight->shape(),
-                                     weight->ctx(), true, mshadow::kInt8);
-  int8_t *quan_weight_ptr = quantized_weight.data().dptr<int8_t>();
-  size_t channel = weight->shape()[0];
+  std::vector<float> weight_scales;
+  const DType *weight_ptr = weight.data().dptr<DType>();
+  size_t channel = weight.shape()[0];
 
   // TODO(Zhennan): Handle the case weight is not in dims 4.
-  size_t offset = weight->shape()[1] * weight->shape()[2] * weight->shape()[3];
+  size_t offset = weight.shape()[1] * weight.shape()[2] * weight.shape()[3];
   std::vector<DType> weight_c_min(channel, MaxValue<DType>());
   std::vector<DType> weight_c_max(channel, MinValue<DType>());
-#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
   for (int c = 0; c < static_cast<int>(channel); ++c) {
     const DType *p1 = weight_ptr + c * offset;
     for (size_t k = 0; k < offset; ++k) {
@@ -105,16 +99,10 @@ static void QuantizeConvWeightBias(NDArray *weight, NDArray *bias,
   }
 
   if (weight_channelwise_scale) {
-    weight_scales->resize(channel);
-#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    weight_scales.resize(channel);
     for (int c = 0; c < static_cast<int>(channel); ++c) {
       DType weight_range = MaxAbs(weight_c_min[c], weight_c_max[c]);
-      weight_scales->at(c) = kInt8Range / weight_range;
-      const DType *fp_ptr = weight_ptr + c * offset;
-      int8_t *quan_ptr = quan_weight_ptr + c * offset;
-      for (size_t k = 0; k < offset; ++k) {
-        quan_ptr[k] = std::round(weight_scales->at(c) * fp_ptr[k]);
-      }
+      weight_scales[c] = kInt8Range / weight_range;
     }
   } else {
     DType total_min = weight_c_min[0];
@@ -123,74 +111,73 @@ static void QuantizeConvWeightBias(NDArray *weight, NDArray *bias,
       if (total_min > weight_c_min[c]) total_min = weight_c_min[c];
       if (total_max < weight_c_max[c]) total_max = weight_c_max[c];
     }
-    weight_scales->resize(1);
+    weight_scales.resize(1);
     DType weight_range = MaxAbs(total_min, total_max);
-    weight_scales->at(0) = kInt8Range / weight_range;
-#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-    for (int c = 0; c < static_cast<int>(channel); ++c) {
-      const DType *fp_ptr = weight_ptr + c * offset;
-      int8_t *quan_ptr = quan_weight_ptr + c * offset;
-      for (size_t k = 0; k < offset; ++k) {
-        quan_ptr[k] = std::round(weight_scales->at(0) * fp_ptr[k]);
-      }
-    }
+    weight_scales[0] = kInt8Range / weight_range;
   }
-
-  *weight = quantized_weight;
-  if (has_bias) {
-    const DType *bias_ptr = bias->data().dptr<DType>();
-    NDArray quantized_bias = NDArray(bias->storage_type(), bias->shape(),
-                                     bias->ctx(), true, mshadow::kInt32);
-    int32_t *quan_bias_ptr = quantized_bias.data().dptr<int32_t>();
-    for (size_t c = 0; c < channel; ++c) {
-      auto weight_scale =
-          weight_channelwise_scale ? weight_scales->at(c) : weight_scales->at(0);
-      float bias_scale = weight_scale * data_scale;
-      quan_bias_ptr[c] = std::round(bias_scale * bias_ptr[c]);
-    }
-    *bias = quantized_bias;
-  }
-}
-
-static void ConvFusionFallBackCompute() {
-  LOG(FATAL) << "Don't know how to do ConvFusionFallBackCompute!";
+  return weight_scales;
 }
 
-static void ConvolutionFusionComputeExCPU(const MKLDNNConvFullParam &full_param,
-                                          const OpContext &ctx,
-                                          MKLDNNConvForward *fwd,
-                                          const std::vector<NDArray> &inputs,
-                                          const std::vector<OpReqType> &req,
-                                          const std::vector<NDArray> &outputs) {
-  if (SupportMKLDNNConv(full_param.conv_param, inputs[0])) {
-    MKLDNNConvolutionForwardFullFeature(full_param, ctx, fwd, inputs, req, outputs);
-    return;
+static void ConvertWeightBias2MKLDNN(const MKLDNNConvFullParam &param,
+                                     mkldnn::convolution_forward::primitive_desc fwd_pd,
+                                     NDArray *weight, NDArray *bias, bool has_bias,
+                                     float data_scale, const std::vector<float> &weight_scales) {
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  const auto new_weight = NDArray(fwd_pd.weights_primitive_desc());
+  const auto conv_weights_memory = new_weight.GetMKLDNNData();
+  primitive_attr weight_attr;
+  if (weight_scales.size()) {
+    const int weight_mask = (weight_scales.size()) == 1 ? 0 : 1;
+    weight_attr.set_int_output_round_mode(round_mode::round_nearest);
+    weight_attr.set_output_scales(weight_mask, weight_scales);
   }
-  ConvFusionFallBackCompute();
+  auto default_weights_memory = GetWeights(*weight, param.conv_param.num_group);
+  if (default_weights_memory == nullptr) default_weights_memory = weight->GetMKLDNNData();
+  const auto weight_reorder_pd =
+      mkldnn::reorder::primitive_desc(default_weights_memory->get_primitive_desc(),
+                                      conv_weights_memory->get_primitive_desc(), weight_attr);
+  stream->RegisterPrim(
+      mkldnn::reorder(weight_reorder_pd, *default_weights_memory, *conv_weights_memory));
+
+  NDArray new_bias;
+  if (has_bias && data_scale) {
+    std::vector<float> bias_scales(weight_scales.size());
+    for (size_t c = 0; c < weight_scales.size(); ++c) {
+      bias_scales[c] = weight_scales[c] * data_scale;
+    }
+    new_bias = NDArray(fwd_pd.bias_primitive_desc());
+    const auto conv_bias_memory = new_bias.GetMKLDNNData();
+    const int bias_mask = (bias_scales.size()) == 1 ? 0 : 1;
+    primitive_attr bias_attr;
+    bias_attr.set_int_output_round_mode(round_mode::round_nearest);
+    bias_attr.set_output_scales(bias_mask, bias_scales);
+    auto bias_weights_memory = bias->GetMKLDNNData();
+    auto bias_reorder_pd =
+        mkldnn::reorder::primitive_desc(bias_weights_memory->get_primitive_desc(),
+                                        conv_bias_memory->get_primitive_desc(), bias_attr);
+    stream->RegisterPrim(
+        mkldnn::reorder(bias_reorder_pd, *bias_weights_memory, *conv_bias_memory));
+  }
+  stream->Submit();
+  *weight = new_weight;
+  if (has_bias && data_scale) *bias = new_bias;
 }
 
 class SgMKLDNNConvOperator {
  public:
   explicit SgMKLDNNConvOperator(const nnvm::NodeAttrs &attrs)
-      : initalized_(false),
-        subgraph_sym_(*attrs.subgraphs[0]),
-        param_(nnvm::get<MKLDNNConvFusionParam>(attrs.parsed)),
-        inplace_(false) {}
+      : subgraph_sym_(*attrs.subgraphs[0]),
+        param_(nnvm::get<MKLDNNConvFusionParam>(attrs.parsed)) {}
 
   void Forward(const OpContext &ctx,
                const std::vector<NDArray> &inputs,
                const std::vector<OpReqType> &req,
                const std::vector<NDArray> &outputs);
 
-  void Backward(const OpContext &ctx, const std::vector<NDArray> &inputs,
-                const std::vector<OpReqType> &req,
-                const std::vector<NDArray> &outputs) {
-    LOG(FATAL) << "Not implemented: subgraph mkldnn Conv only supports "
-                  "inference computation.";
-  }
-
  private:
-  bool initalized_;
+  bool initalized_{false};
+  bool inplace_{false};
+  bool post_requantize_{false};
   nnvm::Symbol subgraph_sym_;
   MKLDNNConvFusionParam param_;
   std::shared_ptr<MKLDNNConvForward> fwd_;
@@ -200,10 +187,12 @@ class SgMKLDNNConvOperator {
   float cached_data_max_;
   float cached_sum_min_;
   float cached_sum_max_;
+  float cached_output_min_;
+  float cached_output_max_;
   size_t weight_ver_;
   size_t bias_ver_;
+  float data_scale_{0.0f};
   std::vector<float> weight_scales_;
-  bool inplace_;
 };
 
 void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
@@ -239,10 +228,6 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
   float sum_max = (mkldnn_param.with_sum && mkldnn_param.quantized)
                       ? inputs[idx++].data().dptr<float>()[0]
                       : 0.0;
-  float *out_min_ptr =
-      mkldnn_param.quantized ? outputs[kMin].data().dptr<float>() : nullptr;
-  float *out_max_ptr =
-      mkldnn_param.quantized ? outputs[kMax].data().dptr<float>() : nullptr;
   CHECK_EQ(input_size, idx);
   bool has_bias = mkldnn_param.with_bn || !conv_param.no_bias;
   NDArray data = inputs[in_data];
@@ -251,18 +236,22 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
   // Copy inputs[in_sum] into outputs[kOut] in case inplace optimization failed.
   if (mkldnn_param.with_sum) {
     if (!initalized_) {
-      auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
-      auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
       // TODO(zhennan): Currently, mkldnn fallback mechanism will break inplace option,
       // which make check (req[kOut] == kWriteInplace) useless.
+      auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
+      auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
       if (in_mkl_mem->get_data_handle() == out_mkl_mem->get_data_handle()) {
         inplace_ = true;
       }
     }
     if (!inplace_) {
       auto in_mkl_mem = inputs[in_sum].GetMKLDNNData();
-      const_cast<NDArray &>(outputs[kOut]).CopyFrom(*in_mkl_mem);
-      output = NDArray(outputs[kOut].GetMKLDNNData());
+      auto out_mkl_mem = outputs[kOut].GetMKLDNNData();
+      mkldnn_mem_ptr tmp_mem(
+          new mkldnn::memory(in_mkl_mem->get_primitive_desc(), out_mkl_mem->get_data_handle()));
+      MKLDNNStream::Get()->RegisterMem(tmp_mem);
+      mxnet::MKLDNNCopy(*in_mkl_mem, tmp_mem.get());
+      output = NDArray(tmp_mem);
     }
   }
 
@@ -284,19 +273,6 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
       }
     }
   }
-  bool post_requantize = false;
-  if (mkldnn_param.quantized) {
-    if (mkldnn_param.min_calib_range.has_value() &&
-        mkldnn_param.max_calib_range.has_value()) {
-      post_requantize = true;
-      mkldnn_param.weight_channelwise_scale = true;
-      *out_min_ptr = mkldnn_param.min_calib_range.value();
-      *out_max_ptr = mkldnn_param.max_calib_range.value();
-    } else {
-      mkldnn_param.weight_channelwise_scale = false;
-    }
-  }
-
   if (!initalized_) {
     cached_data_min_ = data_min;
     cached_data_max_ = data_max;
@@ -306,7 +282,7 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     cached_weight_ = inputs[in_weight].Reorder2Default();
     weight_ver_ = inputs[in_weight].version();
     if (!conv_param.no_bias) {
-      cached_bias_ = inputs[in_bias].Reorder2Default();
+      cached_bias_ = inputs[in_bias];
       bias_ver_ = inputs[in_bias].version();
     } else {
       cached_bias_ = NDArray();
@@ -327,13 +303,22 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     // Quantize weight and bias.
     if (mkldnn_param.quantized) {
       CHECK(data.dtype() == mshadow::kInt8 || data.dtype() == mshadow::kUint8);
+      if (cached_data_min_ < 0.0f) {
+        CHECK_EQ(data.dtype(), mshadow::kInt8)
+            << "Expect int8 when data_min < 0.0, consider quantize model with int8.";
+      }
+      auto weight_channelwise_scale = false;
+      if (mkldnn_param.min_calib_range.has_value() && mkldnn_param.max_calib_range.has_value()) {
+        cached_output_min_ = mkldnn_param.min_calib_range.value();
+        cached_output_max_ = mkldnn_param.max_calib_range.value();
+        post_requantize_ = true;
+        weight_channelwise_scale = true;
+      }
       auto data_range = (data.dtype() == mshadow::kInt8) ? kInt8Range : kUint8Range;
-      float data_scale = data_range / MaxAbs(cached_data_min_, cached_data_max_);
+      data_scale_ = data_range / MaxAbs(cached_data_min_, cached_data_max_);
       MSHADOW_REAL_TYPE_SWITCH(cached_weight_.dtype(), DType, {
-        QuantizeConvWeightBias<DType>(&cached_weight_, &cached_bias_,
-                                      has_bias, data_scale,
-                                      mkldnn_param.weight_channelwise_scale,
-                                      &weight_scales_);
+        weight_scales_ =
+            GetWeightScales<DType>(cached_weight_, weight_channelwise_scale);
       });
       // Collect scale.
       size_t channel = cached_weight_.shape()[0];
@@ -341,29 +326,20 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
       float out_range;
       float quantized_out_range;
       float output_scale;
-      if (cached_data_min_ < 0.0) {
-        // TODO(zhennan): Support int8 input when mkldnn supports.
-        LOG(FATAL) << "Can't handle negetive value for QuantizeData";
-      }
       if (mkldnn_param.with_sum) {
         auto quantized_sum_range = cached_sum_min_ < 0 ? kInt8Range : kUint8Range;
         sum_in_scale = quantized_sum_range / MaxAbs(cached_sum_min_, cached_sum_max_);
       }
-      if (post_requantize) {
-        quantized_out_range =
-            IsOutputUInt8(mkldnn_param) ? kUint8Range : kInt8Range;
-        out_range = MaxAbs(*out_min_ptr, *out_max_ptr);
+      if (post_requantize_) {
+        quantized_out_range = IsOutputUInt8(mkldnn_param) ? kUint8Range : kInt8Range;
+        out_range = MaxAbs(cached_output_min_, cached_output_max_);
         output_scale = quantized_out_range / out_range;
-        full_conv_param.requantize_scales.resize(channel);
-        for (size_t c = 0; c < channel; c++) {
-          auto weight_scale = mkldnn_param.weight_channelwise_scale
-                                  ? weight_scales_[c]
-                                  : weight_scales_[0];
-          full_conv_param.requantize_scales[c] =
-              output_scale / data_scale / weight_scale;
+        full_conv_param.requantize_scales.resize(weight_channelwise_scale ? channel : 1);
+        for (size_t c = 0; c < full_conv_param.requantize_scales.size(); c++) {
+          full_conv_param.requantize_scales[c] = output_scale / data_scale_ / weight_scales_[c];
         }
       } else {
-        output_scale = data_scale * weight_scales_[0];
+        output_scale = data_scale_ * weight_scales_[0];
         full_conv_param.requantize_scales.resize(0);
       }
       if (mkldnn_param.with_sum)
@@ -372,23 +348,44 @@ void SgMKLDNNConvOperator::Forward(const OpContext &ctx,
     fwd_.reset(new MKLDNNConvForward(
         full_conv_param, ctx.is_train, data, cached_weight_,
         has_bias ? &cached_bias_ : nullptr, output));
+    ConvertWeightBias2MKLDNN(full_conv_param, fwd_->fwd_pd, &cached_weight_, &cached_bias_,
+                             has_bias, data_scale_, weight_scales_);
+    fwd_->SetNewMem(*data.GetMKLDNNData(), *cached_weight_.GetMKLDNNData(),
+                    has_bias ? cached_bias_.GetMKLDNNData() : nullptr,
+                    *output.GetMKLDNNData());
+    initalized_ = true;
   }
-  initalized_ = true;
-  std::vector<NDArray> new_inputs;
-  std::vector<OpReqType> new_req;
-  if (has_bias) {
-    new_inputs = {data, cached_weight_, cached_bias_};
-    new_req = {req[in_data], req[in_weight], req[in_bias]};
+
+  if (mkldnn_param.quantized) {
+    auto data_mem = data.GetMKLDNNDataReorder(fwd_->fwd_pd.src_primitive_desc());
+    mkldnn::memory *mem = output.CreateMKLDNNData(fwd_->fwd_pd.dst_primitive_desc());
+    fwd_->SetNewMem(*data_mem, *mem);
+    MKLDNNStream::Get()->RegisterPrim(fwd_->GetFwd());
+    MKLDNNStream::Get()->Submit();
   } else {
-    new_inputs = {data, cached_weight_};
-    new_req = {req[in_data], req[in_weight]};
+    std::vector<NDArray> new_inputs;
+    std::vector<OpReqType> new_req;
+    if (has_bias) {
+      new_inputs = {data, cached_weight_, cached_bias_};
+      new_req = {req[in_data], req[in_weight], req[in_bias]};
+    } else {
+      new_inputs = {data, cached_weight_};
+      new_req = {req[in_data], req[in_weight]};
+    }
+    MKLDNNConvolutionForwardFullFeature(full_conv_param, ctx, fwd_.get(), new_inputs, new_req,
+                                        {output});
+  }
+  if (post_requantize_) {
+  float *out_min_ptr = outputs[kMin].data().dptr<float>();
+  float *out_max_ptr = outputs[kMax].data().dptr<float>();
+  *out_min_ptr = cached_output_min_;
+  *out_max_ptr = cached_output_max_;
   }
-  ConvolutionFusionComputeExCPU(full_conv_param, ctx, fwd_.get(), new_inputs,
-                                new_req, {output});
-
   if (mkldnn_param.with_sum) {
     auto out = const_cast<NDArray &>(outputs[kOut]);
-    out.UpdateMKLDNNMemDesc();
+    auto format = static_cast<mkldnn::memory::format>(
+        fwd_->fwd_pd.dst_primitive_desc().desc().data.format);
+    out.UpdateMKLDNNMemDesc(format);
   }
 }
 
@@ -405,7 +402,7 @@ static uint32_t SgMKLDNNConvNumInputs(const NodeAttrs &attrs) {
   auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
   auto num_input = DefaultSubgraphOpNumInputs(attrs);
   if (param.full_conv_param.mkldnn_param.quantized)
-    return num_input + 2 + param.full_conv_param.mkldnn_param.with_sum ? 2 : 0;
+    return num_input + 2 + (param.full_conv_param.mkldnn_param.with_sum ? 2 : 0);
   else
     return num_input;
 }
@@ -425,6 +422,7 @@ static void SgMKLDNNConvParamParser(nnvm::NodeAttrs *attrs) {
     os << ")";
     throw dmlc::ParamError(os.str());
   }
+  CHECK_EQ(attrs->subgraphs.size(), 1);
   auto subgraph_sym = attrs->subgraphs[0];
   DFSVisit(subgraph_sym->outputs, [&](const nnvm::NodePtr &node) {
     if (node->is_variable()) return;
@@ -442,10 +440,23 @@ static void SgMKLDNNConvParamParser(nnvm::NodeAttrs *attrs) {
   attrs->parsed = std::move(param_);
 }
 
-static std::vector<std::string> SgMKLDNNConvListInputNames(
-    const NodeAttrs &attrs) {
+static std::vector<std::string> SgMKLDNNConvListInputNames(const NodeAttrs &attrs) {
   auto const &param = nnvm::get<MKLDNNConvFusionParam>(attrs.parsed);
-  std::vector<std::string> input_names = DefaultSubgraphOpListInputs(attrs);
+  std::vector<std::string> input_names;
+  input_names.emplace_back("data");
+  input_names.emplace_back("weight");
+  if (!param.full_conv_param.conv_param.no_bias) {
+    input_names.emplace_back("bias");
+  }
+  if (param.full_conv_param.mkldnn_param.with_bn) {
+    input_names.emplace_back("gamma");
+    input_names.emplace_back("beta");
+    input_names.emplace_back("mean");
+    input_names.emplace_back("var");
+  }
+  if (param.full_conv_param.mkldnn_param.with_sum) {
+    input_names.emplace_back("sum");
+  }
   if (param.full_conv_param.mkldnn_param.quantized) {
     input_names.emplace_back("data_min");
     input_names.emplace_back("data_max");
@@ -454,6 +465,7 @@ static std::vector<std::string> SgMKLDNNConvListInputNames(
       input_names.emplace_back("sum_max");
     }
   }
+  CHECK_EQ(input_names.size(), SgMKLDNNConvNumInputs(attrs));
   return input_names;
 }
 
diff --git a/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc b/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc
index d55375d22..ded316090 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_conv_property.cc
@@ -66,17 +66,21 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
   }
 
   bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
-    if (status == kFail || status == kSuccess || new_node.is_variable())
-      return false;
     // If n isn't the last matched node, then we encoutered a internal
     // branch, we should pop out the node behind n and stop fusion.
     if (matched_list.back() != &n) {
-      while (matched_list.back() != &n) {
-        matched_list.pop_back();
+      if (std::find(matched_list.begin(), matched_list.end(), &n) !=
+          matched_list.end()) {
+        while (matched_list.back() != &n) {
+          matched_list.pop_back();
+        }
       }
       status = kSuccess;
       return false;
     }
+    if (status == kFail || status == kSuccess || new_node.is_variable())
+      return false;
+
     // Use status machine to do selection. The status change is
     // kStart -> kBN -> kSum -> kSuccess
     switch (status) {
@@ -99,12 +103,11 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
               nnvm::get<ActivationParam>(new_node.attrs.parsed);
           if (param.act_type == activation::kReLU) {
             matched_list.push_back(&new_node);
-            // If we find conv+relu, then we can't match bn anymore.
-            if (status == kStart) status = kBN;
-            return true;
-          } else {
+            // If we find conv+relu, then we can't match anymore.
+            // TODO(zhennan): mkldnn only supports convolution + relu + sum in
+            // int8, not in fp32. So we disable this pattern at moment.
             status = kSuccess;
-            return false;
+            return true;
           }
         }
         status = kSuccess;
@@ -117,7 +120,15 @@ class SgMKLDNNConvSelector : public SubgraphSelector {
     if (status == kFail) {
       return std::vector<nnvm::Node *>(0);
     } else {
-      return candidates;
+      std::vector<nnvm::Node *> ret;
+      for (auto i : matched_list) {
+        auto non_const_i = const_cast<nnvm::Node *>(i);
+        if (std::find(candidates.begin(), candidates.end(), non_const_i) !=
+            candidates.end()) {
+          ret.push_back(non_const_i);
+        }
+      }
+      return ret;
     }
   }
 };
@@ -130,8 +141,7 @@ class SgMKLDNNConvProperty : public SubgraphProperty {
     disable_conv_relu = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_RELU", 0);
     disable_conv_sum = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_FUSE_CONV_SUM", 0);
 
-    disable_all =
-        disable_all && disable_conv_bn && disable_conv_relu && disable_conv_sum;
+    disable_all = disable_all || (disable_conv_bn && disable_conv_relu && disable_conv_sum);
     if (disable_all) {
       LOG(INFO) << "MKLDNN Convolution optimization pass is disabled.";
     } else {
diff --git a/src/operator/subgraph/partition_graph.cc b/src/operator/subgraph/partition_graph.cc
index 2605f9354..a034ae8b2 100644
--- a/src/operator/subgraph/partition_graph.cc
+++ b/src/operator/subgraph/partition_graph.cc
@@ -515,16 +515,14 @@ void FindInputEntries(
     nnvm::NodeEntryMap<std::vector<nnvm::NodeEntry*>>* input_entry_map) {
   const auto& indexed_graph = g.indexed_graph();
   int label = -1;
-  for (size_t i = 0; i < subgraph_nodes.size(); ++i) {
+  for (auto subgraph_node : subgraph_nodes) {
     if (label == -1) {
-      label = subgraph_nodes[i]->label;
+      label = subgraph_node->label;
     } else {
-      CHECK_EQ(subgraph_nodes[i]->label, label);
+      CHECK_EQ(subgraph_node->label, label);
     }
-
-    auto& inputs = subgraph_nodes[i]->node->inputs;
-    for (size_t j = 0; j < inputs.size(); ++j) {
-      auto& e = inputs[j];
+    auto& inputs = subgraph_node->node->inputs;
+    for (auto &e : inputs) {
       if (input_entry_map->count(e) != 0) {
         input_entry_map->at(e).push_back(&e);
       } else {
@@ -575,37 +573,34 @@ void FindOutputEntries(Graph* g,
     }
     output_entries->push_back(entry);
   };
-
-  for (size_t i = 0; i < subgraph_nodes.size(); ++i) {
+  for (auto subgraph_node : subgraph_nodes) {
     if (label == -1) {
-      label = subgraph_nodes[i]->label;
+      label = subgraph_node->label;
     } else {
-      CHECK_EQ(subgraph_nodes[i]->label, label);
+      CHECK_EQ(subgraph_node->label, label);
     }
-    for (auto it = subgraph_nodes[i]->outputs.begin();
-         it != subgraph_nodes[i]->outputs.end(); ++it) {
-      if (indexed_graph.exist(it->first)) {
+    for (auto &output_node : subgraph_node->outputs) {
+      if (indexed_graph.exist(output_node.first)) {
         // if the output node is a normal graph node (not a subgraph node)
-        const auto nid = indexed_graph.node_id(it->first);
+        const auto nid = indexed_graph.node_id(output_node.first);
         // this is a node not belonging to the current subgraph
         if (simple_nodes[nid]->label != label) {
-          for (auto idx : it->second) {
+          for (auto idx : output_node.second) {
             add_output(&simple_nodes[nid]->node->inputs[idx]);
           }
         }
       } else {
         // if the output node is a subgraph node
         // two graphs are adjacent
-        for (auto idx : it->second) {
-          add_output(&it->first->inputs[idx]);
+        for (auto idx : output_node.second) {
+          add_output(&output_node.first->inputs[idx]);
         }
       }
     }
   }
   // Check if current subgraph contains a node which is the last node
   // of the whole graph. If so, save its corresponding entry as well.
-  for (size_t i = 0; i < g->outputs.size(); ++i) {
-    auto& entry = g->outputs[i];
+  for (auto &entry : g->outputs) {
     // The entry might has been updated as an output of
     // a subgraph node. In this case, no need
     // to check its source for the current subgraph. Otherwise,
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
index 7f69395d1..d0079b545 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cc
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -71,7 +71,7 @@ static bool IdentityAttrLikeRhsStorageType(const nnvm::NodeAttrs& attrs,
 
 // relu
 MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(relu, cpu, mshadow_op::relu)
-.describe(R"code(Computes rectified linear.
+.describe(R"code(Computes rectified linear activation.
 
 .. math::
    max(features, 0)
@@ -916,6 +916,22 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_erf)
 .set_attr<FCompute>("FCompute<cpu>",
                     ElemwiseBinaryOp::Compute<cpu, unary_bwd<mshadow_op::erf_grad>>);
 
+// erfinv
+MXNET_OPERATOR_REGISTER_UNARY(erfinv)
+.describe(R"code(Returns element-wise inverse gauss error function of the input.
+
+Example::
+
+   erfinv([0, 0.5., -1.]) = [0., 0.4769, -inf]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::erfinv>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_erfinv"});
+
+MXNET_OPERATOR_REGISTER_BINARY(_backward_erfinv)
+.set_attr<FCompute>("FCompute<cpu>",
+                    ElemwiseBinaryOp::Compute<cpu, unary_bwd<mshadow_op::erfinv_grad>>);
+
 // rcbrt
 MXNET_OPERATOR_REGISTER_UNARY(rcbrt)
 .describe(R"code(Returns element-wise inverse cube-root value of the input.
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
index 14f2be02a..642cb0e6e 100644
--- a/src/operator/tensor/elemwise_unary_op_basic.cu
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -62,6 +62,14 @@ NNVM_REGISTER_OP(_backward_erf)
 .set_attr<FCompute>("FCompute<gpu>",
                     ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::erf_grad>>);
 
+// erfinv
+NNVM_REGISTER_OP(erfinv)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::erfinv>);
+
+NNVM_REGISTER_OP(_backward_erfinv)
+.set_attr<FCompute>("FCompute<gpu>",
+                    ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::erfinv_grad>>);
+
 // copy
 NNVM_REGISTER_OP(_copy)
 .set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>)
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index c39418dbe..564171d2c 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -678,11 +678,14 @@ Examples::
   // In this case we will get rows 0 and 1, then 1 and 2 (calculated by wrapping around).
   // Along axis 1
 
-  take(x, [[0, 3], [-1, -2]], axis=1, mode='wrap') = [[[ 1.,  2.],
-                                                       [ 3.,  4.]],
+  take(x, [[0, 3], [-1, -2]], axis=1, mode='wrap') = [[[ 1.  2.]
+                                                       [ 2.  1.]]
 
-                                                      [[ 3.,  4.],
-                                                       [ 5.,  6.]]]
+                                                      [[ 3.  4.]
+                                                       [ 4.  3.]]
+
+                                                      [[ 5.  6.]
+                                                       [ 6.  5.]]]
 
 The storage type of ``take`` output depends upon the input storage type:
 
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 3b229cf38..97c4fa556 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -37,6 +37,7 @@
 #include "broadcast_reduce_op.h"
 #include "./init_op.h"
 #include "../../common/static_array.h"
+#include "./slice-inl.h"
 
 #if MXNET_USE_CUDA
 #include <thrust/device_vector.h>
@@ -398,19 +399,15 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-struct SliceParam : public dmlc::Parameter<SliceParam> {
-  nnvm::Tuple<dmlc::optional<int>> begin, end;
-  nnvm::Tuple<dmlc::optional<int>> step;
-  DMLC_DECLARE_PARAMETER(SliceParam) {
-    DMLC_DECLARE_FIELD(begin)
-    .describe("starting indices for the slice operation, supports negative indices.");
-    DMLC_DECLARE_FIELD(end)
-    .describe("ending indices for the slice operation, supports negative indices.");
-    DMLC_DECLARE_FIELD(step)
-    .set_default(nnvm::Tuple<dmlc::optional<int>>())
-    .describe("step for the slice operation, supports negative values.");
+// Currently MKLDNN only supports step = 1 or step has no value
+inline bool SupportMKLDNNSlice(const SliceParam& param) {
+  if (param.step.ndim() == 0U) return true;
+  for (uint32_t i = 0; i < param.step.ndim(); ++i) {
+    if (param.step[i].has_value() && param.step[i].value() != 1)
+      return false;
   }
-};
+  return true;
+}
 
 inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
                                          const int dev_mask,
@@ -432,9 +429,19 @@ inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
       && (!param.step[0].has_value() || param.step[0].value() == 1)) {
     trivial_step = true;
   }
-  if (!dispatched && in_stype == kDefaultStorage) {
-    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
-                                     dispatch_mode, DispatchMode::kFCompute);
+
+  if (in_stype == kDefaultStorage) {
+#if MXNET_USE_MKLDNN == 1
+    if (dev_mask == Context::kCPU && MKLDNNEnvSet()
+        && SupportMKLDNNSlice(param)) {
+      dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                       dispatch_mode, dispatch_ex);
+    }
+#endif
+    if (!dispatched) {
+      dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                       dispatch_mode, DispatchMode::kFCompute);
+    }
   }
 
   if (!dispatched && in_stype == kCSRStorage && trivial_step) {
@@ -2520,6 +2527,319 @@ void SpaceToDepthOpForward(const nnvm::NodeAttrs& attrs,
   });
 }
 
+namespace split_enum {
+enum SplitOpInputs {kData};
+}  // namespace split_enum
+
+struct SplitParam : public dmlc::Parameter<SplitParam> {
+  TShape indices;
+  int axis;
+  bool squeeze_axis;
+  int sections;
+  DMLC_DECLARE_PARAMETER(SplitParam) {
+    DMLC_DECLARE_FIELD(indices)
+    .describe("Indices of splits. The elements should denote the boundaries of at which split"
+              " is performed along the `axis`.");
+    DMLC_DECLARE_FIELD(axis).set_default(1)
+    .describe("Axis along which to split.");
+    DMLC_DECLARE_FIELD(squeeze_axis).set_default(0)
+    .describe("If true, Removes the axis with length 1 from the shapes of the output arrays."
+              " **Note** that setting `squeeze_axis` to ``true`` removes axis with length 1"
+              " only along the `axis` which it is split."
+              " Also `squeeze_axis` can be set to ``true``"
+              " only if ``input.shape[axis] == num_outputs``.");
+    DMLC_DECLARE_FIELD(sections).set_default(0)
+    .describe("Number of sections if equally splitted. Default to 0 which means split by indices.");
+  }
+};  // struct SplitParam
+
+inline TShape GetSplitIndices(const TShape& ishape, int axis, int sections) {
+  TShape indices(sections+1);
+  indices[0] = 0;
+  int64_t section_size = ishape[axis] / sections;
+  for (int i = 0; i < sections; ++i) {
+    indices[i+1] = section_size * (i + 1);
+  }
+  return indices;
+}
+
+inline bool SplitOpType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int>* in_attrs,
+                        std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  int dtype = (*in_attrs)[0];
+  CHECK_NE(dtype, -1) << "First input must have specified type";
+  const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
+  out_attrs->clear();
+  int num_outputs = (param.sections > 0) ? param.sections : param.indices.ndim();
+  for (int i = 0; i < num_outputs; ++i) {
+    out_attrs->push_back(dtype);
+  }
+  return true;
+}
+
+inline bool SplitOpShape(const nnvm::NodeAttrs& attrs,
+                         std::vector<TShape>* in_attrs,
+                         std::vector<TShape>* out_attrs) {
+  using namespace mshadow;
+  const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  TShape dshape = in_attrs->at(split_enum::kData);
+  TShape ishape = in_attrs->at(split_enum::kData);
+  if (dshape.ndim() == 0) return false;
+  if (param.axis >= 0) {
+    CHECK_LT(static_cast<size_t>(param.axis), dshape.ndim());
+  } else {
+    CHECK_LT(param.axis + dshape.ndim(), dshape.ndim());
+  }
+  int real_axis = param.axis;
+  if (real_axis < 0) {
+    real_axis += dshape.ndim();
+  }
+  const TShape indices =
+    (param.sections > 0) ? GetSplitIndices(ishape, real_axis, param.sections) : param.indices;
+  int num_outputs = (param.sections > 0) ? indices.ndim() - 1 : indices.ndim();
+  // Pre-compute squeezed output shape for future usage
+  TShape squeezed_dshape = dshape;
+  for (int d = real_axis; d < static_cast<int>(squeezed_dshape.ndim()) - 1; ++d) {
+    squeezed_dshape[d] = squeezed_dshape[d+1];
+  }
+  squeezed_dshape = TShape(&squeezed_dshape[0], &squeezed_dshape[squeezed_dshape.ndim()-1]);
+  // Assign shape to every output
+  for (int i = 0; i < num_outputs; ++i) {
+    int start = indices[i];
+    int end = (i < num_outputs - 1) ? indices[i + 1] : ishape[real_axis];
+    CHECK(start < end)
+      << "start " << start << " is not less than end " << end << "for subarray " << i;
+    CHECK(end <= ishape[real_axis])
+      << "end " << end << " is no less than the size of the axis " << ishape[real_axis];
+    dshape[real_axis] = (end - start);
+    if (param.squeeze_axis) {
+      CHECK_EQ(end - start, 1U) << "expected axis size of 1 but got " << end - start;
+      SHAPE_ASSIGN_CHECK(*out_attrs, i, squeezed_dshape);
+    } else {
+      SHAPE_ASSIGN_CHECK(*out_attrs, i, dshape);
+    }
+  }
+  TShape back_calculate_dshape = ishape;
+  back_calculate_dshape[real_axis] = 0;
+  for (int d = 0; d < real_axis; ++d) {
+    back_calculate_dshape[d] = (*out_attrs)[0][d];
+  }
+  if (param.squeeze_axis) {
+    back_calculate_dshape[real_axis] = num_outputs;
+  } else {
+    for (int i = 0; i < num_outputs; ++i) {
+      back_calculate_dshape[real_axis] += (*out_attrs)[i][real_axis];
+    }
+  }
+  for (int d = real_axis + 1; d < static_cast<int>(ishape.ndim()); ++d) {
+    if (param.squeeze_axis) {
+      back_calculate_dshape[d] = (*out_attrs)[0][d - 1];
+    } else {
+      back_calculate_dshape[d] = (*out_attrs)[0][d];
+    }
+  }
+  SHAPE_ASSIGN_CHECK(*in_attrs, split_enum::kData, back_calculate_dshape);
+  return true;
+}
+
+struct SplitKernel {
+  /*!
+   * \brief Map function for forward split_v2 operator
+   * \param i              global thread id
+   * \param in_data        ptr to input buffer
+   * \param out_data       ptr to ptr of outputs buffer
+   * \param indices        ptr to indices buffer
+   * \param num_sections   # of sections after split
+   * \param axis_size      size of axis to be splitted on
+   * \param trailing_size  step size within the data buffer of the axis to be splitted on
+   */
+  template<typename DType>
+  static MSHADOW_XINLINE void Map(size_t i,
+                                  const DType *in_data, DType** out_data, const size_t* indices,
+                                  const size_t num_sections, const size_t axis_size,
+                                  const size_t trailing_size) {
+    size_t idx = i / trailing_size % axis_size;
+    size_t target = 0;
+    for (size_t section = 0;
+         section < num_sections && indices[section] <= idx;
+         target = section++) {}
+    DType* target_data = out_data[target];
+    const size_t mid_idx = idx - indices[target];
+    const size_t head_idx = i / (trailing_size * axis_size);
+    const size_t tail_idx = i % trailing_size;
+    const size_t section_size = indices[target + 1] - indices[target];
+    const size_t target_idx =
+      head_idx * trailing_size * section_size + mid_idx * trailing_size + tail_idx;
+    target_data[target_idx] = in_data[i];
+  }
+};
+
+struct ConcatenateKernel {
+  /*!
+   * \brief Map function for backward split_v2 operator
+   * \param i              global thread id
+   * \param out_grad       ptr to ptr of out grads buffer
+   * \param in_grad        ptr to input grad buffer
+   * \param indices        ptr to indices buffer
+   * \param num_sections   # of sections after split
+   * \param axis_size      size of axis to be splitted on
+   * \param trailing_size  step size within the data buffer of the axis to be splitted on
+   */
+  template<typename DType>
+  static MSHADOW_XINLINE void Map(size_t i,
+                                  DType** out_grad, DType* in_grad, const size_t* indices,
+                                  const size_t num_sections, const size_t axis_size,
+                                  const size_t trailing_size) {
+    size_t idx = i / trailing_size % axis_size;
+    size_t src = 0;
+    for (size_t section = 0;
+         section < num_sections && indices[section] <= idx;
+         src = section++) {}
+    DType* src_grad = out_grad[src];
+    const size_t mid_idx = idx - indices[src];
+    const size_t head_idx = i / (trailing_size * axis_size);
+    const size_t tail_idx = i % trailing_size;
+    const size_t section_size = indices[src + 1] - indices[src];
+    const size_t src_idx =
+      head_idx * trailing_size * section_size + mid_idx * trailing_size + tail_idx;
+    in_grad[i] = src_grad[src_idx];
+  }
+};
+
+template<typename xpu>
+inline void SplitOpForward(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<TBlob>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), (param.sections > 0) ? param.sections : param.indices.ndim());
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob& input_data = inputs[split_enum::kData];
+  size_t leading = 1, trailing = 1;
+  int real_axis = param.axis;
+  if (real_axis < 0) {
+    real_axis += input_data.ndim();
+  }
+  CHECK_LT(real_axis, input_data.ndim());
+  size_t mid = input_data.shape_[real_axis];
+  for (int i = 0; i < real_axis; ++i) {
+    leading *= input_data.shape_[i];
+  }
+  for (int i = real_axis + 1; i < input_data.ndim(); ++i) {
+    trailing *= input_data.shape_[i];
+  }
+
+  size_t workspace_size = 0;
+  const TShape& ishape = input_data.shape_;
+  const TShape split_pts =
+    (param.sections > 0) ? GetSplitIndices(ishape, real_axis, param.sections) : param.indices;
+  std::vector<size_t> indices;
+  for (const auto& section : split_pts) {
+    indices.push_back(section);
+  }
+  if (param.sections == 0) {
+    indices.push_back(ishape[real_axis]);
+  }
+  workspace_size += indices.size() * sizeof(size_t);
+  MSHADOW_TYPE_SWITCH(input_data.type_flag_, DType, {
+    std::vector<DType*> output_data;
+    for (const TBlob& data : outputs) {
+      output_data.push_back(data.dptr<DType>());
+    }
+    workspace_size += output_data.size() * sizeof(DType*);
+    Tensor<xpu, 1, char> workspace =
+      ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+    Tensor<cpu, 1, size_t> indices_cpu_tensor(indices.data(), Shape1(indices.size()));
+    Tensor<xpu, 1, size_t> indices_xpu_tensor(
+      reinterpret_cast<size_t*>(workspace.dptr_), Shape1(indices.size()));
+    Tensor<cpu, 1, DType*> ptrs_cpu_tensor(output_data.data(), Shape1(output_data.size()));
+    Tensor<xpu, 1, DType*> ptrs_xpu_tensor(
+      reinterpret_cast<DType**>(workspace.dptr_ + indices.size() * sizeof(size_t)),
+      Shape1(output_data.size()));
+    mshadow::Copy(indices_xpu_tensor, indices_cpu_tensor, s);
+    mshadow::Copy(ptrs_xpu_tensor, ptrs_cpu_tensor, s);
+    Kernel<SplitKernel, xpu>::Launch(
+      s, input_data.Size(), input_data.dptr<DType>(), ptrs_xpu_tensor.dptr_,
+      indices_xpu_tensor.dptr_, indices.size() - 1, mid, trailing);
+  });
+}
+
+template<typename xpu>
+inline void SplitOpBackward(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), (param.sections > 0) ? param.sections : param.indices.ndim())
+    << "out grad vector size mush match the output size";
+  CHECK_EQ(outputs.size(), 1U);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  TBlob input_grad = outputs[split_enum::kData];
+  size_t leading = 1, trailing = 1;
+  int real_axis = param.axis;
+  if (real_axis < 0) {
+      real_axis += input_grad.ndim();
+  }
+  CHECK_LT(real_axis, input_grad.ndim());
+  size_t mid = input_grad.shape_[real_axis];
+  for (int i = 0; i < real_axis; ++i) {
+    leading *= input_grad.shape_[i];
+  }
+  for (int i = real_axis + 1; i < input_grad.ndim(); ++i) {
+    trailing *= input_grad.shape_[i];
+  }
+
+  size_t workspace_size = 0;
+  const TShape& ishape = input_grad.shape_;
+  const TShape split_pts =
+    (param.sections > 0) ? GetSplitIndices(ishape, real_axis, param.sections) : param.indices;
+  std::vector<size_t> indices;
+  for (const auto& section : split_pts) {
+    indices.push_back(section);
+  }
+  if (param.sections == 0) {
+    indices.push_back(ishape[real_axis]);
+  }
+  workspace_size += indices.size() * sizeof(size_t);
+  MSHADOW_TYPE_SWITCH(input_grad.type_flag_, DType, {
+    std::vector<DType*> out_grads;
+    for (const TBlob& output_grad : inputs) {
+      out_grads.push_back(output_grad.dptr<DType>());
+    }
+    workspace_size += out_grads.size() * sizeof(DType*);
+    Tensor<xpu, 1, char> workspace =
+      ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+    Tensor<cpu, 1, size_t> indices_cpu_tensor(indices.data(), Shape1(indices.size()));
+    Tensor<xpu, 1, size_t> indices_xpu_tensor(
+      reinterpret_cast<size_t*>(workspace.dptr_), Shape1(indices.size()));
+    Tensor<cpu, 1, DType*> ptrs_cpu_tensor(out_grads.data(), Shape1(inputs.size()));
+    Tensor<xpu, 1, DType*> ptrs_xpu_tensor(
+      reinterpret_cast<DType**>(workspace.dptr_ + indices.size() * sizeof(size_t)),
+      Shape1(inputs.size()));
+    mshadow::Copy(indices_xpu_tensor, indices_cpu_tensor, s);
+    mshadow::Copy(ptrs_xpu_tensor, ptrs_cpu_tensor, s);
+    Kernel<ConcatenateKernel, xpu>::Launch(
+      s, input_grad.Size(), ptrs_xpu_tensor.dptr_, input_grad.dptr<DType>(),
+      indices_xpu_tensor.dptr_, indices.size() - 1, mid, trailing);
+  });
+}
+
+inline uint32_t SplitNumOutputs(const NodeAttrs& attrs) {
+  const SplitParam& param = nnvm::get<SplitParam>(attrs.parsed);
+  return (param.sections > 0) ? param.sections : param.indices.ndim();
+}
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index db8efa454..e5d354be6 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -27,6 +27,7 @@
 #include "./elemwise_unary_op.h"
 #include "../nn/mkldnn/mkldnn_ops-inl.h"
 #include "../nn/mkldnn/mkldnn_base-inl.h"
+#include "../nn/mkldnn/mkldnn_slice-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -102,6 +103,7 @@ DMLC_REGISTER_PARAMETER(ReverseParam);
 DMLC_REGISTER_PARAMETER(StackParam);
 DMLC_REGISTER_PARAMETER(SqueezeParam);
 DMLC_REGISTER_PARAMETER(DepthToSpaceParam);
+DMLC_REGISTER_PARAMETER(SplitParam);
 
 #if MXNET_USE_MKLDNN == 1
 void MKLDNNReshape(const NDArray &in_data, const NDArray &out_data) {
@@ -420,6 +422,30 @@ will return a new array with shape ``(2,1,3,4)``.
 .add_argument("data", "NDArray-or-Symbol", "Source input")
 .add_arguments(ExpandDimParam::__FIELDS__());
 
+void SliceExCPU(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const std::vector<NDArray>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1);
+  CHECK_EQ(outputs.size(), 1);
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  auto in_stype = inputs[0].storage_type();
+  if (in_stype == kCSRStorage) {
+    SliceCsrImpl<cpu>(param, ctx, inputs[0], req[0], outputs[0]);
+#if MXNET_USE_MKLDNN == 1
+  } else if (in_stype == kDefaultStorage) {
+    if (SupportMKLDNN(inputs[0])) {
+      MKLDNNSlice(param, ctx, inputs[0], req[0], outputs[0]);
+    } else {
+      FallBackCompute(SliceOpForward<cpu>, attrs, ctx, inputs, req, outputs);
+    }
+#endif
+  } else {
+    LOG(FATAL) << "Slice not implemented for storage type" << in_stype;
+  }
+}
+
 NNVM_REGISTER_OP(slice)
 MXNET_ADD_SPARSE_OP_ALIAS(slice)
 .add_alias("crop")
@@ -478,7 +504,10 @@ Example::
 .set_attr<FInferStorageType>("FInferStorageType", SliceForwardInferStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_slice"})
 .set_attr<FCompute>("FCompute<cpu>", SliceOpForward<cpu>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", SliceEx<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SliceExCPU)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+#endif
 .add_argument("data", "NDArray-or-Symbol", "Source input")
 .add_arguments(SliceParam::__FIELDS__());
 
@@ -1043,8 +1072,8 @@ Example::
          [12, 18, 13, 19, 14, 20],
          [3, 9, 4, 10, 5, 11],
          [15, 21, 16, 22, 17, 23]]]]
-  
-  
+
+
   space_to_depth(x, 2) = [[[[0, 1, 2],
                             [3, 4, 5]],
                            [[6, 7, 8],
@@ -1072,5 +1101,103 @@ Example::
 .add_argument("data", "NDArray-or-Symbol", "Input ndarray")
 .add_arguments(DepthToSpaceParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_split_v2)
+.describe(R"code(Splits an array along a particular axis into multiple sub-arrays.
+
+Example::
+
+   x  = [[[ 1.]
+          [ 2.]]
+         [[ 3.]
+          [ 4.]]
+         [[ 5.]
+          [ 6.]]]
+   x.shape = (3, 2, 1)
+
+   y = split_v2(x, axis=1, indices_or_sections=2) // a list of 2 arrays with shape (3, 1, 1)
+   y = [[[ 1.]]
+        [[ 3.]]
+        [[ 5.]]]
+
+       [[[ 2.]]
+        [[ 4.]]
+        [[ 6.]]]
+
+   y[0].shape = (3, 1, 1)
+
+   z = split_v2(x, axis=0, indices_or_sections=3) // a list of 3 arrays with shape (1, 2, 1)
+   z = [[[ 1.]
+         [ 2.]]]
+
+       [[[ 3.]
+         [ 4.]]]
+
+       [[[ 5.]
+         [ 6.]]]
+
+   z[0].shape = (1, 2, 1)
+
+   w = split_v2(x, axis=0, indices_or_sections=(1,)) // a list of 2 arrays with shape [(1, 2, 1), (2, 2, 1)]
+   w = [[[ 1.]
+         [ 2.]]]
+
+       [[[3.]
+         [4.]]
+
+        [[5.]
+         [6.]]]
+
+  w[0].shape = (1, 2, 1)
+  w[1].shape = (2, 2, 1)
+
+`squeeze_axis=True` removes the axis with length 1 from the shapes of the output arrays.
+**Note** that setting `squeeze_axis` to ``1`` removes axis with length 1 only
+along the `axis` which it is split.
+Also `squeeze_axis` can be set to true only if ``input.shape[axis] == indices_or_sections``.
+
+Example::
+
+   z = split_v2(x, axis=0, indices_or_sections=3, squeeze_axis=1) // a list of 3 arrays with shape (2, 1)
+   z = [[ 1.]
+        [ 2.]]
+
+       [[ 3.]
+        [ 4.]]
+
+       [[ 5.]
+        [ 6.]]
+   z[0].shape = (2, 1)
+
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<SplitParam>)
+.set_num_inputs(1)
+.set_num_outputs(SplitNumOutputs)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", SplitOpShape)
+.set_attr<nnvm::FInferType>("FInferType", SplitOpType)
+.set_attr<FCompute>("FCompute<cpu>", SplitOpForward<cpu>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& n) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_split_v2_backward"})
+.add_argument("data", "NDArray-or-Symbol", "The input")
+.add_arguments(SplitParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_split_v2_backward)
+.set_attr_parser(ParamParser<SplitParam>)
+.set_num_inputs(SplitNumOutputs)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& n) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<FCompute>("FCompute<cpu>", SplitOpBackward<cpu>);
+
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index 4e31a4cf1..87311276d 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -217,5 +217,11 @@ NNVM_REGISTER_OP(depth_to_space)
 NNVM_REGISTER_OP(space_to_depth)
 .set_attr<FCompute>("FCompute<gpu>", SpaceToDepthOpForward<gpu>);
 
+NNVM_REGISTER_OP(_split_v2)
+.set_attr<FCompute>("FCompute<gpu>", SplitOpForward<gpu>);
+
+NNVM_REGISTER_OP(_split_v2_backward)
+.set_attr<FCompute>("FCompute<gpu>", SplitOpBackward<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/slice-inl.h b/src/operator/tensor/slice-inl.h
new file mode 100644
index 000000000..4e94cbeda
--- /dev/null
+++ b/src/operator/tensor/slice-inl.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file slice-inl.h
+ * \brief
+ * \author Zhiyuan Huang
+*/
+
+#ifndef MXNET_OPERATOR_TENSOR_SLICE_INL_H_
+#define MXNET_OPERATOR_TENSOR_SLICE_INL_H_
+
+#include <utility>
+#include <vector>
+#include <string>
+
+namespace mxnet {
+namespace op {
+
+struct SliceParam : public dmlc::Parameter<SliceParam> {
+  nnvm::Tuple<dmlc::optional<int>> begin, end;
+  nnvm::Tuple<dmlc::optional<int>> step;
+  DMLC_DECLARE_PARAMETER(SliceParam) {
+    DMLC_DECLARE_FIELD(begin)
+    .describe("starting indices for the slice operation, supports negative indices.");
+    DMLC_DECLARE_FIELD(end)
+    .describe("ending indices for the slice operation, supports negative indices.");
+    DMLC_DECLARE_FIELD(step)
+    .set_default(nnvm::Tuple<dmlc::optional<int>>())
+    .describe("step for the slice operation, supports negative values.");
+  }
+  bool operator==(const SliceParam& other) const {
+    return this->begin == other.begin &&
+           this->end == other.end &&
+           this->step == other.step;
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+namespace std {
+template<>
+struct hash<mxnet::op::SliceParam> {
+  size_t operator()(const mxnet::op::SliceParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.begin);
+    ret = dmlc::HashCombine(ret, val.end);
+    ret = dmlc::HashCombine(ret, val.step);
+    return ret;
+  }
+};
+}  // namespace std
+
+#endif  // MXNET_OPERATOR_TENSOR_SLICE_INL_H_
diff --git a/src/resource.cc b/src/resource.cc
index ba4ab7270..80a5c0e44 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -34,6 +34,7 @@
 #include <atomic>
 #include "./common/lazy_alloc_array.h"
 #include "./common/utils.h"
+#include "./common/cuda_utils.h"
 
 namespace mxnet {
 namespace resource {
@@ -92,11 +93,14 @@ class ResourceManagerImpl : public ResourceManager {
     gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 1);
     cpu_native_rand_copy_ = dmlc::GetEnv("MXNET_CPU_PARALLEL_RAND_COPY", 1);
     gpu_native_rand_copy_ = dmlc::GetEnv("MXNET_GPU_PARALLEL_RAND_COPY", 4);
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+    gpu_cudnn_dropout_state_copy_ = dmlc::GetEnv("MXNET_GPU_CUDNN_DROPOUT_STATE_COPY", 4);
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
     engine_ref_ = Engine::_GetSharedRef();
     storage_ref_ = Storage::_GetSharedRef();
     cpu_rand_.reset(new ResourceRandom<cpu>(
         Context::CPU(), global_seed_));
-    cpu_space_.reset(new ResourceTempSpace(
+    cpu_space_.reset(new ResourceTempSpace<ResourceRequest::kTempSpace>(
         Context::CPU(), cpu_temp_space_copy_));
     cpu_parallel_rand_.reset(new ResourceParallelRandom<cpu>(
         Context::CPU(), cpu_native_rand_copy_, global_seed_));
@@ -110,6 +114,9 @@ class ResourceManagerImpl : public ResourceManager {
     gpu_rand_.Clear();
     gpu_space_.Clear();
     gpu_parallel_rand_.Clear();
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+    gpu_cudnn_dropout_state_.Clear();
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
 #endif
     if (engine_ref_ != nullptr) {
       engine_ref_ = nullptr;
@@ -139,7 +146,7 @@ class ResourceManagerImpl : public ResourceManager {
         }
         case ResourceRequest::kTempSpace: {
           return gpu_space_.Get(ctx.dev_id, [ctx, this]() {
-              return new ResourceTempSpace(ctx, gpu_temp_space_copy_);
+              return new ResourceTempSpace<ResourceRequest::kTempSpace>(ctx, gpu_temp_space_copy_);
             })->GetNext();
         }
         case ResourceRequest::kParallelRandom: {
@@ -147,6 +154,14 @@ class ResourceManagerImpl : public ResourceManager {
             return new ResourceParallelRandom<gpu>(ctx, gpu_native_rand_copy_, global_seed_);
           })->GetNext();
         }
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+        case ResourceRequest::kCuDNNDropoutDesc: {
+          return gpu_cudnn_dropout_state_.Get(ctx.dev_id, [ctx, this]() {
+            return new ResourceTempSpace<ResourceRequest::kCuDNNDropoutDesc>(
+                ctx, gpu_cudnn_dropout_state_copy_);
+          })->GetNext();
+        }
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
         default: LOG(FATAL) << "Unknown supported type " << req.type;
       }
 #else
@@ -231,7 +246,8 @@ class ResourceManagerImpl : public ResourceManager {
     }
   };
 
-  // temporal space resource.
+  // temporary space resource.
+  template<ResourceRequest::Type req>
   struct ResourceTempSpace {
     /*! \brief the context of the device */
     Context ctx;
@@ -248,7 +264,7 @@ class ResourceManagerImpl : public ResourceManager {
         resource[i].var = Engine::Get()->NewVariable();
         resource[i].id = static_cast<int32_t>(i);
         resource[i].ptr_ = &space[i];
-        resource[i].req = ResourceRequest(ResourceRequest::kTempSpace);
+        resource[i].req = ResourceRequest(req);
         space[i].ctx = ctx;
         CHECK_EQ(space[i].handle.size, 0U);
       }
@@ -372,16 +388,23 @@ class ResourceManagerImpl : public ResourceManager {
   /*! \brief CPU random number resources */
   std::unique_ptr<ResourceRandom<cpu> > cpu_rand_;
   /*! \brief CPU temp space resources */
-  std::unique_ptr<ResourceTempSpace> cpu_space_;
+  std::unique_ptr<ResourceTempSpace<ResourceRequest::kTempSpace>> cpu_space_;
   /*! \brief CPU parallel random number resources */
   std::unique_ptr<ResourceParallelRandom<cpu> > cpu_parallel_rand_;
 #if MXNET_USE_CUDA
   /*! \brief random number generator for GPU */
   common::LazyAllocArray<ResourceRandom<gpu> > gpu_rand_;
   /*! \brief temp space for GPU */
-  common::LazyAllocArray<ResourceTempSpace> gpu_space_;
+  common::LazyAllocArray<ResourceTempSpace<ResourceRequest::kTempSpace>> gpu_space_;
   /*! \brief GPU parallel (on device) random number resources */
   common::LazyAllocArray<ResourceParallelRandom<gpu> > gpu_parallel_rand_;
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+  /*! \brief number of copies in GPU cudnn dropout descriptor resources */
+  int gpu_cudnn_dropout_state_copy_;
+  /*! \brief GPU parallel (on device) random number resources */
+  common::LazyAllocArray<ResourceTempSpace<ResourceRequest::kCuDNNDropoutDesc>>
+    gpu_cudnn_dropout_state_;
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
 #endif
 };
 }  // namespace resource
@@ -394,6 +417,36 @@ void* Resource::get_host_space_internal(size_t size) const {
   return static_cast<resource::SpaceAllocator*>(ptr_)->GetHostSpace(size);
 }
 
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+void Resource::get_cudnn_dropout_desc(
+    cudnnDropoutDescriptor_t* dropout_desc,
+    mshadow::Stream<gpu> *stream,
+    const float dropout,
+    uint64_t seed) const {
+
+  CHECK_EQ(req.type, ResourceRequest::kCuDNNDropoutDesc);
+  auto state_space = static_cast<resource::SpaceAllocator*>(ptr_);
+  CHECK_EQ(state_space->ctx.dev_id, stream->dev_id)
+    << "The device id of cudnn dropout state space doesn't match that from stream.";
+  if (!state_space->handle.size) {
+    // not initialized yet.
+    size_t dropout_state_size;
+    CUDNN_CALL(cudnnDropoutGetStatesSize(stream->dnn_handle_, &dropout_state_size));
+    CUDNN_CALL(cudnnSetDropoutDescriptor(*dropout_desc, stream->dnn_handle_,
+                                         dropout,
+                                         state_space->GetSpace(dropout_state_size),
+                                         dropout_state_size,
+                                         seed));
+  } else {
+    CUDNN_CALL(cudnnRestoreDropoutDescriptor(*dropout_desc, stream->dnn_handle_,
+                                             dropout,
+                                             state_space->handle.dptr,
+                                             state_space->handle.size,
+                                             seed));
+  }
+}
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+
 ResourceManager* ResourceManager::Get() {
   typedef dmlc::ThreadLocalStore<resource::ResourceManagerImpl> inst;
   return inst::Get();
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
index c39373b1b..f59a9e8d7 100644
--- a/tests/cpp/include/test_core_op.h
+++ b/tests/cpp/include/test_core_op.h
@@ -168,26 +168,50 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    * \param op Pointer to nnvm Operator object
    */
   void AttachResources(OpContext *ctx, const nnvm::NodeAttrs& attrs, const nnvm::Op *op) {
+    std::vector<ResourceRequest> reqs;
+    std::vector<Resource>& requested = ctx->requested;
     static auto& fresource = nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
     if (fresource.count(op) != 0) {
-      std::vector<Resource>& requested = ctx->requested;
-      auto reqs = fresource[op](attrs);
+      reqs = fresource[op](attrs);
+    } else {
+      static auto& fresourceex = nnvm::Op::GetAttr<FResourceRequestEx>("FResourceRequestEx");
+      if (fresourceex.count(op) != 0) {
+        if (this->function_ || this->stateful_function_) {
+          reqs = fresourceex[op](attrs, ctx->run_ctx.ctx.dev_mask(), DispatchMode::kFCompute);
+        } else {
+          reqs = fresourceex[op](attrs, ctx->run_ctx.ctx.dev_mask(), DispatchMode::kFComputeEx);
+        }
+      }
+    }
+    if (!reqs.empty()) {
       // Get the resource of temporal space.
       for (const ResourceRequest& req : reqs) {
-        if (req.type == ResourceRequest::kTempSpace) {
-          Resource r = ResourceManager::Get()->Request(ctx->run_ctx.ctx, req);
-          requested.emplace_back(r);
-        } else if (req.type == ResourceRequest::kRandom) {
-          requested.emplace_back(ResourceManager::Get()->Request(ctx->run_ctx.ctx, req));
-        } else if (req.type == ResourceRequest::kParallelRandom) {
-          Resource rm = ResourceManager::Get()->Request(ctx->run_ctx.ctx, req);
-          if (ctx->run_ctx.ctx.dev_mask() == Context::kCPU) {
-            common::random::RandGenerator<cpu, DType>::AllocState(
-                rm.get_parallel_random<cpu, DType>());
+        switch (req.type) {
+          case ResourceRequest::kTempSpace: {
+            requested.emplace_back(ResourceManager::Get()->Request(ctx->run_ctx.ctx, req));
+            break;
           }
-          requested.emplace_back(rm);
-        } else {
-          LOG(FATAL) << "resource type not yet supported";
+          case ResourceRequest::kRandom: {
+            requested.emplace_back(ResourceManager::Get()->Request(ctx->run_ctx.ctx, req));
+            break;
+          }
+          case ResourceRequest::kParallelRandom: {
+            Resource rm = ResourceManager::Get()->Request(ctx->run_ctx.ctx, req);
+            if (ctx->run_ctx.ctx.dev_mask() == Context::kCPU) {
+              common::random::RandGenerator<cpu, DType>::AllocState(
+                  rm.get_parallel_random<cpu, DType>());
+            }
+            requested.emplace_back(rm);
+            break;
+          }
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+          case ResourceRequest::kCuDNNDropoutDesc: {
+            requested.emplace_back(ResourceManager::Get()->Request(ctx->run_ctx.ctx, req));
+            break;
+          }
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+          default:
+            LOG(FATAL) << "resource type " << req.type << " is not yet supported";
         }
       }
     }
@@ -511,9 +535,25 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
 
       function_ = common::GetFCompute<FCompute>(op_, "FCompute", ctx_.run_ctx.ctx);
       functionex_ = common::GetFCompute<FComputeEx>(op_, "FComputeEx", ctx_.run_ctx.ctx);
+      stateful_function_ = common::GetFCompute<FStatefulCompute>(op_, "FStatefulCompute",
+                                                                 ctx_.run_ctx.ctx);
 
       AttachResources(&ctx_, attrs_, op_);
 
+      auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
+      auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
+      if (createop.count(op_) || is_layer_backward.get(op_, false)) {
+        if (backward_for_op) {
+          state_ = backward_for_op->state_;
+        }
+        if (!state_) {
+          if (!create_state_) {
+            create_state_ = createop[op_];
+          }
+          state_ = create_state_(attrs_, ctx_.run_ctx.ctx, input_shapes_, input_types);
+        }
+      }
+
       if (!backward_for_op) {
         bool no_backward = false;
         // Set up backward
@@ -561,8 +601,14 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
   inline void forward(const size_t count) {
     perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), kForward, "Forward", count);
     mxnet::profiler::vtune::VTuneResume profile;
-    for (size_t i = 0; i < count; ++i) {
-      Execute();
+    if (stateful_function_) {
+      for (size_t i = 0; i < count; ++i) {
+        ExecuteStateful();
+      }
+    } else {
+      for (size_t i = 0; i < count; ++i) {
+        Execute();
+      }
     }
   }
 
@@ -570,8 +616,14 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     CHECK(HasBackward());
     perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), kBackward, "Backward", count);
     mxnet::profiler::vtune::VTuneResume profile;
-    for (size_t i = 0; i < count; ++i) {
-      ExecuteBackward();
+    if (stateful_function_) {
+      for (size_t i = 0; i < count; ++i) {
+        ExecuteBackwardStateful();
+      }
+    } else {
+      for (size_t i = 0; i < count; ++i) {
+        ExecuteBackward();
+      }
     }
   }
 
@@ -595,6 +647,17 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     functionex_(attrs_, ctx_, inputs_, req_, outputs_);
   }
 
+  /*!
+   * \brief Execute the stateful operator
+   */
+  void ExecuteStateful() {
+    CHECK_EQ(initialized_, true);
+    CHECK(state_);
+    CollectBlobs(inputs_, &blob_inputs_);
+    CollectBlobs(outputs_, &blob_outputs_);
+    stateful_function_(state_, ctx_, blob_inputs_, req_, blob_outputs_);
+  }
+
   bool HasBackward() const {
     return !backward_.empty();
   }
@@ -631,6 +694,22 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
     return false;
   }
 
+  /*!
+   * \brief Execute backward pass on stateful operator
+   */
+  bool ExecuteBackwardStateful() {
+    CHECK_EQ(initialized_, true);
+    CHECK(HasBackward());
+    if (!backward_.empty()) {
+      // Avoid locked ref count here
+      for (std::shared_ptr<CoreOpExecutor> &p : backward_) {
+        p->ExecuteStateful();
+      }
+      return true;
+    }
+    return false;
+  }
+
   /*!
    * \brief Access input NDArray vector
    * \return reference to NDArray vector of forward inputs
@@ -738,6 +817,18 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
    * \brief Operator's FCompute function (for sparse tensors)
    */
   FComputeEx functionex_;
+  /*!
+   * \brief Operator's FStatefulCompute function
+   */
+  FStatefulCompute stateful_function_;
+  /*!
+   * \brief Operator's FCreateOpState function
+   */
+  FCreateOpState create_state_;
+  /*!
+   * \brief Operator state
+   */
+  OpStatePtr state_;
 
   /*!
    * \brief Backward executors (if any)
diff --git a/tests/cpp/include/test_legacy_op.h b/tests/cpp/include/test_legacy_op.h
index 3e395a4cd..7fd407e39 100644
--- a/tests/cpp/include/test_legacy_op.h
+++ b/tests/cpp/include/test_legacy_op.h
@@ -494,25 +494,38 @@ class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
     ctx.dev_id = 0;
 
     for (const ResourceRequest& req : reqs) {
-      if (req.type == ResourceRequest::kTempSpace) {
-        if (cached_temp.count(ctx) != 0) {
-          opContext_.requested.emplace_back(cached_temp.at(ctx));
-        } else {
-          Resource r = ResourceManager::Get()->Request(ctx, req);
-          opContext_.requested.emplace_back(r);
-          cached_temp[ctx] = r;
+      switch (req.type) {
+        case ResourceRequest::kTempSpace: {
+          if (cached_temp.count(ctx) != 0) {
+            opContext_.requested.emplace_back(cached_temp.at(ctx));
+          } else {
+            Resource r = ResourceManager::Get()->Request(ctx, req);
+            opContext_.requested.emplace_back(r);
+            cached_temp[ctx] = r;
+          }
+          break;
+        }
+        case ResourceRequest::kRandom: {
+          opContext_.requested.emplace_back(ResourceManager::Get()->Request(ctx, req));
+          break;
+        }
+        case ResourceRequest::kParallelRandom: {
+          Resource rm = ResourceManager::Get()->Request(ctx, req);
+          if (ctx.dev_mask() == Context::kCPU) {
+            common::random::RandGenerator<cpu, DType>::AllocState(
+              rm.get_parallel_random<cpu, DType>());
+          }
+          opContext_.requested.emplace_back(rm);
+          break;
         }
-      } else if (req.type == ResourceRequest::kRandom) {
-        opContext_.requested.emplace_back(ResourceManager::Get()->Request(ctx, req));
-      } else if (req.type == ResourceRequest::kParallelRandom) {
-        Resource rm = ResourceManager::Get()->Request(ctx, req);
-        if (ctx.dev_mask() == Context::kCPU) {
-          common::random::RandGenerator<cpu, DType>::AllocState(
-            rm.get_parallel_random<cpu, DType>());
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+        case ResourceRequest::kCuDNNDropoutDesc: {
+          opContext_.requested.push_back(ResourceManager::Get()->Request(ctx, req));
+          break;
         }
-        opContext_.requested.emplace_back(rm);
-      } else {
-        LOG(FATAL) << "resource type not yet supported";
+#endif  // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+        default:
+          LOG(FATAL) << "resource type " << req.type << " is not yet supported";
       }
     }
   }
diff --git a/tests/nightly/apache_rat_license_check/rat-excludes b/tests/nightly/apache_rat_license_check/rat-excludes
index 5969f01a3..93ac16e42 100755
--- a/tests/nightly/apache_rat_license_check/rat-excludes
+++ b/tests/nightly/apache_rat_license_check/rat-excludes
@@ -35,6 +35,7 @@ _mask.pyx
 coco.py
 base.pyi
 special_functions-inl.h
+erfinv-inl.h
 im2col.cuh
 im2col.h
 pool.h
@@ -47,6 +48,7 @@ moderngpu/*
 deformable_im2col.cuh
 deformable_im2col.h
 REQUIRE
+Project.toml
 include/*
 .*.iml
-.*.json.ref
\ No newline at end of file
+.*.json.ref
diff --git a/tests/python-pytest/onnx/test_cases.py b/tests/python-pytest/onnx/test_cases.py
index 6ec37092d..b20db23aa 100644
--- a/tests/python-pytest/onnx/test_cases.py
+++ b/tests/python-pytest/onnx/test_cases.py
@@ -113,7 +113,8 @@
              'test_Softmax',
              'test_softmax_functional',
              'test_softmax_lastdim',
-             ]
+             ],
+    'export': ['test_ConvTranspose2d']
 }
 
 STANDARD_MODEL = {
diff --git a/tests/python-pytest/onnx/test_node.py b/tests/python-pytest/onnx/test_node.py
index 25fe9c9f9..96045516c 100644
--- a/tests/python-pytest/onnx/test_node.py
+++ b/tests/python-pytest/onnx/test_node.py
@@ -31,7 +31,7 @@
 from collections import namedtuple
 import numpy as np
 import numpy.testing as npt
-from onnx import numpy_helper, helper, load_model
+from onnx import checker, numpy_helper, helper, load_model
 from onnx import TensorProto
 from mxnet.test_utils import download
 from mxnet.contrib import onnx as onnx_mxnet
@@ -206,6 +206,18 @@ def test_imports(self):
                 mxnet_out = bkd_rep.run(inputs)
                 npt.assert_almost_equal(np_out, mxnet_out, decimal=4)
 
+    def test_exports(self):
+        input_shape = (2,1,3,1)
+        for test in export_test_cases:
+            test_name, onnx_name, mx_op, attrs = test
+            input_sym = mx.sym.var('data')
+            outsym = mx_op(input_sym, **attrs)
+            converted_model = onnx_mxnet.export_model(outsym, {}, [input_shape], np.float32,
+                                                      onnx_file_path=outsym.name + ".onnx")
+            model = load_model(converted_model)
+            checker.check_model(model)
+
+
 # test_case = ("test_case_name", mxnet op, "ONNX_op_name", [input_list], attribute map, MXNet_specific=True/False,
 # fix_attributes = {'modify': {mxnet_attr_name: onnx_attr_name},
 #                   'remove': [attr_name],
@@ -274,5 +286,11 @@ def test_imports(self):
     ("test_lpnormalization_ord2", "LpNormalization", [get_rnd([5, 3, 3, 2])], np.linalg.norm, {'ord':2, 'axis':1})
 ]
 
+# test_case = ("test_case_name", "ONNX_op_name", mxnet_op, attribute map)
+export_test_cases = [
+    ("test_expand", "Expand", mx.sym.broadcast_to, {'shape': (2,1,3,1)}),
+    ("test_tile", "Tile", mx.sym.tile, {'reps': (2,3)})
+]
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/python/gpu/test_gluon_transforms.py b/tests/python/gpu/test_gluon_transforms.py
new file mode 100644
index 000000000..23b34d334
--- /dev/null
+++ b/tests/python/gpu/test_gluon_transforms.py
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from __future__ import print_function
+import os
+import sys
+import mxnet as mx
+import mxnet.ndarray as nd
+import numpy as np
+from mxnet import gluon
+from mxnet.base import MXNetError
+from mxnet.gluon.data.vision import transforms
+from mxnet.test_utils import assert_almost_equal, set_default_context
+from mxnet.test_utils import almost_equal
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(curr_path, '../unittest'))
+from common import assertRaises, setup_module, with_seed, teardown
+
+
+set_default_context(mx.gpu(0))
+
+@with_seed()
+def test_normalize():
+    # 3D Input
+    data_in_3d = nd.random.uniform(0, 1, (3, 300, 300))
+    out_nd_3d = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in_3d)
+    data_expected_3d = data_in_3d.asnumpy()
+    data_expected_3d[:][:][0] = data_expected_3d[:][:][0] / 3.0
+    data_expected_3d[:][:][1] = (data_expected_3d[:][:][1] - 1.0) / 2.0
+    data_expected_3d[:][:][2] = data_expected_3d[:][:][2] - 2.0
+    assert_almost_equal(data_expected_3d, out_nd_3d.asnumpy())
+
+    # 4D Input
+    data_in_4d = nd.random.uniform(0, 1, (2, 3, 300, 300))
+    out_nd_4d = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in_4d)
+    data_expected_4d = data_in_4d.asnumpy()
+    data_expected_4d[0][:][:][0] = data_expected_4d[0][:][:][0] / 3.0
+    data_expected_4d[0][:][:][1] = (data_expected_4d[0][:][:][1] - 1.0) / 2.0
+    data_expected_4d[0][:][:][2] = data_expected_4d[0][:][:][2] - 2.0
+    data_expected_4d[1][:][:][0] = data_expected_4d[1][:][:][0] / 3.0
+    data_expected_4d[1][:][:][1] = (data_expected_4d[1][:][:][1] - 1.0) / 2.0
+    data_expected_4d[1][:][:][2] = data_expected_4d[1][:][:][2] - 2.0
+    assert_almost_equal(data_expected_4d, out_nd_4d.asnumpy())
+
+    # Default normalize values i.e., mean=0, std=1
+    data_in_3d_def = nd.random.uniform(0, 1, (3, 300, 300))
+    out_nd_3d_def = transforms.Normalize()(data_in_3d_def)
+    data_expected_3d_def = data_in_3d_def.asnumpy()
+    assert_almost_equal(data_expected_3d_def, out_nd_3d_def.asnumpy())
+
+    # Invalid Input - Neither 3D or 4D input
+    invalid_data_in = nd.random.uniform(0, 1, (5, 5, 3, 300, 300))
+    normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
+    assertRaises(MXNetError, normalize_transformer, invalid_data_in)
+
+    # Invalid Input - Channel neither 1 or 3
+    invalid_data_in = nd.random.uniform(0, 1, (5, 4, 300, 300))
+    normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
+    assertRaises(MXNetError, normalize_transformer, invalid_data_in)
+
+@with_seed()
+def test_to_tensor():
+    # 3D Input
+    data_in = np.random.uniform(0, 255, (300, 300, 3)).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
+    assert_almost_equal(out_nd.asnumpy(), np.transpose(
+        data_in.astype(dtype=np.float32) / 255.0, (2, 0, 1)))
+
+    # 4D Input
+    data_in = np.random.uniform(0, 255, (5, 300, 300, 3)).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
+    assert_almost_equal(out_nd.asnumpy(), np.transpose(
+                        data_in.astype(dtype=np.float32) / 255.0, (0, 3, 1, 2)))
+
+    # Invalid Input
+    invalid_data_in = nd.random.uniform(0, 255, (5, 5, 300, 300, 3)).astype(dtype=np.uint8)
+    transformer = transforms.ToTensor()
+    assertRaises(MXNetError, transformer, invalid_data_in)
+
+@with_seed()
+def test_resize():
+    # Test with normal case 3D input float type
+    data_in_3d = nd.random.uniform(0, 255, (300, 300, 3))
+    out_nd_3d = transforms.Resize((100, 100))(data_in_3d)
+    data_in_4d_nchw = nd.moveaxis(nd.expand_dims(data_in_3d, axis=0), 3, 1)
+    data_expected_3d = (nd.moveaxis(nd.contrib.BilinearResize2D(data_in_4d_nchw, 100, 100), 1, 3))[0]
+    assert_almost_equal(out_nd_3d.asnumpy(), data_expected_3d.asnumpy())
+
+    # Test with normal case 4D input float type
+    data_in_4d = nd.random.uniform(0, 255, (2, 300, 300, 3))
+    out_nd_4d = transforms.Resize((100, 100))(data_in_4d)
+    data_in_4d_nchw = nd.moveaxis(data_in_4d, 3, 1)
+    data_expected_4d = nd.moveaxis(nd.contrib.BilinearResize2D(data_in_4d_nchw, 100, 100), 1, 3)
+    assert_almost_equal(out_nd_4d.asnumpy(), data_expected_4d.asnumpy())
+
+    # Test invalid interp
+    data_in_3d = nd.random.uniform(0, 255, (300, 300, 3))
+    invalid_transform = transforms.Resize(-150, keep_ratio=False, interpolation=2)
+    assertRaises(MXNetError, invalid_transform, data_in_3d)
+
+    # Credited to Hang Zhang
+    def py_bilinear_resize_nhwc(x, outputHeight, outputWidth):
+        batch, inputHeight, inputWidth, channel = x.shape
+        if outputHeight == inputHeight and outputWidth == inputWidth:
+            return x
+        y = np.empty([batch, outputHeight, outputWidth, channel]).astype('uint8')
+        rheight = 1.0 * (inputHeight - 1) / (outputHeight - 1) if outputHeight > 1 else 0.0
+        rwidth = 1.0 * (inputWidth - 1) / (outputWidth - 1) if outputWidth > 1 else 0.0
+        for h2 in range(outputHeight):
+            h1r = 1.0 * h2 * rheight
+            h1 = int(np.floor(h1r))
+            h1lambda = h1r - h1
+            h1p = 1 if h1 < (inputHeight - 1) else 0
+            for w2 in range(outputWidth):
+                w1r = 1.0 * w2 * rwidth
+                w1 = int(np.floor(w1r))
+                w1lambda = w1r - w1
+                w1p = 1 if w1 < (inputHeight - 1) else 0
+                for b in range(batch):
+                    for c in range(channel):
+                        y[b][h2][w2][c] = (1-h1lambda)*((1-w1lambda)*x[b][h1][w1][c] + \
+                            w1lambda*x[b][h1][w1+w1p][c]) + \
+                            h1lambda*((1-w1lambda)*x[b][h1+h1p][w1][c] + \
+                            w1lambda*x[b][h1+h1p][w1+w1p][c])
+        return y
+
+    # Test with normal case 3D input int8 type
+    data_in_4d = nd.random.uniform(0, 255, (1, 300, 300, 3)).astype('uint8')
+    out_nd_3d = transforms.Resize((100, 100))(data_in_4d[0])
+    assert_almost_equal(out_nd_3d.asnumpy(), py_bilinear_resize_nhwc(data_in_4d.asnumpy(), 100, 100)[0], atol=1.0)
+
+    # Test with normal case 4D input int8 type
+    data_in_4d = nd.random.uniform(0, 255, (2, 300, 300, 3)).astype('uint8')
+    out_nd_4d = transforms.Resize((100, 100))(data_in_4d)
+    assert_almost_equal(out_nd_4d.asnumpy(), py_bilinear_resize_nhwc(data_in_4d.asnumpy(), 100, 100), atol=1.0)
diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py
index be6feaeb9..313668cb5 100644
--- a/tests/python/mkl/test_subgraph.py
+++ b/tests/python/mkl/test_subgraph.py
@@ -35,14 +35,14 @@
 
 DATA_SHAPE=[(4, 4, 10, 10), (32, 3, 24, 24), (64, 8, 64, 64)]
 
-def check_qsym_calibrated(qsym):
+def check_qsym_calibrated(qsym, out_type):
   assert ''.join(qsym.attr_dict().keys()).find('quantized_sg_mkldnn_conv') != -1
   for k, v in qsym.attr_dict().items():
     if k.find('quantized_sg_mkldnn_conv') != -1:
       assert 'min_calib_range' in v
       assert 'max_calib_range' in v
     if k.find('_quantize') != -1:
-      assert v['out_type'] == 'uint8'
+      assert v['out_type'] == out_type
 
 def check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape):
   mod = mx.mod.Module(symbol=qsym, context=mx.current_context())
@@ -66,7 +66,7 @@ def check_qsym_dummy_forward(qsym, batch, data_shape, label_shape):
     output.wait_to_read()
   return mod.get_outputs()
 
-def check_quantize(sym, data_shape, check_conv=True):
+def check_quantize(sym, data_shape, out_type, check_conv=True):
   fc = mx.sym.FullyConnected(data=sym, num_hidden=10, flatten=True, name='fc')
   sym = mx.sym.SoftmaxOutput(data=fc, name='softmax')
   sym_sg = sym.get_backend_symbol("MKLDNN")
@@ -99,15 +99,14 @@ def check_quantize(sym, data_shape, check_conv=True):
                                                                    aux_params=aux_params,
                                                                    ctx=mx.current_context(),
                                                                    excluded_sym_names=excluded_sym_names,
-                                                                   quantized_dtype='uint8',
+                                                                   quantized_dtype=out_type,
                                                                    calib_mode='naive',
                                                                    calib_data=calib_data,
                                                                    calib_layer=calib_layer,
-                                                                   calib_quantize_op=True,
                                                                    num_calib_examples=5)
   qsym = qsym.get_backend_symbol("MKLDNN_POST_QUANTIZE")
   if check_conv:
-    check_qsym_calibrated(qsym)
+    check_qsym_calibrated(qsym, out_type)
   quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape, label_shape)
   for i in range(len(ref_out)):
     assert_almost_equal(ref_out[i].asnumpy(), quantized_out[i].asnumpy(), atol = 1)
@@ -135,8 +134,9 @@ def check_fusion(sym, data_shape, attrs_op):
   for i in range(len(exe.outputs)):
     assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), rtol=1e-3, atol=1e-3)
 
-  # fp32 to uint8
-  check_quantize(sym, data_shape)
+  # fp32 to int8
+  for out_type in ('uint8', 'int8', 'auto'):
+    check_quantize(sym, data_shape, out_type)
 
 def check_neg_fusion(syms, attrs_name=None, excluded_attrs=None, date_shape=(4,4,10,10)):
   for sym, attrs, excluded_attr in zip(syms, attrs_name, excluded_attrs):
@@ -475,12 +475,13 @@ def test_pos_conv_bn_sum_relu():
 
 def test_pos_single_concat():
   for data_shape in DATA_SHAPE:
-    net = single_concat(data_shape, 2, 1)
-    check_quantize(net, data_shape, False)
-    net = single_concat(data_shape, 4, 2)
-    check_quantize(net, data_shape, False)
-    net = single_concat(data_shape, 4, 3)
-    check_quantize(net, data_shape, False)
+    for out_type in ('uint8', 'int8', 'auto'):
+      net = single_concat(data_shape, 2, 1)
+      check_quantize(net, data_shape, out_type, False)
+      net = single_concat(data_shape, 4, 2)
+      check_quantize(net, data_shape, out_type, False)
+      net = single_concat(data_shape, 4, 3)
+      check_quantize(net, data_shape, out_type, False)
 
 @with_seed()
 def test_neg_conv_bn():
diff --git a/tests/python/tensorrt/test_resnet18.py b/tests/python/tensorrt/test_resnet18.py
new file mode 100644
index 000000000..fff3ac5dd
--- /dev/null
+++ b/tests/python/tensorrt/test_resnet18.py
@@ -0,0 +1,68 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from mxnet.gluon.model_zoo import vision
+from mxnet.test_utils import assert_almost_equal
+import mxnet as mx
+import numpy as np
+import os
+
+batch_shape = (1, 3, 224, 224)
+url = '/~https://github.com/dmlc/web-data/blob/master/mxnet/doc/tutorials/python/predict_image/cat.jpg?raw=true'
+model_file_name = 'resnet18_v2_trt_test'
+
+
+def get_image(image_url):
+    fname = mx.test_utils.download(image_url, fname=image_url.split('/')[-1].split('?')[0])
+    img = mx.image.imread(fname)
+    img = mx.image.imresize(img, 224, 224)  # Resize
+    img = img.transpose((2, 0, 1))  # Channel first
+    img = img.expand_dims(axis=0)  # Batchify
+    img = mx.nd.cast(img, dtype=np.float32)
+    return img/255.0
+
+
+def test_tensorrt_resnet18_feature_vect():
+    print("downloading sample input")
+    input_data = get_image(url)
+    gluon_resnet18 = vision.resnet18_v2(pretrained=True)
+    gluon_resnet18.hybridize()
+    gluon_resnet18.forward(input_data)
+    gluon_resnet18.export(model_file_name)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(model_file_name, 0)
+
+    os.environ['MXNET_USE_TENSORRT'] = '0'
+    executor = sym.simple_bind(ctx=mx.gpu(), data=batch_shape, grad_req='null', force_rebind=True)
+    executor.copy_params_from(arg_params, aux_params)
+    y = executor.forward(is_train=False, data=input_data)
+
+    os.environ['MXNET_USE_TENSORRT'] = '1'
+    all_params = arg_params
+    all_params.update(aux_params)
+    executor = mx.contrib.tensorrt.tensorrt_bind(sym, ctx=mx.gpu(), all_params=all_params, data=batch_shape,
+                                                 grad_req='null', force_rebind=True)
+    y_trt = executor.forward(is_train=False, data=input_data)
+
+    no_trt_output = y[0].asnumpy()[0]
+    trt_output = y_trt[0].asnumpy()[0]
+    assert_almost_equal(no_trt_output, trt_output, 1e-4, 1e-4)
+
+
+if __name__ == '__main__':
+    import nose
+
+    nose.runmodule()
diff --git a/tests/python/unittest/test_contrib_operator.py b/tests/python/unittest/test_contrib_operator.py
index 43d3db648..aac807660 100644
--- a/tests/python/unittest/test_contrib_operator.py
+++ b/tests/python/unittest/test_contrib_operator.py
@@ -261,6 +261,42 @@ def test_multibox_target_op():
     assert_array_equal(loc_mask.asnumpy(), expected_loc_mask)
     assert_array_equal(cls_target.asnumpy(), expected_cls_target)
 
+def test_gradient_multiplier_op():
+    # We use the quadratic function in combination with gradient multiplier
+    def f(x, a, b, c):
+        return a * x**2 + b * x + c
+
+    a = np.random.random_sample()
+    b = np.random.random_sample()
+    c = np.random.random_sample()
+    m = np.random.random_sample() - 0.5
+    
+    data = mx.symbol.Variable('data')
+    quad_sym = mx.sym.contrib.quadratic(data=data, a=a, b=b, c=c)
+    gr_q_sym = mx.sym.contrib.gradientmultiplier(quad_sym, scalar=m)
+
+    for dtype in [np.float16, np.float32, np.float64]:
+        for ndim in range(1, 6):
+            shape = rand_shape_nd(ndim, 5)
+            data_np = np.random.randn(*shape).astype(dtype)
+            expected = f(data_np, a, b, c)
+            backward_expected = (2 * a * data_np + b) * m
+
+            # check imperative forward
+            output = mx.nd.contrib.quadratic(mx.nd.array(data_np), a=a, b=b, c=c)
+            output = mx.nd.contrib.gradientmultiplier(output, scalar=m)
+            assert_almost_equal(output.asnumpy(), expected,
+                                rtol=1e-2 if dtype is np.float16 else 1e-5,
+                                atol=1e-2 if dtype is np.float16 else 1e-5)
+            # check forward
+            check_symbolic_forward(gr_q_sym, [data_np], [expected],
+                                    rtol=1e-2 if dtype is np.float16 else 1e-5,
+                                    atol=1e-2 if dtype is np.float16 else 1e-5)
+            # check backward
+            check_symbolic_backward(gr_q_sym, [data_np], [np.ones(expected.shape)],
+                                        [backward_expected],
+                                        rtol=1e-2 if dtype is np.float16 else 1e-5,
+                                        atol=1e-2 if dtype is np.float16 else 1e-5)
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_features.py b/tests/python/unittest/test_features.py
new file mode 100644
index 000000000..ff9118100
--- /dev/null
+++ b/tests/python/unittest/test_features.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import sys
+from mxnet.mxfeatures import *
+from mxnet.base import MXNetError
+from nose.tools import *
+
+def test_runtime_features():
+    for f in Feature:
+        res = has_feature(f.value)
+        ok_(type(res) is bool)
+    for f in features_enabled():
+        ok_(type(f) is Feature)
+    ok_(type(features_enabled_str()) is str)
+    print("Features enabled: {}".format(features_enabled_str()))
+
+@raises(MXNetError)
+def test_has_feature_2large():
+    has_feature(sys.maxsize)
+
+
+if __name__ == "__main__":
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_gluon_data_vision.py b/tests/python/unittest/test_gluon_data_vision.py
index 2ff9c5cb2..a855fc8cf 100644
--- a/tests/python/unittest/test_gluon_data_vision.py
+++ b/tests/python/unittest/test_gluon_data_vision.py
@@ -17,32 +17,105 @@
 from __future__ import print_function
 import mxnet as mx
 import mxnet.ndarray as nd
-import numpy as np
+from mxnet.base import MXNetError
 from mxnet import gluon
+from mxnet.base import MXNetError
 from mxnet.gluon.data.vision import transforms
 from mxnet.test_utils import assert_almost_equal
 from mxnet.test_utils import almost_equal
-from common import setup_module, with_seed, teardown
+from common import assertRaises, setup_module, with_seed, teardown
 
+import numpy as np
 
 @with_seed()
 def test_to_tensor():
+    # 3D Input
     data_in = np.random.uniform(0, 255, (300, 300, 3)).astype(dtype=np.uint8)
     out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
     assert_almost_equal(out_nd.asnumpy(), np.transpose(
-        data_in.astype(dtype=np.float32) / 255.0, (2, 0, 1)))
+                        data_in.astype(dtype=np.float32) / 255.0, (2, 0, 1)))
+
+    # 4D Input
+    data_in = np.random.uniform(0, 255, (5, 300, 300, 3)).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
+    assert_almost_equal(out_nd.asnumpy(), np.transpose(
+                        data_in.astype(dtype=np.float32) / 255.0, (0, 3, 1, 2)))
+    
+    # Invalid Input
+    invalid_data_in = nd.random.uniform(0, 255, (5, 5, 300, 300, 3)).astype(dtype=np.uint8)
+    transformer = transforms.ToTensor()
+    assertRaises(MXNetError, transformer, invalid_data_in)
 
 
 @with_seed()
 def test_normalize():
-    data_in = np.random.uniform(0, 255, (300, 300, 3)).astype(dtype=np.uint8)
-    data_in = transforms.ToTensor()(nd.array(data_in, dtype='uint8'))
-    out_nd = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in)
-    data_expected = data_in.asnumpy()
-    data_expected[:][:][0] = data_expected[:][:][0] / 3.0
-    data_expected[:][:][1] = (data_expected[:][:][1] - 1.0) / 2.0
-    data_expected[:][:][2] = data_expected[:][:][2] - 2.0
-    assert_almost_equal(data_expected, out_nd.asnumpy())
+    # 3D Input
+    data_in_3d = nd.random.uniform(0, 1, (3, 300, 300))
+    out_nd_3d = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in_3d)
+    data_expected_3d = data_in_3d.asnumpy()
+    data_expected_3d[:][:][0] = data_expected_3d[:][:][0] / 3.0
+    data_expected_3d[:][:][1] = (data_expected_3d[:][:][1] - 1.0) / 2.0
+    data_expected_3d[:][:][2] = data_expected_3d[:][:][2] - 2.0
+    assert_almost_equal(data_expected_3d, out_nd_3d.asnumpy())
+
+    # 4D Input
+    data_in_4d = nd.random.uniform(0, 1, (2, 3, 300, 300))
+    out_nd_4d = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in_4d)
+    data_expected_4d = data_in_4d.asnumpy()
+    data_expected_4d[0][:][:][0] = data_expected_4d[0][:][:][0] / 3.0
+    data_expected_4d[0][:][:][1] = (data_expected_4d[0][:][:][1] - 1.0) / 2.0
+    data_expected_4d[0][:][:][2] = data_expected_4d[0][:][:][2] - 2.0
+    data_expected_4d[1][:][:][0] = data_expected_4d[1][:][:][0] / 3.0
+    data_expected_4d[1][:][:][1] = (data_expected_4d[1][:][:][1] - 1.0) / 2.0
+    data_expected_4d[1][:][:][2] = data_expected_4d[1][:][:][2] - 2.0
+    assert_almost_equal(data_expected_4d, out_nd_4d.asnumpy())
+
+    # Invalid Input - Neither 3D or 4D input
+    invalid_data_in = nd.random.uniform(0, 1, (5, 5, 3, 300, 300))
+    normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
+    assertRaises(MXNetError, normalize_transformer, invalid_data_in)
+
+    # Invalid Input - Channel neither 1 or 3
+    invalid_data_in = nd.random.uniform(0, 1, (5, 4, 300, 300))
+    normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
+    assertRaises(MXNetError, normalize_transformer, invalid_data_in)
+
+
+@with_seed()
+def test_resize():
+    def _test_resize_with_diff_type(dtype):
+        # test normal case
+        data_in = nd.random.uniform(0, 255, (300, 200, 3)).astype(dtype)
+        out_nd = transforms.Resize(200)(data_in)
+        data_expected = mx.image.imresize(data_in, 200, 200, 1)
+        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
+        # test 4D input
+        data_bath_in = nd.random.uniform(0, 255, (3, 300, 200, 3)).astype(dtype)
+        out_batch_nd = transforms.Resize(200)(data_bath_in)
+        for i in range(len(out_batch_nd)):
+            assert_almost_equal(mx.image.imresize(data_bath_in[i], 200, 200, 1).asnumpy(),
+                out_batch_nd[i].asnumpy())
+        # test interp = 2
+        out_nd = transforms.Resize(200, interpolation=2)(data_in)
+        data_expected = mx.image.imresize(data_in, 200, 200, 2)
+        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
+        # test height not equals to width
+        out_nd = transforms.Resize((200, 100))(data_in)
+        data_expected = mx.image.imresize(data_in, 200, 100, 1)
+        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
+        # test keep_ratio
+        out_nd = transforms.Resize(150, keep_ratio=True)(data_in)
+        data_expected = mx.image.imresize(data_in, 150, 225, 1)
+        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
+        # test size below zero
+        invalid_transform = transforms.Resize(-150, keep_ratio=True)
+        assertRaises(MXNetError, invalid_transform, data_in)
+        # test size more than 2:
+        invalid_transform = transforms.Resize((100, 100, 100), keep_ratio=True)
+        assertRaises(MXNetError, invalid_transform, data_in)
+
+    for dtype in ['uint8', 'float32', 'float64']:
+        _test_resize_with_diff_type(dtype)    
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_gluon_trainer.py b/tests/python/unittest/test_gluon_trainer.py
index 39ee57e6b..bda806623 100644
--- a/tests/python/unittest/test_gluon_trainer.py
+++ b/tests/python/unittest/test_gluon_trainer.py
@@ -17,6 +17,7 @@
 
 import mxnet as mx
 import unittest
+import os
 import numpy as np
 from mxnet import gluon
 from mxnet.gluon import nn
@@ -98,6 +99,9 @@ def dict_equ(a, b):
 
 @with_seed()
 def test_trainer_save_load():
+    previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
+    os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
+
     x = gluon.Parameter('x', shape=(10,), lr_mult=1.0)
     x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
     trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 0.1})
@@ -112,6 +116,7 @@ def test_trainer_save_load():
     x.lr_mult = 2.0
     # check if parameter dict is correctly associated with optimizer after load_state
     assert trainer._kvstore._updater.optimizer._get_lr(0) == 0.2
+    os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
 
 @with_seed()
 def test_trainer_sparse_save_load():
@@ -254,10 +259,11 @@ def check_trainer_sparse_kv(kv, stype, grad_stype, update_on_kv, expected):
             assert isinstance(err, expected)
 
     kvs = ['local', 'device']
+    global_update_on_kvstore = bool(int(os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")))
     for kv in kvs:
         check_trainer_sparse_kv(kv, 'default', 'default', True, True)
         check_trainer_sparse_kv(kv, 'default', 'default', False, False)
-        check_trainer_sparse_kv(kv, 'default', 'default', None, True)
+        check_trainer_sparse_kv(kv, 'default', 'default', None, global_update_on_kvstore)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', None, False)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', True, True)
         check_trainer_sparse_kv(kv, 'default', 'row_sparse', False, False)
diff --git a/tests/python/unittest/test_image.py b/tests/python/unittest/test_image.py
index c8ff53f35..dedb2d341 100644
--- a/tests/python/unittest/test_image.py
+++ b/tests/python/unittest/test_image.py
@@ -226,8 +226,7 @@ def test_color_normalize(self):
             mx_result = mx.image.color_normalize(mx.nd.array(src),
                 mx.nd.array(mean), mx.nd.array(std))
             assert_almost_equal(mx_result.asnumpy(), (src - mean) / std, atol=1e-3)
-            
-    @unittest.skip("The test fail with python errors. Temporarily disabled till it gets fixed")
+
     def test_imageiter(self):
         im_list = [[np.random.randint(0, 5), x] for x in TestImage.IMAGES]
         fname = './data/test_imageiter.lst'
diff --git a/tests/python/unittest/test_init.py b/tests/python/unittest/test_init.py
index efd6ef367..c8bf01f48 100644
--- a/tests/python/unittest/test_init.py
+++ b/tests/python/unittest/test_init.py
@@ -60,8 +60,17 @@ def check_rsp_const_init(init, val):
     check_rsp_const_init(mx.initializer.Zero(), 0.)
     check_rsp_const_init(mx.initializer.One(), 1.)
 
+def test_bilinear_init():
+    bili = mx.init.Bilinear()
+    bili_weight = mx.ndarray.empty((1,1,4,4))
+    bili._init_weight(None, bili_weight)
+    bili_1d = np.array([[1/float(4), 3/float(4), 3/float(4), 1/float(4)]])
+    bili_2d = bili_1d * np.transpose(bili_1d)
+    assert (bili_2d == bili_weight.asnumpy()).all()
+    
 if __name__ == '__main__':
     test_variable_init()
     test_default_init()
     test_aux_init()
     test_rsp_const_init()
+    test_bilinear_init()
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index 0641f235a..351231bb4 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -32,6 +32,10 @@
 import sys
 from common import assertRaises
 import unittest
+try:
+    from itertools import izip_longest as zip_longest
+except:
+    from itertools import zip_longest
 
 
 def test_MNISTIter():
@@ -427,13 +431,56 @@ def check_CSVIter_synthetic(dtype='float32'):
     for dtype in ['int32', 'int64', 'float32']:
         check_CSVIter_synthetic(dtype=dtype)
 
-@unittest.skip("Flaky test: /~https://github.com/apache/incubator-mxnet/issues/11359")
 def test_ImageRecordIter_seed_augmentation():
     get_cifar10()
     seed_aug = 3
 
+    def assert_dataiter_items_equals(dataiter1, dataiter2):
+        """
+        Asserts that two data iterators have the same numbner of batches,
+        that the batches have the same number of items, and that the items
+        are the equal.
+        """
+        for batch1, batch2 in zip_longest(dataiter1, dataiter2):
+            
+            # ensure iterators contain the same number of batches
+            # zip_longest will return None if on of the iterators have run out of batches
+            assert batch1 and batch2, 'The iterators do not contain the same number of batches'
+
+            # ensure batches are of same length
+            assert len(batch1.data) == len(batch2.data), 'The returned batches are not of the same length'
+
+            # ensure batch data is the same
+            for i in range(0, len(batch1.data)):
+                data1 = batch1.data[i].asnumpy().astype(np.uint8)
+                data2 = batch2.data[i].asnumpy().astype(np.uint8)
+                assert(np.array_equal(data1, data2))
+
+    def assert_dataiter_items_not_equals(dataiter1, dataiter2):
+        """
+        Asserts that two data iterators have the same numbner of batches,
+        that the batches have the same number of items, and that the items
+        are the _not_ equal.
+        """
+        for batch1, batch2 in zip_longest(dataiter1, dataiter2):
+
+            # ensure iterators are of same length
+            # zip_longest will return None if on of the iterators have run out of batches
+            assert batch1 and batch2, 'The iterators do not contain the same number of batches'
+
+            # ensure batches are of same length
+            assert len(batch1.data) == len(batch2.data), 'The returned batches are not of the same length'
+
+            # ensure batch data is the same
+            for i in range(0, len(batch1.data)):
+                data1 = batch1.data[i].asnumpy().astype(np.uint8)
+                data2 = batch2.data[i].asnumpy().astype(np.uint8)
+                if not np.array_equal(data1, data2):
+                    return
+        assert False, 'Expected data iterators to be different, but they are the same'
+
     # check whether to get constant images after fixing seed_aug
-    dataiter = mx.io.ImageRecordIter(
+    dataiter1 = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/train.rec",
         mean_img="data/cifar/cifar10_mean.bin",
         shuffle=False,
@@ -449,10 +496,8 @@ def test_ImageRecordIter_seed_augmentation():
         random_h=10,
         max_shear_ratio=2,
         seed_aug=seed_aug)
-    batch = dataiter.next()
-    data = batch.data[0].asnumpy().astype(np.uint8)
 
-    dataiter = mx.io.ImageRecordIter(
+    dataiter2 = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/train.rec",
         mean_img="data/cifar/cifar10_mean.bin",
         shuffle=False,
@@ -468,12 +513,12 @@ def test_ImageRecordIter_seed_augmentation():
         random_h=10,
         max_shear_ratio=2,
         seed_aug=seed_aug)
-    batch = dataiter.next()
-    data2 = batch.data[0].asnumpy().astype(np.uint8)
-    assert(np.array_equal(data,data2))
+    
+    assert_dataiter_items_equals(dataiter1, dataiter2)
 
     # check whether to get different images after change seed_aug
-    dataiter = mx.io.ImageRecordIter(
+    dataiter1.reset()
+    dataiter2 = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/train.rec",
         mean_img="data/cifar/cifar10_mean.bin",
         shuffle=False,
@@ -489,31 +534,27 @@ def test_ImageRecordIter_seed_augmentation():
         random_h=10,
         max_shear_ratio=2,
         seed_aug=seed_aug+1)
-    batch = dataiter.next()
-    data2 = batch.data[0].asnumpy().astype(np.uint8)
-    assert(not np.array_equal(data,data2))
+
+    assert_dataiter_items_not_equals(dataiter1, dataiter2)
 
     # check whether seed_aug changes the iterator behavior
-    dataiter = mx.io.ImageRecordIter(
+    dataiter1 = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/train.rec",
         mean_img="data/cifar/cifar10_mean.bin",
         shuffle=False,
         data_shape=(3, 28, 28),
         batch_size=3,
         seed_aug=seed_aug)
-    batch = dataiter.next()
-    data = batch.data[0].asnumpy().astype(np.uint8)
 
-    dataiter = mx.io.ImageRecordIter(
+    dataiter2 = mx.io.ImageRecordIter(
         path_imgrec="data/cifar/train.rec",
         mean_img="data/cifar/cifar10_mean.bin",
         shuffle=False,
         data_shape=(3, 28, 28),
         batch_size=3,
         seed_aug=seed_aug)
-    batch = dataiter.next()
-    data2 = batch.data[0].asnumpy().astype(np.uint8)
-    assert(np.array_equal(data,data2))
+    
+    assert_dataiter_items_equals(dataiter1, dataiter2)
 
 if __name__ == "__main__":
     test_NDArrayIter()
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index adb31745a..7e561f20a 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -175,6 +175,8 @@ def test_module_layout():
 
 @with_seed()
 def test_save_load():
+    previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
+    os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
     def dict_equ(a, b):
         assert set(a) == set(b)
         for k in a:
@@ -212,6 +214,7 @@ def dict_equ(a, b):
     assert mod._symbol.tojson() == mod2._symbol.tojson()
     dict_equ(mod.get_params()[0], mod2.get_params()[0])
     dict_equ(mod._kvstore._updater.states, mod2._updater.states)
+    os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
 
 
 @with_seed()
@@ -872,6 +875,48 @@ def empty_fn(*args, **kwargs):
     train_data = MockTrainData(batches=2)
     mod.fit(train_data, num_epoch=1)
 
+@with_seed()
+def test_bucket_module_grad_req():
+    batch_size = 2
+    def sym_gen(_):
+        data = mx.symbol.Variable('data')
+        weight = mx.symbol.Variable('a', shape=(1,), init=mx.init.One())
+        sym = mx.sym.make_loss(mx.sym.broadcast_mul(data, weight))
+        return sym, ('data',), None
+
+    mod = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=10)
+    mod.bind(data_shapes=[['data', (batch_size, )]], for_training=True, grad_req='write')
+    mod.init_params()
+
+    mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))],
+                                         label=None,
+                                         provide_data=[mx.io.DataDesc(name='data', shape=(batch_size, ), layout='N')],
+                                         bucket_key=10))
+    assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == batch_size)
+
+    mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))],
+                                         label=None,
+                                         provide_data=[mx.io.DataDesc(name='data', shape=(batch_size, ), layout='N')],
+                                         bucket_key=5))
+    assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == batch_size)
+
+    mod = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=10)
+    mod.bind(data_shapes=[['data', (batch_size, )]], for_training=True, grad_req='add')
+    mod.init_params()
+
+    mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))],
+                                         label=None,
+                                         provide_data=[mx.io.DataDesc(name='data', shape=(batch_size,), layout='N')],
+                                         bucket_key=10))
+    assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == batch_size)
+
+    mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))],
+                                         label=None,
+                                         provide_data=[mx.io.DataDesc(name='data', shape=(batch_size,), layout='N')],
+                                         bucket_key=5))
+    assert mod._curr_module._grad_req == 'add'
+    assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == 2 * batch_size)
+
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 0aa485539..7176b1888 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -1056,7 +1056,7 @@ def test_output():
 @with_seed()
 def test_ndarray_fluent():
     has_grad = set(['flatten', 'expand_dims', 'flip', 'tile', 'transpose', 'sum', 'nansum', 'prod',
-                    'nanprod', 'mean', 'max', 'min', 'reshape', 'broadcast_to', 'split',
+                    'nanprod', 'mean', 'max', 'min', 'reshape', 'broadcast_to', 'split', 'split_v2',
                     'broadcast_axes', 'pad', 'swapaxes', 'slice', 'slice_axis', 'slice_like',
                     'take', 'one_hot', 'pick', 'sort', 'topk', 'argsort', 'argmax', 'argmin',
                     'clip', 'abs', 'sign', 'sin', 'cos', 'tan', 'arcsin', 'arccos', 'arctan',
@@ -1093,6 +1093,8 @@ def check_fluent_regular(func, kwargs, shape=(5, 17, 1), equal_nan=False):
     check_fluent_regular('repeat', {'repeats': 3})
     check_fluent_regular('transpose', {'axes': (1,0,2)})
     check_fluent_regular('split', {'axis': 2, 'num_outputs': 3}, shape=(5, 17, 6))
+    check_fluent_regular('split_v2', {'axis': 2, 'indices_or_sections': 3}, shape=(5, 17, 6))
+    check_fluent_regular('split_v2', {'axis': 2, 'indices_or_sections': (1, 3, 5)}, shape=(5, 17, 6))
     check_fluent_regular('slice', {'begin': (2, 5, 1), 'end': (4, 7, 6)}, shape=(5, 17, 6))
     check_fluent_regular('slice_axis', {'axis': 1, 'begin': 5, 'end': 7})
     check_fluent_regular('slice_like', {'axes': (0, -2), 'shape_like': mx.nd.zeros((3, 3))})
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 20d3eb8f4..dfd349873 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -1380,7 +1380,6 @@ def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, targ
     assert out_shapes[0] == (input_shape[0], 5) + target_shape
 
 
-@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at /~https://github.com/apache/incubator-mxnet/issues/10973")
 @with_seed()
 def test_deconvolution():
     # 2D
@@ -1633,7 +1632,7 @@ def test_convolution_grouping():
                 np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-3)
 
 
-@unittest.skip("Flaky test /~https://github.com/apache/incubator-mxnet/issues/12203")
+@unittest.skip("Flaky test /~https://github.com/apache/incubator-mxnet/issues/14052")
 @with_seed()
 def test_depthwise_convolution():
     for dim in [1,2]:
@@ -1664,7 +1663,7 @@ def test_depthwise_convolution():
 
                             dev = default_context()
                             exe1 = y1.simple_bind(dev, x=shape)
-                            exe2 = y2.simple_bind(mx.cpu(), x=shape, w=(num_filter, shape[1]//num_group)+kernel,
+                            exe2 = y2.simple_bind(dev, x=shape, w=(num_filter, shape[1]//num_group)+kernel,
                                     b=(num_filter,))
                             for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
                                 arr1[:] = np.random.normal(size=arr1.shape)
@@ -2531,7 +2530,8 @@ def test_flip():
 
 @with_seed()
 def test_stn():
-    np.set_printoptions(threshold=np.nan)
+    import sys
+    np.set_printoptions(threshold=sys.maxsize)
     num_filter = 2  # conv of loc net
     kernel = (3, 3)  # conv of loc net
     num_hidden = 6  # fc of loc net
@@ -3293,36 +3293,38 @@ def check_sequence_func(ftype, mask_value=0, axis=0):
     L = mx.symbol.Variable('L') # lengths
     shapes = [(3, 4), (1, 1), (3, 4, 3, 1, 1)]
     for seqlenQ in [True, False]:
-        for s in shapes:
-            x = mx.random.uniform(-1, 1, s, ctx=mx.cpu()).copyto(xpu)
-            batch = s[1] if (axis == 0) else s[0]
-            seqlen = s[axis]
-            l_np = np.random.randint(1, seqlen + 1, batch)
-            l = mx.nd.array(l_np, ctx=mx.cpu()).copyto(xpu)
-            if not seqlenQ:
-                l_np = None
-            args = {'data':X, 'use_sequence_length':seqlenQ, "axis":axis}
-            if seqlenQ:
-                args['sequence_length'] = L
-            if ftype == "last":
-                Y = mx.symbol.SequenceLast(**args)
-                np_out = sequence_last_numpy(x.asnumpy(), l_np, axis)
-            elif ftype == "mask":
-                args['value'] = mask_value
-                Y = mx.symbol.SequenceMask(**args)
-                np_out = sequence_mask_numpy(x.asnumpy(), l_np, axis, mask_value)
-            elif ftype == "reverse":
-                Y = mx.symbol.SequenceReverse(**args)
-                np_out = sequence_reverse_numpy(x.asnumpy(), l_np, axis)
-            fargs = [x, l] if seqlenQ else [x]
-            gargs = [x.asnumpy(), l_np] if seqlenQ else [x.asnumpy()]
-            check_symbolic_forward(Y, fargs, [np_out])
-            check_numeric_gradient(Y, gargs, grad_nodes={'X':'write'},
-                numeric_eps=1e-2, rtol=1e-2)
-            check_numeric_gradient(Y, gargs, grad_nodes={'X':'add'},
-                numeric_eps=1e-3, rtol=1e-2, atol=1E-4)
-            check_numeric_gradient(Y, gargs, grad_nodes={'X':'null'},
-                numeric_eps=1e-3, rtol=1e-2, atol=1E-4)
+        for ary_dtype in [np.float32]:
+            for idx_dtype in [np.int32, np.float32]:
+                for s in shapes:
+                    x = mx.random.uniform(-1, 1, s, ctx=mx.cpu()).astype(ary_dtype).copyto(xpu)
+                    batch = s[1] if (axis == 0) else s[0]
+                    seqlen = s[axis]
+                    l_np = np.random.randint(1, seqlen + 1, batch)
+                    l = mx.nd.array(l_np, ctx=mx.cpu(), dtype=idx_dtype).copyto(xpu)
+                    if not seqlenQ:
+                        l_np = None
+                    args = {'data':X, 'use_sequence_length':seqlenQ, "axis":axis}
+                    if seqlenQ:
+                        args['sequence_length'] = L
+                    if ftype == "last":
+                        Y = mx.symbol.SequenceLast(**args)
+                        np_out = sequence_last_numpy(x.asnumpy(), l_np, axis)
+                    elif ftype == "mask":
+                        args['value'] = mask_value
+                        Y = mx.symbol.SequenceMask(**args)
+                        np_out = sequence_mask_numpy(x.asnumpy(), l_np, axis, mask_value)
+                    elif ftype == "reverse":
+                        Y = mx.symbol.SequenceReverse(**args)
+                        np_out = sequence_reverse_numpy(x.asnumpy(), l_np, axis)
+                    fargs = [x, l] if seqlenQ else [x]
+                    gargs = [x.asnumpy(), l_np] if seqlenQ else [x.asnumpy()]
+                    check_symbolic_forward(Y, fargs, [np_out], dtype="asnumpy")
+                    check_numeric_gradient(Y, gargs, grad_nodes={'X':'write'},
+                        numeric_eps=1e-2, rtol=1e-2)
+                    check_numeric_gradient(Y, gargs, grad_nodes={'X':'add'},
+                        numeric_eps=1e-3, rtol=1e-2, atol=1E-4)
+                    check_numeric_gradient(Y, gargs, grad_nodes={'X':'null'},
+                        numeric_eps=1e-3, rtol=1e-2, atol=1E-4)
 
 
 @with_seed()
@@ -3501,7 +3503,11 @@ def test_special_functions_using_scipy():
 
     # erf
     mathematical_core("erf", lambda x: mx.sym.erf(x), lambda x: scipy_special.erf(x),
-                     lambda x: 2.0 / math.sqrt(math.pi) * math.exp(-(x ** 2)), 0.5, 0.5)
+                     lambda x: 2.0 / math.sqrt(math.pi) * np.exp(-(x ** 2)), 0.5, 0.5)
+
+    # erfinv
+    mathematical_core("erfinv", lambda x: mx.sym.erfinv(x), lambda x: scipy_special.erfinv(x),
+                     lambda x: 0.5 * math.sqrt(math.pi) * np.exp(scipy_special.erfinv(x) ** 2), 0.5, 0.5)
 
 
 def rounding(name, forward_mxnet_call, forward_numpy_call, data_init=5., grad_init=2.):
@@ -3995,6 +4001,48 @@ def test_cast():
             assert_almost_equal(exe.grad_arrays[0].asnumpy(), X.astype(dsttype).astype(srctype), rtol=1e-3, atol=1e-5)
 
 
+# Test requires all platforms to round float32->float16 with same round-to-nearest-even policy.
+@with_seed()
+def test_cast_float32_to_float16():
+    FP16_FRACTION_BITS = 10
+    FP32_FRACTION_BITS = 23
+    FP32_EXP_MIN = -126
+    FP32_EXP_MAX = 127
+    # generate test cases in the vicinity of representable float16 mantissas
+    # and mid-way between them, but over the full range of float32 exponents.
+    def get_data():
+        for sign_bit in [0, 1]:
+            for exponent in range(FP32_EXP_MIN - FP32_FRACTION_BITS - 1, FP32_EXP_MAX + 2):
+                denominator = 2**(FP16_FRACTION_BITS + 1)
+                for numerator in range(0, denominator):
+                    fraction = numerator / float(denominator)
+                    for y in [-1.0, 0.0, 1.0]:
+                        small_delta = y / 2**FP32_FRACTION_BITS
+                        val = (-1.0)**sign_bit * 2.0**exponent * (1.0 + fraction + small_delta)
+                        yield val
+        # Add np.nan as a final data value to process
+        yield np.nan
+
+    input_np = np.array(list(get_data())).astype(np.float32)
+    # The intermediate cast to np.float64 below gets around a numpy rounding bug that is fixed
+    # as of numpy 1.17 by PR /~https://github.com/numpy/numpy/pull/12722
+    expected_output = input_np.astype(np.float64).astype(np.float16)
+
+    x = mx.sym.Variable('x', dtype=np.float32)
+    sym = mx.sym.Cast(x, dtype=np.float16)
+    ctx = default_context()
+    exe = sym.bind(ctx, {'x' : mx.nd.array(input_np, dtype=np.float32, ctx=ctx)})
+    assert exe.arg_arrays[0].dtype == np.float32
+    assert exe.outputs[0].dtype == np.float16
+    exe.forward(is_train=False)
+    sym_output = exe.outputs[0].asnumpy()
+    for fp32_val, model_fp16_val, np_fp16_val in zip(input_np, sym_output, expected_output):
+        assert (model_fp16_val == np_fp16_val) or \
+               (np.isnan(model_fp16_val) and np.isnan(np_fp16_val)), \
+            'fp32->fp16 cast mismatch: with fp32 value {}, model_fp16 = {}, numpy_fp16 = {}'.format(
+                fp32_val, model_fp16_val, np_fp16_val)
+
+
 @with_seed()
 def test_repeat():
     def test_repeat_forward():
@@ -5904,10 +5952,10 @@ def check_correctness(executor, input, ratio):
         elif ratio == 0:
             assert output_zeroes == 0
 
-    def check_dropout_ratio(ratio, shape):
+    def check_dropout_ratio(ratio, shape, cudnn_off=True):
         # test dropout
         x = mx.sym.var('data')
-        y = mx.sym.Dropout(x, p=ratio)
+        y = mx.sym.Dropout(x, p=ratio, cudnn_off=cudnn_off)
         exe = y.simple_bind(ctx=default_context(), data=shape)
 
         if ratio == 1:
@@ -5945,7 +5993,7 @@ def check_dropout_ratio(ratio, shape):
 
             # test permanent dropout
             x = mx.sym.var('data')
-            y = mx.sym.Dropout(x, p=ratio, mode='always')
+            y = mx.sym.Dropout(x, p=ratio, mode='always', cudnn_off=cudnn_off)
             exe = y.simple_bind(ctx=default_context(), data=shape)
 
             exe.arg_arrays[0][:] = 1
@@ -5970,24 +6018,45 @@ def get_slice(x, axis, idx):
                 ix += (slice(None, None, None),)
         return x[ix]
 
-    def check_dropout_axes(ratio, shape, axes):
+    def check_dropout_axes(ratio, shape, axes, cudnn_off=True):
         compactshape = list(shape)
         for axis in axes:
             compactshape[axis] = 1
         compactx = mx.random.uniform(shape=tuple(compactshape))
         broadcastx = compactx.broadcast_to(shape)
-        dropouty = mx.nd.Dropout(broadcastx, p=ratio, axes=axes)
+        dropouty = mx.nd.Dropout(broadcastx, p=ratio, axes=axes, cudnn_off=cudnn_off)
         for axis in axes:
             target = get_slice(dropouty, axis, 0).asnumpy()
             for i in range(1, shape[axis]):
                 assert(get_slice(dropouty, axis, i).asnumpy() == target).all()
 
+    def check_passthrough(ratio, shape, cudnn_off=True):
+        # test inference_mode forward and then backward
+        a = mx.random.uniform(shape=shape)
+        a.attach_grad()
+        with mx.autograd.record(train_mode=False):
+            b = mx.nd.Dropout(a, ratio, cudnn_off=cudnn_off) # dropout acts as identity
+        b.backward()
+        assert_almost_equal(a.grad.asnumpy(), mx.nd.ones_like(b).asnumpy())
+
     shape = (100, 100)
     check_dropout_ratio(0.5, shape)
     check_dropout_ratio(0.0, shape)
     check_dropout_ratio(1.0, shape)
     check_dropout_ratio(0.75, shape)
     check_dropout_ratio(0.25, shape)
+    check_dropout_ratio(0.5, shape, cudnn_off=False)
+    check_dropout_ratio(0.0, shape, cudnn_off=False)
+    check_dropout_ratio(1.0, shape, cudnn_off=False)
+    check_dropout_ratio(0.75, shape, cudnn_off=False)
+    check_dropout_ratio(0.25, shape, cudnn_off=False)
+
+    check_passthrough(0.5, shape)
+    check_passthrough(0.0, shape)
+    check_passthrough(1.0, shape)
+    check_passthrough(0.5, shape, cudnn_off=False)
+    check_passthrough(0.0, shape, cudnn_off=False)
+    check_passthrough(1.0, shape, cudnn_off=False)
 
     nshape = (10, 10, 10, 10)
     with mx.autograd.train_mode():
@@ -6004,6 +6073,20 @@ def check_dropout_axes(ratio, shape, axes):
         check_dropout_axes(0.25, nshape, axes = (0, 1, 2))
         check_dropout_axes(0.25, nshape, axes = (0, 2, 3))
         check_dropout_axes(0.25, nshape, axes = (1, 2, 3))
+        check_dropout_axes(0.25, nshape, axes = (0,), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (1,), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (2,), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (3,), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (0, 1), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (0, 2), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (0, 3), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (1, 2), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (1, 3), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (2, 3), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (0, 1, 2), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (0, 2, 3), cudnn_off=False)
+        check_dropout_axes(0.25, nshape, axes = (1, 2, 3), cudnn_off=False)
+
 
 
 @unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at /~https://github.com/apache/incubator-mxnet/issues/11290")
@@ -6530,6 +6613,11 @@ def check_bilinear_resize_op(shape, height, width):
         x = mx.nd.random.uniform(shape=shape)
         y = mx.nd.contrib.BilinearResize2D(x, height=height, width=width)
         assert_almost_equal(y.asnumpy(), py_bilinear_resize(x.asnumpy(), height, width))
+
+        x_scale = width / shape[-1]
+        y_scale = height / shape[-2]
+        y = mx.nd.contrib.BilinearResize2D(x, scale_height=y_scale, scale_width=x_scale)
+        assert_almost_equal(y.asnumpy(), py_bilinear_resize(x.asnumpy(), height, width))
     shape = (2, 2, 10, 10)
     check_bilinear_resize_op(shape, 5, 5)
     check_bilinear_resize_op(shape, 10, 10)
@@ -6735,7 +6823,7 @@ def get_output_names_callback(name, arr):
             output_names.append(py_str(name))
 
         op_exe = op_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
-        op_exe.set_monitor_callback(get_output_names_callback)
+        op_exe.set_monitor_callback(get_output_names_callback, monitor_all=False)
         op_exe.forward()
         for output_name, expected_name in zip(output_names, expected_names):
             assert output_name == expected_name
@@ -6773,6 +6861,52 @@ def get_output_names_callback(name, arr):
                             name='pooling')
     check_name(us_sym, ['pooling_output'])
 
+@unittest.skip("skip for ngraph-mxnet as output names are handled internally")
+def test_op_all_names_monitor():
+    def check_name(op_sym, expected_names):
+        output_names = []
+
+        def get_output_names_callback(name, arr):
+            output_names.append(py_str(name))
+
+        op_exe = op_sym.simple_bind(ctx=mx.current_context(), grad_req='null')
+        op_exe.set_monitor_callback(get_output_names_callback, monitor_all=True)
+        op_exe.forward()
+        for output_name, expected_name in zip(output_names, expected_names):
+            assert output_name == expected_name
+
+    data = mx.sym.Variable('data', shape=(10, 3, 10, 10))
+    conv_sym = mx.sym.Convolution(data, kernel=(2, 2), num_filter=1, name='conv')
+    check_name(conv_sym, ['data', 'conv_data', 'conv_weight', 'conv_weight', 'conv_bias', 'conv_bias', 'conv_output'])
+
+    deconv_sym = mx.sym.Deconvolution(data, kernel=(2, 2), num_filter=1, name='deconv')
+    check_name(deconv_sym, ['data', 'deconv_data', 'deconv_weight', 'deconv_weight', 'deconv_output'])
+
+    fc_sym = mx.sym.FullyConnected(data, num_hidden=10, name='fc')
+    check_name(fc_sym, ['data', 'fc_data', 'fc_weight', 'fc_weight', 'fc_bias', 'fc_bias', 'fc_output'])
+
+    lrn_sym = mx.sym.LRN(data, nsize=1, name='lrn')
+    check_name(lrn_sym, ['data', 'lrn_data', 'lrn_output', 'lrn_tmp_norm'])
+
+    act_sym = mx.sym.Activation(data, act_type='relu', name='act')
+    check_name(act_sym, ['data', 'act_input0', 'act_output'])
+
+    cc_sym = mx.sym.concat(data, data, dim=0, name='concat')
+    check_name(cc_sym, ['data', 'concat_arg0', 'data', 'concat_arg1', 'concat_output'])
+
+    sm_sym = mx.sym.softmax(data, name='softmax')
+    check_name(sm_sym, ['data', 'softmax_input0', 'softmax_output'])
+
+    sa_sym = mx.sym.SoftmaxActivation(data, name='softmax')
+    check_name(sa_sym, ['data', 'softmax_input0', 'softmax_output'])
+
+    us_sym = mx.sym.UpSampling(data, scale=2, sample_type='nearest',
+                               name='upsampling')
+    check_name(us_sym, ['data', 'upsampling_arg0', 'upsampling_output'])
+
+    us_sym = mx.sym.Pooling(data, kernel=(2, 2), pool_type='avg',
+                            name='pooling')
+    check_name(us_sym, ['data', 'pooling_data', 'pooling_output'])
 
 @with_seed()
 def test_activation():
@@ -6851,139 +6985,150 @@ def test_context_num_gpus():
 
 @with_seed()
 def test_op_roi_align():
-    # Adapted from /~https://github.com/wkcn/MobulaOP/blob/master/tests/test_roi_align_op.py
+    T = np.float32
+
+    def assert_same_dtype(dtype_a, dtype_b):
+        '''
+        Assert whether the two data type are the same
+        Parameters
+        ----------
+        dtype_a, dtype_b: type
+            Input data types to compare
+        '''
+        assert dtype_a == dtype_b,\
+            TypeError('Unmatched data types: %s vs %s' % (dtype_a, dtype_b))
+
     def bilinear_interpolate(bottom, height, width, y, x):
         if y < -1.0 or y > height or x < -1.0 or x > width:
-            return 0.0, []
-        x = max(0.0, x)
-        y = max(0.0, y)
+            return T(0.0), []
+        x = T(max(0.0, x))
+        y = T(max(0.0, y))
         x_low = int(x)
         y_low = int(y)
         if x_low >= width - 1:
             x_low = x_high = width - 1
-            x = x_low
+            x = T(x_low)
         else:
             x_high = x_low + 1
-
         if y_low >= height - 1:
             y_low = y_high = height - 1
-            y = y_low
+            y = T(y_low)
         else:
             y_high = y_low + 1
-
-        ly = y - y_low
-        lx = x - x_low
-        hy = 1.0 - ly
-        hx = 1.0 - lx
-
+        ly = y - T(y_low)
+        lx = x - T(x_low)
+        hy = T(1.0) - ly
+        hx = T(1.0) - lx
         v1 = bottom[y_low, x_low]
         v2 = bottom[y_low, x_high]
         v3 = bottom[y_high, x_low]
         v4 = bottom[y_high, x_high]
-
-        '''
-        ----------->x
-        |hx hy | lx hy
-        |------+------
-        |hx ly | lx ly
-        V
-        y
-        v1|v2
-        --+--
-        v3|v4
-        '''
         w1 = hy * hx
         w2 = hy * lx
         w3 = ly * hx
         w4 = ly * lx
-
+        assert_same_dtype(w1.dtype, T)
+        assert_same_dtype(w2.dtype, T)
+        assert_same_dtype(w3.dtype, T)
+        assert_same_dtype(w4.dtype, T)
         val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+        assert_same_dtype(val.dtype, T)
         grad = [(y_low, x_low, w1), (y_low, x_high, w2),
                 (y_high, x_low, w3), (y_high, x_high, w4)
-               ]
+                ]
         return val, grad
 
     def roialign_forward_backward(data, rois, pooled_size, spatial_scale, sampling_ratio,
-            position_sensitive, dy):
+                                  position_sensitive, dy):
         N, C, H, W = data.shape
         R = rois.shape[0]
         PH, PW = pooled_size
-        assert len(rois.shape) == 2
-        assert rois.shape[1] == 5
+        assert rois.ndim == 2,\
+            ValueError(
+                'The ndim of rois should be 2 rather than %d' % rois.ndim)
+        assert rois.shape[1] == 5,\
+            ValueError(
+                'The length of the axis 1 of rois should be 5 rather than %d' % rois.shape[1])
+        assert_same_dtype(data.dtype, T)
+        assert_same_dtype(rois.dtype, T)
 
         C_out = C // PH // PW if position_sensitive else C
-        out = np.zeros((R, C_out, PH, PW))
+        out = np.zeros((R, C_out, PH, PW), dtype=T)
         dx = np.zeros_like(data)
         drois = np.zeros_like(rois)
 
         for r in range(R):
             batch_ind = int(rois[r, 0])
-            sw, sh, ew, eh = rois[r, 1:5] * spatial_scale
-            roi_w = max(ew - sw, 1.0)
-            roi_h = max(eh - sh, 1.0)
-            bin_h = roi_h * 1.0 / PH
-            bin_w = roi_w * 1.0 / PW
+            sw, sh, ew, eh = rois[r, 1:5] * T(spatial_scale)
+            roi_w = T(max(ew - sw, 1.0))
+            roi_h = T(max(eh - sh, 1.0))
+            bin_h = roi_h / T(PH)
+            bin_w = roi_w / T(PW)
             bdata = data[batch_ind]
             if sampling_ratio > 0:
                 roi_bin_grid_h = roi_bin_grid_w = sampling_ratio
             else:
-                roi_bin_grid_h = int(np.ceil(roi_h * 1.0 / PH))
-                roi_bin_grid_w = int(np.ceil(roi_w * 1.0 / PW))
-            count = roi_bin_grid_h * roi_bin_grid_w
+                roi_bin_grid_h = int(np.ceil(roi_h / T(PH)))
+                roi_bin_grid_w = int(np.ceil(roi_w / T(PW)))
+            count = T(roi_bin_grid_h * roi_bin_grid_w)
             for c in range(C_out):
                 for ph in range(PH):
                     for pw in range(PW):
-                        val = 0.0
+                        val = T(0.0)
                         c_in = c * PH * PW + ph * PW + pw if position_sensitive else c
                         for iy in range(roi_bin_grid_h):
-                            y = sh + ph * bin_h + (iy + 0.5) * bin_h / roi_bin_grid_h
+                            y = sh + T(ph) * bin_h + (T(iy) + T(0.5)) * \
+                                bin_h / T(roi_bin_grid_h)
                             for ix in range(roi_bin_grid_w):
-                                x = sw + pw * bin_w + (ix + 0.5) * bin_w / roi_bin_grid_w
-                                v, g = bilinear_interpolate(bdata[c_in], H, W, y, x)
+                                x = sw + T(pw) * bin_w + (T(ix) + T(0.5)) * \
+                                    bin_w / T(roi_bin_grid_w)
+                                v, g = bilinear_interpolate(
+                                    bdata[c_in], H, W, y, x)
+                                assert_same_dtype(v.dtype, T)
                                 val += v
                                 # compute grad
                                 for qy, qx, qw in g:
-                                    dx[batch_ind, c_in, qy, qx] += dy[r, c, ph, pw] * qw * 1.0 / count
-
-                        out[r, c, ph, pw] = val * 1.0 / count
+                                    assert_same_dtype(qw.dtype, T)
+                                    dx[batch_ind, c_in, qy, qx] += dy[r,
+                                                                      c, ph, pw] * qw / count
+                        out[r, c, ph, pw] = val / count
+        assert_same_dtype(out.dtype, T)
         return out, [dx, drois]
 
     def test_roi_align_value(sampling_ratio=0, position_sensitive=False):
-        ctx=default_context()
+        ctx = default_context()
         dtype = np.float32
-
         dlen = 224
         N, C, H, W = 5, 3, 16, 16
-        assert H == W
         R = 7
         pooled_size = (3, 4)
         C = C * pooled_size[0] * pooled_size[1] if position_sensitive else C
-
         spatial_scale = H * 1.0 / dlen
-        data = mx.nd.array(np.arange(N*C*W*H).reshape((N,C,H,W)), ctx=ctx, dtype = dtype)
-        # data = mx.nd.random.uniform(0, 1, (N, C, H, W), dtype = dtype)
-        center_xy = mx.nd.random.uniform(0, dlen, (R, 2), ctx=ctx, dtype = dtype)
-        wh = mx.nd.random.uniform(0, dlen, (R, 2), ctx=ctx, dtype = dtype)
-        batch_ind = mx.nd.array(np.random.randint(0, N, size = (R,1)), ctx=ctx)
-        pos = mx.nd.concat(center_xy - wh / 2, center_xy + wh / 2, dim = 1)
-        rois = mx.nd.concat(batch_ind, pos, dim = 1)
+        data = mx.nd.array(
+            np.arange(N * C * W * H).reshape((N, C, H, W)), ctx=ctx, dtype=dtype)
+        center_xy = mx.nd.random.uniform(0, dlen, (R, 2), ctx=ctx, dtype=dtype)
+        wh = mx.nd.random.uniform(0, dlen, (R, 2), ctx=ctx, dtype=dtype)
+        batch_ind = mx.nd.array(np.random.randint(0, N, size=(R, 1)), ctx=ctx)
+        pos = mx.nd.concat(center_xy - wh / 2, center_xy + wh / 2, dim=1)
+        rois = mx.nd.concat(batch_ind, pos, dim=1)
 
         data.attach_grad()
         rois.attach_grad()
         with mx.autograd.record():
             output = mx.nd.contrib.ROIAlign(data, rois, pooled_size=pooled_size,
-                    spatial_scale=spatial_scale, sample_ratio=sampling_ratio,
-                    position_sensitive=position_sensitive)
+                                            spatial_scale=spatial_scale, sample_ratio=sampling_ratio,
+                                            position_sensitive=position_sensitive)
         C_out = C // pooled_size[0] // pooled_size[1] if position_sensitive else C
-        dy = mx.nd.random.uniform(-1, 1, (R, C_out) + pooled_size, ctx=ctx, dtype = dtype)
+        dy = mx.nd.random.uniform(-1, 1, (R, C_out) +
+                                  pooled_size, ctx=ctx, dtype=dtype)
         output.backward(dy)
         real_output, [dx, drois] = roialign_forward_backward(data.asnumpy(), rois.asnumpy(), pooled_size,
                                                              spatial_scale, sampling_ratio,
                                                              position_sensitive, dy.asnumpy())
-        assert np.allclose(output.asnumpy(), real_output)
-        # It seems that the precision between Cfloat and Pyfloat is different.
-        assert np.allclose(data.grad.asnumpy(), dx, atol = 1e-5), np.abs(data.grad.asnumpy() - dx).max()
-        assert np.allclose(rois.grad.asnumpy(), drois)
+
+        assert_almost_equal(output.asnumpy(), real_output, atol=1e-3)
+        assert_almost_equal(data.grad.asnumpy(), dx, atol=1e-3)
+        assert_almost_equal(rois.grad.asnumpy(), drois, atol=1e-3)
 
     # modified from test_roipooling()
     def test_roi_align_autograd(sampling_ratio=0):
@@ -6998,10 +7143,10 @@ def test_roi_align_autograd(sampling_ratio=0):
                        [1, 3.1, 1.1, 5.2, 10.2]], dtype='float64')
 
         check_numeric_gradient(sym=test, location=[x1, x2],
-                               grad_nodes={'data':'write', 'rois':'null'},
+                               grad_nodes={'data': 'write', 'rois': 'null'},
                                numeric_eps=1e-4, rtol=1e-1, atol=1e-4, ctx=ctx)
         check_numeric_gradient(sym=test, location=[x1, x2],
-                               grad_nodes={'data':'add', 'rois':'null'},
+                               grad_nodes={'data': 'add', 'rois': 'null'},
                                numeric_eps=1e-4, rtol=1e-1, atol=1e-4, ctx=ctx)
 
     test_roi_align_value()
@@ -7009,6 +7154,7 @@ def test_roi_align_autograd(sampling_ratio=0):
     test_roi_align_value(position_sensitive=True)
     test_roi_align_autograd()
 
+
 @with_seed()
 def test_diag():
 
@@ -7246,6 +7392,25 @@ def f_sm_ce(data, label):
     check_symbolic_forward(sym, {'data' : np_data, 'label' : np_label}, [np.array([f_sm_ce(np_sm, np_one_hot_label)])], rtol=1e-3, atol=1e-5)
 
 
+@with_seed()
+def test_split_v2():
+    dim = random.randint(2, 6)
+    shape = rand_shape_nd(dim)
+    axis = random.randint(-dim, dim-1)
+    axis_size = shape[axis]
+    samples = random.randint(0, axis_size - 1)
+    indices = sorted(random.sample([i for i in range(1, axis_size)], samples))
+    indices = tuple(indices)
+    mx_data = rand_ndarray(shape)
+    np_data = mx_data.asnumpy()
+    np_out = np.split(np_data, indices_or_sections=indices, axis=axis)
+    data = mx.sym.Variable("data")
+    sym = mx.sym.split_v2(data, indices_or_sections=indices, axis=axis)
+    check_symbolic_forward(sym, {"data": mx_data}, np_out, rtol=1e-3, atol=1e-5)
+    out_grad = [np.ones(arr.shape) for arr in np_out]
+    check_symbolic_backward(sym, {"data": mx_data}, out_grad, [np.concatenate(out_grad, axis=axis)])
+
+
 @with_seed()
 def test_invalid_kernel_size():
     invalid_kernel_size = 28
@@ -7298,6 +7463,73 @@ def test_invalid_max_pooling_pad_type_same():
         name='pooling',
         pooling_convention="same")
 
+
+@with_seed()
+def test_image_normalize():
+    # Part 1 - Test 3D Input
+    shape_3d = (3, 28, 28)
+    mean = (0, 1, 2)
+    std = (3, 2, 1)
+
+    data_in_3d = mx.nd.random.uniform(0, 1, shape_3d)
+    data_expected_3d = data_in_3d.asnumpy()
+    data_expected_3d[:][:][0] = data_expected_3d[:][:][0] / 3.0
+    data_expected_3d[:][:][1] = (data_expected_3d[:][:][1] - 1.0) / 2.0
+    data_expected_3d[:][:][2] = data_expected_3d[:][:][2] - 2.0
+
+    data = mx.symbol.Variable('data')
+    img_norm_sym = mx.sym.image.normalize(data=data, mean=mean, std=std)
+
+    # check forward
+    check_symbolic_forward(img_norm_sym, [data_in_3d], [data_expected_3d],
+                           rtol=1e-5, atol=1e-5)
+
+    # Gradient is 1/std_dev
+    grad_expected_3d = np.ones(shape_3d)
+    grad_expected_3d[:][:][0] = 1 / 3.0
+    grad_expected_3d[:][:][1] = 1 / 2.0
+    grad_expected_3d[:][:][2] = 1 / 1.0
+
+    # check backward
+    check_symbolic_backward(img_norm_sym, location=[data_in_3d], out_grads=[mx.nd.ones(shape_3d)],
+                            expected=[grad_expected_3d], rtol=1e-5, atol=1e-5)
+
+    # check backward using finite difference
+    check_numeric_gradient(img_norm_sym, [data_in_3d], atol=0.001)
+
+    # Part 2 - Test 4D Input
+    shape_4d = (2, 3, 28, 28)
+
+    data_in_4d = mx.nd.random.uniform(0, 1, shape_4d)
+    data_expected_4d = data_in_4d.asnumpy()
+    data_expected_4d[0][:][:][0] = data_expected_4d[0][:][:][0] / 3.0
+    data_expected_4d[0][:][:][1] = (data_expected_4d[0][:][:][1] - 1.0) / 2.0
+    data_expected_4d[0][:][:][2] = data_expected_4d[0][:][:][2] - 2.0
+    data_expected_4d[1][:][:][0] = data_expected_4d[1][:][:][0] / 3.0
+    data_expected_4d[1][:][:][1] = (data_expected_4d[1][:][:][1] - 1.0) / 2.0
+    data_expected_4d[1][:][:][2] = data_expected_4d[1][:][:][2] - 2.0
+
+    # check forward
+    check_symbolic_forward(img_norm_sym, [data_in_4d], [data_expected_4d],
+                           rtol=1e-5, atol=1e-5)
+
+    # Gradient is 1/std_dev
+    grad_expected_4d = np.ones(shape_4d)
+    grad_expected_4d[0][:][:][0] = 1 / 3.0
+    grad_expected_4d[0][:][:][1] = 1 / 2.0
+    grad_expected_4d[0][:][:][2] = 1 / 1.0
+    grad_expected_4d[1][:][:][0] = 1 / 3.0
+    grad_expected_4d[1][:][:][1] = 1 / 2.0
+    grad_expected_4d[1][:][:][2] = 1 / 1.0
+
+    # check backward
+    check_symbolic_backward(img_norm_sym, location=[data_in_4d], out_grads=[mx.nd.ones(shape_4d)],
+                            expected=[grad_expected_4d], rtol=1e-5, atol=1e-5)
+
+    # check backward using finite difference
+    check_numeric_gradient(img_norm_sym, [data_in_4d], atol=0.001)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index eb33f9b52..3fdd1cd6b 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -385,10 +385,10 @@ def update(self, index, weight, grad, state):
             else:
               mom = state
               mom[:] *= self.momentum
-              grad += wd * weight
               mom[:] += grad
+              mom[:] += wd * weight
               grad[:] += self.momentum * mom
-              weight[:] += -lr * grad
+              weight[:] -= lr * grad
         else:
             grad32 = array(grad, ctx=grad.context, dtype=np.float32)
             grad32 = grad32 * self.rescale_grad
@@ -400,10 +400,10 @@ def update(self, index, weight, grad, state):
                 weight32[:] += -lr * (grad32 + wd * weight32)
             else:
                 mom[:] *= self.momentum
-                grad32 += wd * weight32
                 mom[:] += grad32
+                mom[:] += wd * weight32
                 grad32[:] += self.momentum * mom
-                weight32[:] += -lr * grad32
+                weight32[:] -= lr * grad32
             tmp = weight32.astype(weight.dtype)
             tmp.copyto(weight)
 
@@ -435,6 +435,90 @@ def test_nag():
                                 continue
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
 
+#SGLD
+class PySGLD(mx.optimizer.Optimizer):
+    """python reference implementation of SGLD"""
+
+    def __init__(self, **kwargs):
+        super(PySGLD, self).__init__(**kwargs)
+
+    def create_state(self, index, weight):
+        return None
+
+    def update(self, index, weight, grad, state):
+        assert(isinstance(weight, mx.nd.NDArray))
+        assert(isinstance(grad, mx.nd.NDArray))
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+
+        grad = grad * self.rescale_grad
+        if self.clip_gradient is not None:
+            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+        weight[:] += - lr/2 * (grad + wd * weight) + mx.random.normal(0, math.sqrt(lr), shape=weight.shape,
+                                                            dtype=weight.dtype, ctx=weight.context)
+
+
+
+@with_seed()
+def test_sgld():
+    opt1 = PySGLD
+    opt2 = mx.optimizer.SGLD
+    shape = (3, 4, 5)
+    ns_options = [1234, 42]
+
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+
+
+    def compare_optimizer_noise_seeded(opt1, opt2, shape, dtype, noise_seed,
+                                       w_stype='default', g_stype='default',
+                                       rtol=1e-4, atol=1e-5, compare_states=True):
+        """Compare opt1 and opt2 with the added functionality that the seed for generating random noise
+        in the SGLD optimizer update is set so that the same noise is used in opt1 and opt2.
+
+        """
+        if w_stype == 'default':
+            w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+            w1 = w2.copyto(default_context())
+        elif w_stype == 'row_sparse' or w_stype == 'csr':
+            w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
+            w1 = w2.copyto(default_context()).tostype('default')
+        else:
+            raise Exception("type not supported yet")
+        if g_stype == 'default':
+            g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+            g1 = g2.copyto(default_context())
+        elif g_stype == 'row_sparse' or g_stype == 'csr':
+            g2 = rand_ndarray(shape, g_stype, dtype=dtype)
+            g1 = g2.copyto(default_context()).tostype('default')
+        else:
+            raise Exception("type not supported yet")
+
+        state1 = opt1.create_state_multi_precision(0, w1)
+        state2 = opt2.create_state_multi_precision(0, w2)
+        if compare_states:
+            compare_ndarray_tuple(state1, state2)
+
+        # set seed for Gaussian noise replication
+        mx.random.seed(noise_seed)
+        opt1.update_multi_precision(0, w1, g1, state1)
+        mx.random.seed(noise_seed)
+        opt2.update_multi_precision(0, w2, g2, state2)
+        if compare_states:
+            compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
+        assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol)
+
+    for seed in ns_options:
+        for dtype in [np.float16, np.float32, np.float64]:
+            for params in itertools.product(cg_options, wd_options, mp_options):
+                kwarg = {k: v for param in params for k, v in param.items()}
+                if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                    not kwarg['multi_precision'])):
+                    continue
+                compare_optimizer_noise_seeded(opt1(**kwarg), opt2(**kwarg), shape, dtype, seed)
+
 
 
 # FTML
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 405602f07..4e31a23a9 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -582,6 +582,7 @@ def test_poisson_generator():
                      for _ in range(10)])
             verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
 
+@unittest.skip("Flaky test. Tracked in /~https://github.com/apache/incubator-mxnet/issues/13506")
 @with_seed()
 def test_negative_binomial_generator():
     ctx = mx.context.current_context()
@@ -882,7 +883,7 @@ def test_randint_generator():
 @with_seed()
 def test_randint_without_dtype():
     a = mx.nd.random.randint(low=50000000, high=50000010, ctx=mx.context.current_context())
-    assert(a.dtype, 'int32')
+    assert a.dtype == np.int32
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/tutorials/test_sanity_tutorials.py b/tests/tutorials/test_sanity_tutorials.py
index 644a61183..429527db2 100644
--- a/tests/tutorials/test_sanity_tutorials.py
+++ b/tests/tutorials/test_sanity_tutorials.py
@@ -28,6 +28,7 @@
              'c++/basics.md',
              'c++/index.md',
              'c++/subgraphAPI.md',
+             'c++/mxnet_cpp_inference_tutorial.md',
              'control_flow/index.md',
              'embedded/index.md',
              'embedded/wine_detector.md',
diff --git a/tests/tutorials/test_tutorials.py b/tests/tutorials/test_tutorials.py
index 8d8ef398d..37ba9918f 100644
--- a/tests/tutorials/test_tutorials.py
+++ b/tests/tutorials/test_tutorials.py
@@ -151,6 +151,9 @@ def test_python_logistic_regression() :
 def test_python_numpy_gotchas() :
     assert _test_tutorial_nb('gluon/gotchas_numpy_in_mxnet')
 
+def test_gluon_end_to_end():
+    assert _test_tutorial_nb('gluon/gluon_from_experiment_to_deployment')
+
 def test_python_mnist():
     assert _test_tutorial_nb('python/mnist')
 
diff --git a/tools/coreml/converter/_layers.py b/tools/coreml/converter/_layers.py
index d0113a915..8f4bc1a8a 100644
--- a/tools/coreml/converter/_layers.py
+++ b/tools/coreml/converter/_layers.py
@@ -472,11 +472,14 @@ def convert_batchnorm(net, node, module, builder):
     inputs = node['inputs']
 
 
-    eps = 1e-3 # Default value of eps for MXNet.
-    use_global_stats = False # Default value of use_global_stats for MXNet.
+    eps = 1e-3  # Default value of eps for MXNet.
+    use_global_stats = False  # Default value of use_global_stats for MXNet.
+    fix_gamma = True  # Default value of fix_gamma for MXNet.
     attrs = _get_attrs(node)
     if 'eps' in attrs:
         eps = literal_eval(attrs['eps'])
+    if 'fix_gamma' in attrs:
+        fix_gamma = literal_eval(attrs['fix_gamma'])
 
     args, aux = module.get_params()
     gamma = args[_get_node_name(net, inputs[1][0])].asnumpy()
@@ -484,6 +487,8 @@ def convert_batchnorm(net, node, module, builder):
     mean = aux[_get_node_name(net, inputs[3][0])].asnumpy()
     variance = aux[_get_node_name(net, inputs[4][0])].asnumpy()
     nb_channels = gamma.shape[0]
+    if fix_gamma:
+        gamma.fill(1.)
     builder.add_batchnorm(
         name=name,
         channels=nb_channels,
diff --git a/tools/coreml/test/test_mxnet_converter.py b/tools/coreml/test/test_mxnet_converter.py
index 5d26c5faf..bc850690a 100644
--- a/tools/coreml/test/test_mxnet_converter.py
+++ b/tools/coreml/test/test_mxnet_converter.py
@@ -938,6 +938,40 @@ def test_batch_norm_no_global_stats(self):
             name='batch_norm_1')
         self._test_mxnet_model(net, input_shape=input_shape, mode='random', delta=1e-2)
 
+    def test_batch_norm_with_fix_gamma(self):
+        """ The gamma will always be an array of ones when fix_gamma=True. The values
+            of gamma may be changed accidentally if there have been fix_gamma=False before
+            the final trained model.
+        """
+        np.random.seed(1988)
+        input_shape = (1, 1, 2, 3)
+
+        net = mx.sym.Variable('data')
+        gamma = mx.sym.Variable('gamma')
+        beta = mx.sym.Variable('beta')
+        moving_mean = mx.sym.Variable('moving_mean')
+        moving_var = mx.sym.Variable('moving_var')
+        net = mx.symbol.BatchNorm(
+            data=net,
+            gamma=gamma,
+            beta=beta,
+            moving_mean=moving_mean,
+            moving_var=moving_var,
+            fix_gamma=True,
+            name='batch_norm_1')
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random', delta=1e-2)
+
+        np.random.seed(1988)
+        net = mx.symbol.BatchNorm(
+            data=net,
+            gamma=gamma,
+            beta=beta,
+            moving_mean=moving_mean,
+            moving_var=moving_var,
+            fix_gamma=False,
+            name='batch_norm_2')
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random', delta=1e-2)
+
     def test_pre_processing_args(self):
         np.random.seed(1988)
         input_shape = (1, 10)
diff --git a/tools/dependencies/README.md b/tools/dependencies/README.md
index cfe3d6c75..c30e85d51 100644
--- a/tools/dependencies/README.md
+++ b/tools/dependencies/README.md
@@ -12,3 +12,14 @@ The scripts use the following environment variables for setting behavior:
 `PLATFORM`: name of the OS in lower case. Supported options are 'linux' and 'darwin'.
 
 It also expects the following build tools in path: make, cmake, tar, unzip, autoconf, nasm
+
+# FAQ
+
+## Build failure regarding to gcc, g++, gfortran
+Currently, we only support gcc-4.8 build. It's your own choice to use a higher version of gcc. Please make sure your gcc, g++ and gfortran always have the same version in order to eliminate build failure.
+
+## idn2 not found
+This issue appeared in the OSX build with XCode version 8.0 above (reproduced on 9.2). Please add the following build flag in `curl.sh` if your XCode version is more than 8.0:
+```
+--without-libidn2
+``` 
\ No newline at end of file
diff --git a/tools/dependencies/cityhash.sh b/tools/dependencies/cityhash.sh
index 81cc9cbaa..6bc663e90 100755
--- a/tools/dependencies/cityhash.sh
+++ b/tools/dependencies/cityhash.sh
@@ -18,15 +18,17 @@
 # under the License.
 
 # This script builds the static library of cityhash that can be used as dependency of mxnet.
+set -ex
 CITYHASH_VERSION=1.1.1
 if [[ ! -f $DEPS_PATH/lib/libcityhash.a ]]; then
     # Download and build cityhash
     >&2 echo "Building cityhash..."
     git clone /~https://github.com/google/cityhash $DEPS_PATH/cityhash-$CITYHASH_VERSION
+    pushd .
     cd $DEPS_PATH/cityhash-$CITYHASH_VERSION
     git reset --hard 8af9b8c2b889d80c22d6bc26ba0df1afb79a30db
     ./configure -prefix=$DEPS_PATH --enable-sse4.2
-    make CXXFLAGS="-g -O3 -msse4.2"
-    make install
-    cd -
+    $MAKE CXXFLAGS="-g -O3 -msse4.2"
+    $MAKE install
+    popd
 fi
diff --git a/tools/dependencies/curl.sh b/tools/dependencies/curl.sh
index 9633edb78..bb947715f 100755
--- a/tools/dependencies/curl.sh
+++ b/tools/dependencies/curl.sh
@@ -18,12 +18,14 @@
 # under the License.
 
 # This script builds the static library of libcurl that can be used as dependency of mxnet.
+set -ex
 LIBCURL_VERSION=7.61.0
 if [[ ! -f $DEPS_PATH/lib/libcurl.a ]]; then
     # download and build libcurl
     >&2 echo "Building libcurl..."
     curl -s -L https://curl.haxx.se/download/curl-$LIBCURL_VERSION.zip -o $DEPS_PATH/libcurl.zip
     unzip -q $DEPS_PATH/libcurl.zip -d $DEPS_PATH
+    pushd .
     cd $DEPS_PATH/curl-$LIBCURL_VERSION
     if [[ $PLATFORM == 'linux' ]]; then
         CONFIG_FLAG=""
@@ -58,7 +60,7 @@ if [[ ! -f $DEPS_PATH/lib/libcurl.a ]]; then
                 --disable-gopher \
                 --disable-manual \
                 --prefix=$DEPS_PATH
-    make
-    make install
-    cd -
+    $MAKE
+    $MAKE install
+    popd
 fi
diff --git a/tools/dependencies/eigen.sh b/tools/dependencies/eigen.sh
old mode 100755
new mode 100644
index ac2f75a03..a0cd8fcc9
--- a/tools/dependencies/eigen.sh
+++ b/tools/dependencies/eigen.sh
@@ -18,6 +18,7 @@
 # under the License.
 
 # This script imports the headers from eigen3 that can be used to in opencv.
+set -ex
 EIGEN_VERSION=3.3.4
 if [[ ! -d $DEPS_PATH/include/eigen3 ]]; then
     # download eigen
@@ -25,10 +26,11 @@ if [[ ! -d $DEPS_PATH/include/eigen3 ]]; then
     curl -s -L /~https://github.com/eigenteam/eigen-git-mirror/archive/$EIGEN_VERSION.zip -o $DEPS_PATH/eigen.zip
     unzip -q $DEPS_PATH/eigen.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/eigen-git-mirror-$EIGEN_VERSION/build
+    pushd .
     cd $DEPS_PATH/eigen-git-mirror-$EIGEN_VERSION/build
     cmake \
           -D CMAKE_BUILD_TYPE=RELEASE \
           -D CMAKE_INSTALL_PREFIX=$DEPS_PATH ..
-    make install
-    cd -
+    $MAKE install
+    popd
 fi
diff --git a/tools/dependencies/libpng.sh b/tools/dependencies/libpng.sh
index d1523c654..3faa9f027 100755
--- a/tools/dependencies/libpng.sh
+++ b/tools/dependencies/libpng.sh
@@ -18,6 +18,7 @@
 # under the License.
 
 # This script builds the static library of libpng that can be used as dependency of mxnet/opencv.
+set -ex
 PNG_VERSION=1.6.34
 if [[ ! -f $DEPS_PATH/lib/libpng.a ]]; then
     # download and build libpng
@@ -25,6 +26,7 @@ if [[ ! -f $DEPS_PATH/lib/libpng.a ]]; then
     curl -s -L /~https://github.com/glennrp/libpng/archive/v$PNG_VERSION.zip -o $DEPS_PATH/libpng.zip
     unzip -q $DEPS_PATH/libpng.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/libpng-$PNG_VERSION/build
+    pushd .
     cd $DEPS_PATH/libpng-$PNG_VERSION/build
     cmake \
           -D PNG_SHARED=OFF \
@@ -32,9 +34,9 @@ if [[ ! -f $DEPS_PATH/lib/libpng.a ]]; then
           -D CMAKE_BUILD_TYPE=RELEASE \
           -D CMAKE_INSTALL_PREFIX=$DEPS_PATH \
           -D CMAKE_C_FLAGS=-fPIC ..
-    make
-    make install
+    $MAKE
+    $MAKE install
     mkdir -p $DEPS_PATH/include/libpng
     ln -s $DEPS_PATH/include/png.h $DEPS_PATH/include/libpng/png.h
-    cd -
+    popd
 fi
diff --git a/tools/dependencies/libtiff.sh b/tools/dependencies/libtiff.sh
index 14dcb2d7b..2a402bbca 100755
--- a/tools/dependencies/libtiff.sh
+++ b/tools/dependencies/libtiff.sh
@@ -18,15 +18,17 @@
 # under the License.
 
 # This script builds the static library of libtiff that can be used as dependency of mxnet/opencv.
+set -ex
 TIFF_VERSION="4-0-9"
 if [[ ! -f $DEPS_PATH/lib/libtiff.a ]]; then
     # download and build libtiff
     >&2 echo "Building libtiff..."
     curl -s -L https://gitlab.com/libtiff/libtiff/-/archive/Release-v$TIFF_VERSION/libtiff-Release-v$TIFF_VERSION.zip -o $DEPS_PATH/libtiff.zip
     unzip -q $DEPS_PATH/libtiff.zip -d $DEPS_PATH
+    pushd .
     cd $DEPS_PATH/libtiff-Release-v$TIFF_VERSION
     ./configure --quiet --disable-shared --disable-jpeg --disable-zlib --disable-jbig --disable-lzma --prefix=$DEPS_PATH
-    make
-    make install
-    cd -
+    $MAKE
+    $MAKE install
+    popd
 fi
diff --git a/tools/dependencies/libturbojpeg.sh b/tools/dependencies/libturbojpeg.sh
index 4991906f8..ac813ebec 100755
--- a/tools/dependencies/libturbojpeg.sh
+++ b/tools/dependencies/libturbojpeg.sh
@@ -19,6 +19,7 @@
 
 # This script builds the static library of libturbojpeg that can be used as dependency of
 # mxnet/opencv.
+set -ex
 TURBO_JPEG_VERSION=1.5.90
 if [[ $PLATFORM == 'darwin' ]]; then
     JPEG_NASM_OPTION="-D CMAKE_ASM_NASM_COMPILER=/usr/local/bin/nasm"
@@ -30,6 +31,7 @@ if [[ ! -f $DEPS_PATH/lib/libjpeg.a ]] || [[ ! -f $DEPS_PATH/lib/libturbojpeg.a
     curl -s -L /~https://github.com/libjpeg-turbo/libjpeg-turbo/archive/$TURBO_JPEG_VERSION.zip -o $DEPS_PATH/libjpeg.zip
     unzip -q $DEPS_PATH/libjpeg.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/libjpeg-turbo-$TURBO_JPEG_VERSION/build
+    pushd .
     cd $DEPS_PATH/libjpeg-turbo-$TURBO_JPEG_VERSION/build
     cmake \
           -G"Unix Makefiles" \
@@ -41,7 +43,7 @@ if [[ ! -f $DEPS_PATH/lib/libjpeg.a ]] || [[ ! -f $DEPS_PATH/lib/libturbojpeg.a
           -D WITH_JPEG8=TRUE \
           $JPEG_NASM_OPTION \
           -D ENABLE_SHARED=FALSE ..
-    make
-    make install
-    cd -
+    $MAKE
+    $MAKE install
+    popd
 fi
diff --git a/tools/dependencies/libz.sh b/tools/dependencies/libz.sh
old mode 100755
new mode 100644
index 927f1de82..c5f9953c8
--- a/tools/dependencies/libz.sh
+++ b/tools/dependencies/libz.sh
@@ -18,6 +18,7 @@
 # under the License.
 
 # This script builds the static library of libz that can be used as dependency of mxnet.
+set -ex
 ZLIB_VERSION=1.2.6
 if [[ ! -f $DEPS_PATH/lib/libz.a ]]; then
     # Download and build zlib
@@ -25,12 +26,13 @@ if [[ ! -f $DEPS_PATH/lib/libz.a ]]; then
     curl -s -L /~https://github.com/LuaDist/zlib/archive/$ZLIB_VERSION.zip -o $DEPS_PATH/zlib.zip
     unzip -q $DEPS_PATH/zlib.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/zlib-$ZLIB_VERSION/build
+    pushd .
     cd $DEPS_PATH/zlib-$ZLIB_VERSION/build
     cmake \
           -D CMAKE_BUILD_TYPE=RELEASE \
           -D CMAKE_INSTALL_PREFIX=$DEPS_PATH \
           -D BUILD_SHARED_LIBS=OFF ..
-    make
-    make install
-    cd -
+    $MAKE
+    $MAKE install
+    popd
 fi
diff --git a/tools/dependencies/lz4.sh b/tools/dependencies/lz4.sh
index a4269bf29..478c9925e 100755
--- a/tools/dependencies/lz4.sh
+++ b/tools/dependencies/lz4.sh
@@ -18,14 +18,16 @@
 # under the License.
 
 # This script builds the static library of lz4 that can be used as dependency of mxnet.
+set -ex
 LZ4_VERSION=r130
 if [[ ! -f $DEPS_PATH/lib/liblz4.a ]]; then
     # Download and build lz4
     >&2 echo "Building lz4..."
     curl -s -L /~https://github.com/lz4/lz4/archive/$LZ4_VERSION.zip -o $DEPS_PATH/lz4.zip
     unzip -q $DEPS_PATH/lz4.zip -d $DEPS_PATH
+    pushd .
     cd $DEPS_PATH/lz4-$LZ4_VERSION
-    make
-    make PREFIX=$DEPS_PATH install
-    cd -
+    $MAKE
+    $MAKE PREFIX=$DEPS_PATH install
+    popd
 fi
diff --git a/tools/dependencies/make_shared_dependencies.sh b/tools/dependencies/make_shared_dependencies.sh
old mode 100755
new mode 100644
index d678fddcc..165085861
--- a/tools/dependencies/make_shared_dependencies.sh
+++ b/tools/dependencies/make_shared_dependencies.sh
@@ -38,3 +38,5 @@ source $DIR/protobuf.sh
 source $DIR/cityhash.sh
 source $DIR/zmq.sh
 source $DIR/lz4.sh
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$(dirname $(find $DEPS_PATH -type f -name 'libprotoc*' | grep protobuf | head -n 1)):$DEPS_PATH/lib
diff --git a/tools/dependencies/openblas.sh b/tools/dependencies/openblas.sh
index 9463e3325..27f473919 100755
--- a/tools/dependencies/openblas.sh
+++ b/tools/dependencies/openblas.sh
@@ -18,6 +18,8 @@
 # under the License.
 
 # This script builds the static library of openblas that can be used as dependency of mxnet.
+set +e # This script throws an error but otherwise works
+set -x
 OPENBLAS_VERSION=0.3.3
 if [[ ! -e $DEPS_PATH/lib/libopenblas.a ]]; then
     # download and build openblas
@@ -25,11 +27,12 @@ if [[ ! -e $DEPS_PATH/lib/libopenblas.a ]]; then
 
     curl -s -L /~https://github.com/xianyi/OpenBLAS/archive/v$OPENBLAS_VERSION.zip -o $DEPS_PATH/openblas.zip
     unzip -q $DEPS_PATH/openblas.zip -d $DEPS_PATH
+    pushd .
     cd $DEPS_PATH/OpenBLAS-$OPENBLAS_VERSION
 
-    make DYNAMIC_ARCH=1 NO_SHARED=1 USE_OPENMP=1
-    make PREFIX=$DEPS_PATH install
-    cd -
+    $MAKE DYNAMIC_ARCH=1 NO_SHARED=1 USE_OPENMP=1
+    $MAKE PREFIX=$DEPS_PATH install
+    popd
     ln -s libopenblas.a $DEPS_PATH/lib/libcblas.a
     ln -s libopenblas.a $DEPS_PATH/lib/liblapack.a
 fi
diff --git a/tools/dependencies/opencv.sh b/tools/dependencies/opencv.sh
index 99d0ecb71..11c9c2155 100755
--- a/tools/dependencies/opencv.sh
+++ b/tools/dependencies/opencv.sh
@@ -19,6 +19,7 @@
 
 # This script builds the static library of opencv that can be used as dependency of mxnet.
 # It expects openblas, libjpeg, libpng, libtiff, eigen, etc., to be in $DEPS_PATH.
+set -ex
 OPENCV_VERSION=3.4.2
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 if [[ $PLATFORM == 'linux' ]]; then
@@ -41,6 +42,7 @@ if [[ ! -f $DEPS_PATH/lib/libopencv_core.a ]] || [[ ! -f $DEPS_PATH/lib/libopenc
     curl -s -L /~https://github.com/opencv/opencv/archive/$OPENCV_VERSION.zip -o $DEPS_PATH/opencv.zip
     unzip -q $DEPS_PATH/opencv.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/opencv-$OPENCV_VERSION/build
+    pushd .
     cd $DEPS_PATH/opencv-$OPENCV_VERSION/build
     cmake \
           -D OPENCV_ENABLE_NONFREE=OFF \
@@ -184,9 +186,9 @@ if [[ ! -f $DEPS_PATH/lib/libopencv_core.a ]] || [[ ! -f $DEPS_PATH/lib/libopenc
     if [[ $PLATFORM == 'linux' ]]; then
         cp $DIR/patch/opencv_lapack.h ./
     fi
-    make
-    make install
-    cd -
+    $MAKE
+    $MAKE install
+    popd
     # @szha: compatibility header
     cat $DEPS_PATH/include/opencv2/imgcodecs/imgcodecs_c.h >> $DEPS_PATH/include/opencv2/imgcodecs.hpp
 fi
diff --git a/tools/dependencies/openssl.sh b/tools/dependencies/openssl.sh
index b7e4317d4..93284db3e 100755
--- a/tools/dependencies/openssl.sh
+++ b/tools/dependencies/openssl.sh
@@ -18,6 +18,7 @@
 # under the License.
 
 # This script builds the static library of openssl that can be used as dependency of mxnet.
+set -ex
 OPENSSL_VERSION=1.0.2l
 if [[ ! -f $DEPS_PATH/lib/libssl.a ]] || [[ ! -f $DEPS_PATH/lib/libcrypto.a ]]; then
     # download and build openssl
@@ -25,6 +26,7 @@ if [[ ! -f $DEPS_PATH/lib/libssl.a ]] || [[ ! -f $DEPS_PATH/lib/libcrypto.a ]];
     OPENSSL_VERSION=$(echo $OPENSSL_VERSION | sed 's/\./_/g')
     curl -s -L /~https://github.com/openssl/openssl/archive/OpenSSL_$OPENSSL_VERSION.zip -o $DEPS_PATH/openssl.zip
     unzip -q $DEPS_PATH/openssl.zip -d $DEPS_PATH
+    pushd .
     cd $DEPS_PATH/openssl-OpenSSL_$OPENSSL_VERSION
     if [[ $PLATFORM == 'linux' ]]; then
         TARGET=linux-x86_64
@@ -32,7 +34,7 @@ if [[ ! -f $DEPS_PATH/lib/libssl.a ]] || [[ ! -f $DEPS_PATH/lib/libcrypto.a ]];
         TARGET=darwin64-x86_64-cc
     fi
     ./Configure no-shared no-zlib --prefix=$DEPS_PATH --openssldir=$DEPS_PATH/ssl $TARGET
-    make
-    make install
-    cd -
+    $MAKE
+    $MAKE install
+    popd
 fi
diff --git a/tools/dependencies/protobuf.sh b/tools/dependencies/protobuf.sh
index 156470104..76ce1de8e 100755
--- a/tools/dependencies/protobuf.sh
+++ b/tools/dependencies/protobuf.sh
@@ -18,6 +18,7 @@
 # under the License.
 
 # This script builds the static library of protobuf along with protoc, that can be used as dependency of mxnet.
+set -ex
 PROTOBUF_VERSION=3.5.1
 if [[ $PLATFORM == 'darwin' ]]; then
     DY_EXT="dylib"
@@ -32,12 +33,12 @@ if [[ ! -e $LIBPROTOBUF ]] || [[ ! -e $LIBPROTOC ]]; then
     >&2 echo "Building protobuf..."
     curl -s -L /~https://github.com/google/protobuf/archive/v$PROTOBUF_VERSION.zip -o $DEPS_PATH/protobuf.zip
     unzip -q $DEPS_PATH/protobuf.zip -d $DEPS_PATH
+    pushd .
     cd $DEPS_PATH/protobuf-$PROTOBUF_VERSION
     ./autogen.sh
     ./configure -prefix=$DEPS_PATH
-    make
-    make install
-    cd -
+    $MAKE
+    $MAKE install
+    popd
 fi
 
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$(dirname $(find $DEPS_PATH -type f -name 'libprotoc*' | grep protobuf | head -n 1)):$DEPS_PATH/lib
diff --git a/tools/dependencies/zmq.sh b/tools/dependencies/zmq.sh
index 55e17798c..0042d6bcd 100755
--- a/tools/dependencies/zmq.sh
+++ b/tools/dependencies/zmq.sh
@@ -18,6 +18,7 @@
 # under the License.
 
 # This script builds the static library of zeroMQ that can be used as dependency of mxnet.
+set -ex
 ZEROMQ_VERSION=4.2.2
 if [[ ! -f $DEPS_PATH/lib/libzmq.a ]]; then
     # Download and build zmq
@@ -25,14 +26,14 @@ if [[ ! -f $DEPS_PATH/lib/libzmq.a ]]; then
     curl -s -L /~https://github.com/zeromq/libzmq/archive/v$ZEROMQ_VERSION.zip -o $DEPS_PATH/zeromq.zip
     unzip -q $DEPS_PATH/zeromq.zip -d $DEPS_PATH
     mkdir -p $DEPS_PATH/libzmq-$ZEROMQ_VERSION/build
+    pushd .
     cd $DEPS_PATH/libzmq-$ZEROMQ_VERSION/build
     cmake \
           -D CMAKE_BUILD_TYPE=RELEASE \
           -D CMAKE_INSTALL_PREFIX=$DEPS_PATH \
           -D WITH_LIBSODIUM=OFF \
           -D BUILD_SHARED_LIBS=OFF ..
-    make
-    make install
-    cp $DEPS_PATH/lib/x86_64-linux-gnu/libzmq.a $DEPS_PATH/lib/libzmq.a
-    cd -
+    $MAKE
+    $MAKE install
+    popd
 fi
diff --git a/tools/license_header.py b/tools/license_header.py
index 199d56c7e..b9acbf167 100755
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -84,12 +84,16 @@
                'src/operator/nn/im2col.cuh',
 
                # Licenses in headers
+               'src/operator/contrib/erfinv-inl.h',
                'docs/_static/searchtools_custom.js',
                'docs/_static/js/clipboard.js',
                'docs/_static/js/clipboard.min.js',
 
                # Licensed under 2-Clause BSD in header
                'example/ssd/dataset/pycocotools/coco.py',
+
+               # Julia package metadata, generated by Pkg3.jl
+               'julia/Project.toml',
                ]
 
 # language extensions and the according commment mark
diff --git a/tools/setup_gpu_build_tools.sh b/tools/setup_gpu_build_tools.sh
index 44b44c574..167d4c6a6 100755
--- a/tools/setup_gpu_build_tools.sh
+++ b/tools/setup_gpu_build_tools.sh
@@ -246,6 +246,8 @@ if [[ ! -d $DEPS_PATH/usr/local/cuda-${CUDA_MAJOR_VERSION} ]]; then
         rm package.deb
     done
 
+    mkdir -p ${prefix}/include
+    mkdir -p ${prefix}/lib
     cp ${prefix}/usr/include/x86_64-linux-gnu/cudnn_v${LIBCUDNN_MAJOR}.h ${prefix}/include/cudnn.h
     ln -s libcudnn_static_v${LIBCUDNN_MAJOR}.a ${prefix}/usr/lib/x86_64-linux-gnu/libcudnn.a
     cp ${prefix}/usr/local/cuda-${CUDA_MAJOR_VERSION}/lib64/*.a ${prefix}/lib/
diff --git a/tools/staticbuild/README.md b/tools/staticbuild/README.md
new file mode 100644
index 000000000..2def768a1
--- /dev/null
+++ b/tools/staticbuild/README.md
@@ -0,0 +1,32 @@
+# MXNet Static Build
+
+This folder contains the core script used to build the static library. This README provides information on how to use the scripts in this folder. Please be aware, all of the scripts are designed to be run under the root folder.
+
+## `build.sh`
+This script is a wrapper around `build_lib.sh. It simplifies the things by automatically identifing the system version, number of cores, and all environment variable settings. Here are examples you can run with this script:
+
+```
+tools/staticbuild/build.sh cu92 maven
+```
+This would build the mxnet package based on CUDA9.2 and Maven (Scala) build setttings.
+```
+tools/staticbuild/build.sh mkl pip
+```
+This would build the mxnet package based on MKLDNN and and pypi configuration settings.
+
+As the result, users would have a complete static dependencies in `/staticdeps` in the root folder as well as a static-linked `libmxnet.so` file lives in `lib`. You can build your language binding by using the `libmxnet.so`.
+
+## `build_lib.sh`
+This script clones the most up-to-date master and builds the MXNet backend with a static library. In order to run the static library, you must set the the following environment variables:
+
+- `DEPS_PATH` Path to your static dependencies
+- `STATIC_BUILD_TARGET` Either `pip` or `maven` as your publish platform
+- `PLATFORM` linux, darwin
+- `VARIANT` cpu, cu*, cu*mkl, mkl
+
+It is not recommended to run this file alone since there are a bunch of variables need to be set.
+
+After running this script, you would have everything you need ready in the `/lib` folder.
+
+## `build_wheel.sh`
+This script builds the python package. It also runs a sanity test.
\ No newline at end of file
diff --git a/tools/staticbuild/build.sh b/tools/staticbuild/build.sh
new file mode 100755
index 000000000..4f8a78d70
--- /dev/null
+++ b/tools/staticbuild/build.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+if [ $# -lt 1 ]; then
+    >&2 echo "Usage: build.sh <VARIANT> <TARGET>"
+fi
+
+export CURDIR=$PWD
+export DEPS_PATH=$PWD/staticdeps
+export VARIANT=$(echo $1 | tr '[:upper:]' '[:lower:]')
+export STATIC_BUILD_TARGET=$(echo $2 | tr '[:upper:]' '[:lower:]')
+export PLATFORM=$(uname | tr '[:upper:]' '[:lower:]')
+
+if [[ $VARIANT == darwin* ]]; then
+    export VARIANT="darwin"
+fi
+
+NUM_PROC=1
+if [[ ! -z $(command -v nproc) ]]; then
+    NUM_PROC=$(nproc)
+elif [[ ! -z $(command -v sysctl) ]]; then
+    NUM_PROC=$(sysctl -n hw.ncpu)
+else
+    >&2 echo "Can't discover number of cores."
+fi
+export NUM_PROC
+>&2 echo "Using $NUM_PROC parallel jobs in building."
+
+if [[ $DEBUG -eq 1 ]]; then
+    export ADD_MAKE_FLAG="-j $NUM_PROC"
+else
+    export ADD_MAKE_FLAG="--quiet -j $NUM_PROC"
+fi
+export MAKE="make $ADD_MAKE_FLAG"
+
+export CC="gcc -fPIC"
+export CXX="g++ -fPIC"
+export FC="gfortran"
+export PKG_CONFIG_PATH=$DEPS_PATH/lib/pkgconfig:$DEPS_PATH/lib64/pkgconfig:$DEPS_PATH/lib/x86_64-linux-gnu/pkgconfig:$PKG_CONFIG_PATH
+export CPATH=$DEPS_PATH/include:$CPATH
+
+if [[ $PLATFORM == 'linux' && $VARIANT == cu* ]]; then
+    source tools/setup_gpu_build_tools.sh $VARIANT $DEPS_PATH
+fi
+
+mkdir -p $DEPS_PATH
+
+# Build Dependencies
+source tools/dependencies/make_shared_dependencies.sh
+
+# Build mxnet
+source tools/staticbuild/build_lib.sh
diff --git a/tools/staticbuild/build_lib.sh b/tools/staticbuild/build_lib.sh
new file mode 100755
index 000000000..b08057321
--- /dev/null
+++ b/tools/staticbuild/build_lib.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script builds the libraries of mxnet.
+make_config=make/${STATIC_BUILD_TARGET}/${STATIC_BUILD_TARGET}_${PLATFORM}_${VARIANT}.mk
+if [[ ! -f $make_config ]]; then
+    >&2 echo "Couldn't find make config $make_config for the current settings."
+    exit 1
+fi
+
+>&2 echo "Now building mxnet modules..."
+cp $make_config config.mk
+
+git submodule update --init --recursive || true
+
+$MAKE DEPS_PATH=$DEPS_PATH DMLCCORE
+$MAKE DEPS_PATH=$DEPS_PATH $PWD/3rdparty/tvm/nnvm/lib/libnnvm.a
+$MAKE DEPS_PATH=$DEPS_PATH PSLITE
+
+if [[ $VARIANT == *mkl ]]; then
+    MKLDNN_LICENSE='license.txt'
+    if [[ $PLATFORM == 'linux' ]]; then
+        IOMP_LIBFILE='libiomp5.so'
+        MKLML_LIBFILE='libmklml_intel.so'
+        MKLDNN_LIBFILE='libmkldnn.so.0'
+    else
+        IOMP_LIBFILE='libiomp5.dylib'
+        MKLML_LIBFILE='libmklml.dylib'
+        MKLDNN_LIBFILE='libmkldnn.0.dylib'
+    fi
+    $MAKE DEPS_PATH=$DEPS_PATH mkldnn
+    cp 3rdparty/mkldnn/build/install/lib/$IOMP_LIBFILE lib
+    cp 3rdparty/mkldnn/build/install/lib/$MKLML_LIBFILE lib
+    cp 3rdparty/mkldnn/build/install/lib/$MKLDNN_LIBFILE lib
+    cp 3rdparty/mkldnn/build/install/$MKLDNN_LICENSE lib
+    cp 3rdparty/mkldnn/LICENSE ./MKLML_LICENSE
+fi
+
+>&2 echo "Now building mxnet..."
+$MAKE DEPS_PATH=$DEPS_PATH
+
+if [[ $PLATFORM == 'linux' ]]; then
+    cp -L /usr/lib/gcc/x86_64-linux-gnu/4.8/libgfortran.so lib/libgfortran.so.3
+    cp -L /usr/lib/x86_64-linux-gnu/libquadmath.so.0 lib/libquadmath.so.0
+fi
+
+# Print the linked objects on libmxnet.so
+>&2 echo "Checking linked objects on libmxnet.so..."
+if [[ ! -z $(command -v readelf) ]]; then
+    readelf -d lib/libmxnet.so
+    strip --strip-unneeded lib/libmxnet.so
+elif [[ ! -z $(command -v otool) ]]; then
+    otool -L lib/libmxnet.so
+    strip -u -r -x lib/libmxnet.so
+else
+    >&2 echo "Not available"
+fi
+
+ln -s staticdeps/ deps
diff --git a/tools/staticbuild/build_wheel.sh b/tools/staticbuild/build_wheel.sh
new file mode 100755
index 000000000..9c1803b3d
--- /dev/null
+++ b/tools/staticbuild/build_wheel.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script builds the wheel for binary distribution and performs sanity check.
+echo $(git rev-parse HEAD) >> python/mxnet/COMMIT_HASH
+cd python/
+
+# Make wheel for testing
+python setup.py bdist_wheel
+
+wheel_name=$(ls -t dist | head -n 1)
+pip install -U --user --force-reinstall dist/$wheel_name