diff --git a/docs/api/python/symbol/symbol.md b/docs/api/python/symbol/symbol.md
index 9cab2c59e862..fea746bb02f4 100644
--- a/docs/api/python/symbol/symbol.md
+++ b/docs/api/python/symbol/symbol.md
@@ -337,6 +337,7 @@ Composite multiple symbols into a new one by an operator.
     :nosignatures:
 
     Symbol.infer_type
+    Symbol.infer_type_partial
     Symbol.infer_shape
     Symbol.infer_shape_partial
 ```
diff --git a/docs/install/index.md b/docs/install/index.md
index 76f22e744075..2b7e0457e2b4 100644
--- a/docs/install/index.md
+++ b/docs/install/index.md
@@ -1188,7 +1188,7 @@ MXNet should work on any cloud provider's CPU-only instances. Follow the Python
 <div class="devices">
   <div class="raspberry-pi">
 
-MXNet supports the Debian based Raspbian ARM based operating system so you can run MXNet on Raspberry Pi Devices.
+MXNet supports the Debian based Raspbian ARM based operating system so you can run MXNet on Raspberry Pi 3B devices.
 
 These instructions will walk through how to build MXNet for the Raspberry Pi and install the Python bindings for the library.
 
@@ -1196,6 +1196,9 @@ You can do a dockerized cross compilation build on your local machine or a nativ
 
 The complete MXNet library and its requirements can take almost 200MB of RAM, and loading large models with the library can take over 1GB of RAM. Because of this, we recommend running MXNet on the Raspberry Pi 3 or an equivalent device that has more than 1 GB of RAM and a Secure Digital (SD) card that has at least 4 GB of free memory.
 
+## Quick installation
+You can use this [pre-built Python wheel](wget https://mxnet-public.s3.amazonaws.com/install/raspbian/mxnet-1.5.0-py2.py3-none-any.whl) on a Raspberry Pi 3B with Stretch. You will likely need to install several dependencies to get MXNet to work. Refer to the following **Build** section for details.
+
 **Cross compilation build (Experimental)**
 
 ## Docker installation
@@ -1222,11 +1225,48 @@ ci/build.py -p armv7
 
 ## Install
 
-Create a virtualenv and install the package we created previously.
+Your Pi will need several dependencies.
+
+Install MXNet dependencies with the following:
+```
+sudo apt-get update
+sudo apt-get install -y \
+    apt-transport-https \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    git \
+    libatlas-base-dev \
+    libcurl4-openssl-dev \
+    libjemalloc-dev \
+    liblapack-dev \
+    libopenblas-dev \
+    libopencv-dev \
+    libzmq3-dev \
+    ninja-build \
+    python-dev \
+    software-properties-common \
+    sudo \
+    unzip \
+    virtualenv \
+    wget
+```
+Install virtualenv with:
+```
+sudo pip install virtualenv
+```
+Create a Python 2.7 environment for MXNet with:
+```
+virtualenv -p `which python` mxnet_py27
+```
+You may use Python 3, however the [wine bottle detection example](https://mxnet.incubator.apache.org/versions/master/tutorials/embedded/wine_detector.html) for the Pi with camera requires Python 2.7.
+
+Create a virtualenv and install the wheel we created previously, or the wheel that you downloaded.
 
 ```
-virtualenv -p `which python3` mxnet_py3
-source mxnet_py3/bin/activate
+virtualenv -p `which python3` mxnet_py27
+source mxnet_py27/bin/activate
 pip install mxnet-x.x.x-py2.py3-none-any.whl
 ```
 
@@ -1257,7 +1297,7 @@ Install these dependencies using the following commands in any directory:
 
 ```
     sudo apt-get update
-    sudo apt-get -y install git cmake ninja-build build-essential g++-4.9 c++-4.9 liblapack* libblas* libopencv* libopenblas* python3-dev virtualenv
+    sudo apt-get -y install git cmake ninja-build build-essential g++-4.9 c++-4.9 liblapack* libblas* libopencv* libopenblas* python3-dev python-dev virtualenv
 ```
 
 Clone the MXNet source code repository using the following `git` command in your home directory:
diff --git a/docs/install/raspbian_setup.md b/docs/install/raspbian_setup.md
index a432d4815662..896d4721370b 100644
--- a/docs/install/raspbian_setup.md
+++ b/docs/install/raspbian_setup.md
@@ -17,9 +17,9 @@
 
 <!-- This page should be deleted after sometime (Allowing search engines
 to update links) -->
-<meta http-equiv="refresh" content="3; url=http://mxnet.io/install/index.html" />
+<meta http-equiv="refresh" content="3; url=https://mxnet.incubator.apache.org/versions/master/install/index.html?platform=Devices&language=Python&processor=CPU" />
 <!-- Just in case redirection does not work -->
 <p>
-  <a href="http://mxnet.io/install/index.html">
+  <a href="https://mxnet.incubator.apache.org/versions/master/install/index.html?platform=Devices&language=Python&processor=CPU">
     This content is moved to a new MXNet install page. Redirecting... </a>
 </p>
diff --git a/docs/install/scala_setup.md b/docs/install/scala_setup.md
index bc069a14e6b3..15a2def1ef38 100644
--- a/docs/install/scala_setup.md
+++ b/docs/install/scala_setup.md
@@ -20,6 +20,7 @@
 The following instructions are provided for macOS and Ubuntu. Windows is not yet available.
 
 **Note:** If you use IntelliJ or a similar IDE, you may want to follow the [MXNet-Scala on IntelliJ tutorial](../tutorials/scala/mxnet_scala_on_intellij.html) instead of these instructions.
+**Note:** Currently, we only support scala 2.11
 
 <hr>
 
@@ -114,6 +115,33 @@ mvn install
 
 <hr>
 
+## Interpreter
+
+To run the scala interpreter, first download and install scala 2.11.x (run `scala -version` to make sure you have the right version installed.**
+
+### Installing the Interpreter
+
+**Ubuntu***
+
+```
+sudo apt-get install scala
+```
+
+**macOS***
+
+```
+brew install scala@2.11
+```
+
+Then, add scala to your path by following the instructions output by homebrew.
+
+### Running the Interpreter
+
+To run the interpreter, download the appropriate mxnet jar from [the maven repository](https://search.maven.org/search?q=g:org.apache.mxnet) or build from source following the instructions above.
+
+Then, run `scala -cp {path/to/mxnet-full_2.11-os-version.jar}` to start it.
+If you receive a "NumberFormatException" when running the interpreter, run `export TERM=xterm-color` before starting the interpreter.
+
 ## Documentation
 
 Scaladocs are generated as part of the docs build pipeline. You can find them published in the [Scala API](http://mxnet.incubator.apache.org/api/scala/index.html) section of the website or by going to the [scaladocs output](https://mxnet.incubator.apache.org/api/scala/docs/index.html#org.apache.mxnet.package) directly.
diff --git a/docs/tutorials/scala/char_lstm.md b/docs/tutorials/scala/char_lstm.md
index 972661bc81ef..aca08dc79920 100644
--- a/docs/tutorials/scala/char_lstm.md
+++ b/docs/tutorials/scala/char_lstm.md
@@ -71,11 +71,7 @@ In this tutorial, you will accomplish the following:
 
 ## Prerequisites
 
-To complete this tutorial, you need:
-
-- MXNet. See the instructions for your operating system in [Setup and Installation](http://mxnet.io/install/index.html)
-- [Scala 2.11.8](https://www.scala-lang.org/download/2.11.8.html)
-- [Maven 3](https://maven.apache.org/install.html)
+To complete this tutorial, setup and run the scala interpreter by following the [instructions](https://mxnet.incubator.apache.org/install/scala_setup.html#interpreter).
 
 ## Download the Data
 
diff --git a/example/captcha/mxnet_captcha.R b/example/captcha/mxnet_captcha.R
index 8988d25104d5..43e819f8c264 100644
--- a/example/captcha/mxnet_captcha.R
+++ b/example/captcha/mxnet_captcha.R
@@ -39,9 +39,9 @@ label <- mx.symbol.Reshape(data = label, target_shape = c(0))
 captcha_net <- mx.symbol.SoftmaxOutput(data = fc2, label = label, name = "softmax")
 
 mx.metric.acc2 <- mx.metric.custom("accuracy", function(label, pred) {
-    ypred <- max.col(t(pred)) - 1
+    ypred <- max.col(t(data.matrix(pred))) - 1
     ypred <- matrix(ypred, nrow = nrow(label), ncol = ncol(label), byrow = TRUE)
-    return(sum(colSums(label == ypred) == 4) / ncol(label))
+    return(sum(colSums(data.matrix(label) == ypred) == 4) / ncol(label))
   })
 
 data.shape <- c(80, 30, 3)
diff --git a/example/gan/CGAN_mnist_R/README.md b/example/gan/CGAN_mnist_R/README.md
new file mode 100644
index 000000000000..bf0bb08b1147
--- /dev/null
+++ b/example/gan/CGAN_mnist_R/README.md
@@ -0,0 +1,168 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Conditional Generative Adversarial Network with MXNet R package 
+
+This tutorial shows how to build and train a Conditional Generative Adversarial Network (CGAN) on MNIST images.
+
+## How GAN works
+A Generative Adversarial Model simultaneously trains two models: a generator that learns to output fake samples from an unknown distribution and a discriminator that learns to distinguish fake from real samples.
+
+The CGAN is a conditional variation of the GAN where the generator is instructed to generate a real sample having specific characteristics rather than a generic sample from full distribution. Such condition could be the label associated with an image like in this tutorial or a more detailed tag as shown in the example below:
+
+![Image credit: (Scott Reed)[/~https://github.com/reedscot/icml2016]](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gan/CGAN_mnist_R/dcgan_network.jpg)
+
+## Initial setup
+
+The following packages are needed to run the tutorial:
+
+```
+require("imager")
+require("dplyr")
+require("readr")
+require("mxnet")
+```
+
+The full demo is comprised of the two following scripts:
+
+```CGAN_mnist_setup.R```: prepare data and define the model structure
+```CGAN_train.R```: execute the training
+
+## Data preperation
+
+The MNIST dataset is available [here](https://www.kaggle.com/c/digit-recognizer/data)). Once train.csv is downloaded into the data/ folder, we can import into R.
+
+```train <- read_csv('data/train.csv')
+train <- data.matrix(train)
+
+train_data <- train[,-1]
+train_data <- t(train_data/255*2-1)
+train_label <- as.integer(train[,1])
+
+dim(train_data) <- c(28, 28, 1, ncol(train_data))
+```
+Custom iterators are defined in ```iterators.R``` and imported by ```CGAN_mnist_setup.R```
+
+## Generator
+The generator is a network that creates novel samples (MNIST images) from 2 inputs:
+
+- Noise vector
+- Labels defining the object condition (which digit to produce)
+
+The noise vector provides the building blocks to the Generator model, which will learns how to structure that noise into a sample. The mx.symbol.Deconvolution operator is used to upsample the initial input from a 1x1 shape up to a 28x28 image.
+
+The information on the label for which to generate a fake sample is provided by a one-hot encoding of the label indices that is appended to the random noise. For MNIST, the 0-9 indices are therefore converted into a binary vector of length 10. More complex applications would require embeddings rather than simple one-hot to encode the condition.
+
+## Discriminator
+The discriminator attempts to distinguish between fake samples produced by the generator and real ones sampled from MNIST training data.
+
+In a conditional GAN, the labels associated with the samples are also provided to the Discriminator. In this demo, this information is again provided as a hot-hot encoding of the label that is broadcast to match the image dimensions (10 -> 28x28x10).
+
+## Training logic
+The training process of the discriminator is most obvious: the loss is simple a binary TRUE/FALSE response and that loss is propagated back into the CNN network. It can therefore be understood as a simple binary classification problem.
+
+```### Train loop on fake
+mx.exec.update.arg.arrays(exec_D, arg.arrays = 
+  list(data=D_data_fake, digit=D_digit_fake, label=mx.nd.array(rep(0, batch_size))), 
+  match.name=TRUE)
+mx.exec.forward(exec_D, is.train=T)
+mx.exec.backward(exec_D)
+update_args_D<- updater_D(weight = exec_D$ref.arg.arrays, grad = exec_D$ref.grad.arrays)
+mx.exec.update.arg.arrays(exec_D, update_args_D, skip.null=TRUE)
+
+### Train loop on real
+mx.exec.update.arg.arrays(exec_D, arg.arrays = 
+  list(data=D_data_real, digit=D_digit_real, label=mx.nd.array(rep(1, batch_size))), 
+  match.name=TRUE)
+mx.exec.forward(exec_D, is.train=T)
+mx.exec.backward(exec_D)
+update_args_D<- updater_D(weight = exec_D$ref.arg.arrays, grad = exec_D$ref.grad.arrays)
+mx.exec.update.arg.arrays(exec_D, update_args_D, skip.null=TRUE)
+```
+
+The generator loss comes from the backpropagation of the the discriminator loss into its generated output. By faking the generator labels to be real samples into the discriminator, the discriminator back-propagated loss provides the generator with the information on how to best adapt its parameters to trick the discriminator into believing the fake samples are real.
+
+This requires to backpropagate the gradients up to the input data of the discriminator (whereas this input gradient is typically ignored in vanilla feedforward network).
+
+```### Update Generator weights - use a seperate executor for writing data gradients
+exec_D_back <- mxnet:::mx.symbol.bind(symbol = D_sym, 
+  arg.arrays = exec_D$arg.arrays, 
+  aux.arrays = exec_D$aux.arrays, grad.reqs = rep("write", length(exec_D$arg.arrays)), 
+  ctx = devices)
+
+mx.exec.update.arg.arrays(exec_D_back, arg.arrays = 
+  list(data=D_data_fake, digit=D_digit_fake, label=mx.nd.array(rep(1, batch_size))), 
+  match.name=TRUE)
+mx.exec.forward(exec_D_back, is.train=T)
+mx.exec.backward(exec_D_back)
+D_grads <- exec_D_back$ref.grad.arrays$data
+mx.exec.backward(exec_G, out_grads=D_grads)
+
+update_args_G <- updater_G(weight = exec_G$ref.arg.arrays, grad = exec_G$ref.grad.arrays)
+mx.exec.update.arg.arrays(exec_G, update_args_G, skip.null=TRUE)
+```
+
+The above training steps are executed in the ```CGAN_train.R``` script.
+
+## Monitor the training
+
+During training, the [imager](http://dahtah.github.io/imager/) package facilitates the visual quality assessment of the fake samples.
+
+```if (iteration==1 | iteration %% 100==0){
+  par(mfrow=c(3,3), mar=c(0.1,0.1,0.1,0.1))
+  for (i in 1:9) {
+    img <- as.array(exec_G$ref.outputs$G_sym_output)[,,,i]
+    plot(as.cimg(img), axes=F)
+  }
+}
+```
+Below are samples obtained at different stage of the training.
+
+Starting from noise:
+
+![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gan/CGAN_mnist_R/CGAN_1.png)
+
+Slowly getting it - iteration 200:
+
+![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gan/CGAN_mnist_R/CGAN_200.png)
+
+Generate specified digit images on demand - iteration 2400:
+
+![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gan/CGAN_mnist_R/CGAN_2400.png)
+
+## Inference
+
+Once the model is trained, synthetic images of the desired digit can be produced by feeding the generator with fixed labels rather than the randomly generated ones used during the training.
+
+Here we will generate fake ```9```:
+
+```digit <- mx.nd.array(rep(9, times=batch_size))
+data <- mx.nd.one.hot(indices = digit, depth = 10)
+data <- mx.nd.reshape(data = data, shape = c(1,1,-1, batch_size))
+
+exec_G <- mx.simple.bind(symbol = G_sym, data=data_shape_G, ctx = devices, grad.req = "null")
+mx.exec.update.arg.arrays(exec_G, G_arg_params, match.name=TRUE)
+mx.exec.update.arg.arrays(exec_G, list(data=data), match.name=TRUE)
+mx.exec.update.aux.arrays(exec_G, G_aux_params, match.name=TRUE)
+
+mx.exec.forward(exec_G, is.train=F)
+```
+![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gan/CGAN_mnist_R/CGAN_infer_9.png)
+
+Further details of the CGAN methodology can be found in the paper [Generative Adversarial Text to Image Synthesis](https://arxiv.org/abs/1605.05396).
+
+
diff --git a/example/gluon/actor_critic/README.md b/example/gluon/actor_critic/README.md
new file mode 100644
index 000000000000..7f3a6a73e972
--- /dev/null
+++ b/example/gluon/actor_critic/README.md
@@ -0,0 +1,61 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Actor Critic Model
+
+This example shows an actor critic model that consists of a critic that measures how good an action taken is and an actor that controls the agent's behavior. 
+In our example actor and critic use the same model:
+
+```
+class Policy(gluon.Block):
+    def __init__(self, **kwargs):
+        super(Policy, self).__init__(**kwargs)
+        with self.name_scope():
+            self.dense = nn.Dense(16, in_units=4, activation='relu')
+            self.action_pred = nn.Dense(2, in_units=16)
+            self.value_pred = nn.Dense(1, in_units=16)
+
+    def forward(self, x):
+        x = self.dense(x)
+        probs = self.action_pred(x)
+        values = self.value_pred(x)
+        return F.softmax(probs), values
+```
+The example uses [Gym](https://gym.openai.com/docs/), which is a toolkit for developing and comparing reinforcement learning algorithms. The model is running an instance of [CartPole-v0](https://gym.openai.com/envs/CartPole-v0/) that simulates a pole that is attached by an un-actuated joint to a cart, which moves along a frictionless track. The goal is to prevent it from falling over. 
+
+
+The example provides the following commandline options:
+```
+MXNet actor-critic example
+
+optional arguments:
+  -h, --help        show this help message and exit
+  --gamma G         discount factor (default: 0.99)
+  --seed N          random seed (default: 1)
+  --render          render the environment
+  --log-interval N  interval between training status logs (default: 10)
+
+```
+
+To run the model execute, type 
+```
+python actor_critic.py --render
+```
+
+You will get an output like the following:
+![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gluon/actor_critic/actor_critic.gif)
+
diff --git a/example/gluon/audio/README.md b/example/gluon/audio/README.md
new file mode 100644
index 000000000000..cb2b53eb3b83
--- /dev/null
+++ b/example/gluon/audio/README.md
@@ -0,0 +1,115 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Urban Sounds Classification in MXNet Gluon
+
+This example provides an end-to-end pipeline for a common datahack competition - [Urban Sounds Classification Example](https://datahack.analyticsvidhya.com/contest/practice-problem-urban-sound-classification/).
+
+After logging in, the data set can be downloaded.
+The details of the dataset and the link to download it are given below:
+
+
+## Urban Sounds Dataset:
+### Description
+  The dataset contains 8732 wav files which are audio samples(<= 4s)) of street sounds like engine_idling, car_horn, children_playing, dog_barking and so on.
+  The task is to classify these audio samples into one of the following 10 labels:
+  ```
+  siren,
+  street_music,
+  drilling,
+  dog_bark,
+  children_playing,
+  gun_shot,
+  engine_idling,
+  air_conditioner,
+  jackhammer,
+  car_horn
+  ```
+
+To be able to run this example:
+
+1. `pip install -r requirements.txt`
+
+    If you are in the directory where the requirements.txt file lies,
+    this step installs the required libraries to run the example.
+    The main dependency that is required is: Librosa. 
+    The version used to test the example is: `0.6.2`
+    For more details, refer here:
+https://librosa.github.io/librosa/install.html
+
+2. Download the dataset(train.zip, test.zip) required for this example from the location:
+https://drive.google.com/drive/folders/0By0bAi7hOBAFUHVXd1JCN3MwTEU
+
+3. Extract both the zip archives into the **current directory** - after unzipping you would get 2 new folders namely,
+   **Train** and **Test** and two csv files - **train.csv**, **test.csv**
+
+   Assuming you are in a directory *"UrbanSounds"*, after downloading and extracting train.zip, the folder structure should be:
+   
+   ```
+        UrbanSounds        
+                    - Train
+                        - 0.wav, 1.wav ...
+                    - train.csv
+                    - train.py
+                    - predict.py ...
+    ```
+
+4. Apache MXNet is installed on the machine. For instructions, go to the link: https://mxnet.incubator.apache.org/install/
+
+
+
+For information on the current design of how the AudioFolderDataset is implemented, refer below:
+https://cwiki.apache.org/confluence/display/MXNET/Gluon+-+Audio
+
+### Usage 
+
+For training:
+
+- Arguments
+  - train : The folder/directory that contains the audio(wav) files locally. Default = "./Train"
+  - csv: The file name of the csv file that contains audio file name to label mapping. Default = "train.csv"
+  - epochs : Number of epochs to train the model. Default = 30
+  - batch_size : The batch size for training. Default = 32
+
+
+###### To use the default arguments, use:
+```
+python train.py
+``` 
+or
+
+###### To pass command-line arguments for training data directory, epochs, batch_size, csv file name, use :
+```
+python train.py --train ./Train --csv train.csv --batch_size 32 --epochs 30 
+```
+
+For prediction:
+
+- Arguments
+  - pred : The folder/directory that contains the audio(wav) files which are to be classified. Default = "./Test"
+
+
+###### To use the default arguments, use:
+```
+python predict.py
+``` 
+or
+
+###### To pass command-line arguments for test data directory, use :
+```
+python predict.py --pred ./Test
+```
diff --git a/example/gluon/house_prices/README.md b/example/gluon/house_prices/README.md
new file mode 100644
index 000000000000..1393a0e3869d
--- /dev/null
+++ b/example/gluon/house_prices/README.md
@@ -0,0 +1,29 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# House Prices: Advanced Regression Techniques
+
+This example shows how to predict house prices and it is based on the [House Price Kaggle challenge](https://www.kaggle.com/c/house-prices-advanced-regression-techniques#description)
+
+First you need to download train and test data set from here:
+```
+https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/train.csv
+https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/test.csv
+```
+Afterwards you can execute the script with  ```python kaggle_k_fold_cross_validation.py```
+
+For a detailed explanation of the code, you can check out this [chapter](http://d2l.ai/chapter_deep-learning-basics/kaggle-house-price.html) of the Dive into Deep Learning book.
diff --git a/example/gluon/lstm_crf/README.md b/example/gluon/lstm_crf/README.md
new file mode 100644
index 000000000000..519c3b89f9fd
--- /dev/null
+++ b/example/gluon/lstm_crf/README.md
@@ -0,0 +1,36 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# BiLSTM CRF model
+This example demonstrates how a [BiLSTM-CRF model](https://arxiv.org/pdf/1508.01991v1.pdf) can be implemented in Gluon to perform noun-phrase chunking as a sequence labeling task.  In this example we define the following training sample:
+```
+georgia tech is a university in georgia
+B I O O O O B
+```
+The second line is the IOB representation of the above sentence that is learnt by the model. **I** stands for in chunk, **O** for out of a chunk and **B** for beginning of junks.
+
+The model consists of an LSTM layer with 2 hidden units and a CRF layer. The CRF layer has a state transition matrix which allows to take past and future tags into account when predicting the current tag. The bidirectional LSTM is reading the word sequence from beginning to end and vice versa. It prodcues a vector representation for the words. The following image is taken from https://arxiv.org/pdf/1508.01991v1.pdf and shows the model architecture:
+
+![Image taken from https://arxiv.org/pdf/1508.01991v1.pdf](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gluon/lstm_crf/bi-lstm_crf.png)
+
+You can run the example by executing 
+```
+python lstm_crf.py
+```
+The example code does not take any commandline arguments. If you want to change the number of hidden units or the size of vectors embeddings, then you need to change the variables ```EMBEDDING_DIM``` and ```HIDDEN_DIM```.
+
+
diff --git a/example/gluon/mnist/README.md b/example/gluon/mnist/README.md
new file mode 100644
index 000000000000..c053364fad3c
--- /dev/null
+++ b/example/gluon/mnist/README.md
@@ -0,0 +1,55 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# MNIST classification example
+
+This script shows a simple example how to do image classification with Gluon. 
+The model is trained on MNIST digits image dataset and the goal is to classify the digits ```0-9```.  The model has the following layout:
+```
+net = nn.Sequential()
+with net.name_scope():
+    net.add(nn.Dense(128, activation='relu'))
+    net.add(nn.Dense(64, activation='relu'))
+    net.add(nn.Dense(10))
+```
+
+The script provides the following commandline arguments: 
+
+
+```
+MXNet Gluon MNIST Example
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --batch-size BATCH_SIZE
+                        batch size for training and testing (default: 100)
+  --epochs EPOCHS       number of epochs to train (default: 10)
+  --lr LR               learning rate (default: 0.1)
+  --momentum MOMENTUM   SGD momentum (default: 0.9)
+  --cuda                Train on GPU with CUDA
+  --log-interval N      how many batches to wait before logging training
+                        status
+```
+
+After one epoch we get the following output vector for the given test image:
+
+<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/gluon/mnist/test_image.png" width="250" height="250">
+
+[-5.461655  -4.745     -1.8203478 -0.5705207  8.923972  -2.2358544 -3.3020825 -2.409004   4.0074944 10.362008] 
+
+As we can see the highest activation is 10.362 which corresponds to label `9`.
+
diff --git a/example/gluon/super_resolution/README.md b/example/gluon/super_resolution/README.md
new file mode 100644
index 000000000000..ddcbe8b0a202
--- /dev/null
+++ b/example/gluon/super_resolution/README.md
@@ -0,0 +1,45 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Superresolution
+
+This example trains a convolutional neural network to enhance the resolution of images (also known as superresolution). 
+The script takes the following commandline arguments:
+
+```
+Super-resolution using an efficient sub-pixel convolution neural network.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --upscale_factor UPSCALE_FACTOR
+                        super resolution upscale factor. default is 3.
+  --batch_size BATCH_SIZE
+                        training batch size, per device. default is 4.
+  --test_batch_size TEST_BATCH_SIZE
+                        test batch size
+  --epochs EPOCHS       number of training epochs
+  --lr LR               learning Rate. default is 0.001.
+  --use-gpu             whether to use GPU.
+  --seed SEED           random seed to use. Default=123
+  --resolve_img RESOLVE_IMG
+                        input image to use
+```
+
+Once the network is trained you can use the following command to increase the resolution of your image:
+```
+python  super_resolution.py --resolve_img myimage.jpg
+```
diff --git a/example/gluon/data.py b/example/gluon/super_resolution/data.py
similarity index 100%
rename from example/gluon/data.py
rename to example/gluon/super_resolution/data.py
diff --git a/example/model-parallel/README.md b/example/model-parallel/README.md
new file mode 100644
index 000000000000..537562070a62
--- /dev/null
+++ b/example/model-parallel/README.md
@@ -0,0 +1,20 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Run parts of a model on different devices
+
+This folder contains the example [matrix_factorization](/~https://github.com/apache/incubator-mxnet/tree/master/example/model-parallel/matrix_factorization) that demonstrates the basic usage of `group2ctxs`. 
diff --git a/example/sparse/README.md b/example/sparse/README.md
new file mode 100644
index 000000000000..8f1302950d22
--- /dev/null
+++ b/example/sparse/README.md
@@ -0,0 +1,24 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Examples using Sparse Symbol API
+This folder contains examples that demonstrate the usage of [Sparse Symbol API](https://mxnet.incubator.apache.org/api/python/symbol/sparse.html)
+- [Factorization Machine](/~https://github.com/apache/incubator-mxnet/tree/master/example/sparse/factorization_machine) uses sparse weights
+- [Linear Classification Using Sparse Matrix Multiplication](/~https://github.com/apache/incubator-mxnet/tree/master/example/sparse/linear_classification) shows how to use a sparse data loader, sparse dot operator and sparse gradient updaters
+- [Matrix Factorization w/ Sparse Embedding](/~https://github.com/apache/incubator-mxnet/tree/master/example/sparse/matrix_factorization) uses sparse weights
+- [Wide and Deep Learning](/~https://github.com/apache/incubator-mxnet/tree/master/example/sparse/wide_deep) shows how to run sparse wide and deep classification
+
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 13ee903407b3..76a4995d15c0 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -1562,6 +1562,38 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
                                 const int **aux_type_data,
                                 int *complete);
 
+/*!
+ * \brief partially infer type of unknown input types given the known one.
+ *
+ *  Return partially inferred results if not all types could be inferred.
+ *  The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param sym symbol handle
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_type_data the content of the CSR
+ * \param in_type_size sizeof the returning array of in_types
+ * \param in_type_data returning array of pointers to head of the input type.
+ * \param out_type_size sizeof the returning array of out_types
+ * \param out_type_data returning array of pointers to head of the input type.
+ * \param aux_type_size sizeof the returning array of aux_types
+ * \param aux_type_data returning array of pointers to head of the auxiliary type.
+ * \param complete whether infer type completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolInferTypePartial(SymbolHandle sym,
+                                       mx_uint num_args,
+                                       const char** keys,
+                                       const int *arg_type_data,
+                                       mx_uint *in_type_size,
+                                       const int **in_type_data,
+                                       mx_uint *out_type_size,
+                                       const int **out_type_data,
+                                       mx_uint *aux_type_size,
+                                       const int **aux_type_data,
+                                       int *complete);
+
 /*!
  * \brief Convert a symbol into a quantized symbol where FP32 operators are replaced with INT8
  * \param sym_handle symbol to be converted
diff --git a/perl-package/AI-MXNet/Changes b/perl-package/AI-MXNet/Changes
index 8bd43f3be205..e67fe39b49ab 100644
--- a/perl-package/AI-MXNet/Changes
+++ b/perl-package/AI-MXNet/Changes
@@ -1,5 +1,11 @@
 Revision history for Perl extension AI::MXNet
 
+1.4     Mon Feb 18 11:54:07 PST 2019
+        - Two more gluon loss classes
+        - Visualization fixes
+        - Gluon rnn rework, including hybridization
+        - Exposed GPU memory info to perl level.
+
 1.33    Thu Oct  4 13:25:56 PDT 2018
         - Added randn function.
         - Internal SELU function on C++ layer.
diff --git a/perl-package/AI-MXNet/META.json b/perl-package/AI-MXNet/META.json
index bbbea734ccf8..37c573c279f5 100644
--- a/perl-package/AI-MXNet/META.json
+++ b/perl-package/AI-MXNet/META.json
@@ -30,7 +30,7 @@
       },
       "runtime" : {
          "requires" : {
-            "AI::MXNetCAPI" : "1.33",
+            "AI::MXNetCAPI" : "1.4",
             "AI::NNVMCAPI" : "1.3",
             "Function::Parameters" : "1.0705",
             "Hash::Ordered" : "0.012",
@@ -45,5 +45,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.33"
+   "version" : "1.4"
 }
diff --git a/perl-package/AI-MXNet/META.yml b/perl-package/AI-MXNet/META.yml
index 26e37b572600..692ca0307948 100644
--- a/perl-package/AI-MXNet/META.yml
+++ b/perl-package/AI-MXNet/META.yml
@@ -34,7 +34,7 @@ no_index:
     - t
     - inc
 requires:
-  AI::MXNetCAPI: '1.33'
+  AI::MXNetCAPI: '1.4'
   AI::NNVMCAPI: '1.3'
   Function::Parameters: '1.0705'
   Hash::Ordered: '0.012'
@@ -42,4 +42,4 @@ requires:
   Mouse: v2.1.0
   PDL: '2.007'
   PDL::CCS: '1.23.4'
-version: '1.33'
+version: '1.4'
diff --git a/perl-package/AI-MXNet/Makefile.PL b/perl-package/AI-MXNet/Makefile.PL
index 6d70b21344c2..19aba3fee4a5 100644
--- a/perl-package/AI-MXNet/Makefile.PL
+++ b/perl-package/AI-MXNet/Makefile.PL
@@ -36,7 +36,7 @@ my %WriteMakefileArgs = (
   "LICENSE" => "apache_2_0",
   "NAME" => "AI::MXNet",
   "PREREQ_PM" => {
-    "AI::MXNetCAPI" => "1.33",
+    "AI::MXNetCAPI" => "1.4",
     "AI::NNVMCAPI" => "1.3",
     "Function::Parameters" => "1.0705",
     "Hash::Ordered" => "0.012",
@@ -46,7 +46,7 @@ my %WriteMakefileArgs = (
     "GraphViz" => "2.14"
   },
   "TEST_REQUIRES" => {},
-  "VERSION" => "1.33",
+  "VERSION" => "1.4",
   "test" => {
     "TESTS" => "t/*.t"
   }
diff --git a/perl-package/AI-MXNet/README b/perl-package/AI-MXNet/README
index f370db3804e9..4935b6384071 100644
--- a/perl-package/AI-MXNet/README
+++ b/perl-package/AI-MXNet/README
@@ -1,5 +1,5 @@
 This archive contains the distribution AI-MXNet,
-version 1.33:
+version 1.4:
 
   Perl interface to MXNet machine learning library
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm
index 6a559a394a9f..80699b14311c 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet.pm
@@ -51,7 +51,7 @@ use AI::MXNet::Gluon;
 use AI::MXNet::NDArray::Sparse;
 use AI::MXNet::Symbol::Sparse;
 use AI::MXNet::Engine;
-our $VERSION = '1.33';
+our $VERSION = '1.4';
 
 sub import
 {
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
index 826e7baf905b..7ae99be7b99e 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
@@ -190,6 +190,30 @@ method num_gpus()
     return scalar(check_call(AI::MXNetCAPI::GetGPUCount()));
 }
 
+=head2 gpu_memory_info
+
+    Query CUDA for the free and total bytes of GPU global memory.
+
+    Parameters
+    ----------
+    $device_id=0 : int, optional
+        The device id of the GPU device.
+
+    Raises
+    ------
+    Will raise an exception on any CUDA error.
+
+    Returns
+    -------
+    ($free, $total) : (int, int)
+        Free and total memory in bytes.
+=cut
+
+method gpu_memory_info($device_id=0)
+{
+    return check_call(AI::MXNetCAPI::GetGPUMemoryInformation64($device_id));
+}
+
 method current_ctx()
 {
     return $AI::MXNet::current_ctx;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm
index 7dea68ffa16d..3eb62eb5a2ef 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm
@@ -824,4 +824,175 @@ method hybrid_forward(
 
 __PACKAGE__->register('AI::MXNet::Gluon::Loss');
 
+package AI::MXNet::Gluon::PoissonNLLLoss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Loss';
+has 'from_logits'  => (is => 'ro', isa => 'Bool', default => 1);
+has 'compute_full' => (is => 'ro', isa => 'Bool', default => 0);
+
+=head1 NAME
+
+    AI::MXNet::Gluon::PoissonNLLLoss
+=cut
+
+=head1 DESCRIPTION
+
+    For a target (Random Variable) in a Poisson distribution, the function calculates the Negative
+    Log likelihood loss.
+    PoissonNLLLoss measures the loss accrued from a poisson regression prediction made by the model.
+
+    .. math::
+        L = \text{pred} - \text{target} * \log(\text{pred}) +\log(\text{target!})
+
+    `pred`, `target` can have arbitrary shape as long as they have the same number of elements.
+
+    Parameters
+    ----------
+    from_logits : boolean, default True
+        indicating whether log(predicted) value has already been computed. If True, the loss is computed as
+        :math:`\exp(\text{pred}) - \text{target} * \text{pred}`, and if False, then loss is computed as
+        :math:`\text{pred} - \text{target} * \log(\text{pred}+\text{epsilon})`.The default value
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+    compute_full: boolean, default False
+        Indicates whether to add an approximation(Stirling factor) for the Factorial term in the formula for the loss.
+        The Stirling factor is:
+        :math:`\text{target} * \log(\text{target}) - \text{target} + 0.5 * \log(2 * \pi * \text{target})`
+    epsilon: float, default 1e-08
+        This is to avoid calculating log(0) which is not defined.
+
+
+    Inputs:
+        - **pred**:   Predicted value
+        - **target**: Random variable(count or number) which belongs to a Poisson distribution.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as pred. For example, if pred has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: Average loss (shape=(1,1)) of the loss tensor with shape (batch_size,).
+=cut
+
+method hybrid_forward(
+    GluonClass $F, GluonInput $pred, GluonInput $target,
+    Maybe[GluonInput] $sample_weight=, Maybe[Num] $epsilon=1e-08
+)
+{
+    $target = __PACKAGE__->_reshape_like($F, $target, $pred);
+    my $loss;
+    if($self->from_logits)
+    {
+        $loss = $F->exp($pred) - $target * $pred;
+    }
+    else
+    {
+        $loss = $pred - $target * $F->log($pred + $epsilon);
+        if($self->compute_full)
+        {
+            my $stirling_factor = $target * $F->log($target) - $target + 0.5 * $F->log(2 * $target * 3.1415926);
+            $stirling_factor *= ($target > 1);
+            $loss += $stirling_factor;
+        }
+        $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
+    }
+    return $F->mean($loss);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
+package AI::MXNet::Gluon::CosineEmbeddingLoss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Loss';
+has 'margin' => (is => 'rw', isa => 'Num', default => 0);
+
+=head1 NAME
+
+    AI::MXNet::Gluon::CosineEmbeddingLoss
+=cut
+
+=head1 DESCRIPTION
+
+    For a target label 1 or -1, vectors input1 and input2, the function computes the cosine distance
+    between the vectors. This can be interpreted as how similar/dissimilar two input vectors are.
+
+    .. math::
+
+        L = \sum_i \begin{cases} 1 - {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = 1\\
+                         {cos\_sim({input1}_i, {input2}_i)} & \text{ if } {label}_i = -1 \end{cases}\\
+        cos\_sim(input1, input2) = \frac{{input1}_i.{input2}_i}{||{input1}_i||.||{input2}_i||}
+
+    `input1`, `input2` can have arbitrary shape as long as they have the same number of elements.
+
+    Parameters
+    ----------
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+    margin : float
+        Margin of separation between correct and incorrect pair.
+
+
+    Inputs:
+        - **input1**: a tensor with arbitrary shape
+        - **input2**: another tensor with same shape as pred to which input1 is
+          compared for similarity and loss calculation
+        - **label**: A 1-D tensor indicating for each pair input1 and input2, target label is 1 or -1
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as input1. For example, if input1 has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: The loss tensor with shape (batch_size,).
+=cut
+
+method hybrid_forward(
+    GluonClass $F, GluonInput $input1, GluonInput $input2, GluonInput $label, Maybe[GluonInput] $sample_weight=
+)
+{
+    $input1 = __PACKAGE__->_reshape_like($F, $input1, $input2);
+    $label = $label->reshape([-1, 1]);
+    my $cos_sim = $self->_cosine_similarity($F, $input1, $input2);
+    my $y_1 = $label == 1;
+    my $y_minus_1 = $label == -1;
+    my $cos_sim_a = (1 - $cos_sim) * $y_1;
+
+    my $z_array;
+    if($F eq 'AI::MXNet::NDArray')
+    {
+        $z_array = $F->array([0]);
+    }
+    else
+    {
+        $z_array = $F->zeros([1, 1]);
+    }
+    my $cos_sim_b = $F->broadcast_maximum($z_array, $y_minus_1 * ($cos_sim - $self->margin), { axis=>1 });
+    my $loss = $cos_sim_a + $cos_sim_b;
+    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
+    return $loss;
+}
+
+method _cosine_similarity($F, $x, $y, $axis=-1)
+{
+    my $x_norm = $F->norm($x, axis=>$axis)->reshape([-1, 1]);
+    my $y_norm = $F->norm($y, axis=>$axis)->reshape([-1, 1]);
+    my $x_dot_y = $F->sum($x*$y, axis=>$axis)->reshape([-1, 1]);
+    my $eps_arr;
+    if($F eq 'AI::MXNet::NDArray')
+    {
+        $eps_arr = $F->array([1e-12]);
+    }
+    else
+    {
+        $eps_arr = $F->full([1, 1], 1e-12);
+    }
+    return ($x_dot_y / $F->broadcast_maximum($x_norm * $y_norm, $eps_arr));
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
 1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm
index c14b792e77d7..89493c7b8bfb 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm
@@ -57,6 +57,7 @@ method _get_begin_state(GluonClass $F, $begin_state, GluonInput $inputs, $batch_
     return $begin_state;
 }
 
+
 method _format_sequence($length, $inputs, $layout, $merge, $in_layout=)
 {
     assert(
@@ -118,7 +119,7 @@ method _format_sequence($length, $inputs, $layout, $merge, $in_layout=)
         if($merge)
         {
             $inputs  = [map { $F->expand_dims($_, axis => $axis) } @{ $inputs }];
-            $inputs  = $F->concat(@{ $inputs }, dim => $axis);
+            $inputs  = $F->stack(@{ $inputs }, axis => $axis);
             $in_axis = $axis;
         }
     }
@@ -129,6 +130,54 @@ method _format_sequence($length, $inputs, $layout, $merge, $in_layout=)
     return ($inputs, $axis, $F, $batch_size);
 }
 
+method _mask_sequence_variable_length($F, $data, $length, $valid_length, $time_axis, $merge)
+{
+    assert(defined $valid_length);
+    if(not blessed $data)
+    {
+        $data = $F->stack(@$data, axis=>$time_axis);
+    }
+    my $outputs = $F->SequenceMask($data, { sequence_length=>$valid_length, use_sequence_length=>1,
+                             axis=>$time_axis});
+    if(not $merge)
+    {
+        $outputs = $F->split($outputs, { num_outputs=>$length, axis=>$time_axis,
+                                   squeeze_axis=>1});
+        if(not ref $outputs eq 'ARRAY')
+        {
+            $outputs = [$outputs];
+        }
+    }
+    return $outputs;
+}
+
+method _reverse_sequences($sequences, $unroll_step, $valid_length=)
+{
+    my $F;
+    if($sequences->[0]->isa('AI::MXNet::Symbol'))
+    {
+        $F = 'AI::MXNet::Symbol';
+    }
+    else
+    {
+        $F = 'AI::MXNet::NDArray';
+    }
+
+    my $reversed_sequences;
+    if(not defined $valid_length)
+    {
+        $reversed_sequences = [reverse(@$sequences)];
+    }
+    else
+    {
+        $reversed_sequences = $F->SequenceReverse($F->stack(@$sequences, axis=>0),
+                                               {sequence_length=>$valid_length,
+                                               use_sequence_length=>1});
+        $reversed_sequences = $F->split($reversed_sequences, {axis=>0, num_outputs=>$unroll_step, squeeze_axis=>1});
+    }
+    return $reversed_sequences;
+}
+
 =head1 NAME
 
     AI::MXNet::Gluon::RNN::RecurrentCell
@@ -280,21 +329,39 @@ method unroll(
     Maybe[GluonInput] $inputs,
     Maybe[GluonInput] :$begin_state=,
     Str :$layout='NTC',
-    Maybe[Bool] :$merge_outputs=
+    Maybe[Bool] :$merge_outputs=,
+    Maybe[Bool] :$valid_length=
 )
 {
     $self->reset();
-    my ($F, $batch_size);
-    ($inputs, undef, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, 0);
+    my ($F, $batch_size, $axis);
+    ($inputs, $axis, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, 0);
     $begin_state //= $self->_get_begin_state($F, $begin_state, $inputs, $batch_size);
 
     my $states = $begin_state;
     my $outputs = [];
+    my $all_states = [];
     for my $i (0..$length-1)
     {
         my $output;
         ($output, $states) = $self->($inputs->[$i], $states);
         push @$outputs, $output;
+        if(defined $valid_length)
+        {
+            push @$all_states, $states;
+        }
+    }
+    if(defined $valid_length)
+    {
+        $states = [];
+        for(zip(@$all_states))
+        {
+            push @$states, $F->SequenceLast($F->stack(@$_, axis=>0),
+                                     sequence_length=>$valid_length,
+                                     use_sequence_length=>1,
+                                     axis=>0);
+        }
+        $outputs = $self->_mask_sequence_variable_length($F, $outputs, $length, $valid_length, $axis, 1);
     }
     ($outputs) = $self->_format_sequence($length, $outputs, $layout, $merge_outputs);
     return ($outputs, $states);
@@ -304,8 +371,17 @@ method _get_activation(GluonClass $F, GluonInput $inputs, Activation $activation
 {
     if(not blessed $activation)
     {
+        my %act = map { $_ => 1 } qw(tanh relu sigmoid softsign);
+        if(exists $act{$activation})
+        {
+            return $F->$activation($inputs, %kwargs)
+        }
         return $F->Activation($inputs, act_type=>$activation, %kwargs);
     }
+    elsif(ref($activation) =~ /LeakyReLU/)
+    {
+        return $F->LeakyReLU($inputs, act_type=>'leaky', slope => $activation->alpha, %kwargs);
+    }
     else
     {
         return $activation->($inputs, %kwargs);
@@ -430,7 +506,7 @@ has [qw/
 method python_constructor_arguments()
 {
     [qw/
-        hidden_size activation 
+        hidden_size activation
         i2h_weight_initializer h2h_weight_initializer
         i2h_bias_initializer h2h_bias_initializer
         input_size
@@ -476,16 +552,17 @@ method hybrid_forward(
 {
     my $prefix = "t${\ $self->counter}_";
     my $i2h = $F->FullyConnected(
-        $inputs, $i2h_weight, $i2h_bias,
+        data => $inputs, weight => $i2h_weight, bias => $i2h_bias,
         num_hidden => $self->hidden_size,
         name => "${prefix}i2h"
     );
     my $h2h = $F->FullyConnected(
-        $states->[0], $h2h_weight, $h2h_bias,
+        data => $states->[0], weight => $h2h_weight, bias => $h2h_bias,
         num_hidden => $self->hidden_size,
         name => "${prefix}h2h"
     );
-    my $output = $self->_get_activation($F, $i2h + $h2h, $self->activation, name => "${prefix}out");
+    my $i2h_plus_h2h = $F->elemwise_add($i2h, $h2h, name => "${prefix}plus0");
+    my $output = $self->_get_activation($F, $i2h_plus_h2h, $self->activation, name => "${prefix}out");
     return ($output, [$output]);
 }
 
@@ -555,6 +632,7 @@ method python_constructor_arguments()
     /];
 }
 
+
 sub BUILD
 {
     my $self = shift;
@@ -606,14 +684,18 @@ method hybrid_forward(
         num_hidden => $self->hidden_size*4,
         name => "${prefix}h2h"
     );
-    my $gates = $i2h + $h2h;
+    my $gates = $F->elemwise_add($i2h, $h2h, name => "${prefix}plus0");
     my @slice_gates = @{ $F->SliceChannel($gates, num_outputs => 4, name => "${prefix}slice") };
     my $in_gate = $F->Activation($slice_gates[0], act_type=>"sigmoid", name => "${prefix}i");
     my $forget_gate = $F->Activation($slice_gates[1], act_type=>"sigmoid", name => "${prefix}f");
     my $in_transform = $F->Activation($slice_gates[2], act_type=>"tanh", name => "${prefix}c");
     my $out_gate = $F->Activation($slice_gates[3], act_type=>"sigmoid", name => "${prefix}o");
-    my $next_c = $F->_plus($forget_gate * $states->[1], $in_gate * $in_transform, name => "${prefix}state");
-    my $next_h = $F->_mul($out_gate, $F->Activation($next_c, act_type=>"tanh"), name => "${prefix}out");
+    my $next_c = $F->_plus(
+        $F->elemwise_mul($forget_gate, $states->[1], name => "${prefix}mul0"),
+        $F->elemwise_mul($in_gate, $in_transform, name => "${prefix}mul1"),
+        name => "${prefix}state"
+    );
+    my $next_h = $F->_mul($out_gate, $F->Activation($next_c, act_type=>"tanh", name => "${prefix}activation0"), name => "${prefix}out");
     return ($next_h, [$next_h, $next_c]);
 }
 
@@ -735,10 +817,29 @@ method hybrid_forward(
     my ($i2h_r, $i2h_z, $h2h_r, $h2h_z);
     ($i2h_r, $i2h_z, $i2h) = @{ $F->SliceChannel($i2h, num_outputs => 3, name => "${prefix}i2h_slice") };
     ($h2h_r, $h2h_z, $h2h) = @{ $F->SliceChannel($h2h, num_outputs => 3, name => "${prefix}h2h_slice") };
-    my $reset_gate  = $F->Activation($i2h_r + $h2h_r, act_type=>"sigmoid", name => "${prefix}r_act");
-    my $update_gate = $F->Activation($i2h_z + $h2h_z, act_type=>"sigmoid", name => "${prefix}z_act");
-    my $next_h_tmp = $F->Activation($i2h + $reset_gate * $h2h, act_type => "tanh", name => "${prefix}h_act");
-    my $next_h = $F->_plus((1 - $update_gate) * $next_h_tmp, $update_gate * $prev_state_h, name => "${prefix}out");
+    my $reset_gate  = $F->Activation($F->elemwise_add($i2h_r, $h2h_r, name => "${prefix}plus0"), act_type=>"sigmoid", name => "${prefix}r_act");
+    my $update_gate = $F->Activation($F->elemwise_add($i2h_z, $h2h_z, name => "${prefix}plus1"), act_type=>"sigmoid", name => "${prefix}z_act");
+    my $next_h_tmp = $F->Activation(
+        $F->elemwise_add(
+            $i2h,
+            $F->elemwise_mul(
+                $reset_gate, $h2h, name => "${prefix}mul0"
+            ),
+            name => "${prefix}plus2"
+        ),
+        act_type => "tanh",
+        name => "${prefix}h_act"
+    );
+    my $ones = $F->ones_like($update_gate, name => "${prefix}ones_like0");
+    my $next_h = $F->_plus(
+        $F->elemwise_mul(
+            $F->elemwise_sub($ones, $update_gate, name => "${prefix}minus0"),
+            $next_h_tmp,
+            name => "${prefix}mul1"
+        ),
+        $F->elemwise_mul($update_gate, $prev_state_h, name => "${prefix}mul2"),
+        name => "${prefix}out"
+    );
     return ($next_h, [$next_h]);
 }
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm
index 2b6e8a5bdae4..08212ab20f6d 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm
@@ -21,7 +21,7 @@ package AI::MXNet::Gluon::RNN::Layer;
 use AI::MXNet::Function::Parameters;
 use AI::MXNet::Gluon::Mouse;
 use AI::MXNet::Base;
-extends 'AI::MXNet::Gluon::Block';
+extends 'AI::MXNet::Gluon::HybridBlock';
 
 has 'hidden_size'   => (is => 'rw', isa => 'Int');
 has 'num_layers'    => (is => 'rw', isa => 'Int');
@@ -29,18 +29,19 @@ has 'layout'        => (is => 'rw', isa => 'Str');
 has 'dropout'       => (is => 'rw', isa => 'Num');
 has 'bidirectional' => (is => 'rw', isa => 'Bool');
 has 'input_size'    => (is => 'rw', isa => 'Int', default => 0);
+has 'projection_size' => (is => 'rw', isa => 'Maybe[Int]');
+has [qw/lstm_state_clip_min
+        lstm_state_clip_max/] => (is => 'rw', isa => 'Maybe[Num]');
+has 'lstm_state_clip_nan' => (is => 'rw', isa => 'Bool', default => 0);
 has [qw/
     i2h_weight_initializer
     h2h_weight_initializer
     i2h_bias_initializer
     h2h_bias_initializer
+    h2r_weight_initializer
     /]              => (is => 'rw', isa => 'Maybe[Initializer]');
 has 'mode'          => (is => 'rw', isa => 'Str');
 has [qw/dir gates
-    i2h_weight
-    h2h_weight
-    i2h_bias
-    h2h_bias
     unfused/]       => (is => 'rw', init_arg => undef);
 
 method python_constructor_arguments()
@@ -50,7 +51,8 @@ method python_constructor_arguments()
         dropout bidirectional input_size
         i2h_weight_initializer h2h_weight_initializer
         i2h_bias_initializer h2h_bias_initializer
-        mode
+        mode projection_size h2r_weight_initializer
+        lstm_state_clip_min lstm_state_clip_max lstm_state_clip_nan
     /];
 }
 
@@ -61,41 +63,76 @@ sub BUILD
         ($self->layout eq 'TNC' or $self->layout eq 'NTC'),
         "Invalid layout [${\ $self->layout }]; must be one of ['TNC' or 'NTC']"
     );
-    $self->i2h_weight([]);
-    $self->h2h_weight([]);
-    $self->i2h_bias([]);
-    $self->h2h_bias([]);
     $self->dir($self->bidirectional ? 2 : 1);
     $self->gates({qw/rnn_relu 1 rnn_tanh 1 lstm 4 gru 3/}->{$self->mode});
     my ($ng, $ni, $nh) = ($self->gates, $self->input_size, $self->hidden_size);
-    for my $i (0..$self->num_layers-1)
+    if(not $self->projection_size)
     {
-        for my $j ($self->dir == 2 ? ('l', 'r') : ('l'))
+        for my $i (0..$self->num_layers-1)
         {
-            push @{ $self->i2h_weight }, $self->params->get(
-                "$j${i}_i2h_weight", shape=>[$ng*$nh, $ni],
-                init => $self->i2h_weight_initializer,
-                allow_deferred_init => 1
-            );
-            push @{ $self->h2h_weight }, $self->params->get(
-                "$j${i}_h2h_weight", shape=>[$ng*$nh, $nh],
-                init => $self->h2h_weight_initializer,
-                allow_deferred_init => 1
-            );
-            push @{ $self->i2h_bias }, $self->params->get(
-                "$j${i}_i2h_bias", shape=>[$ng*$nh],
-                init => $self->i2h_bias_initializer,
-                allow_deferred_init => 1
-            );
-            push @{ $self->h2h_bias }, $self->params->get(
-                "$j${i}_h2h_bias", shape=>[$ng*$nh],
-                init => $self->h2h_bias_initializer,
-                allow_deferred_init => 1
-            );
+            for my $j ($self->dir == 2 ? ('l', 'r') : ('l'))
+            {
+                $self->_register_param(
+                    "$j${i}_i2h_weight", [$ng*$nh, $ni],
+                    $self->i2h_weight_initializer
+                );
+                $self->_register_param(
+                    "$j${i}_h2h_weight", [$ng*$nh, $nh],
+                    $self->h2h_weight_initializer
+                );
+                $self->_register_param(
+                    "$j${i}_i2h_bias", [$ng*$nh],
+                    $self->i2h_bias_initializer,
+                );
+                $self->_register_param(
+                    "$j${i}_h2h_bias", [$ng*$nh],
+                    $self->h2h_bias_initializer,
+                );
+            }
+            $ni = $nh * $self->dir;
+        }
+    }
+    else
+    {
+        my $np = $self->projection_size;
+        for my $i (0..$self->num_layers-1)
+        {
+            for my $j ($self->dir == 2 ? ('l', 'r') : ('l'))
+            {
+                $self->_register_param(
+                    "$j${i}_i2h_weight", [$ng*$nh, $ni],
+                    $self->i2h_weight_initializer
+                );
+                $self->_register_param(
+                    "$j${i}_h2h_weight", [$ng*$nh, $np],
+                    $self->h2h_weight_initializer
+                );
+                $self->_register_param(
+                    "$j${i}_i2h_bias", [$ng*$nh],
+                    $self->i2h_bias_initializer,
+                );
+                $self->_register_param(
+                    "$j${i}_h2h_bias", [$ng*$nh],
+                    $self->h2h_bias_initializer,
+                );
+                $self->_register_param(
+                    "$j${i}_h2r_weight", [$np, $nh],
+                    $self->h2r_weight_initializer,
+                );
+            }
+            $ni = $np * $self->dir;
         }
-        $ni = $nh * $self->dir;
     }
-    $self->unfused($self->_unfuse());
+}
+
+method _register_param($name, $shape, $init)
+{
+    my $p = $self->params->get(
+        $name, shape=>$shape, init=>$init,
+        allow_deferred_init=>1
+    );
+    $self->$name($p);
+    return $p;
 }
 
 use overload '""' => sub {
@@ -119,15 +156,55 @@ use overload '""' => sub {
     return $s;
 };
 
+method _collect_params_with_prefix(Str $prefix='')
+{
+    $prefix .= '.' if($prefix);
+    my $pattern = qr/(l|r)(\d+)_(i2h|h2h)_(weight|bias)$/;
+    my $convert_key = sub { my ($m, $bidirectional) = @_;
+        my ($d, $l, $g, $t) = @$m;
+        if($bidirectional)
+        {
+            return "_unfused.$l.${d}_cell.${g}_$t";
+        }
+        else
+        {
+            return "_unfused.$l.${g}_$t";
+        }
+    };
+    my $bidirectional = 0;
+    my %params = %{ $self->_reg_params };
+    for my $k (keys %params)
+    {
+        $k =~ $pattern;
+        $bidirectional = 1 if $1 and $1 eq 'r';
+    }
+    my %ret;
+    for my $k (keys %params)
+    {
+        $k =~ $pattern;
+        $ret{ $prefix . $convert_key->([$1, $2, $3, $4], $bidirectional) } = $params{$k};
+    }
+    my $iter = $self->_children->iterator;
+    while(my ($name, $child) = $iter->())
+    {
+        %ret = (%ret, %{ $child->_collect_params_with_prefix("$prefix$name") });
+    }
+    return \%ret;
+}
+
 method state_info($batch_size=0)
 {
     confess('NotImplementedError');
 }
 
-# Unfuses the fused RNN in to a stack of rnn cells.
 
 method _unfuse()
 {
+    assert((not $self->projection_size), "_unfuse does not support projection layer yet!");
+    assert(
+        (not $self->lstm_state_clip_min and not $self->lstm_state_clip_max),
+        "_unfuse does not support state clipping yet!"
+    );
     my $get_cell = {
         rnn_relu => sub {
             my %kwargs = @_;
@@ -218,89 +295,105 @@ method begin_state(
 }
 
 use Data::Dumper;
-method forward(GluonInput $inputs, Maybe[GluonInput] $states=)
+method hybrid_forward(GluonClass $F, GluonInput $inputs, @args)
 {
-    my $batch_size = $inputs->shape->[index($self->layout, 'N')];
-    my $skip_states = not defined $states;
-    if($skip_states)
+    my $states;
+    if(@args)
     {
-        $states = $self->begin_state($batch_size, ctx=>$inputs->context);
+        if(not defined $args[0] or ref $args[0])
+        {
+            $states = shift(@args);
+            undef $states if(ref $states eq 'ARRAY' and not @$states);
+        }
     }
-    if(blessed $states and $states->isa('AI::MXNet::NDArray'))
+    use Data::Dumper;
+
+    my $batch_size;
+    if($F eq 'AI::MXNet::NDArray')
     {
-        $states = [$states];
+        $batch_size = $inputs->shape->[index($self->layout, 'N')];
     }
-    for(zip($states, $self->state_info($batch_size))) {
-        my ($state, $info) = @$_;
-        if(Dumper($state->shape) ne Dumper($info->{shape}))
+    my $skip_states = not defined $states;
+    if($skip_states)
+    {
+        if($F eq 'AI::MXNet::NDArray')
         {
-            my @state_shape = @{ $state->shape };
-            confess("Invalid recurrent state shape. Expecting @{$info->{shape}}, got @state_shape.");
+            $states = $self->begin_state($batch_size, ctx=>$inputs->context, dtype=>$inputs->dtype);
         }
-    }
-    if($self->input_size == 0)
-    {
-        for my $i (0..$self->dir-1)
+        else
         {
-            $self->i2h_weight->[$i]->shape([$self->gates*$self->hidden_size, $inputs->shape->[2]]);
-            $self->i2h_weight->[$i]->_finish_deferred_init();
+            $states = $self->begin_state(0, func=>sub { return AI::MXNet::Symbol->zeros(@_) });
         }
     }
-    my $out;
-    if($inputs->context->device_type eq 'gpu')
+    if(blessed $states and ($states->isa('AI::MXNet::NDArray') or $states->isa('AI::MXNet::Symbol')))
     {
-        $out = $self->_forward_gpu($inputs, $states);
+        $states = [$states];
     }
-    else
+    if($F eq 'AI::MXNet::NDArray')
     {
-        $out = $self->_forward_cpu($inputs, $states);
+        for(zip($states, $self->state_info($batch_size)))
+        {
+            my ($state, $info) = @$_;
+            if(Dumper($state->shape) ne Dumper($info->{shape}))
+            {
+                my @state_shape = @{ $state->shape };
+                confess("Invalid recurrent state shape. Expecting @{$info->{shape}}, got @state_shape.");
+            }
+        }
     }
-
-    # out is (output, state)
+    my $out = $self->_forward_kernel($F, $inputs, $states, @args);
     return $skip_states ? $out->[0] : $out;
 }
 
-method _forward_cpu($inputs, $states)
+method _forward_kernel($F, $inputs, $states, %kwargs)
 {
-    my $ns = @{ $states };
-    my $axis = index($self->layout, 'T');
-    $states = [map { @{$_} } @{ $states }];
-    my $outputs;
-    ($outputs, $states) = $self->unfused->unroll(
-        $inputs->shape->[$axis], $inputs, begin_state => $states,
-        layout => $self->layout, merge_outputs => 1
-    );
-    my @new_states;
-    for my $i (0..$ns-1)
+    if($self->layout eq 'NTC')
     {
-        my @tmp;
-        for (my $j = $i; $j < @{ $states }; $j += $ns)
+        $inputs = $F->swapaxes($inputs, dim1=>0, dim2=>1);
+    }
+    my @params;
+    if(not defined $self->projection_size)
+    {
+        for my $t ('weight', 'bias')
         {
-            push @tmp, $states->[$j];
+            for my $l (0..$self->num_layers-1)
+            {
+                for my $d ($self->dir == 2 ? ('l', 'r') : ('l'))
+                {
+                    for my $g ('i2h', 'h2h')
+                    {
+                        push @params, $kwargs{"$d${l}_${g}_$t"}->reshape([-1]);
+                    }
+                }
+            }
         }
-        my $state = AI::MXNet::NDArray->concat((map { $_->reshape([1, @{ $_->shape }]) } @tmp), dim => 0);
-        push @new_states, $state;
     }
-    return [$outputs, \@new_states];
-}
-
-method _forward_gpu($inputs, $states)
-{
-    if($self->layout eq 'NTC')
+    else
     {
-        $inputs = $inputs->swapaxes(dim1 => 0, dim2 => 1);
+        for my $t ('weight', 'bias')
+        {
+            for my $l (0..$self->num_layers-1)
+            {
+                for my $d ($self->dir == 2 ? ('l', 'r') : ('l'))
+                {
+                    for my $g ('i2h', 'h2h', 'h2r')
+                    {
+                        push @params, $kwargs{"$d${l}_${g}_$t"}->reshape([-1])
+                            unless($g eq 'h2r' and $t eq 'bias');
+                    }
+                }
+            }
+        }
     }
-    my $ctx = $inputs->context;
-    my @params = map { $_->data($ctx)->reshape([-1]) } map { @{ $_ } } (
-        $self->i2h_weight, $self->h2h_weight,
-        $self->i2h_bias, $self->h2h_bias
-    );
-    my $params = AI::MXNet::NDArray->concat(@params, dim => 0);
-    my $rnn = AI::MXNet::NDArray->RNN(
-        $inputs, $params, @{ $states }, state_size => $self->hidden_size,
+    my $params = $F->_rnn_param_concat(@params, dim=>0);
+    my $rnn = $F->RNN(
+        $inputs, $params, @{ $states }, { state_size => $self->hidden_size,
         num_layers => $self->num_layers, bidirectional => $self->dir == 2 ? 1 : 0,
-        p => $self->dropout, state_outputs => 1, mode => $self->mode
-    );
+        p => $self->dropout, state_outputs => 1, mode => $self->mode,
+        (defined $self->lstm_state_clip_min ? (lstm_state_clip_min=>$self->lstm_state_clip_min) : ()),
+        (defined $self->lstm_state_clip_max ? (lstm_state_clip_max=>$self->lstm_state_clip_max) : ()),
+        (defined $self->lstm_state_clip_nan ? (lstm_state_clip_nan=>$self->lstm_state_clip_nan) : ())
+    });
     my $outputs;
     my @rnn = @{$rnn};
     if($self->mode eq 'lstm')
@@ -318,7 +411,6 @@ method _forward_gpu($inputs, $states)
     return [$outputs, $states];
 }
 
-
 package AI::MXNet::Gluon::RNN::RNN;
 
 =head1 NAME
@@ -552,7 +644,10 @@ method state_info(DimSize $batch_size=0)
 {
     return [
         {
-            shape => [$self->num_layers * $self->dir, $batch_size, $self->hidden_size],
+            shape => [
+                $self->num_layers * $self->dir, $batch_size, 
+                defined $self->projection_size ? $self->projection_size : $self->hidden_size
+            ],
             __layout__ => 'LNC'
         },
         {
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
index 0359cc3640d4..75c8b1e3dad1 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
@@ -191,6 +191,16 @@ method call(Str|AI::MXNet::InitDesc $desc, AI::MXNet::NDArray $arr)
             $self->$method($desc, $arr);
             $self->_verbose_print($desc, $1, $arr);
         }
+        elsif($desc =~ /min$/)
+        {
+            $self->_init_zero($desc, $arr);
+            $self->_verbose_print($desc, 'min', $arr);
+        }
+        elsif($desc =~ /max$/)
+        {
+            $self->_init_one($desc, $arr);
+            $self->_verbose_print($desc, 'max', $arr);
+        }
         else
         {
             $self->_init_default($desc, $arr)
@@ -250,6 +260,14 @@ method _legacy_init(Str $name, AI::MXNet::NDArray $arr)
     {
         $self->_init_zero($name, $arr);
     }
+    elsif($name =~ /min$/)
+    {
+        $self->_init_zero($name, $arr);
+    }
+    elsif($name =~ /max$/)
+    {
+        $self->_init_one($name, $arr);
+    }
     else
     {
         $self->_init_default($name, $arr);
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
index 3a7b6bab2e2c..72f6cc772178 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
@@ -1226,6 +1226,9 @@ method concatenate(ArrayRef[AI::MXNet::NDArray] $arrays, Index :$axis=0, :$alway
     :$repeat=1 : number, optional
         The repeating time of all elements.
         E.g repeat=3, the element a will be repeated three times --> a, a, a.
+    :$infer_range=0 : Bool
+        When set to 1, infer stop position from start, step, repeat, and
+        output tensor size.
     :$ctx : Context, optional
         The context of the NDArray, defaultw to current default context.
     :$dtype : data type, optional
@@ -1237,7 +1240,7 @@ method concatenate(ArrayRef[AI::MXNet::NDArray] $arrays, Index :$axis=0, :$alway
         The created NDArray
 =cut
 
-method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$repeat=1,
+method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$repeat=1, Bool :$infer_range=0,
               AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
 {
     return __PACKAGE__->_arange({
@@ -1246,6 +1249,7 @@ method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$rep
                 step => $step,
                 repeat => $repeat,
                 dtype => $dtype,
+                infer_range => $infer_range,
                 ctx => "$ctx"
     });
 }
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
index 57bfdf1d977c..04dd1cbfc441 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
@@ -1411,16 +1411,19 @@ method ones(Shape :$shape, Dtype :$dtype='float32', Maybe[Str] :$name=, Maybe[St
 
     Parameters
     ----------
-    start : number
+    :$start=0 : number
         Start of interval. The interval includes this value. The default start value is 0.
-    stop : number, optional
+    :$stop= : number, optional
         End of interval. The interval does not include this value.
-    step : number, optional
+    :$step=1.0 : number, optional
         Spacing between values
-    repeat : int, optional
+    :$repeat=1 : int, optional
         "The repeating time of all elements.
         E.g repeat=3, the element a will be repeated three times --> a, a, a.
-    dtype : type, optional
+    :$infer_range=0 : Bool
+        When set to 1, infer stop position from start, step, repeat, and
+        output tensor size.
+    :$dtype='float32' : type, optional
         The value type of the NDArray, default to np.float32
 
     Returns
@@ -1429,11 +1432,12 @@ method ones(Shape :$shape, Dtype :$dtype='float32', Maybe[Str] :$name=, Maybe[St
         The created Symbol
 =cut
 
-method arange(Index :$start=0, Index :$stop=, Num :$step=1.0, Index :$repeat=1, Maybe[Str] :$name=, Dtype :$dtype='float32')
+method arange(Index :$start=0, Index :$stop=, Num :$step=1.0, Index :$repeat=1, Bool :$infer_range=0, Maybe[Str] :$name=, Dtype :$dtype='float32')
 {
     return __PACKAGE__->_arange({
                  start => $start, (defined $stop ? (stop => $stop) : ()),
-                 step => $step, repeat => $repeat, name => $name, dtype => $dtype
+                 step => $step, repeat => $repeat, name => $name, dtype => $dtype,
+                 infer_range => $infer_range
     });
 }
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
index 20811f10fedf..1574ea58307f 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
@@ -172,6 +172,10 @@ method print_summary(
                 $cur_param = $num_filter * 2;
             }
         }
+        elsif($op eq 'Embedding')
+        {
+            $cur_param = $node->{attrs}{input_dim} * $node->{attrs}{output_dim};
+        }
         my $first_connection;
         if(not $pre_node)
         {
diff --git a/perl-package/AI-MXNet/t/test_gluon_rnn.t b/perl-package/AI-MXNet/t/test_gluon_rnn.t
index 83b294d110ce..51e6ad53e171 100644
--- a/perl-package/AI-MXNet/t/test_gluon_rnn.t
+++ b/perl-package/AI-MXNet/t/test_gluon_rnn.t
@@ -14,6 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
 use strict;
 use warnings;
 use Test::More tests => 77;
@@ -276,14 +277,15 @@ sub check_rnn_layer_forward
     $inputs->attach_grad;
     my $out;
     mx->autograd->record(sub {
-        $out = $layer->($inputs, $states);
         if(defined $states)
         {
+            $out = $layer->($inputs, $states);
             ok(@$out == 2);
             $out = $out->[0];
         }
         else
         {
+            $out = $layer->($inputs);
             ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
         }
         $out->backward();
@@ -292,21 +294,19 @@ sub check_rnn_layer_forward
     my $pdl_out = $out->aspdl;
     my $pdl_dx = $inputs->grad->aspdl;
     $layer->hybridize;
-
     mx->autograd->record(sub {
-        $out = $layer->($inputs, $states);
         if(defined $states)
         {
-            ok(@$out == 2);
-            $out = $out->[0]
+            ($out, $states) = $layer->($inputs, $states);
+            ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
         }
         else
         {
+            $out = $layer->($inputs, $states);
             ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
         }
         $out->backward();
     });
-
     ok(almost_equal($pdl_out, $out->aspdl, 1e-3));
     ok(almost_equal($pdl_dx, $inputs->grad->aspdl, 1e-3));
 }
@@ -314,21 +314,12 @@ sub check_rnn_layer_forward
 sub test_rnn_layers
 {
     check_rnn_layer_forward(gluon->rnn->RNN(10, 2), mx->nd->ones([8, 3, 20]));
-    check_rnn_layer_forward(gluon->rnn->RNN(10, 2), mx->nd->ones([8, 3, 20]), mx->nd->ones([2, 3, 10]));
+    check_rnn_layer_forward(gluon->rnn->RNN(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), mx->nd->ones([4, 3, 10]));
     check_rnn_layer_forward(gluon->rnn->LSTM(10, 2), mx->nd->ones([8, 3, 20]));
-    check_rnn_layer_forward(gluon->rnn->LSTM(10, 2), mx->nd->ones([8, 3, 20]), [mx->nd->ones([2, 3, 10]), mx->nd->ones([2, 3, 10])]);
+    check_rnn_layer_forward(gluon->rnn->LSTM(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), [mx->nd->ones([4, 3, 10]), mx->nd->ones([4, 3, 10])]);
     check_rnn_layer_forward(gluon->rnn->GRU(10, 2), mx->nd->ones([8, 3, 20]));
-    check_rnn_layer_forward(gluon->rnn->GRU(10, 2), mx->nd->ones([8, 3, 20]), mx->nd->ones([2, 3, 10]));
-
-#    my $net = gluon->nn->Sequential();
-#    $net->add(gluon->rnn->LSTM(10, 2, bidirectional=>1));
-#    $net->add(gluon->nn->BatchNorm(axis=>2));
-#    $net->add(gluon->nn->Flatten());
-#    $net->add(gluon->nn->Dense(3, activation=>'relu'));
-#    $net->collect_params()->initialize();
-#    mx->autograd->record(sub {
-#        $net->(mx->nd->ones([2, 3, 10]))->backward();
-#    });
+    check_rnn_layer_forward(gluon->rnn->GRU(10, 2, bidirectional=>1), mx->nd->ones([8, 3, 20]), mx->nd->ones([4, 3, 10]));
 }
 
 test_rnn_layers();
+
diff --git a/perl-package/AI-MXNet/t/test_loss.t b/perl-package/AI-MXNet/t/test_loss.t
index 7fc7ee81d0de..5a9e413bbfaf 100644
--- a/perl-package/AI-MXNet/t/test_loss.t
+++ b/perl-package/AI-MXNet/t/test_loss.t
@@ -17,7 +17,7 @@
 
 use strict;
 use warnings;
-use Test::More tests => 30;
+use Test::More tests => 32;
 use AI::MXNet 'mx';
 use AI::MXNet::Gluon 'gluon';
 use AI::MXNet::TestUtils 'almost_equal';
@@ -435,3 +435,47 @@ sub test_triplet_loss
 
 test_triplet_loss();
 
+sub test_cosine_loss
+{
+    my $input1 = mx->nd->random->randn(3, 2);
+    my $input2 = mx->nd->random->randn(3, 2);
+    my $label  = mx->nd->sign(mx->nd->random->randn($input1->shape->[0]));
+
+    my $Loss = gluon->loss->CosineEmbeddingLoss();
+    my $loss = $Loss->($input1, $input2, $label);
+
+    my $numerator = mx->nd->sum($input1 * $input2, keepdims => 1, axis => 1);
+    my $denominator = mx->nd->sqrt(mx->nd->sum($input1**2, axis=>1, keepdims=>1))
+        *
+    mx->nd->sqrt(mx->nd->sum($input2**2, axis=>1, keepdims=>1));
+    my $pdl_loss = mx->nd->where(
+        ($label == 1), 1-$numerator/$denominator,
+        mx->nd->broadcast_maximum(mx->nd->array([0]), $numerator/$denominator, { axis=>1 })
+    );
+    ok(almost_equal($loss->aspdl, $pdl_loss->aspdl));
+}
+
+test_cosine_loss();
+
+sub test_poisson_nllloss
+{
+    my $N = 1000;
+    mx->random->seed(1234);
+    srand(1234);
+    my $data = mx->random->poisson(shape=>[$N, 2]);
+    my $label = mx->random->poisson(lam=>4, shape=>[$N, 1]);
+    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>20, label_name=>'label', shuffle=>1);
+    my $output = mx->sym->exp(get_net(1));
+    my $l = mx->symbol->Variable('label');
+    my $Loss = gluon->loss->PoissonNLLLoss(from_logits=>0);
+    my $loss = $Loss->($output, $l);
+    $loss = mx->sym->make_loss($loss);
+    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
+    local($AI::MXNet::Logging::silent) = 1;
+    $mod->fit($data_iter, num_epoch=>20, optimizer_params=>{learning_rate => 0.01},
+            initializer=>mx->init->Normal(sigma=>0.1), eval_metric=>mx->metric->Loss(),
+            optimizer=>'adam');
+    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
+}
+
+test_poisson_nllloss;
diff --git a/perl-package/AI-MXNetCAPI/Changes b/perl-package/AI-MXNetCAPI/Changes
index 08ad085abce9..cdbbdab57cdf 100644
--- a/perl-package/AI-MXNetCAPI/Changes
+++ b/perl-package/AI-MXNetCAPI/Changes
@@ -1,5 +1,8 @@
 Revision history for Perl extension AI::MXNetCAPI
 
+1.4     Mon Feb 18 11:54:07 PST 2019
+        - Support for 64bit integers
+
 1.33    Thu Oct  4 13:25:56 PDT 2018
         - Gluon: Better sparse support for KVStore.
         - Gpu memory info via mxnet api call.
diff --git a/perl-package/AI-MXNetCAPI/META.json b/perl-package/AI-MXNetCAPI/META.json
index 1849e6b3bc18..82bee1ace8f8 100644
--- a/perl-package/AI-MXNetCAPI/META.json
+++ b/perl-package/AI-MXNetCAPI/META.json
@@ -37,5 +37,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.33"
+   "version" : "1.4"
 }
diff --git a/perl-package/AI-MXNetCAPI/META.yml b/perl-package/AI-MXNetCAPI/META.yml
index d870f05fbe52..bd4af4047378 100644
--- a/perl-package/AI-MXNetCAPI/META.yml
+++ b/perl-package/AI-MXNetCAPI/META.yml
@@ -36,4 +36,4 @@ no_index:
     - inc
 requires:
   Test::More: '0'
-version: '1.33'
+version: '1.4'
diff --git a/perl-package/AI-MXNetCAPI/README b/perl-package/AI-MXNetCAPI/README
index 67b77ccd1614..848b4d03ab21 100644
--- a/perl-package/AI-MXNetCAPI/README
+++ b/perl-package/AI-MXNetCAPI/README
@@ -1,4 +1,4 @@
-AI-MXNetCAPI version 1.33
+AI-MXNetCAPI version 1.4
 =====================
 
 Swig interface to MXNet c api.
diff --git a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
index bc7676047d76..e3b71f8efc92 100644
--- a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
+++ b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
@@ -18,7 +18,7 @@
 package AI::MXNetCAPI;
 use base qw(DynaLoader);
 bootstrap AI::MXNetCAPI;
-our $VERSION = '1.33';
+our $VERSION = '1.4';
 1;
 __END__
 
diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
index 68e11ca74e1a..50296c2aaba5 100644
--- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
+++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
@@ -115,7 +115,7 @@
     }
 }
 
-%typemap(in,numinputs=0) (int *out) (int temp), (bool *out) (bool temp)
+%typemap(in,numinputs=0) (int *out) (int temp), (bool *out) (bool temp), (uint64_t *out) (uint64_t temp)
 {
     temp = 0;
     $1 = &temp;
@@ -131,6 +131,17 @@
     }
 }
 
+%typemap(argout) (uint64_t *out)
+{
+    if(!result)
+    {
+        $result = newSVnv((double)(*$1));
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+
 %typemap(in,numinputs=0) (const int **out_stypes) (int* temp)
 {
     temp = NULL;
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
index a7cef7674496..1a8d2cea9cd6 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
@@ -62,48 +62,22 @@ def sample_multinomial(attrs, inputs, proto_obj):
     new_attrs['dtype'] = TENSOR_TYPE_TO_NP_TYPE[int(attrs.get('dtype', 6))]
     return 'sample_multinomial', new_attrs, inputs
 
-
 # Arithmetic Operations
 def add(attrs, inputs, proto_obj):
     """Adding two tensors"""
-    new_attr = {}
-    if 'broadcast' in attrs and attrs['broadcast'] == 1:
-        broadcast_axis = attrs['axis']
-        op_value = translation_utils._fix_broadcast('broadcast_add', inputs,
-                                                    broadcast_axis, proto_obj)
-        return op_value, new_attr, inputs
-    return 'broadcast_add', new_attr, inputs
+    return translation_utils.broadcast_arithmetic_helper(attrs, inputs, proto_obj, 'broadcast_add')
 
 def subtract(attrs, inputs, proto_obj):
     """Subtracting two tensors"""
-    new_attr = {}
-    if 'broadcast' in attrs and attrs['broadcast'] == 1:
-        broadcast_axis = attrs['axis']
-        op_value = translation_utils._fix_broadcast('broadcast_sub', inputs,
-                                                    broadcast_axis, proto_obj)
-        return op_value, new_attr, inputs
-    return 'broadcast_sub', new_attr, inputs
-
+    return translation_utils.broadcast_arithmetic_helper(attrs, inputs, proto_obj, 'broadcast_sub')
 
 def multiply(attrs, inputs, proto_obj):
     """Multiply two tensors"""
-    new_attr = {}
-    if 'broadcast' in attrs and attrs['broadcast'] == 1:
-        broadcast_axis = attrs['axis']
-        op_value = translation_utils._fix_broadcast('broadcast_mul', inputs,
-                                                    broadcast_axis, proto_obj)
-        return op_value, new_attr, inputs
-    return 'broadcast_mul', new_attr, inputs
+    return translation_utils.broadcast_arithmetic_helper(attrs, inputs, proto_obj, 'broadcast_mul')
 
 def divide(attrs, inputs, proto_obj):
     """Divide two tensors"""
-    new_attr = {}
-    if 'broadcast' in attrs and attrs['broadcast'] == 1:
-        broadcast_axis = attrs['axis']
-        op_value = translation_utils._fix_broadcast('broadcast_div', inputs,
-                                                    broadcast_axis, proto_obj)
-        return op_value, new_attr, inputs
-    return 'broadcast_div', new_attr, inputs
+    return translation_utils.broadcast_arithmetic_helper(attrs, inputs, proto_obj, 'broadcast_div')
 
 def mean(attrs, inputs, proto_obj):
     """Mean of all the input tensors."""
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
index 6fd52665ca31..0c6730513d4b 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
@@ -221,7 +221,7 @@ def get_input_shape(sym, proto_obj):
     model_input_shape = [data[1] for data  in proto_obj.model_metadata.get('input_tensor_data')]
     data_names = [data[0] for data  in proto_obj.model_metadata.get('input_tensor_data')]
 
-    #creating dummy inputs
+    # creating dummy inputs
     inputs = []
     for  in_shape in model_input_shape:
         inputs.append(nd.ones(shape=in_shape))
@@ -245,3 +245,17 @@ def get_input_shape(sym, proto_obj):
     result = mod.get_outputs()[0].asnumpy()
 
     return result.shape
+
+def broadcast_arithmetic_helper(attrs, inputs, proto_obj, current_op_name):
+    """Helper function for broadcast arithmetic ops."""
+    new_attr = {}
+    op_names = ['batchnorm, convolution, deconvolution']
+    if 'broadcast' in attrs and attrs['broadcast'] == 1:
+        broadcast_axis = attrs['axis']
+        for op_name in op_names:
+            # if input is bias which comes after conv, deconv, batchnorm operators
+            # then only reshape bias term
+            if inputs[0].name.startswith(op_name):
+                op_value = _fix_broadcast(current_op_name, inputs, broadcast_axis, proto_obj)
+                return op_value, new_attr, inputs
+    return current_op_name, new_attr, inputs
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index c08077cc65f4..efb51096c368 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -181,8 +181,10 @@ def _update_params(param_arrays, grad_arrays, updater, num_device,
             w, g = p
             updates[k].append((index*num_device+k, g, w))
     for dev_updates in updates:
-        i, w, g = zip(*dev_updates)
-        updater(i, w, g)
+        # update params if param_arrays and grad_arrays are not empty
+        if dev_updates:
+            i, w, g = zip(*dev_updates)
+            updater(i, w, g)
 
 
 def _multiple_callbacks(callbacks, *args, **kwargs):
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 43de0c9d7535..3e3e79ed59f7 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -882,6 +882,81 @@ def infer_type(self, *args, **kwargs):
             List of auxiliary state types.
             The order is same as the order of list_auxiliary_states().
         """
+        try:
+            res = self._infer_type_impl(False, *args, **kwargs)
+            if res[1] is None:
+                arg_shapes, _, _ = self._infer_type_impl(True, *args, **kwargs)
+                arg_names = self.list_arguments()
+                unknowns = []
+                for name, dtype in zip(arg_names, arg_shapes):
+                    if not dtype:
+                        if len(unknowns) >= 10:
+                            unknowns.append('...')
+                            break
+                        unknowns.append('%s: %s' % (name, str(dtype)))
+                warnings.warn(
+                    "Cannot decide type for the following arguments. " +
+                    "Consider providing them as input:\n\t" +
+                    "\n\t".join(unknowns), stacklevel=2)
+            return res
+        except MXNetError:
+            print("infer_type error. Arguments:")
+            for i, arg in enumerate(args):
+                print("  #%d: %s" % (i, arg))
+            for k, v in kwargs.items():
+                print("  %s: %s" % (k, v))
+            raise
+
+    def infer_type_partial(self, *args, **kwargs):
+        """Infers the type partially.
+
+        This functions works the same way as `infer_type`,
+        except that this function can return partial results.
+
+        In the following example, information about fc2 is not available. So, `infer_shape`
+        will return a tuple of `None` values but `infer_shape_partial` will return partial values.
+
+        Example
+        -------
+        >>> data = mx.sym.Variable('data')
+        >>> prev = mx.sym.Variable('prev')
+        >>> casted_prev  = mx.sym.cast(prev, dtype='float32')
+        >>> out  = mx.sym.Activation(data=mx.sym.elemwise_add(data, casted_prev), act_type='relu')
+        >>> out.list_arguments()
+        ['data', 'prev']
+        >>> out.infer_type(data='float32')
+        (None, None, None)
+        >>> out.infer_type_partial(data='float32')
+        ([numpy.float32, None], [numpy.float32], [])
+        >>> # infers type if you give information about prev
+        >>> out.infer_type(data='float32', prev='float16')
+        ([numpy.float32, numpy.float16], [numpy.float32], [])
+
+        Parameters
+        ----------
+        *args :
+            Type of known arguments in a positional way.
+            Unknown type can be marked as None.
+
+        **kwargs :
+            Keyword arguments of known types.
+
+        Returns
+        -------
+        arg_types : list of numpy.dtype or None
+            List of argument types.
+            The order is same as the order of list_arguments().
+        out_types : list of numpy.dtype or None
+            List of output types.
+            The order is same as the order of list_outputs().
+        aux_types : list of numpy.dtype or None
+            List of auxiliary state types.
+            The order is same as the order of list_auxiliary_states().
+        """
+        return self._infer_type_impl(True, *args, **kwargs)
+
+    def _infer_type_impl(self, partial, *args, **kwargs):
+        """The actual implementation for calling type inference API."""
         # pylint: disable=too-many-locals
         if len(args) != 0 and len(kwargs) != 0:
             raise ValueError('Can only specify known argument \
@@ -912,7 +987,11 @@ def infer_type(self, *args, **kwargs):
         aux_type_size = mx_uint()
         aux_type_data = ctypes.POINTER(ctypes.c_int)()
         complete = ctypes.c_int()
-        check_call(_LIB.MXSymbolInferType(
+        if partial:
+            infer_func = _LIB.MXSymbolInferTypePartial
+        else:
+            infer_func = _LIB.MXSymbolInferType
+        check_call(infer_func(
             self.handle,
             mx_uint(len(sdata)),
             keys,
diff --git a/scala-package/README.md b/scala-package/README.md
index 8322ab2a237f..c7d0cecf15ac 100644
--- a/scala-package/README.md
+++ b/scala-package/README.md
@@ -179,6 +179,37 @@ mvn deploy -Pstaging
 
 Examples & Usage
 -------
+Assuming you use `mvn install`, you can find the `mxnet-full_scala_version-INTERNAL.jar` e.g. `mxnet-full_2.11-INTERNAL.jar` under the path `incubator-mxnet/scala-package/assembly/target`.
+
+Adding the following configuration in `pom.xml`
+```HTML
+<dependency>
+  <groupId>org.apache.mxnet</groupId>
+  <artifactId>mxnet-full_2.11-INTERNAL</artifactId>
+  <version>1.5.0</version>
+  <scope>system</scope>
+  <systemPath>path_to_jar/mxnet-full_2.11-INTERNAL.jar</systemPath>
+</dependency>
+```
+If you have following error message
+```
+Error: A JNI error has occurred, please check your installation and try again
+Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/mxnet/NDArray
+        at java.lang.Class.getDeclaredMethods0(Native Method)
+        at java.lang.Class.privateGetDeclaredMethods(Class.java:2701)
+        at java.lang.Class.privateGetMethodRecursive(Class.java:3048)
+        at java.lang.Class.getMethod0(Class.java:3018)
+        at java.lang.Class.getMethod(Class.java:1784)
+        at sun.launcher.LauncherHelper.validateMainClass(LauncherHelper.java:544)
+        at sun.launcher.LauncherHelper.checkAndLoadMain(LauncherHelper.java:526)
+Caused by: java.lang.ClassNotFoundException: org.apache.mxnet.NDArray
+        at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
+        at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
+        at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
+        at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
+```
+Please make sure your $CLASSPATH is able to find `mxnet-full_scala_version-INTERNAL.jar`.
+
 - To set up the Scala Project using IntelliJ IDE on macOS follow the instructions [here](https://mxnet.incubator.apache.org/tutorials/scala/mxnet_scala_on_intellij.html).
 - Several examples on using the Scala APIs are provided in the [Scala Examples Folder](/~https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/)
 
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 32b63c11dd9a..9f0d2834fcce 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -638,6 +638,27 @@ int MXSymbolInferType(SymbolHandle sym,
   API_END();
 }
 
+int MXSymbolInferTypePartial(SymbolHandle sym,
+                             mx_uint num_args,
+                             const char** keys,
+                             const int *arg_type_data,
+                             mx_uint *in_type_size,
+                             const int **in_type_data,
+                             mx_uint *out_type_size,
+                             const int **out_type_data,
+                             mx_uint *aux_type_size,
+                             const int **aux_type_data,
+                             int *complete) {
+  int succ;
+  *complete = 1;
+  return MXSymbolInferType(sym, num_args, keys,
+                            arg_type_data,
+                            in_type_size, in_type_data,
+                            out_type_size, out_type_data,
+                            aux_type_size, aux_type_data,
+                            &succ);
+}
+
 int MXSymbolGrad(SymbolHandle sym, mx_uint num_wrt, const char** wrt, SymbolHandle* out) {
   API_BEGIN();
   LOG(FATAL) << "not implemented";
diff --git a/src/operator/contrib/adamw-inl.h b/src/operator/contrib/adamw-inl.h
index 3d76b33ae765..66bd4f3f3ba4 100644
--- a/src/operator/contrib/adamw-inl.h
+++ b/src/operator/contrib/adamw-inl.h
@@ -33,6 +33,7 @@
 #include <nnvm/op.h>
 #include <nnvm/op_attr_types.h>
 #include <vector>
+#include <cmath>
 #include "../operator_common.h"
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
@@ -48,7 +49,6 @@ struct AdamWParam : public dmlc::Parameter<AdamWParam> {
   float epsilon;
   float wd;
   float eta;
-  float rescale_grad;
   float clip_gradient;
   DMLC_DECLARE_PARAMETER(AdamWParam) {
     DMLC_DECLARE_FIELD(lr)
@@ -69,9 +69,6 @@ struct AdamWParam : public dmlc::Parameter<AdamWParam> {
               "The penalty scales with the square of the magnitude of each weight.");
     DMLC_DECLARE_FIELD(eta)
     .describe("Learning rate schedule multiplier");
-    DMLC_DECLARE_FIELD(rescale_grad)
-    .set_default(1.0f)
-    .describe("Rescale gradient to grad = rescale_grad*grad.");
     DMLC_DECLARE_FIELD(clip_gradient)
     .set_default(-1.0f)
     .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
@@ -80,44 +77,138 @@ struct AdamWParam : public dmlc::Parameter<AdamWParam> {
   }
 };
 
+// rescale_grad is a reserved argument at position -1. Example:
+// n_in = 2: weight, grad (fp16)
+// n_out = 1: weight (fp16)
+// total_in = 6: weight, grad, mean, var, weight32, rescale_grad (fp32)
+template<int n_in, int n_out, int total_in>
+inline bool MPUpdateInferShape(const nnvm::NodeAttrs& attrs,
+                               std::vector<TShape> *in_attrs,
+                               std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), static_cast<size_t>(total_in)) << " in operator " << attrs.name;
+  CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
+  // rescale_grad.shape = (1,)
+  SHAPE_ASSIGN_CHECK(*in_attrs, total_in - 1, mshadow::Shape1(1));
+  return ElemwiseAttr<TShape, shape_is_none, shape_assign, true, shape_string, n_in, n_out>(
+      attrs, in_attrs, out_attrs, TShape());
+}
+
+// rescale_grad is a reserved argument at position -1. Example:
+// n_in = 2: weight, grad (fp16)
+// n_out = 1: weight (fp16)
+// total_in = 6: weight, grad, mean, var, weight32, rescale_grad (fp32)
+template<int n_in, int n_out, int total_in>
+inline bool MPUpdateInferType(const nnvm::NodeAttrs& attrs,
+                              std::vector<int> *in_attrs,
+                              std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), static_cast<size_t>(total_in)) << " in operator " << attrs.name;
+  CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
+  for (int i = n_in; i < total_in; ++i) {
+    TYPE_ASSIGN_CHECK(*in_attrs, i, mshadow::kFloat32);
+  }
+  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string, n_in, n_out>(
+      attrs, in_attrs, out_attrs, -1);
+}
+
+template<int req>
+struct MPAdamWKernel {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, float* mean_data,
+    float* var_data, const DType* weight_data, const DType* grad_data, float* weight32,
+    const float param_clip_gradient, const float param_beta1, const float param_beta2,
+    const float param_eta, const float param_lr, const float param_wd,
+    const float param_rescale_grad, const float param_epsilon) {
+    float w = weight32[i];
+    float mean = mean_data[i];
+    float var = var_data[i];
+    float scaled_grad = param_rescale_grad*static_cast<float>(grad_data[i]);
+    if (param_clip_gradient >= 0.0f) {
+      mean = param_beta1 * mean +
+             (1 - param_beta1) * mshadow_op::clip::Map(scaled_grad, param_clip_gradient);
+      var = param_beta2 * var + (1 - param_beta2) *
+            mshadow_op::square::Map(mshadow_op::clip::Map(scaled_grad, param_clip_gradient));
+    } else {
+      mean = param_beta1 * mean + (1 - param_beta1) * scaled_grad;
+      var = param_beta2 * var + (1 - param_beta2) * mshadow_op::square::Map(scaled_grad);
+    }
+    mean_data[i] = mean;
+    var_data[i] = var;
+    w = w - param_eta * (param_lr * mean / (mshadow_op::square_root::Map(var) + param_epsilon)
+                         + param_wd * w);
+    weight32[i] = w;
+    KERNEL_ASSIGN(out_data[i], req, w);
+  }
+};
+
+
+template<typename xpu>
+struct MPAdamWUpdate {
+  static inline void Forward(const nnvm::NodeAttrs& attrs,
+               const OpContext &ctx,
+               const std::vector<TBlob> &inputs,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &outputs,
+               const float rescale_grad) {
+    using namespace mxnet_op;
+    AdamWParam param = nnvm::get<AdamWParam>(attrs.parsed);
+    Stream<xpu>* s = ctx.get_stream<xpu>();
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      Tensor<xpu, 2, DType> weight = inputs[0].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> grad = inputs[1].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, float> mean = inputs[2].FlatTo2D<xpu, float>(s);
+      Tensor<xpu, 2, float> var = inputs[3].FlatTo2D<xpu, float>(s);
+      Tensor<xpu, 2, float> weight32 = inputs[4].FlatTo2D<xpu, float>(s);
+      Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
+        Kernel<MPAdamWKernel<req_type>, xpu>::Launch(s, weight.shape_.Size(), out.dptr_, mean.dptr_,
+          var.dptr_, weight.dptr_, grad.dptr_, weight32.dptr_, param.clip_gradient, param.beta1,
+          param.beta2, param.eta, param.lr, param.wd, rescale_grad, param.epsilon);
+      });
+    });
+  }
+};
+
 /*
  * \brief adam_w update.
  */
 template<typename xpu>
-inline void AdamWUpdate(const nnvm::NodeAttrs& attrs,
-                        const OpContext &ctx,
-                        const std::vector<TBlob> &inputs,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  using namespace mshadow_op;
-  const AdamWParam& param = nnvm::get<AdamWParam>(attrs.parsed);
-  Stream<xpu>* s = ctx.get_stream<xpu>();
-  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    Tensor<xpu, 2, DType> weight = inputs[0].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> grad = inputs[1].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> mean = inputs[2].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> var = inputs[3].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+struct AdamWUpdate {
+  static inline void Forward(const nnvm::NodeAttrs& attrs,
+                             const OpContext &ctx,
+                             const std::vector<TBlob> &inputs,
+                             const std::vector<OpReqType> &req,
+                             const std::vector<TBlob> &outputs,
+                             const float rescale_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    using namespace mshadow_op;
+    const AdamWParam& param = nnvm::get<AdamWParam>(attrs.parsed);
+    Stream<xpu>* s = ctx.get_stream<xpu>();
+    MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+      Tensor<xpu, 2, DType> weight = inputs[0].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> grad = inputs[1].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> mean = inputs[2].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> var = inputs[3].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
 
-    grad = scalar<DType>(param.rescale_grad) * grad;
-    if (param.clip_gradient >= 0.0f) {
-      mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) *
-          F<clip>(grad, DType(param.clip_gradient));
-      var = scalar<DType>(param.beta2)*var + scalar<DType>(1.f-param.beta2)*F<square>(
-          F<clip>(grad, DType(param.clip_gradient)));
-    } else {
-      mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) * grad;
-      var = scalar<DType>(param.beta2)*var + scalar<DType>(1.f-param.beta2) * F<square>(grad);
-    }
-    Assign(out, req[0],
-           weight -
-           scalar<DType>(param.eta) * (scalar<DType>(param.lr) *
-           mean / (F<square_root>(var) + scalar<DType>(param.epsilon)) +
-           (scalar<DType>(param.wd) * weight)));
-  });
-}
+      grad = scalar<DType>(rescale_grad) * grad;
+      if (param.clip_gradient >= 0.0f) {
+        mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) *
+            F<clip>(grad, DType(param.clip_gradient));
+        var = scalar<DType>(param.beta2)*var + scalar<DType>(1.f-param.beta2)*F<square>(
+            F<clip>(grad, DType(param.clip_gradient)));
+      } else {
+        mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) * grad;
+        var = scalar<DType>(param.beta2)*var + scalar<DType>(1.f-param.beta2) * F<square>(grad);
+      }
+      Assign(out, req[0],
+             weight -
+             scalar<DType>(param.eta) * (scalar<DType>(param.lr) *
+             mean / (F<square_root>(var) + scalar<DType>(param.epsilon)) +
+             (scalar<DType>(param.wd) * weight)));
+    });
+  }
+};
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/adamw.cc b/src/operator/contrib/adamw.cc
index 94623fe08a9e..2fbc39743c93 100644
--- a/src/operator/contrib/adamw.cc
+++ b/src/operator/contrib/adamw.cc
@@ -24,12 +24,76 @@
  * \author Haibin Lin
  */
 #include "./adamw-inl.h"
+#include "../optimizer_op-inl.h"
 
 namespace mxnet {
 namespace op {
 
 DMLC_REGISTER_PARAMETER(AdamWParam);
 
+template<template <typename xpu> class F>
+inline void MPUpdateCPU(const nnvm::NodeAttrs& attrs,
+                        const OpContext &ctx,
+                        const std::vector<TBlob> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &outputs) {
+  // copy to cpu and check NaN value
+  TBlob scale_blob = inputs[inputs.size() - 1];
+  MSHADOW_REAL_TYPE_SWITCH(scale_blob.type_flag_, DType, {
+    float scalef = static_cast<float>(*scale_blob.dptr<DType>());
+    if (!std::isfinite(scalef) || scalef == 0) return;
+    std::vector<TBlob> inputs_wo_scale;
+    size_t num_in = inputs.size();
+    inputs_wo_scale.reserve(num_in - 1);
+    for (size_t i = 0; i < num_in - 1; i++) inputs_wo_scale.emplace_back(inputs[i]);
+    F<cpu>::Forward(attrs, ctx, inputs_wo_scale, req, outputs, scalef);
+  });
+}
+
+NNVM_REGISTER_OP(_contrib_mp_adamw_update)
+.describe(R"code(Update function for multi-precision AdamW optimizer.
+
+AdamW is seen as a modification of Adam by decoupling the weight decay from the
+optimization steps taken w.r.t. the loss function.
+
+Adam update consists of the following steps, where g represents gradient and m, v
+are 1st and 2nd order moment estimates (mean and variance).
+
+.. math::
+
+ g_t = \nabla J(W_{t-1})\\
+ m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t\\
+ v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\\
+ W_t = W_{t-1} - \eta_t (\alpha \frac{ m_t }{ \sqrt{ v_t } + \epsilon } + wd W_{t-1})
+
+It updates the weights using::
+
+ m = beta1*m + (1-beta1)*grad
+ v = beta2*v + (1-beta2)*(grad**2)
+ w -= eta * (learning_rate * m / (sqrt(v) + epsilon) + w * wd)
+
+Note that gradient is rescaled to grad = rescale_grad * grad. If rescale_grad is NaN, Inf, or 0,
+the update is skipped.
+)code" ADD_FILELINE)
+.set_num_inputs(6)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<AdamWParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", MPUpdateInferShape<2, 1, 6>)
+.set_attr<nnvm::FInferType>("FInferType", MPUpdateInferType<2, 1, 6>)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    return std::vector<uint32_t>{2, 3, 4};
+  })
+.set_attr<FCompute>("FCompute<cpu>", MPUpdateCPU<MPAdamWUpdate>)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("mean", "NDArray-or-Symbol", "Moving mean")
+.add_argument("var", "NDArray-or-Symbol", "Moving variance")
+.add_argument("weight32", "NDArray-or-Symbol", "Weight32")
+.add_argument("rescale_grad", "NDArray-or-Symbol",
+              "Rescale gradient to rescale_grad * grad. If NaN, the update is skipped.")
+.add_arguments(AdamWParam::__FIELDS__());
+
 NNVM_REGISTER_OP(_contrib_adamw_update)
 .describe(R"code(Update function for AdamW optimizer. AdamW is seen as a modification of
 Adam by decoupling the weight decay from the optimization steps taken w.r.t. the loss function.
@@ -50,21 +114,25 @@ It updates the weights using::
  v = beta2*v + (1-beta2)*(grad**2)
  w -= eta * (learning_rate * m / (sqrt(v) + epsilon) + w * wd)
 
+Note that gradient is rescaled to grad = rescale_grad * grad. If rescale_grad is NaN, Inf, or 0,
+the update is skipped.
 )code" ADD_FILELINE)
-.set_num_inputs(4)
+.set_num_inputs(5)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<AdamWParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<4, 1>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<4, 1>)
+.set_attr<nnvm::FInferShape>("FInferShape", MPUpdateInferShape<4, 1, 5>)
+.set_attr<nnvm::FInferType>("FInferType", MPUpdateInferType<4, 1, 5>)
 .set_attr<nnvm::FMutateInputs>("FMutateInputs",
   [](const nnvm::NodeAttrs& attrs) {
     return std::vector<uint32_t>{2, 3};
   })
-.set_attr<FCompute>("FCompute<cpu>", AdamWUpdate<cpu>)
+.set_attr<FCompute>("FCompute<cpu>", MPUpdateCPU<AdamWUpdate>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
 .add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_argument("mean", "NDArray-or-Symbol", "Moving mean")
 .add_argument("var", "NDArray-or-Symbol", "Moving variance")
+.add_argument("rescale_grad", "NDArray-or-Symbol",
+              "Rescale gradient to rescale_grad * grad. If NaN, the update is skipped.")
 .add_arguments(AdamWParam::__FIELDS__());
 
 }  // namespace op
diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu
index b7452f861e2d..e21b83b8aba6 100644
--- a/src/operator/contrib/adamw.cu
+++ b/src/operator/contrib/adamw.cu
@@ -28,8 +28,33 @@
 namespace mxnet {
 namespace op {
 
+template<template <typename xpu> class F>
+inline void MPUpdateGPU(const nnvm::NodeAttrs& attrs,
+                        const OpContext &ctx,
+                        const std::vector<TBlob> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &outputs) {
+  // copy to cpu and check NaN value
+  TBlob scale_blob = inputs[inputs.size() - 1];
+  MSHADOW_REAL_TYPE_SWITCH(scale_blob.type_flag_, DType, {
+    DType scale = 0;
+    CUDA_CALL(cudaMemcpy(&scale, scale_blob.dptr<DType>(), sizeof(DType),
+       cudaMemcpyDeviceToHost));
+    float scalef = static_cast<float>(scale);
+    if (!std::isfinite(scalef) || scalef == 0) return;
+    std::vector<TBlob> inputs_wo_scale;
+    size_t num_in = inputs.size();
+    inputs_wo_scale.reserve(num_in - 1);
+    for (size_t i = 0; i < num_in - 1; i++) inputs_wo_scale.emplace_back(inputs[i]);
+    F<gpu>::Forward(attrs, ctx, inputs_wo_scale, req, outputs, scalef);
+  });
+}
+
 NNVM_REGISTER_OP(_contrib_adamw_update)
-.set_attr<FCompute>("FCompute<gpu>", AdamWUpdate<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", MPUpdateGPU<AdamWUpdate>);
+
+NNVM_REGISTER_OP(_contrib_mp_adamw_update)
+.set_attr<FCompute>("FCompute<gpu>", MPUpdateGPU<MPAdamWUpdate>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 6cab1990858b..d8fc5031e4ff 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -249,6 +249,48 @@ inline int get_num_threads<cpu>(const int N) {
     LOG(FATAL) << "Unknown type enum " << type;            \
   }
 
+#define MXNET_REAL_ACC_TYPE_SWITCH(type, DType, AType, ...)\
+  switch (type) {                                          \
+  case mshadow::kFloat32:                                  \
+    {                                                      \
+      typedef float DType;                                 \
+      typedef double AType;                                \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat64:                                  \
+    {                                                      \
+      typedef double DType;                                \
+      typedef double AType;                                \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kFloat16:                                  \
+    {                                                      \
+      typedef mshadow::half::half_t DType;                 \
+      typedef float AType;                                 \
+      {__VA_ARGS__}                                        \
+    }                                                      \
+    break;                                                 \
+  case mshadow::kUint8:                                    \
+    LOG(FATAL) << "This operation only support "           \
+                  "floating point types not uint8";        \
+    break;                                                 \
+  case mshadow::kInt8:                                     \
+    LOG(FATAL) << "This operation only support "           \
+                  "floating point types not int8";         \
+    break;                                                 \
+  case mshadow::kInt32:                                    \
+    LOG(FATAL) << "This operation only support "           \
+                  "floating point types, not int32";       \
+    break;                                                 \
+  case mshadow::kInt64:                                    \
+    LOG(FATAL) << "This operation only support "           \
+                  "floating point types, not int64";       \
+    break;                                                 \
+  default:                                                 \
+    LOG(FATAL) << "Unknown type enum " << type;            \
+  }
 
 /*!
  * \brief assign the val to out according
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index c063e385f63a..90950bc9e92e 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -25,6 +25,9 @@
 #ifndef MXNET_OPERATOR_NN_SOFTMAX_INL_H_
 #define MXNET_OPERATOR_NN_SOFTMAX_INL_H_
 
+#include <algorithm>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "../mxnet_op.h"
@@ -36,23 +39,33 @@ namespace op {
 namespace mxnet_op {
 
 struct softmax_fwd {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(expf(a)/b);
+  template<typename AType>
+  MSHADOW_XINLINE static AType Map(float a, AType b) {
+    return AType(expf(a)/b);
+  }
+
+  template<typename AType>
+  MSHADOW_XINLINE static AType Map(double a, AType b) {
+    return AType(exp(a)/b);
   }
 };
 
 
 struct log_softmax_fwd {
   template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(a - logf(b));
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return a - logf(b);
+  }
+
+  template<typename DType>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return a - log(b);
   }
 };
 
 
-template<typename OP, bool negate, typename DType, int ndim>
-inline void Softmax(Stream<cpu> *s, DType *in, DType *out,
+template<typename OP, bool negate, typename AType, typename DType, typename OType, int ndim>
+inline void Softmax(Stream<cpu> *s, DType *in, OType *out,
                     Shape<ndim> shape, int axis, const DType temperature) {
   index_t M = shape[axis];
   index_t N = shape.Size()/M;
@@ -72,10 +85,9 @@ inline void Softmax(Stream<cpu> *s, DType *in, DType *out,
       if (mmax < val) mmax = val;
     }
 
-    DType sum = DType(0);
+    AType sum = AType(0);
     DType in_val;
-    // By default temperature is 1.0, and only in reinforcement training
-    // users would set it to other values.
+    // By default temperature is 1.0.
     // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
     if (temperature == 1.0) {
       for (index_t j = 0; j < M; ++j) {
@@ -103,23 +115,29 @@ inline void Softmax(Stream<cpu> *s, DType *in, DType *out,
 
 
 struct softmax_bwd {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType ograd, DType out, DType sum) {
-    return DType(out * (ograd - sum));
+  template<typename DType, typename AType>
+  MSHADOW_XINLINE static AType Map(DType ograd, DType out, AType sum) {
+    return AType(out * (ograd - sum));
   }
 };
 
 
 struct log_softmax_bwd {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType ograd, DType out, DType sum) {
-    return DType(ograd - expf(out)*sum);
+  template<typename AType>
+  MSHADOW_XINLINE static AType Map(float ograd, float out, AType sum) {
+    return AType(ograd - expf(out)*sum);
+  }
+
+  template<typename AType>
+  MSHADOW_XINLINE static AType Map(double ograd, double out, AType sum) {
+    return AType(ograd - exp(out)*sum);
   }
 };
 
 
-template<typename OP1, typename OP2, int Req, bool negate, typename DType, int ndim>
-inline void SoftmaxGrad(Stream<cpu> *s, DType *out, DType *ograd,
+template<typename OP1, typename OP2, int Req, bool negate,
+  typename AType, typename DType, typename OType, int ndim>
+inline void SoftmaxGrad(Stream<cpu> *s, OType *out, OType *ograd,
                         DType *igrad, Shape<ndim> shape, int axis,
                         const DType temperature) {
   index_t M = shape[axis];
@@ -133,13 +151,12 @@ inline void SoftmaxGrad(Stream<cpu> *s, DType *out, DType *ograd,
   for (int i = 0; i < static_cast<int>(N); ++i) {
     index_t base = unravel_dot(i, sshape, stride);
 
-    DType sum = DType(0);
+    AType sum = AType(0);
     for (index_t j = 0; j < M; ++j) {
       sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
     }
 
-    // By default temperature is 1.0, and only in reinforcement training
-    // users would set it to other values.
+    // By default temperature is 1.0.
     // Adding a branch here to save the CPU 'divide-by-1' computation at runtime
     DType final_result;
     if (temperature == 1.0) {
@@ -162,19 +179,20 @@ inline void SoftmaxGrad(Stream<cpu> *s, DType *out, DType *ograd,
 
 
 #ifdef __CUDACC__
-template<int x_bits, typename OP, bool negate, typename DType, int ndim>
-__global__ void softmax_compute_kernel(DType *in, DType *out, index_t M, int axis,
+template<int x_bits, typename OP, bool negate, typename AType, int ndim,
+  typename DType, typename OType>
+__global__ void softmax_compute_kernel(DType *in, OType *out, index_t M, int axis,
                                        Shape<ndim> sshape, Shape<ndim> stride,
                                        const double temperature) {
   const unsigned x_size = 1 << x_bits;
-  __shared__ DType smem[x_size];
+  __shared__ AType smem[x_size];
   index_t sa = stride[axis];
   index_t base = unravel_dot(blockIdx.x, sshape, stride);
   index_t x = threadIdx.x;
 
   red::maximum::SetInitValue(smem[x]);
   for (index_t i = x; i < M; i += x_size) {
-    red::maximum::Reduce(smem[x], negate ? -in[base + i*sa] : in[base + i*sa]);
+    smem[x] = ::max(smem[x], negate ? -in[base + i*sa] : in[base + i*sa]);
   }
   __syncthreads();
   cuda::Reduce1D<red::maximum, x_bits>(smem);
@@ -186,13 +204,12 @@ __global__ void softmax_compute_kernel(DType *in, DType *out, index_t M, int axi
   DType val;
   for (index_t i = x; i < M; i += x_size) {
     val = negate ? -in[base + i*sa]:in[base + i*sa];
-    red::sum::Reduce(
-      smem[x], static_cast<DType>(expf((val - smax) / static_cast<DType>(temperature))));
+    smem[x] += static_cast<AType>(expf((val - smax) / static_cast<AType>(temperature)));
   }
   __syncthreads();
   cuda::Reduce1D<red::sum, x_bits>(smem);
   __syncthreads();
-  DType ssum = smem[0];
+  AType ssum = smem[0];
   __syncthreads();
 
   for (index_t i = x; i < M; i += x_size) {
@@ -201,8 +218,8 @@ __global__ void softmax_compute_kernel(DType *in, DType *out, index_t M, int axi
   }
 }
 
-template<typename OP, bool negate, typename DType, int ndim>
-inline void Softmax(Stream<gpu> *s, DType *in, DType *out,
+template<typename OP, bool negate, typename AType, typename DType, typename OType, int ndim>
+inline void Softmax(Stream<gpu> *s, DType *in, OType *out,
                     Shape<ndim> shape, int axis, const double temperature) {
   const int x_bits = 7;
   const int x_size = 1 << x_bits;
@@ -212,31 +229,32 @@ inline void Softmax(Stream<gpu> *s, DType *in, DType *out,
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;
 
-  softmax_compute_kernel<x_bits, OP, negate, DType, ndim>
+  softmax_compute_kernel<x_bits, OP, negate, AType, ndim>
     <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
       in, out, M, axis, sshape, stride, temperature);
   MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_compute_kernel);
 }
 
 
-template<int x_bits, typename OP1, typename OP2, int Req, bool negate, typename DType, int ndim>
-__global__ void softmax_gradient_kernel(DType *out, DType *ograd, DType *igrad,
+template<int x_bits, typename OP1, typename OP2, int Req, bool negate, typename AType, int ndim,
+  typename DType, typename OType>
+__global__ void softmax_gradient_kernel(OType *out, OType *ograd, DType *igrad,
                                         index_t M, int axis, Shape<ndim> sshape,
                                         Shape<ndim> stride, const double temperature) {
   const unsigned x_size = 1 << x_bits;
-  __shared__ DType smem[x_size];
+  __shared__ AType smem[x_size];
   index_t sa = stride[axis];
   index_t base = unravel_dot(blockIdx.x, sshape, stride);
   index_t x = threadIdx.x;
 
   red::sum::SetInitValue(smem[x]);
   for (index_t i = x; i < M; i += x_size) {
-    red::sum::Reduce(smem[x], OP1::Map(ograd[base + i*sa], out[base + i*sa]));
+    smem[x] += OP1::Map(ograd[base + i*sa], out[base + i*sa]);
   }
   __syncthreads();
   cuda::Reduce1D<red::sum, x_bits>(smem);
   __syncthreads();
-  DType ssum = smem[0];
+  AType ssum = smem[0];
   __syncthreads();
 
   DType final_result;
@@ -250,8 +268,9 @@ __global__ void softmax_gradient_kernel(DType *out, DType *ograd, DType *igrad,
 }
 
 
-template<typename OP1, typename OP2, int Req, bool negate, typename DType, int ndim>
-inline void SoftmaxGrad(Stream<gpu> *s, DType *out, DType *ograd,
+template<typename OP1, typename OP2, int Req, bool negate, typename AType, int ndim,
+  typename DType, typename OType>
+inline void SoftmaxGrad(Stream<gpu> *s, OType *out, OType *ograd,
                         DType *igrad, Shape<ndim> shape, int axis,
                         const double temperature) {
   const int x_bits = 7;
@@ -262,7 +281,7 @@ inline void SoftmaxGrad(Stream<gpu> *s, DType *out, DType *ograd,
   Shape<ndim> sshape = shape;
   sshape[axis] = 1;
 
-  softmax_gradient_kernel<x_bits, OP1, OP2, Req, negate, DType, ndim>
+  softmax_gradient_kernel<x_bits, OP1, OP2, Req, negate, AType, ndim>
     <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
       out, ograd, igrad, M, axis, sshape, stride, temperature);
   MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_gradient_kernel);
@@ -275,11 +294,105 @@ inline void SoftmaxGrad(Stream<gpu> *s, DType *out, DType *ograd,
 struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
   int axis;
   dmlc::optional<double> temperature;
+  dmlc::optional<int> dtype;
   DMLC_DECLARE_PARAMETER(SoftmaxParam) {
     DMLC_DECLARE_FIELD(axis).set_default(-1)
-      .describe("The axis along which to compute softmax.");
+    .describe("The axis along which to compute softmax.");
     DMLC_DECLARE_FIELD(temperature).set_default(dmlc::optional<double>())
-      .describe("Temperature parameter in softmax");
+    .describe("Temperature parameter in softmax");
+    DMLC_DECLARE_FIELD(dtype)
+    .add_enum("float16", mshadow::kFloat16)
+    .add_enum("float32", mshadow::kFloat32)
+    .add_enum("float64", mshadow::kFloat64)
+    .set_default(dmlc::optional<int>())
+    .describe("DType of the output in case this can't be inferred. "
+              "Defaults to the same as input's dtype if not defined (dtype=None).");
+  }
+};
+
+static inline bool softmax_has_dtype_override(const nnvm::NodeAttrs& attrs) {
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  return param.dtype.has_value() && param.dtype.value() != -1;
+}
+
+static inline bool SoftmaxOpType(const nnvm::NodeAttrs& attrs,
+                                 std::vector<int>* in_attrs,
+                                 std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+
+  if (softmax_has_dtype_override(attrs)) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype.value());
+    type_assign(&(*in_attrs)[0], (*out_attrs)[0]);
+    return true;
+  } else {
+    return ElemwiseType<1, 1>(attrs, in_attrs, out_attrs);
+  }
+}
+
+static inline bool SoftmaxGradOpShape(const nnvm::NodeAttrs& attrs,
+                                      std::vector<TShape> *in_attrs,
+                                      std::vector<TShape> *out_attrs) {
+  if (softmax_has_dtype_override(attrs)) {
+    return ElemwiseShape<3, 1>(attrs, in_attrs, out_attrs);
+  } else {
+    return ElemwiseShape<2, 1>(attrs, in_attrs, out_attrs);
+  }
+}
+
+static inline bool SoftmaxGradOpType(const nnvm::NodeAttrs& attrs,
+                                     std::vector<int>* in_attrs,
+                                     std::vector<int>* out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  if (softmax_has_dtype_override(attrs)) {
+    CHECK_EQ(in_attrs->size(), 3);
+    int in_dtype = (*in_attrs)[1];
+    int out_dtype = (*in_attrs)[2];
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, out_dtype);
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_dtype);
+
+    return (*out_attrs)[0] != -1 && (*in_attrs)[0] != -1;
+  } else {
+    CHECK_EQ(in_attrs->size(), 2);
+    int out_dtype = (*in_attrs)[1];
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, out_dtype);
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, out_dtype);
+
+    return (*out_attrs)[0] != -1 && (*in_attrs)[0] != -1;
+  }
+}
+
+static inline std::vector<std::pair<int, int> >
+SoftmaxGradOpInplaceOption(const nnvm::NodeAttrs& attrs) {
+  if (softmax_has_dtype_override(attrs)) {
+    return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}, {2, 0}};
+  } else {
+    return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
+  }
+}
+
+static inline uint32_t SoftmaxGradOpNumInputs(const nnvm::NodeAttrs& attrs) {
+  return softmax_has_dtype_override(attrs) ? 3 : 2;
+}
+
+static inline std::vector<std::string> SoftmaxGradOpInputNames(const nnvm::NodeAttrs& attrs) {
+  if (softmax_has_dtype_override(attrs)) {
+    return std::vector<std::string>{"ograd", "data", "output"};
+  } else {
+    return std::vector<std::string>{"ograd", "output"};
+  }
+}
+
+struct SoftmaxFGradient {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    if (softmax_has_dtype_override(n->attrs)) {
+      return ElemwiseGradUseInOut {op_name}(n, ograds);
+    } else {
+      return ElemwiseGradUseOut {op_name}(n, ograds);
+    }
   }
 };
 
@@ -297,16 +410,20 @@ void SoftmaxCompute(const nnvm::NodeAttrs& attrs,
   const double temperature = param.temperature.has_value() ?
     param.temperature.value() : 1.0;
   TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
-  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    if (shape.ndim() == 2) {
-      Softmax<OP, negate>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-                          outputs[0].dptr<DType>(), shape.get<2>(), axis,
-                          static_cast<DType>(temperature));
-    } else {
-      Softmax<OP, negate>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
-                          outputs[0].dptr<DType>(), shape.get<3>(), axis,
-                          static_cast<DType>(temperature));
-    }
+  MXNET_REAL_ACC_TYPE_SWITCH(inputs[0].type_flag_, DType, AType, {
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      if (shape.ndim() == 2) {
+        Softmax<OP, negate, AType>(
+            ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+            outputs[0].dptr<OType>(), shape.get<2>(), axis,
+            static_cast<DType>(temperature));
+      } else {
+        Softmax<OP, negate, AType>(
+            ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+            outputs[0].dptr<OType>(), shape.get<3>(), axis,
+            static_cast<DType>(temperature));
+      }
+    });
   });
 }
 
@@ -324,17 +441,24 @@ void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
   const double temperature = param.temperature.has_value() ?
     param.temperature.value() : 1.0;
   TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
-  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-      if (shape.ndim() == 2) {
-        SoftmaxGrad<OP1, OP2, Req, negate>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
-                                           inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
-                                           shape.get<2>(), axis, static_cast<DType>(temperature));
-      } else {
-        SoftmaxGrad<OP1, OP2, Req, negate>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
-                                           inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
-                                           shape.get<3>(), axis, static_cast<DType>(temperature));
-      }
+
+  int out_idx = softmax_has_dtype_override(attrs) ? 2 : 1;
+
+  MXNET_REAL_ACC_TYPE_SWITCH(inputs[0].type_flag_, OType, AType, {
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        if (shape.ndim() == 2) {
+          SoftmaxGrad<OP1, OP2, Req, negate, AType>(
+              ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
+              inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
+              shape.get<2>(), axis, static_cast<DType>(temperature));
+        } else {
+          SoftmaxGrad<OP1, OP2, Req, negate, AType>(
+              ctx.get_stream<xpu>(), inputs[out_idx].dptr<OType>(),
+              inputs[0].dptr<OType>(), outputs[0].dptr<DType>(),
+              shape.get<3>(), axis, static_cast<DType>(temperature));
+        }
+      });
     });
   });
 }
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 81e775cac526..c88f738c356d 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -67,7 +67,7 @@ inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs,
 }
 #endif
 
-MXNET_OPERATOR_REGISTER_UNARY(softmax)
+NNVM_REGISTER_OP(softmax)
 .describe(R"code(Applies the softmax function.
 
 The resulting array contains elements in the range (0,1) and the elements along the given axis sum up to 1.
@@ -102,15 +102,31 @@ Example::
 .set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxComputeExCPU)
 .set_attr<FInferStorageType>("FInferStorageType", SoftmaxStorageType)
 #endif
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_softmax"})
+.set_attr<nnvm::FGradient>("FGradient", SoftmaxFGradient{"_backward_softmax"})
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxOpType)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "The input array.")
 .add_arguments(SoftmaxParam::__FIELDS__());
 
-MXNET_OPERATOR_REGISTER_BINARY(_backward_softmax)
+NNVM_REGISTER_OP(_backward_softmax)
+.set_num_inputs(SoftmaxGradOpNumInputs)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", SoftmaxGradOpInputNames)
+.set_attr<nnvm::FInferShape>("FInferShape", SoftmaxGradOpShape)
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxGradOpType)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", SoftmaxGradOpInplaceOption)
+.add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments")
 .set_attr_parser(ParamParser<SoftmaxParam>)
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, op::mshadow_op::mul,
                                                         mxnet_op::softmax_bwd>);
 
-MXNET_OPERATOR_REGISTER_UNARY(softmin)
+NNVM_REGISTER_OP(softmin)
 .describe(R"code(Applies the softmin function.
 
 The resulting array contains elements in the range (0,1) and the elements along the given axis sum
@@ -141,15 +157,31 @@ Example::
     return std::vector<std::string>{"output"};
 })
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::softmax_fwd, true>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_softmin"})
+.set_attr<nnvm::FGradient>("FGradient", SoftmaxFGradient{"_backward_softmin"})
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxOpType)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "The input array.")
 .add_arguments(SoftmaxParam::__FIELDS__());
 
-MXNET_OPERATOR_REGISTER_BINARY(_backward_softmin)
+NNVM_REGISTER_OP(_backward_softmin)
+.set_num_inputs(SoftmaxGradOpNumInputs)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", SoftmaxGradOpInputNames)
+.set_attr<nnvm::FInferShape>("FInferShape", SoftmaxGradOpShape)
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxGradOpType)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", SoftmaxGradOpInplaceOption)
+.add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments")
 .set_attr_parser(ParamParser<SoftmaxParam>)
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, op::mshadow_op::mul,
                                                         mxnet_op::softmax_bwd, true>);
 
-MXNET_OPERATOR_REGISTER_UNARY(log_softmax)
+NNVM_REGISTER_OP(log_softmax)
 .describe(R"code(Computes the log softmax of the input.
 This is equivalent to computing softmax followed by log.
 
@@ -168,10 +200,26 @@ Examples::
 )code")
 .set_attr_parser(ParamParser<SoftmaxParam>)
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::log_softmax_fwd>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_log_softmax"})
+.set_attr<nnvm::FGradient>("FGradient", SoftmaxFGradient{"_backward_log_softmax"})
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxOpType)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "The input array.")
 .add_arguments(SoftmaxParam::__FIELDS__());
 
-MXNET_OPERATOR_REGISTER_BINARY(_backward_log_softmax)
+NNVM_REGISTER_OP(_backward_log_softmax)
+.set_num_inputs(SoftmaxGradOpNumInputs)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", SoftmaxGradOpInputNames)
+.set_attr<nnvm::FInferShape>("FInferShape", SoftmaxGradOpShape)
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxGradOpType)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", SoftmaxGradOpInplaceOption)
+.add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments")
 .set_attr_parser(ParamParser<SoftmaxParam>)
 .set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, mshadow_op::left,
                                                         mxnet_op::log_softmax_bwd>);
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 97c4fa55681c..28ed4215e0a7 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -1389,13 +1389,15 @@ void SliceLikeBackward(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 2U);
   CHECK_EQ(req.size(), 2U);
-  if (req[0] == kNullOp) return;
   using namespace mshadow;
   Stream<xpu>* s = ctx.get_stream<xpu>();
+  if (req[1] != kNullOp && req[1] != kAddTo) {
+    Fill(s, outputs[1], req[1], 0);  // Second input not relavant to gradients.
+  }
+  if (req[0] == kNullOp) return;
   const TBlob& ograd = inputs[0];
   const TBlob& igrad = outputs[0];
   const SliceLikeParam& param = nnvm::get<SliceLikeParam>(attrs.parsed);
-  Fill(s, outputs[1], req[1], 0);  // Second input not relavant to gradients.
   if (req[0] == kWriteTo) {
     Fill(s, igrad, req[0], 0);
   } else if (req[0] == kWriteInplace) {
diff --git a/tests/python/unittest/test_contrib_optimizer.py b/tests/python/unittest/test_contrib_optimizer.py
index 8ff8a7e1436b..dad7bed3a923 100644
--- a/tests/python/unittest/test_contrib_optimizer.py
+++ b/tests/python/unittest/test_contrib_optimizer.py
@@ -94,6 +94,90 @@ def test_group_adagrad():
                 g_stype='row_sparse',
                 compare_states=False)
 
+def test_adamw():
+    shape = (3, 4)
+    weight = mx.nd.random.uniform(shape=shape)
+    weight_ref = weight.copy()
+    grad = mx.nd.random.uniform(shape=shape)
+    m = mx.nd.random.uniform(shape=shape)
+    v = mx.nd.random.uniform(shape=shape)
+    rescale_grad = mx.nd.array([10])
+    eta, lr, wd, epsilon = 1, 1, 0, 1e-8
+    beta1, beta2 = 0.9, 0.999
+    kwargs = {'eta': eta, 'lr': lr, 'wd': wd, 'epsilon': epsilon,
+              'beta1': beta1, 'beta2': beta2}
+
+    # update is skipped for rescale = 0
+    mx.nd.contrib.adamw_update(weight, grad, m, v,
+                               rescale_grad * 0, out=weight, **kwargs)
+    # weight remains unchanged
+    mx.test_utils.assert_almost_equal(weight_ref.asnumpy(), weight.asnumpy())
+
+    # update is skipped for rescale = nan
+    mx.nd.contrib.adamw_update(weight, grad, m, v,
+                               rescale_grad * np.nan, out=weight, **kwargs)
+    # weight remains unchanged
+    mx.test_utils.assert_almost_equal(weight_ref.asnumpy(), weight.asnumpy())
+
+    # update is skipped for rescale = inf
+    mx.nd.contrib.adamw_update(weight, grad, m, v,
+                               rescale_grad * np.inf, out=weight, **kwargs)
+    # weight remains unchanged
+    mx.test_utils.assert_almost_equal(weight_ref.asnumpy(), weight.asnumpy())
+
+    # multi-precision update is skipped for rescale = nan
+    weight_fp16 = weight.astype('float16')
+    grad_fp16 = grad.astype('float16')
+    weight_fp16_ref = weight_fp16.copy()
+    mx.nd.contrib.mp_adamw_update(weight_fp16, grad_fp16, m, v, weight,
+                                  rescale_grad * np.nan, out=weight_fp16, **kwargs)
+    mx.test_utils.assert_almost_equal(weight_ref.asnumpy(), weight.asnumpy())
+    mx.test_utils.assert_almost_equal(weight_fp16_ref.asnumpy(), weight_fp16.asnumpy())
+
+    # multi-precision update is skipped for rescale = inf
+    mx.nd.contrib.mp_adamw_update(weight_fp16, grad_fp16, m, v, weight,
+                                  rescale_grad * np.inf, out=weight_fp16, **kwargs)
+    mx.test_utils.assert_almost_equal(weight_ref.asnumpy(), weight.asnumpy())
+    mx.test_utils.assert_almost_equal(weight_fp16_ref.asnumpy(), weight_fp16.asnumpy())
+
+    # multi-precision update is skipped for rescale = 0
+    mx.nd.contrib.mp_adamw_update(weight_fp16, grad_fp16, m, v, weight,
+                                  rescale_grad * 0, out=weight_fp16, **kwargs)
+    mx.test_utils.assert_almost_equal(weight_ref.asnumpy(), weight.asnumpy())
+    mx.test_utils.assert_almost_equal(weight_fp16_ref.asnumpy(), weight_fp16.asnumpy())
+
+    # reference normal update
+    grad_rescale = rescale_grad * grad
+    m_ref = beta1*m + (1-beta1)*grad_rescale
+    v_ref = beta2*v + (1-beta2)*(grad_rescale**2)
+    weight_ref = weight - eta * (1 * m_ref / (v_ref.sqrt() + epsilon) + weight * wd)
+    m_test = m.copy()
+    v_test = v.copy()
+    weight_test = weight.copy()
+    # op normal update
+    mx.nd.contrib.adamw_update(weight_test, grad, m_test, v_test,
+                               rescale_grad, out=weight_test, **kwargs)
+    mx.test_utils.assert_almost_equal(weight_ref.asnumpy(), weight_test.asnumpy())
+    mx.test_utils.assert_almost_equal(m_ref.asnumpy(), m_test.asnumpy())
+    mx.test_utils.assert_almost_equal(v_ref.asnumpy(), v_test.asnumpy())
+
+    # reference normal multi-precision update
+    m_fp32 = m.copy()
+    v_fp32 = v.copy()
+    weight_fp32 = weight.copy()
+    grad_rescale = rescale_grad * grad_fp16.astype('float32')
+    m_ref = beta1*m_fp32 + (1-beta1)*grad_rescale
+    v_ref = beta2*v_fp32 + (1-beta2)*(grad_rescale**2)
+    weight_ref = weight - eta * (1 * m_ref / (v_ref.sqrt() + epsilon) + weight * wd)
+    weight_fp16_ref = weight_ref.astype('float16')
+    # op normal multi-precision update
+    mx.nd.contrib.mp_adamw_update(weight_fp16, grad_fp16, m_fp32, v_fp32, weight_fp32,
+                                  rescale_grad, out=weight_fp16, **kwargs)
+    mx.test_utils.assert_almost_equal(m_ref.asnumpy(), m_fp32.asnumpy())
+    mx.test_utils.assert_almost_equal(v_ref.asnumpy(), v_fp32.asnumpy())
+    mx.test_utils.assert_almost_equal(weight_ref.asnumpy(), weight_fp32.asnumpy())
+    mx.test_utils.assert_almost_equal(weight_fp16_ref.asnumpy(), weight_fp16.asnumpy())
+
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index ae38a2297ded..36c1993bf0ff 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -917,6 +917,20 @@ def sym_gen(_):
     assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == 2 * batch_size)
 
 
+def test_module_update_no_pragram():
+    # test module to do update on layers without params
+    data_shape = (10, 10)
+    data = mx.sym.Variable('data')
+    out = mx.sym.Dropout(data, 0.5)
+    mod = mx.mod.Module(out)
+    mod.bind(data_shapes=[('data', data_shape)])
+    mod.init_params()
+    mod.init_optimizer()
+    data_batch = mx.io.DataBatch([nd.ones(data_shape)])
+    mod.forward_backward(data_batch)
+    mod.update()
+    assert(mod.get_outputs()[0].shape == data_shape)
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index a9b9cc8cf704..ae7dc86d566c 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -4534,6 +4534,47 @@ def softmax_forward(input_data, true_output):
     softmax_forward(mx.nd.array([[[[-3.4e38,-3.4e38]]]]), np.array([1.0,1.0]))
     softmax_forward(mx.nd.array([[[[3.4e38,3.4e38]]]]), np.array([1.0,1.0]))
 
+@with_seed()
+def test_softmax_dtype():
+    def check_dtypes_almost_equal(op_name,
+                                  atol, rtol,
+                                  grad_atol, grad_rtol,
+                                  idtype, ref_dtype, odtype=None):
+        op = getattr(mx.nd, op_name)
+        input_data = mx.random.uniform(shape=(100, 500))
+        dtype_input = input_data.astype(idtype)
+        ref_input = input_data.astype(ref_dtype)
+        dtype_input.attach_grad()
+        ref_input.attach_grad()
+        with mx.autograd.record():
+            dtype_softmax = op(dtype_input, axis=-1, dtype=odtype)
+            ref_softmax = op(ref_input, axis=-1, dtype=odtype)
+        dtype_softmax_np = dtype_softmax.asnumpy()
+        ref_softmax_np = ref_softmax.asnumpy()
+        assert_almost_equal(dtype_softmax_np, ref_softmax_np, rtol=rtol, atol=atol)
+        dtype_softmax.backward()
+        ref_softmax.backward()
+        dtype_grad_np = dtype_input.grad.asnumpy()
+        ref_grad_np = ref_input.grad.asnumpy()
+        assert_almost_equal(dtype_grad_np, ref_grad_np, rtol=grad_rtol, atol=grad_atol)
+
+    check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32')
+    check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32', 'float32')
+    check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64')
+    check_dtypes_almost_equal('softmax', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64', 'float64')
+    check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32')
+    check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float16', 'float32', 'float32')
+    check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64')
+    check_dtypes_almost_equal('softmin', 1e-5, 1e-5, 1e-5, 1e-5, 'float32', 'float64', 'float64')
+    check_dtypes_almost_equal('log_softmax', 1e-2, 1e-2, 1e-2, 1e-2,
+                              'float16', 'float32')
+    check_dtypes_almost_equal('log_softmax', 1e-2, 1e-2, 1e-2, 1e-2,
+                              'float16', 'float32', 'float32')
+    check_dtypes_almost_equal('log_softmax', 1e-3, 1e-3, 1e-3, 1e-3,
+                              'float32', 'float64')
+    check_dtypes_almost_equal('log_softmax', 1e-3, 1e-3, 1e-3, 1e-3,
+                              'float32', 'float64', 'float64')
+
 @with_seed()
 def test_pick():
     def test_pick_helper(index_type=np.int32):
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index c5c1b018b081..ac4564b66fa0 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -120,6 +120,13 @@ def test_symbol_infer_type():
     assert out == [np.float32]
     assert aux == []
 
+    # partial infer type
+    arg, out, aux = mlp.infer_type_partial()
+    assert arg == [None, np.float32, np.float32, np.float32]
+    assert out == [np.float32]
+    assert aux == []
+
+
 def test_symbol_infer_shape():
     num_hidden = 128
     num_dim    = 64