diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 442a11d02200..d7b6821ac8cc 100644
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -637,6 +637,8 @@ class FTML(Optimizer):
         z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
         weight = - z / d_t
 
+    For details of the update algorithm, see :class:`~mxnet.ndarray.ftml_update`.
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 334b7d4c0fdb..36f7897acaf3 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import itertools
 import numpy as np
 import mxnet as mx
 import mxnet.lr_scheduler as lr_scheduler
@@ -976,8 +977,8 @@ def update(self, index, weight, grad, state):
         div = grad / mx.nd.sqrt(history + self.float_stable_eps)
         weight[:] += (div + weight * wd) * -lr
 
+@with_seed()
 def test_adagrad():
-    mx.random.seed(0)
     opt1 = PyAdaGrad
     opt2 = mx.optimizer.AdaGrad
     shape = (3, 4, 5)
@@ -1002,6 +1003,76 @@ def test_adagrad():
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
                                               g_stype='row_sparse')
 
+# AdaDelta
+class PyAdaDelta(mx.optimizer.Optimizer):
+    """The python reference of AdaDelta optimizer.
+
+    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
+    learning rate method*, available at https://arxiv.org/abs/1212.5701.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        acc_grad = rho * acc_grad + (1. - rho) * grad ** 2
+        cur_delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * cur_delta ** 2
+        weight -= (cur_delta + wd * weight)
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    rho: float
+        Decay rate for both squared gradients and delta.
+    epsilon : float
+        Small value to avoid division by 0.
+    """
+    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
+        super(PyAdaDelta, self).__init__(**kwargs)
+        self.rho = rho
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (mx.nd.zeros(weight.shape, weight.context),
+                mx.nd.zeros(weight.shape, weight.context))
+
+    def update(self, index, weight, grad, state):
+        self._update_count(index)
+        wd = self._get_wd(index)
+
+        grad *= self.rescale_grad
+        if self.clip_gradient is not None:
+            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+
+        acc_grad, acc_delta = state
+
+        acc_grad[:] = self.rho * acc_grad + (1. - self.rho) * grad ** 2
+        current_delta = (mx.nd.sqrt(acc_delta + self.epsilon) /
+                         mx.nd.sqrt(acc_grad + self.epsilon)) * grad
+        acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta ** 2
+
+        # update weight
+        weight[:] -= current_delta + wd * weight
+
+@with_seed()
+def test_adadelta():
+    opt1 = PyAdaDelta
+    opt2 = mx.optimizer.AdaDelta
+    shape = (3, 4, 5)
+    rho_options = [{'rho': 0.9}]
+    eps_options = [{}, {'epsilon': 1e-8}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.0}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(rho_options, eps_options, cg_options,
+                                        rg_options, wd_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+
 
 def test_factor_scheduler():
     base_lr = 1