apache · eric-haibin-lin · Dec 28, 2018 · Dec 25, 2018 · Dec 27, 2018 · Dec 27, 2018
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
@@ -27,14 +27,14 @@
 from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply)
 from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
                        mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                       signsgd_update, signum_update)
+                       signsgd_update, signum_update, adamw_update)
 from ..ndarray import sparse
 from ..random import normal
 
 __all__ = [
     'AdaDelta', 'AdaGrad', 'Adam', 'Adamax', 'DCASGD', 'FTML', 'Ftrl', 'LBSGD',
     'NAG', 'NDabs', 'Nadam', 'Optimizer', 'RMSProp', 'SGD', 'SGLD', 'Signum',
-    'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register'
+    'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register', 'AdamW'
 ]
 
 
@@ -1018,6 +1018,70 @@ class ccSGD(SGD):
     def __init__(self, *args, **kwargs):
         super(ccSGD, self).__init__(*args, **kwargs)
 
+@register
+class AdamW(Optimizer):
+    """The Adam optimizer with fixed weight decay regularization.
+
+    This class implements the optimizer described in *Fixing Weight Decay
+    Regularization in Adam*, available at https://arxiv.org/abs/1711.05101.
+
+    Note that this is different from the original Adam optimizer which adds L2
+    regularization on the weights to the loss: it regularizes weights with large
+    gradients more than L2 regularization would, which was shown to yield better
+    training loss and generalization error in the paper above.
+
+    Updates are applied by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient)
+        m = beta1 * m + (1 - beta1) * rescaled_grad
+        v = beta2 * v + (1 - beta2) * (rescaled_grad**2)
+        w = w - learning_rate * (m / (sqrt(v) + epsilon) + wd * w)
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.adamw_update`.
+
+    Parameters
+    ----------
+    beta1 : float, optional
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, optional
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, optional
+        Small value to avoid division by 0.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+                 **kwargs):
+        super(AdamW, self).__init__(learning_rate=learning_rate, **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype), #mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype)) #variance
+
+    def update(self, index, weight, grad, state):
+        assert(isinstance(weight, NDArray))
+        assert(isinstance(grad, NDArray))
+        self._update_count(index)
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+
+        t = self._index_update_count[index]
+        coef1 = 1. - self.beta1**t
+        coef2 = 1. - self.beta2**t
+        lr *= math.sqrt(coef2)/coef1
+
+        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                  'rescale_grad': self.rescale_grad}
+        if self.clip_gradient:
+            kwargs['clip_gradient'] = self.clip_gradient
+
+        mean, var = state
+        adamw_update(weight, grad, mean, var, out=weight, lr=lr, wd=wd, **kwargs)
+
 @register
 class Adam(Optimizer):
     """The Adam optimizer.

diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
@@ -837,7 +837,10 @@ struct AdamParam : public dmlc::Parameter<AdamParam> {
   }
 };
 
-template<typename xpu>
+/*
+ * \brief adam and adam_w update. Set decoupled=True for adam_w.
+ */
+template<typename xpu, bool decoupled>
 inline void AdamUpdate(const nnvm::NodeAttrs& attrs,
                        const OpContext &ctx,
                        const std::vector<TBlob> &inputs,
@@ -855,9 +858,12 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs,
     Tensor<xpu, 2, DType> var = inputs[3].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
 
-    grad = scalar<DType>(param.rescale_grad) * grad +
-      scalar<DType>(param.wd) * weight;
-
+    if (decoupled) {
+      grad = scalar<DType>(param.rescale_grad) * grad;
+    } else {
+      grad = scalar<DType>(param.rescale_grad) * grad +
+        scalar<DType>(param.wd) * weight;
+    }
     if (param.clip_gradient >= 0.0f) {
       mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) *
           F<clip>(grad, DType(param.clip_gradient));
@@ -867,10 +873,18 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs,
       mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) * grad;
       var = scalar<DType>(param.beta2)*var + scalar<DType>(1.f-param.beta2) * F<square>(grad);
     }
-    Assign(out, req[0],
-           weight -
-           scalar<DType>(param.lr) * mean /
-           (F<square_root>(var) + scalar<DType>(param.epsilon)));
+    if (decoupled) {
+      Assign(out, req[0],
+             weight -
+             scalar<DType>(param.lr) * (mean /
+             (F<square_root>(var) + scalar<DType>(param.epsilon)) +
+             (scalar<DType>(param.wd) * weight)));
+    } else {
+      Assign(out, req[0],
+             weight -
+             scalar<DType>(param.lr) * mean /
+             (F<square_root>(var) + scalar<DType>(param.epsilon)));
+    }
   });
 }
 

diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
@@ -472,15 +472,16 @@ are 1st and 2nd order moment estimates (mean and variance).
 
 .. math::
 
- g_t = \nabla J(W_{t-1})\\
+ g_t = \nabla J(W_{t-1}) + wd W_{t-1}\\
  m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t\\
  v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\\
  W_t = W_{t-1} - \alpha \frac{ m_t }{ \sqrt{ v_t } + \epsilon }
 
 It updates the weights using::
 
- m = beta1*m + (1-beta1)*grad
- v = beta2*v + (1-beta2)*(grad**2)
+ g = grad + wd*w
+ m = beta1*m + (1-beta1)*g
+ v = beta2*v + (1-beta2)*(g**2)
  w += - learning_rate * m / (sqrt(v) + epsilon)
 
 However, if grad's storage type is ``row_sparse``, ``lazy_update`` is True and the storage
@@ -507,14 +508,50 @@ only the row slices whose indices appear in grad.indices are updated (for w, m a
   [](const nnvm::NodeAttrs& attrs) {
     return std::vector<uint32_t>{2, 3};
   })
-.set_attr<FCompute>("FCompute<cpu>", AdamUpdate<cpu>)
+.set_attr<FCompute>("FCompute<cpu>", AdamUpdate<cpu, false>)
 .set_attr<FComputeEx>("FComputeEx<cpu>", AdamUpdateEx<cpu>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
 .add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_argument("mean", "NDArray-or-Symbol", "Moving mean")
 .add_argument("var", "NDArray-or-Symbol", "Moving variance")
 .add_arguments(AdamParam::__FIELDS__());
 
+NNVM_REGISTER_OP(adamw_update)
+.describe(R"code(Update function for AdamW optimizer. AdamW is seen as a modification of
+Adam by decoupling the weight decay from the optimization steps taken w.r.t. the loss function.
+
+Adam update consists of the following steps, where g represents gradient and m, v
+are 1st and 2nd order moment estimates (mean and variance).
+
+.. math::
+
+ g_t = \nabla J(W_{t-1})\\
+ m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t\\
+ v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2\\
+ W_t = W_{t-1} - \alpha (\frac{ m_t }{ \sqrt{ v_t } + \epsilon } + wd W_{t-1})
+
+It updates the weights using::
+
+ m = beta1*m + (1-beta1)*grad
+ v = beta2*v + (1-beta2)*(grad**2)
+ w += - learning_rate * (m / (sqrt(v) + epsilon) + w*wd)
+
+)code" ADD_FILELINE)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<AdamParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<4, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<4, 1>)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    return std::vector<uint32_t>{2, 3};
+  })
+.set_attr<FCompute>("FCompute<cpu>", AdamUpdate<cpu, true>)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("mean", "NDArray-or-Symbol", "Moving mean")
+.add_argument("var", "NDArray-or-Symbol", "Moving variance")
+.add_arguments(AdamParam::__FIELDS__());
 
 NNVM_REGISTER_OP(rmsprop_update)
 .describe(R"code(Update function for `RMSProp` optimizer.

diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
@@ -246,9 +246,12 @@ NNVM_REGISTER_OP(ftml_update)
 .set_attr<FCompute>("FCompute<gpu>", FTMLUpdate<gpu>);
 
 NNVM_REGISTER_OP(adam_update)
-.set_attr<FCompute>("FCompute<gpu>", AdamUpdate<gpu>)
+.set_attr<FCompute>("FCompute<gpu>", AdamUpdate<gpu, false>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", AdamUpdateEx<gpu>);
 
+NNVM_REGISTER_OP(adamw_update)
+.set_attr<FCompute>("FCompute<gpu>", AdamUpdate<gpu, true>);
+
 NNVM_REGISTER_OP(rmsprop_update)
 .set_attr<FCompute>("FCompute<gpu>", RMSPropUpdate<gpu>);
 

diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
@@ -506,12 +506,11 @@ def test_ftml():
 class PyAdam(mx.optimizer.Optimizer):
     """python reference implemenation of adam"""
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 decay_factor=(1 - 1e-8), lazy_update=True, **kwargs):
+                 lazy_update=True, **kwargs):
         super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
-        self.decay_factor = decay_factor
         self.lazy_update = lazy_update
 
     def create_state(self, index, weight):
@@ -614,6 +613,92 @@ def test_adam():
                                           dtype, w_stype='default', g_stype='row_sparse',
                                           rtol=1e-4, atol=2e-5)
 
+# ADAMW
+class PyAdamW(mx.optimizer.Optimizer):
+    """python reference implemenation of AdamW"""
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+                 **kwargs):
+        super(PyAdamW, self).__init__(learning_rate=learning_rate, **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        """Create additional optimizer state: mean, variance
+
+        Parameters
+        ----------
+        weight : NDArray
+        The weight data
+
+        """
+        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
+                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+
+    def update(self, index, weight, grad, state):
+        """Update the parameters.
+
+        Parameters
+        ----------
+        index : int
+        An unique integer key used to index the parameters
+
+        weight : NDArray
+        weight ndarray
+
+        grad : NDArray
+        grad ndarray
+
+        state : NDArray or other objects returned by init_state
+        The auxiliary state used in optimization.
+        """
+        lr = self._get_lr(index)
+        self._update_count(index)
+
+        t = self._index_update_count[index]
+        mean, variance = state
+
+        wd = self._get_wd(index)
+        coef1 = 1. - self.beta1**t
+        coef2 = 1. - self.beta2**t
+        lr *= math.sqrt(coef2)/coef1
+
+        grad *= self.rescale_grad
+        # clip gradients
+        if self.clip_gradient is not None:
+            mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient, out=grad)
+        # update mean
+        mean *= self.beta1
+        mean += grad * (1. - self.beta1)
+        # update variance
+        variance *= self.beta2
+        variance += (1 - self.beta2) * mx.nd.square(grad, out=grad)
+        # update weight
+        weight -= lr * (mean/(mx.nd.sqrt(variance) + self.epsilon) + wd * weight)
+
+@with_seed()
+def test_adamw():
+    opt1 = PyAdamW
+    opt2 = mx.optimizer.AdamW
+    shape = (3, 4, 5)
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    for dtype in [np.float16, np.float32, np.float64]:
+        for cg_option in cg_options:
+            for rg_option in rg_options:
+                for wd_option in wd_options:
+                    for mp_option in mp_options:
+                        kwarg = {}
+                        kwarg.update(cg_option)
+                        kwarg.update(rg_option)
+                        kwarg.update(wd_option)
+                        kwarg.update(mp_option)
+                        if (dtype == np.float16 and
+                            ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                            continue
+                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
 
 # AdaMax
 class PyAdamax(mx.optimizer.Optimizer):