From 8d2208dccf57a066cd86ad6f30f363ed6760ddb1 Mon Sep 17 00:00:00 2001 From: Denisa Roberts Date: Thu, 29 Nov 2018 15:58:34 -0500 Subject: [PATCH 1/3] Add a test for AdaMax optimizer --- tests/python/unittest/test_optimizer.py | 83 ++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 334b7d4c0fdb..7d370bf3b62a 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -501,7 +501,6 @@ def test_ftml(): # ADAM - class PyAdam(mx.optimizer.Optimizer): """python reference implemenation of adam""" def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, @@ -613,6 +612,88 @@ def test_adam(): dtype, w_stype='default', g_stype='row_sparse', rtol=1e-4, atol=2e-5) + +# AdaMax +class PyAdamax(mx.optimizer.Optimizer): + """The python reference of AdaMax optimizer. + + This class implements the AdaMax optimizer, a variant of Adam based on the infinity norm, + available at http://arxiv.org/abs/1412.6980 Section 7. + + The optimizer updates the weight by:: + grad = clip(grad * rescale_grad + wd * weight, clip_gradient) + m = beta1 * m_t + (1 - beta1) * grad + u = maximum(beta2 * u, abs(grad)) + weight -= lr / (1 - beta1**t) * m / u + + This optimizer accepts the following parameters in addition to those accepted + by :class:`.Optimizer`. + + Parameters + ---------- + beta1 : float, optional + Exponential decay rate for the first moment estimates. + beta2 : float, optional + Exponential decay rate for the second moment estimates. + """ + def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, **kwargs): + super(PyAdamax, self).__init__(learning_rate=learning_rate, **kwargs) + self.beta1 = beta1 + self.beta2 = beta2 + + def create_state(self, index, weight): + return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype), # mean + mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)) # variance + + def update(self, index, weight, grad, state): + self._update_count(index) + lr = self._get_lr(index) + wd = self._get_wd(index) + + t = self._index_update_count[index] + lr /= (1. - self.beta1**t) + + # preprocess grad + grad = grad * self.rescale_grad + wd * weight + if self.clip_gradient is not None: + grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient) + + # update m_t and u_t + m_t, u_t = state + m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad + u_t[:] = mx.nd.maximum(self.beta2 * u_t, mx.nd.abs(grad)) + + # update weight + weight[:] -= lr * m_t / u_t + + +@with_seed() +def test_adamax(): + opt1 = PyAdamax + opt2 = mx.optimizer.Adamax + shape = (3, 4, 5) + cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}] + rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}] + wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}] + mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}] + for dtype in [np.float16, np.float32, np.float64]: + for cg_option in cg_options: + for rg_option in rg_options: + for wd_option in wd_options: + for mp_option in mp_options: + kwarg = {} + kwarg.update(cg_option) + kwarg.update(rg_option) + kwarg.update(wd_option) + kwarg.update(mp_option) + if (dtype == np.float16 and + ('multi_precision' not in kwarg or + not kwarg['multi_precision'])): + continue + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, + rtol=1e-4, atol=2e-5) + + # Signum class PySignum(mx.optimizer.Optimizer): """The python reference of Signum optimizer. From 560cfc3293d52b7b3e0c30679838128459000c7f Mon Sep 17 00:00:00 2001 From: Denisa Roberts Date: Thu, 29 Nov 2018 17:29:05 -0500 Subject: [PATCH 2/3] Modify nested for loop with itertools.product and left tolerance to default --- tests/python/unittest/test_optimizer.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 7d370bf3b62a..14dd20ad8b9c 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -16,6 +16,7 @@ # under the License. import numpy as np +import itertools import mxnet as mx import mxnet.lr_scheduler as lr_scheduler from mxnet import gluon @@ -677,21 +678,13 @@ def test_adamax(): wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}] mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}] for dtype in [np.float16, np.float32, np.float64]: - for cg_option in cg_options: - for rg_option in rg_options: - for wd_option in wd_options: - for mp_option in mp_options: - kwarg = {} - kwarg.update(cg_option) - kwarg.update(rg_option) - kwarg.update(wd_option) - kwarg.update(mp_option) - if (dtype == np.float16 and - ('multi_precision' not in kwarg or - not kwarg['multi_precision'])): - continue - compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, - rtol=1e-4, atol=2e-5) + for params in itertools.product(cg_options, rg_options, wd_options, mp_options): + kwarg = {k: v for param in params for k, v in param.items()} + if (dtype == np.float16 and + ('multi_precision' not in kwarg or + not kwarg['multi_precision'])): + continue + compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype) # Signum From 2378b11567e8c39964143c28d1382a55099dfd99 Mon Sep 17 00:00:00 2001 From: Denisa Roberts Date: Tue, 4 Dec 2018 08:22:24 -0500 Subject: [PATCH 3/3] Trigger --- tests/python/unittest/test_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index 14dd20ad8b9c..b03dcdcfba44 100644 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -618,7 +618,7 @@ def test_adam(): class PyAdamax(mx.optimizer.Optimizer): """The python reference of AdaMax optimizer. - This class implements the AdaMax optimizer, a variant of Adam based on the infinity norm, + This class implements the AdaMax optimizer, one variant of Adam based on the infinity norm, available at http://arxiv.org/abs/1412.6980 Section 7. The optimizer updates the weight by::