Add seed management to python tests (#9791)

* Adds with_seed() decorator to unittests. Will fail CI due to demo of known failing seeds. * Fix failing tests, remove hard-coded bad seeds. * Adds with_seed() decorator to gpu tests. Will fail CI due to demo of known failing seeds. * Fix failing test, add test of 'with random_seed()' * Add with_seed() to test_gluon_model_zoo_gpy.py * Increase atol for test_adam * Added more 'with random_seed()' testing. Hardcode bad test_training seed. * test_training put back on random seeding * Switched test_dropout to fixed seed, created issue * Hardcoding seed that caused a test_rsp_push_pull CI core-dump * test_rsp_push_pull CI core-dump not reproduced, so unsetting seed
apache · Feb 18, 2018 · f33591f · f33591f
1 parent 44ead24
commit f33591f
Show file tree

Hide file tree

Showing 25 changed files with 994 additions and 173 deletions.
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
@@ -49,8 +49,6 @@
 from .ndarray import array
 from .symbol import Symbol
 
-_rng = np.random.RandomState(1234)
-
 
 def default_context():
     """Get default context for regression test."""
@@ -844,7 +842,7 @@ def random_projection(shape):
         """
         # random_projection should not have elements too small,
         # otherwise too much precision is lost in numerical gradient
-        plain = _rng.rand(*shape) + 0.1
+        plain = np.random.rand(*shape) + 0.1
         return plain
 
     location = _parse_location(sym=sym, location=location, ctx=ctx, dtype=dtype)
@@ -876,8 +874,9 @@ def random_projection(shape):
     location = dict(list(location.items()) +
                     [("__random_proj", mx.nd.array(random_projection(out_shape[0]),
                                                    ctx=ctx, dtype=dtype))])
-    args_grad_npy = dict([(k, _rng.normal(0, 0.01, size=location[k].shape)) for k in grad_nodes]
-                         + [("__random_proj", _rng.normal(0, 0.01, size=out_shape[0]))])
+    args_grad_npy = dict([(k, np.random.normal(0, 0.01, size=location[k].shape))
+                          for k in grad_nodes]
+                         + [("__random_proj", np.random.normal(0, 0.01, size=out_shape[0]))])
 
     args_grad = {k: mx.nd.array(v, ctx=ctx, dtype=dtype) for k, v in args_grad_npy.items()}
     if grad_stype_dict is not None:
@@ -1068,7 +1067,7 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
     if isinstance(expected, (list, tuple)):
         expected = {k:v for k, v in zip(sym.list_arguments(), expected)}
 
-    args_grad_npy = {k:_rng.normal(size=v.shape) for k, v in expected.items()}
+    args_grad_npy = {k:np.random.normal(size=v.shape) for k, v in expected.items()}
     args_grad_data = {}
     for k, v in args_grad_npy.items():
         nd = mx.nd.array(v, ctx=ctx, dtype=dtype)
@@ -1162,7 +1161,7 @@ def check_speed(sym, location=None, ctx=None, N=20, grad_req=None, typ="whole",
         grad_req = 'write'
     if location is None:
         exe = sym.simple_bind(grad_req=grad_req, ctx=ctx, **kwargs)
-        location = {k: _rng.normal(size=arr.shape, scale=1.0) for k, arr in
+        location = {k: np.random.normal(size=arr.shape, scale=1.0) for k, arr in
                     exe.arg_dict.items()}
     else:
         assert isinstance(location, dict), "Expect dict, get \"location\"=%s" %str(location)

diff --git a/tests/python/gpu/test_forward.py b/tests/python/gpu/test_forward.py
@@ -15,10 +15,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import sys
 import os
 import numpy as np
 import mxnet as mx
 from mxnet.test_utils import *
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(curr_path, '../unittest'))
+from common import setup_module, with_seed
 from mxnet.gluon import utils
 
 def _get_model():
@@ -51,6 +55,7 @@ def _get_data(shape):
                    path='data/inception-v3-dump.npz',
                    sha1_hash=hash_inception_v3)
 
+@with_seed()
 def test_consistency(dump=False):
     shape = (299, 299)
     _get_model()

diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -23,7 +23,11 @@
 from mxnet.gluon.model_zoo.vision import get_model
 from mxnet.test_utils import assert_almost_equal
 import sys
+import os
 import unittest
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(curr_path, '../unittest'))
+from common import setup_module, with_seed
 
 def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
@@ -34,6 +38,7 @@ def download_data():
         'http://data.mxnet.io/data/val-5k-256.rec', VAL_DATA)
 
 @unittest.skip("test fails intermittently. temporarily disabled.")
+@with_seed()
 def test_inference():
     all_models = ['resnet50_v1', 'vgg19_bn', 'alexnet', #'inceptionv3',
                   'densenet201', 'squeezenet1.0', 'mobilenet0.25']
@@ -91,6 +96,10 @@ def get_nn_model(name):
     else:
         return get_model(name)
 
+# Seed 1521019752 produced a failure on the Py2 MKLDNN-GPU CI runner
+# on 2/16/2018 that was not reproducible.  Problem could be timing related or
+# based on non-deterministic algo selection.
+@with_seed()
 def test_training():
     # We use network models without dropout for testing.
     # TODO(zhengda) mobilenet can't pass this test even without MKLDNN.

diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
@@ -16,10 +16,15 @@
 # under the License.
 
 # pylint: skip-file
+import sys
+import os
 import mxnet as mx
 import numpy as np
 import unittest
 from mxnet.test_utils import assert_almost_equal, default_context
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(curr_path, '../unittest'))
+from common import setup_module, with_seed
 
 shape = (4, 4)
 keys = [5, 7, 11]
@@ -35,7 +40,9 @@ def init_kv_with_str(stype='default', kv_type='local'):
     kv.init(str_keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys))
     return kv
 
-
+# Test seed 89411477 (module seed 1829754103) resulted in a py3-gpu CI runner core dump.
+# Not reproducible, so this test is back on random seeds.
+@with_seed()
 def test_rsp_push_pull():
     def check_rsp_push_pull(kv_type, is_push_cpu=True):
         kv = init_kv_with_str('row_sparse', kv_type)