diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 6dc04f7c729c..3dc80815d965 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 6dc04f7c729cd5c6c6210d5d4d2026a26ce0bfbf
+Subproject commit 3dc80815d965b56b9a975dc27229361955bf66fe
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index ce61beb125d3..146836c28459 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -4000,6 +4000,48 @@ def test_cast():
             assert_almost_equal(exe.grad_arrays[0].asnumpy(), X.astype(dsttype).astype(srctype), rtol=1e-3, atol=1e-5)
 
 
+# Test requires all platforms to round float32->float16 with same round-to-nearest-even policy.
+@with_seed()
+def test_cast_float32_to_float16():
+    FP16_FRACTION_BITS = 10
+    FP32_FRACTION_BITS = 23
+    FP32_EXP_MIN = -126
+    FP32_EXP_MAX = 127
+    # generate test cases in the vicinity of representable float16 mantissas
+    # and mid-way between them, but over the full range of float32 exponents.
+    def get_data():
+        for sign_bit in [0, 1]:
+            for exponent in range(FP32_EXP_MIN - FP32_FRACTION_BITS - 1, FP32_EXP_MAX + 2):
+                denominator = 2**(FP16_FRACTION_BITS + 1)
+                for numerator in range(0, denominator):
+                    fraction = numerator / float(denominator)
+                    for y in [-1.0, 0.0, 1.0]:
+                        small_delta = y / 2**FP32_FRACTION_BITS
+                        val = (-1.0)**sign_bit * 2.0**exponent * (1.0 + fraction + small_delta)
+                        yield val
+        # Add np.nan as a final data value to process
+        yield np.nan
+
+    input_np = np.array(list(get_data())).astype(np.float32)
+    # The intermediate cast to np.float64 below gets around a numpy rounding bug that is fixed
+    # as of numpy 1.17 by PR /~https://github.com/numpy/numpy/pull/12722
+    expected_output = input_np.astype(np.float64).astype(np.float16)
+
+    x = mx.sym.Variable('x', dtype=np.float32)
+    sym = mx.sym.Cast(x, dtype=np.float16)
+    ctx = default_context()
+    exe = sym.bind(ctx, {'x' : mx.nd.array(input_np, dtype=np.float32, ctx=ctx)})
+    assert exe.arg_arrays[0].dtype == np.float32
+    assert exe.outputs[0].dtype == np.float16
+    exe.forward(is_train=False)
+    sym_output = exe.outputs[0].asnumpy()
+    for fp32_val, model_fp16_val, np_fp16_val in zip(input_np, sym_output, expected_output):
+        assert (model_fp16_val == np_fp16_val) or \
+               (np.isnan(model_fp16_val) and np.isnan(np_fp16_val)), \
+            'fp32->fp16 cast mismatch: with fp32 value {}, model_fp16 = {}, numpy_fp16 = {}'.format(
+                fp32_val, model_fp16_val, np_fp16_val)
+
+
 @with_seed()
 def test_repeat():
     def test_repeat_forward():