From 10d745843477b8277e46e93173c6f7bfde8eda63 Mon Sep 17 00:00:00 2001 From: Hao Jin Date: Thu, 22 Mar 2018 10:52:44 -0700 Subject: [PATCH] [MXNET-101] Support float16 in LeakyReLU operator (#10169) * support for any datatype in leaky ReLU * test for LeakyReLU operators * make lint * clean up unnecessary prints * fix for amalgamation build failure * add InferType for Leaky ReLU and slight modification to the tests --- src/operator/leaky_relu-inl.h | 136 ++++++++++++++++++------- src/operator/leaky_relu.cc | 13 ++- src/operator/leaky_relu.cu | 8 +- src/operator/mshadow_op.h | 15 ++- src/operator/operator_tune.cc | 4 + tests/python/unittest/test_operator.py | 71 +++++++++++++ 6 files changed, 201 insertions(+), 46 deletions(-) diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h index 77eba43155c8..c99280ac7eaf 100644 --- a/src/operator/leaky_relu-inl.h +++ b/src/operator/leaky_relu-inl.h @@ -34,8 +34,11 @@ #include #include #include +#include "../common/random_generator.h" #include "./operator_common.h" #include "./mshadow_op.h" +#include "./random/sampler.h" +#include "./random/sample_op.h" namespace mxnet { namespace op { @@ -75,7 +78,7 @@ struct prelu_grad { } }; -template +template class LeakyReLUOp : public Operator { public: explicit LeakyReLUOp(LeakyReLUParam param) { @@ -92,25 +95,25 @@ class LeakyReLUOp : public Operator { size_t expected = param_.act_type == leakyrelu::kPReLU ? 2 : 1; CHECK_EQ(in_data.size(), expected); Stream *s = ctx.get_stream(); - Tensor data; - Tensor out; - Tensor mask; - Tensor weight; + Tensor data; + Tensor out; + Tensor mask; + Tensor weight; int n = in_data[leakyrelu::kData].shape_[0]; int k = in_data[leakyrelu::kData].shape_[1]; Shape<3> dshape = Shape3(n, k, in_data[leakyrelu::kData].Size()/n/k); - data = in_data[leakyrelu::kData].get_with_shape(dshape, s); - out = out_data[leakyrelu::kOut].get_with_shape(dshape, s); - if (param_.act_type == leakyrelu::kRReLU) { - mask = out_data[leakyrelu::kMask].get_with_shape(dshape, s); - } + data = in_data[leakyrelu::kData].get_with_shape(dshape, s); + out = out_data[leakyrelu::kOut].get_with_shape(dshape, s); switch (param_.act_type) { case leakyrelu::kLeakyReLU: { - Assign(out, req[leakyrelu::kOut], F(data, param_.slope)); + MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_, DType(param_.slope)); + }); break; } case leakyrelu::kPReLU: { - weight = in_data[leakyrelu::kGamma].get(s); + weight = in_data[leakyrelu::kGamma].get(s); if (weight.shape_.Size() == 1) { Assign(out, req[leakyrelu::kOut], F(data, mshadow::expr::broadcast_scalar(weight, out.shape_))); @@ -122,18 +125,43 @@ class LeakyReLUOp : public Operator { } case leakyrelu::kRReLU: { if (ctx.is_train) { - Random* prnd = ctx.requested[leakyrelu::kRandom].get_random(s); - mask = prnd->uniform(mask.shape_); - mask = mask * (param_.upper_bound - param_.lower_bound) + param_.lower_bound; - Assign(out, req[leakyrelu::kOut], F(data, mask)); + mask = out_data[leakyrelu::kMask].get_with_shape(dshape, s); + mxnet::op::UniformSampler sampler; + Tensor low, high; + mxnet::op::GetSamplingTempData(DType(0.0f), DType(1.0f), ctx, &low, &high); + mxnet::common::random::RandGenerator *pgen = + ctx.requested[0].get_parallel_random(); + Tensor out = mask.FlatTo1D(); + sampler.Sample(low, high, out, pgen, s); + MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kMask], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, mask.size(0) * mask.size(1) * mask.size(2), mask.dptr_, mask.dptr_, + DType(param_.upper_bound - param_.lower_bound)); + }); + MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kMask], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, mask.size(0) * mask.size(1) * mask.size(2), mask.dptr_, mask.dptr_, + DType(param_.lower_bound)); + }); + MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, mask.size(0) * mask.size(1) * mask.size(2), out.dptr_, data.dptr_, mask.dptr_); + }); } else { const float slope = (param_.lower_bound + param_.upper_bound) / 2.0f; - Assign(out, req[leakyrelu::kOut], F(data, slope)); + MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_, DType(slope)); + }); } break; } case leakyrelu::kELU: { - Assign(out, req[leakyrelu::kOut], F(data, param_.slope)); + MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_, + DType(param_.slope)); + }); break; } default: @@ -155,33 +183,38 @@ class LeakyReLUOp : public Operator { CHECK_EQ(req.size(), expected); CHECK_EQ(in_data.size(), expected); Stream *s = ctx.get_stream(); - Tensor output; - Tensor data; - Tensor gdata; - Tensor grad; - Tensor mask; - Tensor weight; - Tensor grad_weight; + Tensor output; + Tensor data; + Tensor gdata; + Tensor grad; + Tensor mask; + Tensor weight; + Tensor grad_weight; int n = out_grad[leakyrelu::kOut].shape_[0]; int k = out_grad[leakyrelu::kOut].shape_[1]; Shape<3> dshape = Shape3(n, k, out_grad[leakyrelu::kOut].Size()/n/k); - grad = out_grad[leakyrelu::kOut].get_with_shape(dshape, s); - gdata = in_grad[leakyrelu::kData].get_with_shape(dshape, s); - output = out_data[leakyrelu::kOut].get_with_shape(dshape, s); + grad = out_grad[leakyrelu::kOut].get_with_shape(dshape, s); + gdata = in_grad[leakyrelu::kData].get_with_shape(dshape, s); + output = out_data[leakyrelu::kOut].get_with_shape(dshape, s); if (param_.act_type == leakyrelu::kRReLU) { - mask = out_data[leakyrelu::kMask].get_with_shape(dshape, s); + mask = out_data[leakyrelu::kMask].get_with_shape(dshape, s); } if (param_.act_type == leakyrelu::kPReLU) { - data = in_data[leakyrelu::kData].get_with_shape(dshape, s); + data = in_data[leakyrelu::kData].get_with_shape(dshape, s); } switch (param_.act_type) { case leakyrelu::kLeakyReLU: { - Assign(gdata, req[leakyrelu::kData], F(output, param_.slope) * grad); + MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, { + mxnet_op::Kernel, Req>, xpu>::Launch( + s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_, grad.dptr_, + output.dptr_, DType(param_.slope)); + }); break; } case leakyrelu::kPReLU: { - weight = in_data[leakyrelu::kGamma].get(s); - grad_weight = in_grad[leakyrelu::kGamma].get(s); + weight = in_data[leakyrelu::kGamma].get(s); + grad_weight = in_grad[leakyrelu::kGamma].get(s); if (weight.shape_.Size() == 1) { Shape<4> gshape = Shape4(1, grad.shape_[0], grad.shape_[1], grad.shape_[2]); Assign(grad_weight, req[leakyrelu::kGamma], @@ -204,7 +237,12 @@ class LeakyReLUOp : public Operator { break; } case leakyrelu::kELU: { - Assign(gdata, req[leakyrelu::kData], F(output, param_.slope) * grad); + MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, { + mxnet_op::Kernel, Req>, xpu>::Launch( + s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_, grad.dptr_, + output.dptr_, DType(param_.slope)); + }); break; } default: @@ -217,7 +255,7 @@ class LeakyReLUOp : public Operator { }; // class LeakyReLUOp template -Operator* CreateOp(LeakyReLUParam type); +Operator* CreateOp(LeakyReLUParam type, int dtype); #if DMLC_USE_CXX11 class LeakyReLUProp : public OperatorProperty { @@ -256,6 +294,26 @@ class LeakyReLUProp : public OperatorProperty { return true; } + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + int dtype = -1; + for (const int& type : *in_type) { + type_assign(&dtype, type); + } + for (const int& type : *out_type) { + type_assign(&dtype, type); + } + + for (size_t i = 0; i < in_type->size(); ++i) { + TYPE_ASSIGN_CHECK(*in_type, i, dtype); + } + for (size_t i = 0; i < out_type->size(); ++i) { + TYPE_ASSIGN_CHECK(*out_type, i, dtype); + } + return dtype != -1; + } + OperatorProperty* Copy() const override { auto ptr = new LeakyReLUProp(); ptr->param_ = param_; @@ -338,7 +396,13 @@ class LeakyReLUProp : public OperatorProperty { } } - Operator* CreateOperator(Context ctx) const override; + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; private: LeakyReLUParam param_; diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc index 6e6fa53ce6eb..99b6ba362f75 100644 --- a/src/operator/leaky_relu.cc +++ b/src/operator/leaky_relu.cc @@ -30,12 +30,17 @@ namespace mxnet { namespace op { template<> -Operator *CreateOp(LeakyReLUParam param) { - return new LeakyReLUOp(param); +Operator *CreateOp(LeakyReLUParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new LeakyReLUOp(param); + }); + return op; } -Operator *LeakyReLUProp::CreateOperator(Context ctx) const { - DO_BIND_DISPATCH(CreateOp, param_); +Operator *LeakyReLUProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); } DMLC_REGISTER_PARAMETER(LeakyReLUParam); diff --git a/src/operator/leaky_relu.cu b/src/operator/leaky_relu.cu index 9de237c5734d..74b444d87597 100644 --- a/src/operator/leaky_relu.cu +++ b/src/operator/leaky_relu.cu @@ -29,8 +29,12 @@ namespace mxnet { namespace op { template<> -Operator *CreateOp(LeakyReLUParam param) { - return new LeakyReLUOp(param); +Operator *CreateOp(LeakyReLUParam param, int dtype) { + Operator* op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new LeakyReLUOp(param); + }); + return op; } } // namespace op diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h index 1d4284e1ac2a..5606c64369ad 100644 --- a/src/operator/mshadow_op.h +++ b/src/operator/mshadow_op.h @@ -89,6 +89,13 @@ MXNET_UNARY_MATH_OP_NC(identity, a); MXNET_UNARY_MATH_OP(identity_grad, 1); +struct identity_with_cast { + template + MSHADOW_XINLINE static void Map(int i, DTypeOut *out, DTypeIn *in) { + out[i] = DTypeOut(in[i]); + } +}; + MXNET_BINARY_MATH_OP_NC(left, a); MXNET_BINARY_MATH_OP_NC(right, b); @@ -119,13 +126,13 @@ MXNET_UNARY_MATH_OP_NC(relu, a > DType(0) ? a : DType(0)); MXNET_UNARY_MATH_OP_NC(relu_grad, a > DType(0) ? DType(1) : DType(0)); -MXNET_BINARY_MATH_OP(xelu, a > DType(0) ? math::id(a) : - math::id(a) * math::id(b)); +MXNET_BINARY_MATH_OP_NC(xelu, a > DType(0) ? a : + DType(static_cast(a) * static_cast(b))); MXNET_BINARY_MATH_OP_NC(xelu_grad, a > DType(0) ? DType(1) : b); -MXNET_BINARY_MATH_OP(elu, a > DType(0) ? math::id(a) : - math::id(b) * math::expm1(a)); +MXNET_BINARY_MATH_OP_NC(elu, a > DType(0) ? a : + DType(math::id(b) * math::expm1(a))); MXNET_BINARY_MATH_OP_NC(elu_grad, a > DType(0) ? DType(1) : DType(b + a)); diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc index c13f1ac2fae1..c48d83a3be87 100644 --- a/src/operator/operator_tune.cc +++ b/src/operator/operator_tune.cc @@ -314,9 +314,13 @@ IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::right); // NOLINT() IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::right); // NOLINT() IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::power); // NOLINT() IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rpower); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::xelu); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::elu); // NOLINT() IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_grad); // NOLINT() IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rpower_grad); // NOLINT() IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_rgrad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::xelu_grad); // NOLINT() +IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::elu_grad); // NOLINT() IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::maximum); // NOLINT() IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minimum); // NOLINT() IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::hypot); // NOLINT() diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py index c1df291f31c5..240c06a5d7a2 100644 --- a/tests/python/unittest/test_operator.py +++ b/tests/python/unittest/test_operator.py @@ -489,6 +489,77 @@ def frelu_grad(x): check_symbolic_backward(y, [xa], [np.ones(shape)], [ga]) +@with_seed(1234) +def test_leaky_relu(): + def fleaky_relu(x, act_type, slope=0.25): + neg_indices = x < 0 + out = x.copy() + if act_type == 'elu': + out[neg_indices] = slope * (np.exp(out[neg_indices]) - 1.) + elif act_type == 'leaky': + out[neg_indices] = slope * out[neg_indices] + return out + def fleaky_relu_grad(grad, x, y, act_type, slope=0.25): + neg_indices = x < 0 + out = np.ones(x.shape) + if act_type == 'elu': + out[neg_indices] = y[neg_indices] + slope + elif act_type == 'leaky': + out[neg_indices] = slope + return out * grad + shape = (3, 4) + x = mx.symbol.Variable("x") + slp = 0.0625 + for dtype in [np.float16, np.float32, np.float64]: + xa = np.random.uniform(low=-1.0,high=-0.2,size=shape).astype(dtype) + eps = 1e-4 + xa[abs(xa) < eps] = 1.0 + # eps = 1e-2 if dtype is np.float16 else 1e-4 + for act_type in ['leaky']: + y = mx.symbol.LeakyReLU(data=x, slope=slp, act_type=act_type) + ya = fleaky_relu(xa, slope=slp, act_type=act_type) + ga = fleaky_relu_grad(np.ones(shape), xa, ya, slope=slp, act_type=act_type) + check_numeric_gradient(y, [xa], numeric_eps=eps, rtol=1e-4, atol=1e-4) + check_symbolic_forward(y, [xa], [ya], rtol=eps, atol=1e-5, dtype=dtype) + check_symbolic_backward(y, [xa], [np.ones(shape)], [ga], rtol=eps, atol=1e-5, dtype=dtype) + + +@with_seed(1234) +def test_prelu(): + def fprelu(x, gamma): + pos_indices = x > 0 + out = x.copy() + out = np.multiply(out, gamma) + out[pos_indices] = x[pos_indices] + return out + def fprelu_grad(x, y, gamma): + pos_indices = x > 0 + grad_x = np.multiply(np.ones(x.shape), gamma) + grad_gam = np.zeros(gamma.shape) + copy_x = x.copy() + copy_x[pos_indices] = 0.0 + grad_x[pos_indices] = 1.0 + if gamma.shape[0] == 1: + grad_gam = np.sum(np.sum(copy_x)) + elif gamma.shape[0] > 1: + grad_gam = np.sum(copy_x, axis=0) + return (grad_x, grad_gam) + shape = (3,4) + x = mx.symbol.Variable("x") + gamma = mx.symbol.Variable("gamma") + for dtype in [np.float16, np.float32, np.float64]: + for gam in [np.array([0.1], dtype=dtype), np.array([0.1, 0.2, 0.3, 0.4], dtype=dtype)]: + xa = np.random.uniform(low=-1.0,high=1.0,size=shape).astype(dtype) + eps = 1e-4 + xa[abs(xa) < eps] = 1.0 + y = mx.symbol.LeakyReLU(data=x, gamma=gamma, act_type='prelu') + ya = fprelu(xa, gam) + g_xa, g_gam = fprelu_grad(xa, ya, gamma=gam) + check_numeric_gradient(y, [xa, gam], numeric_eps=eps, rtol=1e-3, atol=1e-4) + check_symbolic_forward(y, [xa, gam], [ya], rtol=1e-3, atol=1e-20) + check_symbolic_backward(y, [xa, gam], [np.ones(shape)], [g_xa], rtol=1e-3, atol=1e-20) + + @with_seed() def test_sigmoid(): def fsigmoid(a):