From c588f341de7b4912a35a4c20aa2f3abf91c2c945 Mon Sep 17 00:00:00 2001 From: Hao Jin Date: Sun, 17 Mar 2019 13:20:06 -0700 Subject: [PATCH] Speedup _contrib_index_copy (#14359) * speedup _contrib_index_copy * use copy in backward * add support for kAddTo req type for grad * change template to argument for req types --- src/operator/contrib/index_copy-inl.h | 93 +------------------- src/operator/contrib/index_copy.cc | 120 ++++++++++++++++++++++++++ src/operator/contrib/index_copy.cu | 110 ++++++++++++++++++++++- 3 files changed, 231 insertions(+), 92 deletions(-) diff --git a/src/operator/contrib/index_copy-inl.h b/src/operator/contrib/index_copy-inl.h index d93bf47949a8..903dee13272b 100644 --- a/src/operator/contrib/index_copy-inl.h +++ b/src/operator/contrib/index_copy-inl.h @@ -37,108 +37,19 @@ namespace mxnet { namespace op { -template -struct index_copy_forward { - template - MSHADOW_XINLINE static void Map(int i, - int dim, - IType* index, - DType* new_tensor, - DType* out_tensor) { - DType* out_ptr = out_tensor + static_cast(index[i]) * dim; - DType* new_ptr = new_tensor + i * dim; - for (int idx = 0; idx < dim; ++idx) { - KERNEL_ASSIGN(out_ptr[idx], req, new_ptr[idx]); - } - } -}; - template void IndexCopyForward(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 3U); - CHECK_EQ(outputs.size(), 1U); - CHECK_EQ(req.size(), 1U); - mshadow::Stream *s = ctx.get_stream(); - const TBlob& out = outputs[0]; - const TBlob& original_tensor = inputs[0]; - const TBlob& idx_vector = inputs[1]; - const TBlob& copied_tensor = inputs[2]; - int dim = inputs[2].Size() / inputs[1].Size(); - // copy original tensor to output - mxnet_op::copy(s, out, original_tensor); - // index copy - MSHADOW_TYPE_SWITCH(out.type_flag_, DType, { - MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, { - MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, { - mxnet_op::Kernel, xpu>::Launch(s, - idx_vector.Size(), dim, - idx_vector.dptr(), - copied_tensor.dptr(), - out.dptr()); - }); - }); - }); -} - -struct index_copy_backward { - template - MSHADOW_XINLINE static void Map(int i, - int dim, - int index_size, - int req1, int req2, - DType* out_grad, - IType* index, - DType* in_grad_1, - DType* in_grad_2) { - // Copy to in_grad_2 - for (int p = 0; p < index_size; ++p) { - int idx = static_cast(index[p]); - if (i >= idx*dim && i < (idx+1)*dim) { - int offset = i - idx*dim; - KERNEL_ASSIGN(in_grad_2[p*dim+offset], req2, out_grad[i]); - return; - } - } - // Copy to in_grad_1 - KERNEL_ASSIGN(in_grad_1[i], req1, out_grad[i]); - } -}; + const std::vector& outputs); template void IndexCopyBackward(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 4U); - CHECK_EQ(outputs.size(), 3U); - mshadow::Stream *s = ctx.get_stream(); - const TBlob& out_grad = inputs[0]; - const TBlob& index = inputs[2]; - const TBlob& in_grad_1 = outputs[0]; - const TBlob& in_grad_2 = outputs[2]; - int dim = inputs[3].Size() / inputs[2].Size(); - int index_size = inputs[2].Size(); - Fill(s, outputs[0], req[0], 0); - Fill(s, outputs[2], req[2], 0); - // index_copy_backward - MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, { - MSHADOW_TYPE_SWITCH(index.type_flag_, IType, { - mxnet_op::Kernel::Launch(s, - out_grad.Size(), - dim, index_size, - req[0], req[2], - out_grad.dptr(), - index.dptr(), - in_grad_1.dptr(), - in_grad_2.dptr()); - }); - }); -} + const std::vector& outputs); inline bool IndexCopyShape(const nnvm::NodeAttrs& attrs, mxnet::ShapeVector *in_attrs, diff --git a/src/operator/contrib/index_copy.cc b/src/operator/contrib/index_copy.cc index bcf6c02d3d37..f272a8860d85 100644 --- a/src/operator/contrib/index_copy.cc +++ b/src/operator/contrib/index_copy.cc @@ -26,6 +26,122 @@ namespace mxnet { namespace op { +struct index_copy_fwd_cpu { + template + static void Map(int i, + const DType* new_tensor, + const IType* idx, + DType* out_tensor, + int dim_size) { + DType* out_ptr = out_tensor + static_cast(idx[i]) * dim_size; + const DType* new_ptr = new_tensor + i * dim_size; + std::memcpy(out_ptr, new_ptr, sizeof(DType) * dim_size); + } +}; + +template<> +void IndexCopyForward(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mxnet_op; + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + CHECK(req[0] != kAddTo); + if (req[0] == kNullOp) return; + mshadow::Stream *s = ctx.get_stream(); + const TBlob& out = outputs[0]; + const TBlob& original_tensor = inputs[0]; + const TBlob& idx_vector = inputs[1]; + const TBlob& copied_tensor = inputs[2]; + int dim_size = inputs[2].Size() / inputs[1].Size(); + // copy original tensor to output + copy(s, out, original_tensor); + // index copy + MSHADOW_TYPE_SWITCH(out.type_flag_, DType, { + MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, { + Kernel::Launch( + s, idx_vector.Size(), copied_tensor.dptr(), + idx_vector.dptr(), out.dptr(), dim_size); + }); + }); +} + +struct index_copy_bwd_cpu { + template + static void Map(int i, + const DType* out_tensor_grad, + DType* orig_tensor_grad, + DType* new_tensor_grad, + const IType* idx, + int dim_size, + int idx_size, + OpReqType orig_req, + OpReqType new_req) { + const int index = idx[i]; + DType* new_ptr = new_tensor_grad + i * dim_size; + DType* orig_ptr = orig_tensor_grad + index * dim_size; + const DType* src_ptr = out_tensor_grad + index * dim_size; + for (int iter = 0; iter < dim_size; ++iter) { + KERNEL_ASSIGN(new_ptr[iter], new_req, src_ptr[iter]); + } + if (orig_req == kAddTo) { + for (int iter = 0; iter < dim_size; ++iter) { + orig_ptr[iter] -= src_ptr[iter]; + } + } else if (orig_req == kNullOp) { + return; + } else { + std::memset(orig_ptr, 0, sizeof(DType) * dim_size); + } + } +}; + +template<> +void IndexCopyBackward(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mxnet_op; + CHECK_EQ(inputs.size(), 4U); + CHECK_EQ(outputs.size(), 3U); + Stream *s = ctx.get_stream(); + const TBlob& out_grad = inputs[0]; + const TBlob& index = inputs[2]; + const TBlob& in_grad_1 = outputs[0]; + const TBlob& in_grad_2 = outputs[2]; + int dim_size = inputs[3].Size() / inputs[2].Size(); + int index_size = inputs[2].Size(); + OpReqType orig_req = req[0]; + OpReqType new_req = req[2]; + // index_copy_backward + MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, { + MSHADOW_TYPE_SWITCH(index.type_flag_, IType, { + switch (orig_req) { + case kNullOp: + break; + case kWriteTo: + case kWriteInplace: + copy(s, in_grad_1, out_grad); + break; + case kAddTo: + Kernel, cpu>::Launch( + s, out_grad.Size(), in_grad_1.dptr(), + out_grad.dptr(), in_grad_1.dptr()); + } + Kernel::Launch( + s, index_size, out_grad.dptr(), + in_grad_1.dptr(), in_grad_2.dptr(), + index.dptr(), dim_size, index_size, orig_req, new_req); + }); + }); +} + static bool IndexCopyType(const nnvm::NodeAttrs& attrs, std::vector *in_attrs, std::vector *out_attrs) { @@ -71,6 +187,10 @@ Examples:: .set_attr("FInferType", IndexCopyType) .set_attr("FGradient", ElemwiseGradUseIn{"_contrib_backward_index_copy"}) .set_attr("FCompute", IndexCopyForward) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"old_tensor", "index_vector", "new_tensor"}; + }) .add_argument("old_tensor", "NDArray-or-Symbol", "Old tensor") .add_argument("index_vector", "NDArray-or-Symbol", "Index vector") .add_argument("new_tensor", "NDArray-or-Symbol", "New tensor to be copied"); diff --git a/src/operator/contrib/index_copy.cu b/src/operator/contrib/index_copy.cu index dc416114b04d..53f2600aba06 100644 --- a/src/operator/contrib/index_copy.cu +++ b/src/operator/contrib/index_copy.cu @@ -18,7 +18,7 @@ */ /*! - * \file index_copy.cc + * \file index_copy.cu * \brief */ #include "./index_copy-inl.h" @@ -26,6 +26,114 @@ namespace mxnet { namespace op { +struct index_copy_fwd_gpu { + template + MSHADOW_XINLINE static void Map(int i, + const DType* new_tensor, + const IType* idx, + DType* out_tensor, + int dim_size) { + int index = static_cast(idx[i / dim_size]); + out_tensor[index * dim_size + i % dim_size] = new_tensor[i]; + } +}; + +template<> +void IndexCopyForward(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mxnet_op; + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + CHECK(req[0] != kAddTo); + if (req[0] == kNullOp) return; + mshadow::Stream *s = ctx.get_stream(); + const TBlob& out = outputs[0]; + const TBlob& original_tensor = inputs[0]; + const TBlob& idx_vector = inputs[1]; + const TBlob& copied_tensor = inputs[2]; + int dim_size = inputs[2].Size() / inputs[1].Size(); + // copy original tensor to output + copy(s, out, original_tensor); + // index copy + MSHADOW_TYPE_SWITCH(out.type_flag_, DType, { + MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, { + Kernel::Launch( + s, copied_tensor.Size(), copied_tensor.dptr(), + idx_vector.dptr(), out.dptr(), dim_size); + }); + }); +} + +struct index_copy_bwd_gpu { + template + MSHADOW_XINLINE static void Map(int i, + const DType* out_grad, + DType* orig_grad, + DType* new_grad, + const IType* idx, + int dim_size, + int idx_size, + OpReqType orig_req, + OpReqType new_req) { + int index = idx[i / dim_size]; + KERNEL_ASSIGN(new_grad[i], new_req, out_grad[index * dim_size + i % dim_size]); + if (orig_req == kAddTo) { + orig_grad[index * dim_size + i % dim_size] -= new_grad[i]; + } else if (orig_req == kNullOp) { + return; + } else { + orig_grad[index * dim_size + i % dim_size] = 0; + } + } +}; + +template<> +void IndexCopyBackward(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mxnet_op; + CHECK_EQ(inputs.size(), 4U); + CHECK_EQ(outputs.size(), 3U); + Stream *s = ctx.get_stream(); + const TBlob& out_grad = inputs[0]; + const TBlob& index = inputs[2]; + const TBlob& in_grad_1 = outputs[0]; + const TBlob& in_grad_2 = outputs[2]; + int dim_size = inputs[3].Size() / inputs[2].Size(); + int index_size = inputs[2].Size(); + OpReqType orig_req = req[0]; + OpReqType new_req = req[2]; + // index_copy_backward + MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, { + MSHADOW_TYPE_SWITCH(index.type_flag_, IType, { + switch (orig_req) { + case kNullOp: + break; + case kWriteTo: + case kWriteInplace: + copy(s, in_grad_1, out_grad); + break; + case kAddTo: + Kernel, gpu>::Launch( + s, out_grad.Size(), in_grad_1.dptr(), + out_grad.dptr(), in_grad_1.dptr()); + } + Kernel::Launch( + s, in_grad_2.Size(), out_grad.dptr(), + in_grad_1.dptr(), in_grad_2.dptr(), + index.dptr(), dim_size, index_size, orig_req, new_req); + }); + }); +} + NNVM_REGISTER_OP(_contrib_index_copy) .set_attr("FCompute", IndexCopyForward);