Skip to content

Commit

Permalink
Speedup _contrib_index_copy (apache#14359)
Browse files Browse the repository at this point in the history
* speedup _contrib_index_copy

* use copy in backward

* add support for kAddTo req type for grad

* change template to argument for req types
  • Loading branch information
haojin2 authored and haohuw committed Jun 23, 2019
1 parent 2180a78 commit 764dbb1
Show file tree
Hide file tree
Showing 3 changed files with 231 additions and 92 deletions.
93 changes: 2 additions & 91 deletions src/operator/contrib/index_copy-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,108 +37,19 @@
namespace mxnet {
namespace op {

template<int req>
struct index_copy_forward {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i,
int dim,
IType* index,
DType* new_tensor,
DType* out_tensor) {
DType* out_ptr = out_tensor + static_cast<int>(index[i]) * dim;
DType* new_ptr = new_tensor + i * dim;
for (int idx = 0; idx < dim; ++idx) {
KERNEL_ASSIGN(out_ptr[idx], req, new_ptr[idx]);
}
}
};

template<typename xpu>
void IndexCopyForward(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 3U);
CHECK_EQ(outputs.size(), 1U);
CHECK_EQ(req.size(), 1U);
mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
const TBlob& out = outputs[0];
const TBlob& original_tensor = inputs[0];
const TBlob& idx_vector = inputs[1];
const TBlob& copied_tensor = inputs[2];
int dim = inputs[2].Size() / inputs[1].Size();
// copy original tensor to output
mxnet_op::copy(s, out, original_tensor);
// index copy
MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
mxnet_op::Kernel<index_copy_forward<req_type>, xpu>::Launch(s,
idx_vector.Size(), dim,
idx_vector.dptr<IType>(),
copied_tensor.dptr<DType>(),
out.dptr<DType>());
});
});
});
}

struct index_copy_backward {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i,
int dim,
int index_size,
int req1, int req2,
DType* out_grad,
IType* index,
DType* in_grad_1,
DType* in_grad_2) {
// Copy to in_grad_2
for (int p = 0; p < index_size; ++p) {
int idx = static_cast<int>(index[p]);
if (i >= idx*dim && i < (idx+1)*dim) {
int offset = i - idx*dim;
KERNEL_ASSIGN(in_grad_2[p*dim+offset], req2, out_grad[i]);
return;
}
}
// Copy to in_grad_1
KERNEL_ASSIGN(in_grad_1[i], req1, out_grad[i]);
}
};
const std::vector<TBlob>& outputs);

template<typename xpu>
void IndexCopyBackward(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 4U);
CHECK_EQ(outputs.size(), 3U);
mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
const TBlob& out_grad = inputs[0];
const TBlob& index = inputs[2];
const TBlob& in_grad_1 = outputs[0];
const TBlob& in_grad_2 = outputs[2];
int dim = inputs[3].Size() / inputs[2].Size();
int index_size = inputs[2].Size();
Fill<false>(s, outputs[0], req[0], 0);
Fill<false>(s, outputs[2], req[2], 0);
// index_copy_backward
MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
mxnet_op::Kernel<index_copy_backward, xpu>::Launch(s,
out_grad.Size(),
dim, index_size,
req[0], req[2],
out_grad.dptr<DType>(),
index.dptr<IType>(),
in_grad_1.dptr<DType>(),
in_grad_2.dptr<DType>());
});
});
}
const std::vector<TBlob>& outputs);

inline bool IndexCopyShape(const nnvm::NodeAttrs& attrs,
mxnet::ShapeVector *in_attrs,
Expand Down
120 changes: 120 additions & 0 deletions src/operator/contrib/index_copy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,122 @@
namespace mxnet {
namespace op {

struct index_copy_fwd_cpu {
template<typename DType, typename IType>
static void Map(int i,
const DType* new_tensor,
const IType* idx,
DType* out_tensor,
int dim_size) {
DType* out_ptr = out_tensor + static_cast<int>(idx[i]) * dim_size;
const DType* new_ptr = new_tensor + i * dim_size;
std::memcpy(out_ptr, new_ptr, sizeof(DType) * dim_size);
}
};

template<>
void IndexCopyForward<cpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 3U);
CHECK_EQ(outputs.size(), 1U);
CHECK_EQ(req.size(), 1U);
CHECK(req[0] != kAddTo);
if (req[0] == kNullOp) return;
mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
const TBlob& out = outputs[0];
const TBlob& original_tensor = inputs[0];
const TBlob& idx_vector = inputs[1];
const TBlob& copied_tensor = inputs[2];
int dim_size = inputs[2].Size() / inputs[1].Size();
// copy original tensor to output
copy(s, out, original_tensor);
// index copy
MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
Kernel<index_copy_fwd_cpu, cpu>::Launch(
s, idx_vector.Size(), copied_tensor.dptr<DType>(),
idx_vector.dptr<IType>(), out.dptr<DType>(), dim_size);
});
});
}

struct index_copy_bwd_cpu {
template<typename DType, typename IType>
static void Map(int i,
const DType* out_tensor_grad,
DType* orig_tensor_grad,
DType* new_tensor_grad,
const IType* idx,
int dim_size,
int idx_size,
OpReqType orig_req,
OpReqType new_req) {
const int index = idx[i];
DType* new_ptr = new_tensor_grad + i * dim_size;
DType* orig_ptr = orig_tensor_grad + index * dim_size;
const DType* src_ptr = out_tensor_grad + index * dim_size;
for (int iter = 0; iter < dim_size; ++iter) {
KERNEL_ASSIGN(new_ptr[iter], new_req, src_ptr[iter]);
}
if (orig_req == kAddTo) {
for (int iter = 0; iter < dim_size; ++iter) {
orig_ptr[iter] -= src_ptr[iter];
}
} else if (orig_req == kNullOp) {
return;
} else {
std::memset(orig_ptr, 0, sizeof(DType) * dim_size);
}
}
};

template<>
void IndexCopyBackward<cpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 4U);
CHECK_EQ(outputs.size(), 3U);
Stream<cpu> *s = ctx.get_stream<cpu>();
const TBlob& out_grad = inputs[0];
const TBlob& index = inputs[2];
const TBlob& in_grad_1 = outputs[0];
const TBlob& in_grad_2 = outputs[2];
int dim_size = inputs[3].Size() / inputs[2].Size();
int index_size = inputs[2].Size();
OpReqType orig_req = req[0];
OpReqType new_req = req[2];
// index_copy_backward
MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
switch (orig_req) {
case kNullOp:
break;
case kWriteTo:
case kWriteInplace:
copy(s, in_grad_1, out_grad);
break;
case kAddTo:
Kernel<op_with_req<op::mshadow_op::plus, kWriteInplace>, cpu>::Launch(
s, out_grad.Size(), in_grad_1.dptr<DType>(),
out_grad.dptr<DType>(), in_grad_1.dptr<DType>());
}
Kernel<index_copy_bwd_cpu, cpu>::Launch(
s, index_size, out_grad.dptr<DType>(),
in_grad_1.dptr<DType>(), in_grad_2.dptr<DType>(),
index.dptr<IType>(), dim_size, index_size, orig_req, new_req);
});
});
}

static bool IndexCopyType(const nnvm::NodeAttrs& attrs,
std::vector<int> *in_attrs,
std::vector<int> *out_attrs) {
Expand Down Expand Up @@ -71,6 +187,10 @@ Examples::
.set_attr<nnvm::FInferType>("FInferType", IndexCopyType)
.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_contrib_backward_index_copy"})
.set_attr<FCompute>("FCompute<cpu>", IndexCopyForward<cpu>)
.set_attr<nnvm::FListInputNames>("FListInputNames",
[](const NodeAttrs& attrs) {
return std::vector<std::string>{"old_tensor", "index_vector", "new_tensor"};
})
.add_argument("old_tensor", "NDArray-or-Symbol", "Old tensor")
.add_argument("index_vector", "NDArray-or-Symbol", "Index vector")
.add_argument("new_tensor", "NDArray-or-Symbol", "New tensor to be copied");
Expand Down
110 changes: 109 additions & 1 deletion src/operator/contrib/index_copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,122 @@
*/

/*!
* \file index_copy.cc
* \file index_copy.cu
* \brief
*/
#include "./index_copy-inl.h"

namespace mxnet {
namespace op {

struct index_copy_fwd_gpu {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i,
const DType* new_tensor,
const IType* idx,
DType* out_tensor,
int dim_size) {
int index = static_cast<int>(idx[i / dim_size]);
out_tensor[index * dim_size + i % dim_size] = new_tensor[i];
}
};

template<>
void IndexCopyForward<gpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 3U);
CHECK_EQ(outputs.size(), 1U);
CHECK_EQ(req.size(), 1U);
CHECK(req[0] != kAddTo);
if (req[0] == kNullOp) return;
mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
const TBlob& out = outputs[0];
const TBlob& original_tensor = inputs[0];
const TBlob& idx_vector = inputs[1];
const TBlob& copied_tensor = inputs[2];
int dim_size = inputs[2].Size() / inputs[1].Size();
// copy original tensor to output
copy(s, out, original_tensor);
// index copy
MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(idx_vector.type_flag_, IType, {
Kernel<index_copy_fwd_gpu, gpu>::Launch(
s, copied_tensor.Size(), copied_tensor.dptr<DType>(),
idx_vector.dptr<IType>(), out.dptr<DType>(), dim_size);
});
});
}

struct index_copy_bwd_gpu {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i,
const DType* out_grad,
DType* orig_grad,
DType* new_grad,
const IType* idx,
int dim_size,
int idx_size,
OpReqType orig_req,
OpReqType new_req) {
int index = idx[i / dim_size];
KERNEL_ASSIGN(new_grad[i], new_req, out_grad[index * dim_size + i % dim_size]);
if (orig_req == kAddTo) {
orig_grad[index * dim_size + i % dim_size] -= new_grad[i];
} else if (orig_req == kNullOp) {
return;
} else {
orig_grad[index * dim_size + i % dim_size] = 0;
}
}
};

template<>
void IndexCopyBackward<gpu>(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
using namespace mshadow;
using namespace mxnet_op;
CHECK_EQ(inputs.size(), 4U);
CHECK_EQ(outputs.size(), 3U);
Stream<gpu> *s = ctx.get_stream<gpu>();
const TBlob& out_grad = inputs[0];
const TBlob& index = inputs[2];
const TBlob& in_grad_1 = outputs[0];
const TBlob& in_grad_2 = outputs[2];
int dim_size = inputs[3].Size() / inputs[2].Size();
int index_size = inputs[2].Size();
OpReqType orig_req = req[0];
OpReqType new_req = req[2];
// index_copy_backward
MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {
MSHADOW_TYPE_SWITCH(index.type_flag_, IType, {
switch (orig_req) {
case kNullOp:
break;
case kWriteTo:
case kWriteInplace:
copy(s, in_grad_1, out_grad);
break;
case kAddTo:
Kernel<op_with_req<op::mshadow_op::plus, kWriteInplace>, gpu>::Launch(
s, out_grad.Size(), in_grad_1.dptr<DType>(),
out_grad.dptr<DType>(), in_grad_1.dptr<DType>());
}
Kernel<index_copy_bwd_gpu, gpu>::Launch(
s, in_grad_2.Size(), out_grad.dptr<DType>(),
in_grad_1.dptr<DType>(), in_grad_2.dptr<DType>(),
index.dptr<IType>(), dim_size, index_size, orig_req, new_req);
});
});
}

NNVM_REGISTER_OP(_contrib_index_copy)
.set_attr<FCompute>("FCompute<gpu>", IndexCopyForward<gpu>);

Expand Down

0 comments on commit 764dbb1

Please sign in to comment.