Skip to content

Commit

Permalink
Do not use bit-mask when MKL dropout is used.
Browse files Browse the repository at this point in the history
  • Loading branch information
apeforest committed Feb 19, 2020
1 parent 2183a23 commit 746a8f0
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 60 deletions.
88 changes: 28 additions & 60 deletions src/operator/nn/dropout-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,9 @@ class DropoutOp {
}
}
static inline bool MKLAvailable() {
// TODO(lnyuan): how to let user enable/disable MKL Dropout
return true;
// BernoulliGenerate expects an array int, so for types smaller than int, the mask buffer
// will be too small, so we cannot use MKL in those cases
return sizeof(DType) >= sizeof(int);
}

// MKL forward pass
Expand All @@ -123,56 +124,25 @@ class DropoutOp {
Stream<xpu> *s = ctx.get_stream<xpu>();
RandGenerator<xpu, DType> *pgen = ctx.requested[0].get_parallel_random<xpu, DType>();
CHECK_NOTNULL(pgen);
Tensor<xpu, 1, uint8_t> mask = out_data[dropout::kMask].FlatTo1D<xpu, uint8_t>(s);
Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu, DType>(s);
DType *outptr = out.dptr_;
DType *dataptr = data.dptr_;

index_t count = data.shape_[0] * data.shape_[1];
// allocating buffer for MKL routine to calculate int32 based maskptr
Tensor<xpu, 1, int> temp_space =
ctx.requested[1].get_space_typed<xpu, 1, int>(Shape1(count), s);
auto mkl_mask = temp_space.dptr_;

BernoulliGenerate(*pgen, count, this->pkeep_, mkl_mask);
const float pk_1 = 1.0f / this->pkeep_;
const int nthr = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
const int blk_size = 64;
const int nblk = count / blk_size;

#pragma omp parallel num_threads(nthr)
{
#pragma omp for
for (index_t b = 0; b < nblk; ++b) {
for (index_t k = 0; k < blk_size; ++k) {
const index_t i = b * blk_size + k;
outptr[i] = dataptr[i] * mkl_mask[i] * pk_1;
auto mask_idx = i >> 3; // div 8
uint8_t mask_offset = i & 7; // mod 8
if (mkl_mask[i]) {
// set bit
mask.dptr_[mask_idx] |= 1U << mask_offset;
} else {
// clear bit
mask.dptr_[mask_idx] &= ~(1U << mask_offset);
}
}
}
auto maskptr = reinterpret_cast<int *>(mask.dptr_);
index_t count = mask.shape_[0] * mask.shape_[1];
if (sizeof(DType) > sizeof(int)) {
// allocating new buffer to avoiding memory overlapping between `mask.dptr_` and `maskptr`
Tensor<xpu, 1, int> temp = ctx.requested[1].get_space_typed<xpu, 1, int>(Shape1(count), s);
maskptr = temp.dptr_;
}

// tail
for (index_t i = nblk * blk_size; i < count; ++i) {
outptr[i] = dataptr[i] * mkl_mask[i] * pk_1;
auto mask_idx = i >> 3; // div 8
uint8_t mask_offset = i & 7; // mod 8
if (mkl_mask[i]) {
// set bit
mask.dptr_[mask_idx] |= 1U << mask_offset;
} else {
// clear bit
mask.dptr_[mask_idx] &= ~(1U << mask_offset);
}
BernoulliGenerate(*pgen, count, this->pkeep_, maskptr);
const float pk_1 = 1.0f / this->pkeep_;
#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
for (int i = 0; i < count; ++i) {
const DType maskVal = static_cast<DType>(maskptr[i]) * pk_1;
outptr[i] = dataptr[i] * maskVal;
mask.dptr_[i] = maskVal;
}
}

Expand All @@ -183,21 +153,15 @@ class DropoutOp {
const std::vector<TBlob> &out_grad) {
Stream<xpu> *s = ctx.get_stream<xpu>();
Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 1, uint8_t> mask = out_data[dropout::kMask].FlatTo1D<xpu, uint8_t>(s);
Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu, DType>(s);
DType *ingradptr = gdata.dptr_;
const DType *outgradptr = grad.dptr_;
const uint8_t *maskptr = mask.dptr_;
const index_t count = grad.shape_[0] * grad.shape_[1];
const float pk_1 = 1.0f / this->pkeep_;
const int nthr = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();

#pragma omp parallel for num_threads(nthr)
const DType *maskptr = mask.dptr_;
const index_t count = mask.shape_[0] * mask.shape_[1];
#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
for (index_t i = 0; i < count; ++i) {
auto mask_idx = i >> 3; // div 8;
uint8_t mask_offset = i & 7; // mod 8
bool mask_val = maskptr[mask_idx] & (1U << mask_offset);
ingradptr[i] = outgradptr[i] * mask_val * pk_1;
ingradptr[i] = outgradptr[i] * maskptr[i];
}
}
#endif // #if MXNET_USE_MKL_DROPOUT
Expand Down Expand Up @@ -456,18 +420,18 @@ class DropoutOp {
const TBlob &in = in_data[dropout::kData];
const TBlob &out = out_data[dropout::kOut];
const TBlob &mask = out_data[dropout::kMask];
CHECK_EQ(mask.type_flag_, mshadow::kUint8);

if (this->pkeep_ < 1 && (ctx.is_train || this->mode_ == dropout::kAlways)) {
this->dropout_passthrough_ = false;
if (this->axes_.ndim() == 0) {
CHECK_EQ((out.Size() + 7) / 8, mask.Size());
#if MXNET_USE_MKL_DROPOUT
if (MKLAvailable()) {
MKLForward(ctx, in_data, out_data);
return;
}
#endif // MXNET_USE_MKL_DROPOUT
CHECK_EQ((out.Size() + 7) / 8, mask.Size());
CHECK_EQ(mask.type_flag_, mshadow::kUint8);
#if MXNET_USE_CUDNN_DROPOUT && defined(__CUDACC__)
if (CuDNNAvailable()) {
CuDNNForward(ctx, in, mask, out);
Expand Down Expand Up @@ -548,8 +512,12 @@ class DropoutOp {
const TBlob &gdata = in_grad[dropout::kData];
const TBlob &grad = out_grad[dropout::kOut];
const TBlob &mask = out_data[dropout::kMask];
#if MXNET_USE_MKL_DROPOUT
CHECK_EQ(grad.Size(), mask.Size());
#else
CHECK_EQ(mask.type_flag_, mshadow::kUint8);
CHECK_EQ((grad.Size() + 7) / 8, mask.Size());
#endif

if (this->axes_.ndim() == 0) {
#if MXNET_USE_MKL_DROPOUT
Expand Down
19 changes: 19 additions & 0 deletions src/operator/nn/dropout.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,15 @@ Example::
if (!mxnet::ndim_is_known(dshape)) return false;
out_shape->clear();
out_shape->push_back(dshape);

#if MXNET_USE_MKL_DROPOUT
// TODO(TaoLv): Do not use bit-mask when MKL dropout is enabled
// Need to enhance MKLDNN engine for more efficient memory usage
if (param.axes.ndim() == 0) {
out_shape->push_back(dshape);
return true;
}
#endif
if (param.axes.ndim() > 0) {
for (int i = 0; i < param.axes.ndim(); ++i) {
dshape[param.axes[i]] = 1;
Expand Down Expand Up @@ -147,6 +156,16 @@ Example::

out_type->clear();
out_type->push_back(dtype); // data type for output

#if MXNET_USE_MKL_DROPOUT
// TODO(TaoLv): Do not use bit-mask when MKL dropout is enabled
// Need to enhance MKLDNN engine for more efficient memory usage
const DropoutParam& param = nnvm::get<DropoutParam>(attrs.parsed);
if (param.axes.ndim() == 0) {
out_type->push_back(dtype);
return true;
}
#endif
out_type->push_back(mshadow::kUint8); // data type for mask
return true;
})
Expand Down

0 comments on commit 746a8f0

Please sign in to comment.