Skip to content

Commit

Permalink
Performance improving for MKL-DNN Quantized FullyConnected (apache#14528
Browse files Browse the repository at this point in the history
)

* Cached bias to Quantized FullyCOnnected based on Subgraph to improve performance

* retrigger CI

* retrigger CI
  • Loading branch information
ciyongch authored and haohuw committed Jun 23, 2019
1 parent ff53e29 commit 7f0bae4
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 38 deletions.
6 changes: 6 additions & 0 deletions src/operator/nn/fully_connected-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ enum FullyConnectedOpResource {kTempSpace};
enum FullyConnectedOpOutputs {kOut};
} // fullc

namespace quantized_fullc {
enum QuantizedFCInputMinMax {kDataMin, kDataMax, kWeightMin, kWeightMax, kBiasMin, kBiasMax};
enum QuantizedFCOutputs {kOut, kOutMin, kOutMax};
} // quantized_fullc


struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
int num_hidden;
bool no_bias;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,6 @@
namespace mxnet {
namespace op {

namespace quantized_fc_enum {
enum QuantizedFCInputMinMax { kDataMin, kDataMax, kWeightMin, kWeightMax, kBiasMin, kBiasMax };
enum QuantizedFCOutputs { kOut, kOutMin, kOutMax };
}

void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs,
const OpContext &ctx,
const std::vector<NDArray> &in_data,
Expand All @@ -52,15 +47,15 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs,
NDArray weight = in_data[fullc::kWeight];

const float min_data =
in_data[num_inputs + quantized_fc_enum::kDataMin].data().dptr<float>()[0];
in_data[num_inputs + quantized_fullc::kDataMin].data().dptr<float>()[0];
const float max_data =
in_data[num_inputs + quantized_fc_enum::kDataMax].data().dptr<float>()[0];
in_data[num_inputs + quantized_fullc::kDataMax].data().dptr<float>()[0];
const float min_weight =
in_data[num_inputs + quantized_fc_enum::kWeightMin].data().dptr<float>()[0];
in_data[num_inputs + quantized_fullc::kWeightMin].data().dptr<float>()[0];
const float max_weight =
in_data[num_inputs + quantized_fc_enum::kWeightMax].data().dptr<float>()[0];
float *min_output_ptr = out_data[quantized_fc_enum::kOutMin].data().dptr<float>();
float *max_output_ptr = out_data[quantized_fc_enum::kOutMax].data().dptr<float>();
in_data[num_inputs + quantized_fullc::kWeightMax].data().dptr<float>()[0];
float *min_output_ptr = out_data[quantized_fullc::kOutMin].data().dptr<float>();
float *max_output_ptr = out_data[quantized_fullc::kOutMax].data().dptr<float>();

auto data_range = (data.dtype() == mshadow::kInt8) ? kInt8Range : kUint8Range;
float data_scale = data_range / MaxAbs(min_data, max_data);
Expand All @@ -69,8 +64,8 @@ void MKLDNNQuantizedFullyConnectedForward(const nnvm::NodeAttrs &attrs,
NDArray quantized_bias;
if (!param.no_bias) {
NDArray bias = in_data[fullc::kBias];
float min_bias = in_data[num_inputs + quantized_fc_enum::kBiasMin].data().dptr<float>()[0];
float max_bias = in_data[num_inputs + quantized_fc_enum::kBiasMax].data().dptr<float>()[0];
float min_bias = in_data[num_inputs + quantized_fullc::kBiasMin].data().dptr<float>()[0];
float max_bias = in_data[num_inputs + quantized_fullc::kBiasMax].data().dptr<float>()[0];
float bias_int32_rescale = data_scale * weight_scale * MaxAbs(min_bias, max_bias) / kInt8Range;

quantized_bias = NDArray(bias.storage_type(), bias.shape(),
Expand Down
22 changes: 14 additions & 8 deletions src/operator/quantization/quantized_fully_connected.cc
Original file line number Diff line number Diff line change
Expand Up @@ -222,20 +222,26 @@ void QuantizedFullyConnectedForwardCPU(const nnvm::NodeAttrs& attrs,
shiftdata.dptr_[i] = data_temp[i] + shift;
}

Tensor<cpu, 1, float> min_output = out_data[1].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> max_output = out_data[2].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> min_data = in_data[num_inputs].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> max_data = in_data[num_inputs + 1].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> min_weight = in_data[num_inputs + 2].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> max_weight = in_data[num_inputs + 3].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> min_output = out_data[quantized_fullc::kOutMin].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> max_output = out_data[quantized_fullc::kOutMax].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> min_data =
in_data[num_inputs + quantized_fullc::kDataMin].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> max_data =
in_data[num_inputs + quantized_fullc::kDataMax].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> min_weight =
in_data[num_inputs + quantized_fullc::kWeightMin].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> max_weight =
in_data[num_inputs + quantized_fullc::kWeightMax].get<cpu, 1, float>(s);

Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1, min_output.dptr_,
max_output.dptr_, min_data.dptr_, max_data.dptr_, min_weight.dptr_, max_weight.dptr_);
if (!param.no_bias) {
Tensor<cpu, 1, int8_t> bias = in_data[fullc::kBias].get_with_shape<cpu, 1, int8_t>(
Shape1(wshape[0]), s);
Tensor<cpu, 1, float> min_bias = in_data[num_inputs + 4].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> max_bias = in_data[num_inputs + 5].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> min_bias =
in_data[num_inputs + quantized_fullc::kBiasMin].get<cpu, 1, float>(s);
Tensor<cpu, 1, float> max_bias =
in_data[num_inputs + quantized_fullc::kBiasMax].get<cpu, 1, float>(s);

Kernel<QuantizedSumInitKernelWithBias, cpu>::Launch(s, n, out.dptr_,
bias.dptr_, min_output.dptr_, max_output.dptr_, min_bias.dptr_, max_bias.dptr_);
Expand Down
41 changes: 24 additions & 17 deletions src/operator/subgraph/mkldnn/mkldnn_fc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,15 @@ class SgMKLDNNFCOp {
nnvm::Symbol subgraph_sym_;
MKLDNNFCFullParam full_param_;
std::shared_ptr<MKLDNNFullyConnectedForward> fwd_;
NDArray cached_weight_;
NDArray cached_bias_;
float cached_min_data_;
float cached_max_data_;
float cached_min_weight_;
float cached_max_weight_;
float cached_min_bias_;
float cached_max_bias_;
float cached_min_output_;
float cached_max_output_;
};

void SgMKLDNNFCOp::Forward(const OpContext &ctx,
Expand All @@ -91,23 +92,19 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
float max_weight = 0.0;
float min_bias = 0.0;
float max_bias = 0.0;
float *min_output_ptr = nullptr;
float *max_output_ptr = nullptr;

if (mkldnn_param.quantized) {
total_num_inputs = base_num_inputs * 3;
min_data = in_data[base_num_inputs].data().dptr<float>()[0];
max_data = in_data[base_num_inputs + 1].data().dptr<float>()[0];
min_weight = in_data[base_num_inputs + 2].data().dptr<float>()[0];
max_weight = in_data[base_num_inputs + 3].data().dptr<float>()[0];
min_data = in_data[base_num_inputs + quantized_fullc::kDataMin].data().dptr<float>()[0];
max_data = in_data[base_num_inputs + quantized_fullc::kDataMax].data().dptr<float>()[0];
min_weight = in_data[base_num_inputs + quantized_fullc::kWeightMin].data().dptr<float>()[0];
max_weight = in_data[base_num_inputs + quantized_fullc::kWeightMax].data().dptr<float>()[0];
if (has_bias) {
min_bias = in_data[base_num_inputs + 4].data().dptr<float>()[0];
max_bias = in_data[base_num_inputs + 5].data().dptr<float>()[0];
min_bias = in_data[base_num_inputs + quantized_fullc::kBiasMin].data().dptr<float>()[0];
max_bias = in_data[base_num_inputs + quantized_fullc::kBiasMax].data().dptr<float>()[0];
}
if (!mkldnn_param.enable_float_output) {
total_num_outputs = base_num_outputs * 3;
min_output_ptr = out_data[1].data().dptr<float>();
max_output_ptr = out_data[2].data().dptr<float>();
}
}
CHECK_EQ(in_data.size(), total_num_inputs);
Expand Down Expand Up @@ -135,6 +132,8 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
cached_max_weight_ = max_weight;
if (has_bias) {
cached_bias_ = in_data[fullc::kBias];
cached_min_bias_ = min_bias;
cached_max_bias_ = max_bias;
} else {
cached_bias_ = NDArray();
}
Expand All @@ -149,7 +148,7 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
if (has_bias) {
NDArray bias = in_data[fullc::kBias];
float bias_int32_rescale = data_scale * weight_scale *
MaxAbs(min_bias, max_bias) / kInt8Range;
MaxAbs(cached_min_bias_, cached_max_bias_) / kInt8Range;

cached_bias_ = NDArray(bias.storage_type(), bias.shape(),
bias.ctx(), true, mshadow::kInt32);
Expand All @@ -168,15 +167,16 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
} else if (mkldnn_param.min_calib_range.has_value() &&
mkldnn_param.max_calib_range.has_value()) {
full_param_.output_scales.resize(0);
*min_output_ptr = mkldnn_param.min_calib_range.value();
*max_output_ptr = mkldnn_param.max_calib_range.value();
cached_min_output_ = mkldnn_param.min_calib_range.value();
cached_max_output_ = mkldnn_param.max_calib_range.value();

full_param_.requantize_scales[0] = quantized_out_range /
MaxAbs(*min_output_ptr, *max_output_ptr) / data_scale / weight_scale;
MaxAbs(cached_min_output_, cached_max_output_) / data_scale / weight_scale;
} else {
Stream<cpu> *s = ctx.get_stream<cpu>();
mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(s, 1,
min_output_ptr, max_output_ptr, &min_data, &max_data, &min_weight, &max_weight);
mxnet_op::Kernel<QuantizationRangeForMultiplicationStruct, cpu>::Launch(
s, 1, &cached_min_output_, &cached_max_output_,
&min_data, &max_data, &min_weight, &max_weight);
}
}

Expand All @@ -195,6 +195,13 @@ void SgMKLDNNFCOp::Forward(const OpContext &ctx,
}

MKLDNNFCForwardFullFeature(full_param_, ctx, fwd_.get(), new_inputs, new_req, out_data);

if (mkldnn_param.quantized && !mkldnn_param.enable_float_output) {
float *min_output_ptr = out_data[quantized_fullc::kOutMin].data().dptr<float>();
float *max_output_ptr = out_data[quantized_fullc::kOutMax].data().dptr<float>();
*min_output_ptr = cached_min_output_;
*max_output_ptr = cached_max_output_;
}
}

static void SgMKLDNNFCParamParser(nnvm::NodeAttrs *attrs) {
Expand Down

0 comments on commit 7f0bae4

Please sign in to comment.