From 624e3e52089e7577ada38e061d0f3f299b1fac6d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 19 Dec 2017 20:41:46 +0800 Subject: [PATCH 01/10] add MKL Packed RecurrentLayer --- paddle/gserver/CMakeLists.txt | 10 + paddle/gserver/layers/MKLPackedGemm.h | 94 ++++++ .../layers/MKLPackedRecurrentLayer.cpp | 311 ++++++++++++++++++ .../gserver/layers/MKLPackedRecurrentLayer.h | 131 ++++++++ 4 files changed, 546 insertions(+) create mode 100644 paddle/gserver/layers/MKLPackedGemm.h create mode 100644 paddle/gserver/layers/MKLPackedRecurrentLayer.cpp create mode 100644 paddle/gserver/layers/MKLPackedRecurrentLayer.h diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt index 41ead3c5ecef24..3d6ced713f00bd 100644 --- a/paddle/gserver/CMakeLists.txt +++ b/paddle/gserver/CMakeLists.txt @@ -34,6 +34,16 @@ else() message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations") endif() +if(NOT WITH_MKLML) + file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h") + file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp") + list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER}) + list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES}) + message(STATUS "Skip compiling with MKLPackedLayers") +else() + message(STATUS "Compile with MKLPackedLayers") +endif() + if(NOT WITH_GPU) list(REMOVE_ITEM GSERVER_HEADER layers/CudnnConvBaseLayer.h diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h new file mode 100644 index 00000000000000..3c4c62eeb87f6a --- /dev/null +++ b/paddle/gserver/layers/MKLPackedGemm.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/math/MathFunctions.h" +#include "paddle/math/Matrix.h" + +namespace paddle { + +class MKLPackedGemm { +protected: + real* weightPacked_; + real* weightTPacked_; + size_t weightHeight_; + size_t weightWidth_; + +public: + MKLPackedGemm(MatrixPtr weight) { + weightHeight_ = weight->getHeight(); + weightWidth_ = weight->getWidth(); + weightPacked_ = + cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_); + weightTPacked_ = + cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_); + cblas_sgemm_pack(CblasRowMajor, + CblasBMatrix, + CblasNoTrans, + 1, + weightWidth_, + weightHeight_, + 1.0, + weight->getData(), + weightWidth_, + weightPacked_); + cblas_sgemm_pack(CblasRowMajor, + CblasBMatrix, + CblasTrans, + 1, + weightWidth_, + weightHeight_, + 1.0, + weight->getData(), + weightWidth_, + weightTPacked_); + } + void compute(MatrixPtr batch2, MatrixPtr batch1, bool transW = false) { + if (transW) { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + batch2->getHeight(), + weightWidth_, + weightHeight_, + batch1->getData(), + weightHeight_, + weightTPacked_, + weightWidth_, + 1, + batch2->getData(), + weightWidth_); + } else { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + batch2->getHeight(), + weightWidth_, + weightHeight_, + batch1->getData(), + weightHeight_, + weightPacked_, + weightWidth_, + 1, + batch2->getData(), + weightWidth_); + } + } + ~MKLPackedGemm() { + cblas_sgemm_free(weightPacked_); + cblas_sgemm_free(weightTPacked_); + } +}; +} // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp new file mode 100644 index 00000000000000..6f455af91ed800 --- /dev/null +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp @@ -0,0 +1,311 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MKLPackedRecurrentLayer.h" + +namespace paddle { + +REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer); + +bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + if (!Layer::init(layerMap, parameterMap)) return false; + CHECK_EQ(1U, inputLayers_.size()); + CHECK_EQ(1U, parameters_.size()); + CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize()); + weight_.reset(new Weight(getSize(), getSize(), parameters_[0])); + if (biasParameter_.get() != NULL) { + bias_.reset(new Weight(1, getSize(), biasParameter_)); + } + reversed_ = config_.reversed(); + + sgemm_packed_.reset(new MKLPackedGemm(weight_->getW())); + + return true; +} + +void MKLPackedRecurrentLayer::resetState() { + CHECK(!reversed_) << "state is not allowed for reversed recurrent layer"; + Matrix::resizeOrCreate( + prevOutput_, 1, getSize(), /* trans= */ false, useGpu_); + prevOutput_->zeroMem(); +} + +void MKLPackedRecurrentLayer::setState(LayerStatePtr state) { + CHECK(state->value.size() == 1) << "one matrix is expected for RNN state"; + prevOutput_->copyFrom(*(state->value[0])); +} + +LayerStatePtr MKLPackedRecurrentLayer::getState() { + LayerStatePtr res = std::make_shared(); + res->value.push_back(prevOutput_->clone(0, 0, useGpu_)); + res->value[0]->copyFrom(*prevOutput_); + return res; +} + +void MKLPackedRecurrentLayer::forward(PassType passType) { + REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str()); + Layer::forward(passType); + const Argument& input = getInput(0); + CHECK(input.sequenceStartPositions); + int batchSize = input.getBatchSize(); + size_t numSequences = input.getNumSequences(); + resetOutput(batchSize, getSize()); + CHECK_EQ(getSize(), input.value->getWidth()); + const int* starts = input.sequenceStartPositions->getData(false); + CHECK_EQ(starts[numSequences], batchSize); + + output_.value->assign(*input.value); + if (bias_) { + output_.value->addBias(*bias_->getW(), 1); + } + if (!FLAGS_rnn_use_batch) { + forwardSequence(batchSize, numSequences, starts); + } else { + forwardBatch(batchSize, numSequences, starts); + } +} + +void MKLPackedRecurrentLayer::forwardSequence(int batchSize, + size_t numSequences, + const int* starts) { + REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str()); + + frameOutput_.reserve(batchSize); + for (int i = frameOutput_.size(); i < batchSize; ++i) { + Argument arg; + arg.value = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + arg.grad = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + frameOutput_.push_back(arg); + } + + for (int i = 0; i < batchSize; ++i) { + frameOutput_[i].value->setData(output_.value->getData() + i * getSize()); + } + + for (size_t i = 0; i < numSequences; ++i) { + forwardOneSequence(starts[i], starts[i + 1] - starts[i]); + } +} + +void MKLPackedRecurrentLayer::forwardOneSequence(int start, int length) { + if (!reversed_) { + if (prevOutput_) { + frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1); + } + activation_->forward(frameOutput_[start]).check(); + + for (int i = 1; i < length; ++i) { + frameOutput_[start + i].value->mul( + *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1); + activation_->forward(frameOutput_[start + i]).check(); + } + if (prevOutput_) { + prevOutput_->assign(*frameOutput_[start + length - 1].value); + } + } else { + activation_->forward(frameOutput_[start + length - 1]).check(); + for (int i = length - 2; i >= 0; --i) { + frameOutput_[start + i].value->mul( + *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1); + activation_->forward(frameOutput_[start + i]).check(); + } + } +} + +void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) { + REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str()); + const Argument& input = getInput(0); + CHECK(input.sequenceStartPositions); + int batchSize = input.getBatchSize(); + const int* starts = input.sequenceStartPositions->getData(false); + size_t numSequences = input.getNumSequences(); + + if (!FLAGS_rnn_use_batch) { + backwardSequence(batchSize, numSequences, starts); + } else { + backwardBatch(batchSize, numSequences, starts); + } + + if (input.grad) { + input.grad->add(*output_.grad); + } + + if (bias_ && bias_->getWGrad()) { + bias_->getWGrad()->collectBias(*output_.grad, 1); + bias_->getParameterPtr()->incUpdate(callback); + } + + weight_->getParameterPtr()->incUpdate(callback); + sgemm_packed_.reset(new MKLPackedGemm(weight_->getW())); +} + +void MKLPackedRecurrentLayer::backwardSequence(int batchSize, + size_t numSequences, + const int* starts) { + REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str()); + for (int i = 0; i < batchSize; ++i) { + frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize()); + } + + for (size_t i = 0; i < numSequences; ++i) { + backwardOneSequence(starts[i], starts[i + 1] - starts[i]); + } +} + +void MKLPackedRecurrentLayer::backwardOneSequence(int start, int length) { + MatrixPtr weightT = weight_->getW()->getTranspose(); + if (!reversed_) { + for (int i = length - 1; i > 0; --i) { + activation_->backward(frameOutput_[start + i]).check(); + frameOutput_[start + i - 1].grad->mul( + *frameOutput_[start + i].grad, *weightT, 1, 1); + } + activation_->backward(frameOutput_[start]).check(); + if (weight_->getWGrad()) { + weight_->getWGrad()->mul( + *output_.value->subMatrix(start, length - 1)->getTranspose(), + *output_.grad->subMatrix(start + 1, length - 1), + 1, + 1); + } + } else { + for (int i = 0; i < length - 1; ++i) { + activation_->backward(frameOutput_[start + i]).check(); + frameOutput_[start + i + 1].grad->mul( + *frameOutput_[start + i].grad, *weightT, 1, 1); + } + activation_->backward(frameOutput_[start + length - 1]).check(); + if (weight_->getWGrad()) { + weight_->getWGrad()->mul( + *output_.value->subMatrix(start + 1, length - 1)->getTranspose(), + *output_.grad->subMatrix(start, length - 1), + 1, + 1); + } + } +} + +void MKLPackedRecurrentLayer::forwardBatch(int batchSize, + size_t numSequences, + const int* starts) { + if (!batchValue_) { + batchValue_.reset(new SequenceToBatch(useGpu_)); + } + + batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_); + + batchValue_->copyFromSeq(*output_.value); + + { + REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str()); + /* forward one batch */ + for (size_t n = 0; n < batchValue_->getNumBatch(); n++) { + MatrixPtr batch2 = batchValue_->getBatchValue(n); + + if (n != 0) { + MatrixPtr batch1 = + batchValue_->getBatchValue(n - 1, batch2->getHeight()); + + // batch2->mul(*batch1, *weight_->getW(), 1, 1); + sgemm_packed_->compute(batch2, batch1); + } + +#pragma omp parallel for collapse(2) + for (size_t i = 0; i < batch2->getHeight(); i++) { + for (size_t j = 0; j < batch2->getWidth(); j++) { + *(batch2->getData() + i * batch2->getWidth() + j) = + *(batch2->getData() + i * batch2->getWidth() + j) > 0 + ? *(batch2->getData() + i * batch2->getWidth() + j) + : 0; + } + } + } + } + + batchValue_->copyBackSeq(*output_.value); +} + +void MKLPackedRecurrentLayer::backwardBatch(int batchSize, + size_t numSequences, + const int* starts) { + if (!batchGrad_) { + batchGrad_.reset(new SequenceToBatch(useGpu_)); + } + batchGrad_->shareIndexWith(*batchValue_); + + size_t numBatch = batchGrad_->getNumBatch(); + bool backwardByBatch = numBatch < numSequences; + + batchGrad_->copyFromSeq(*output_.grad); + { + REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str()); + /* backward one batch */ + for (int n = (int)numBatch - 1; n >= 0; n--) { + MatrixPtr batch2 = batchGrad_->getBatchValue(n); + MatrixPtr batch1 = batchValue_->getBatchValue(n, batch2->getHeight()); + + Argument arg; + arg.value = batch1; + arg.grad = batch2; + activation_->backward(arg).check(); + + if (n != 0) { + batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight()); + // batch1->mul(*batch2, *weightT, 1, 1); + sgemm_packed_->compute(batch1, batch2, true); + } + + if (backwardByBatch && weight_->getWGrad()) { + if (n != 0) { + /* backward weight */ + batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight()); + weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1); + } + } + } + } + + batchGrad_->copyBackSeq(*output_.grad); + + if (!backwardByBatch && weight_->getWGrad()) { + REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str()); + for (size_t seq = 0; seq < numSequences; ++seq) { + int len = starts[seq + 1] - starts[seq]; + if (!reversed_) { + weight_->getWGrad()->mul( + *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(), + *output_.grad->subMatrix(starts[seq] + 1, len - 1), + 1, + 1); + } else { + weight_->getWGrad()->mul( + *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(), + *output_.grad->subMatrix(starts[seq], len - 1), + 1, + 1); + } + } + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h new file mode 100644 index 00000000000000..719137f2db74f8 --- /dev/null +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "Layer.h" +#include "MKLPackedGemm.h" +#include "SequenceToBatch.h" +#include "paddle/utils/Stat.h" + +DECLARE_bool(rnn_use_batch); + +namespace paddle { + +/** + * @brief MKLPackedRecurrentLayer takes 1 input layer. The output size is the + * same with + * input layer. + * For each sequence [start, end] it performs the following computation: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\ + * out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end + * + * \f] + * If reversed is true, the order is reversed: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\ + * out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end + * \f] + * There are two methods to calculate rnn. One way is to compute rnn one + * sequence by one sequence. The other way is to reorganize the input + * into batches, then compute rnn one batch by one batch. Users can select + * them by rnn_use_batch flag. + */ + +class MKLPackedRecurrentLayer : public Layer { +public: + explicit MKLPackedRecurrentLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + + void backward(const UpdateCallback& callback) override; + + void resetState() override; + + void setState(LayerStatePtr state) override; + + LayerStatePtr getState() override; + +protected: + /** + * @brief If user do not set --rnn_use_batch=true, it will + * compute rnn forward one sequence by one sequence in default. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void forwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn forward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ + void forwardOneSequence(int start, int length); + /** + * @brief Compute rnn backward one sequence by onesequence. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void backwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn backward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ + void backwardOneSequence(int start, int length); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. It will convert batch shape to sequence after finishing forward. + * The batch info can refer to SequenceToBatch class. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void forwardBatch(int batchSize, size_t numSequences, const int* starts); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void backwardBatch(int batchSize, size_t numSequences, const int* starts); + +protected: + std::unique_ptr weight_; + std::unique_ptr bias_; + + /// frameOutput_[i] is used to hold the i-th sample of output_ + std::vector frameOutput_; + MatrixPtr prevOutput_; + /// Whether compute rnn by reverse. + bool reversed_; + /// If compute batch by batch, batchValue_ will be used to save the + /// reorganized input value. + std::unique_ptr batchValue_; + /// If compute batch by batch, batchGrad_ will be used to save the + /// gradient with respect to reorganized input value. + std::unique_ptr batchGrad_; + + std::unique_ptr sgemm_packed_; +}; +} From 2e101df7c656d524c92b6c31711a8bbcaf7ce09f Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 19 Dec 2017 21:40:34 +0800 Subject: [PATCH 02/10] enable gtest for MKLPackedRecurrentLayer --- paddle/gserver/tests/test_RecurrentLayer.cpp | 165 ++++++++++++++++++- 1 file changed, 159 insertions(+), 6 deletions(-) diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 16ab0e6aecb6a8..1f31158579a2c9 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -420,12 +420,165 @@ TEST(Layer, LstmLayer) { } } +#ifdef PADDLE_WITH_MKLML + +LayerPtr initMKLPackedLayer(LayerConfig layerConfig, + bool reversed, + int layerSize, + LayerPtr dataLayer, + ParameterPtr para, + ParameterPtr bias = nullptr) { + LayerMap layerMap; + ParameterMap parameterMap; + layerMap[dataLayer->getName()] = dataLayer; + parameterMap[para->getName()] = para; + if (bias) { + parameterMap[bias->getName()] = bias; + layerConfig.set_bias_parameter_name("bias_0"); + } + + layerConfig.set_size(layerSize); + layerConfig.set_reversed(reversed); + layerConfig.add_inputs(); + LayerInputConfig& input = *(layerConfig.mutable_inputs(0)); + input.set_input_layer_name("layer_0"); + input.set_input_parameter_name("para_0"); + + LayerPtr testLayer = Layer::create(layerConfig); + layerMap[testLayer->getName()] = testLayer; + + testLayer->init(layerMap, parameterMap); + testLayer->setNeedGradient(true); + + return testLayer; +} + +void checkMKLPackedLayer(LayerPtr testLayer1, LayerPtr testLayer2) { + const VectorPtr& weightGrad = + (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT); + const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad(); + CpuVector wgt_grad1(weightGrad->getSize()); + CpuVector wgt_grad2(weightGrad->getSize()); + CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth()); + CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth()); + + CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth()); + outputGrad.randomizeUniform(); + + for (int i = 0; i < 2; i++) { + FLAGS_rnn_use_batch = true; + + testLayer1->forward(PASS_GC); + + testLayer1->getOutputGrad()->copyFrom(outputGrad); + + weightGrad->zero(); + inputGrad->zero(); + + testLayer1->backward(nullptr); + + wgt_grad1.copyFrom(*weightGrad); + input_grad1.copyFrom(*inputGrad); + + FLAGS_rnn_use_batch = true; + + testLayer2->forward(PASS_GC); + testLayer2->getOutputGrad()->copyFrom(outputGrad); + + weightGrad->zero(); + inputGrad->zero(); + + testLayer2->backward(nullptr); + + wgt_grad2.copyFrom(*weightGrad); + input_grad2.copyFrom(*inputGrad); + + checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue()); + + checkError(wgt_grad1, wgt_grad2); + checkError(input_grad1, input_grad2); + } + + for (int i = 0; i < 2; i++) { + CpuMatrix outputValue(testLayer2->getOutputValue()->getHeight(), + testLayer2->getOutputValue()->getWidth()); + + FLAGS_rnn_use_batch = true; + + testLayer2->forward(PASS_GC); + outputValue.copyFrom(*testLayer2->getOutputValue()); + + testLayer2->getOutputGrad()->copyFrom(outputGrad); + + weightGrad->zero(); + inputGrad->zero(); + + testLayer2->backward(nullptr); + + wgt_grad1.copyFrom(*weightGrad); + input_grad1.copyFrom(*inputGrad); + + FLAGS_rnn_use_batch = false; + + testLayer2->getOutputValue()->zero(); + + testLayer2->forward(PASS_GC); + testLayer2->getOutputGrad()->copyFrom(outputGrad); + + weightGrad->zero(); + inputGrad->zero(); + + testLayer2->backward(nullptr); + + wgt_grad2.copyFrom(*weightGrad); + input_grad2.copyFrom(*inputGrad); + + checkError(outputValue, *testLayer2->getOutputValue()); + checkError(wgt_grad1, wgt_grad2); + checkError(input_grad1, input_grad2); + } +} + +TEST(MKLPackedLayer, RecurrentLayer) { + LayerConfig layerConfig1; + LayerConfig layerConfig2; + + layerConfig1.set_name("paddle-rnn"); + layerConfig1.set_type("recurrent"); + layerConfig1.set_active_type("relu"); + + layerConfig2.set_name("mkl-packed-rnn"); + layerConfig2.set_type("mkl_packed_recurrent"); + layerConfig2.set_active_type("relu"); + + for (auto layerSize : {32, 64, 128, 256, 512}) { + for (auto batchSize : {1, 5, 100, 500}) { + for (auto reversed : {true, false}) { + LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize + << " reversed=" << reversed; + + LayerPtr dataLayer = + creatDataLayer("layer_0", batchSize, layerSize, false); + ParameterPtr para = + creatParameter("para_0", 0, layerSize * layerSize, false); + + LayerPtr testLayer1 = initMKLPackedLayer( + layerConfig1, reversed, layerSize, dataLayer, para); + LayerPtr testLayer2 = initMKLPackedLayer( + layerConfig2, reversed, layerSize, dataLayer, para); + + checkMKLPackedLayer(testLayer1, testLayer2); + } + } + } +} +#endif + int main(int argc, char** argv) { - if (version::isWithGpu()) { - testing::InitGoogleTest(&argc, argv); - initMain(argc, argv); - return RUN_ALL_TESTS(); - } else { - return 0; + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + if (!version::isWithGpu()) { + testing::GTEST_FLAG(filter) = "-Layer.*"; } + return RUN_ALL_TESTS(); } From 0f8aad2934f88540249da6d7e5c8e8ceeafd60ec Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 19 Dec 2017 23:06:54 +0800 Subject: [PATCH 03/10] fix compile error --- paddle/gserver/layers/MKLPackedGemm.h | 3 ++- paddle/gserver/layers/MKLPackedRecurrentLayer.h | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h index 3c4c62eeb87f6a..91e2515e323217 100644 --- a/paddle/gserver/layers/MKLPackedGemm.h +++ b/paddle/gserver/layers/MKLPackedGemm.h @@ -27,7 +27,7 @@ class MKLPackedGemm { size_t weightWidth_; public: - MKLPackedGemm(MatrixPtr weight) { + explicit MKLPackedGemm(MatrixPtr weight) { weightHeight_ = weight->getHeight(); weightWidth_ = weight->getWidth(); weightPacked_ = @@ -91,4 +91,5 @@ class MKLPackedGemm { cblas_sgemm_free(weightTPacked_); } }; + } // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h index 719137f2db74f8..b8727e0ff30994 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + #include #include "Layer.h" #include "MKLPackedGemm.h" @@ -128,4 +130,5 @@ class MKLPackedRecurrentLayer : public Layer { std::unique_ptr sgemm_packed_; }; -} + +} // namespace paddle From b95834dc0c43cedd27124e18a13345712dcf9d47 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 20 Dec 2017 09:05:41 +0800 Subject: [PATCH 04/10] disable use_gpu when test mkl recurrent layer comparing with cpu --- paddle/gserver/tests/test_RecurrentLayer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 1f31158579a2c9..44d84dd8be2f7c 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -551,6 +551,8 @@ TEST(MKLPackedLayer, RecurrentLayer) { layerConfig2.set_type("mkl_packed_recurrent"); layerConfig2.set_active_type("relu"); + FLAGS_use_gpu = false; + for (auto layerSize : {32, 64, 128, 256, 512}) { for (auto batchSize : {1, 5, 100, 500}) { for (auto reversed : {true, false}) { From 0b080a42da85d67d7a900a9b23bdcea6cfcbc01c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 13:36:59 +0800 Subject: [PATCH 05/10] add recurrent layer header --- paddle/gserver/layers/RecurrentLayer.cpp | 106 +----------------- paddle/gserver/layers/RecurrentLayer.h | 130 +++++++++++++++++++++++ 2 files changed, 131 insertions(+), 105 deletions(-) create mode 100644 paddle/gserver/layers/RecurrentLayer.h diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp index e4c2b483d2fa40..285b11b5a027cd 100644 --- a/paddle/gserver/layers/RecurrentLayer.cpp +++ b/paddle/gserver/layers/RecurrentLayer.cpp @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "RecurrentLayer.h" #include #include "Layer.h" #include "SequenceToBatch.h" @@ -21,110 +22,6 @@ DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation."); namespace paddle { -/** - * @brief RecurrentLayer takes 1 input layer. The output size is the same with - * input layer. - * For each sequence [start, end] it performs the following computation: - * \f[ - * out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\ - * out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end - * - * \f] - * If reversed is true, the order is reversed: - * \f[ - * out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\ - * out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end - * \f] - * There are two methods to calculate rnn. One way is to compute rnn one - * sequence by one sequence. The other way is to reorganize the input - * into batches, then compute rnn one batch by one batch. Users can select - * them by rnn_use_batch flag. - */ - -class RecurrentLayer : public Layer { -public: - explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {} - - bool init(const LayerMap& layerMap, - const ParameterMap& parameterMap) override; - - void forward(PassType passType) override; - - void backward(const UpdateCallback& callback) override; - - void resetState() override; - - void setState(LayerStatePtr state) override; - - LayerStatePtr getState() override; - -protected: - /** - * @brief If user do not set --rnn_use_batch=true, it will - * compute rnn forward one sequence by one sequence in default. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void forwardSequence(int batchSize, size_t numSequences, const int* starts); - /** - * @brief Compute rnn forward by one sequence. - * @param start The start position of this sequence (or sample). - * @param length The length of this sequence (or sample), namely the words - * number of this sequence. - */ - void forwardOneSequence(int start, int length); - /** - * @brief Compute rnn backward one sequence by onesequence. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void backwardSequence(int batchSize, size_t numSequences, const int* starts); - /** - * @brief Compute rnn backward by one sequence. - * @param start The start position of this sequence (or sample). - * @param length The length of this sequence (or sample), namely the words - * number of this sequence. - */ - void backwardOneSequence(int start, int length); - - /** - * @brief Reorganize input into batches and compute rnn forward batch - * by batch. It will convert batch shape to sequence after finishing forward. - * The batch info can refer to SequenceToBatch class. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void forwardBatch(int batchSize, size_t numSequences, const int* starts); - - /** - * @brief Reorganize input into batches and compute rnn forward batch - * by batch. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void backwardBatch(int batchSize, size_t numSequences, const int* starts); - -protected: - std::unique_ptr weight_; - std::unique_ptr bias_; - - /// frameOutput_[i] is used to hold the i-th sample of output_ - std::vector frameOutput_; - MatrixPtr prevOutput_; - /// Whether compute rnn by reverse. - bool reversed_; - /// If compute batch by batch, batchValue_ will be used to save the - /// reorganized input value. - std::unique_ptr batchValue_; - /// If compute batch by batch, batchGrad_ will be used to save the - /// gradient with respect to reorganized input value. - std::unique_ptr batchGrad_; -}; - REGISTER_LAYER(recurrent, RecurrentLayer); bool RecurrentLayer::init(const LayerMap& layerMap, @@ -260,7 +157,6 @@ void RecurrentLayer::backward(const UpdateCallback& callback) { bias_->getWGrad()->collectBias(*output_.grad, 1); bias_->getParameterPtr()->incUpdate(callback); } - weight_->getParameterPtr()->incUpdate(callback); } diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h new file mode 100644 index 00000000000000..f40dbe150fa93b --- /dev/null +++ b/paddle/gserver/layers/RecurrentLayer.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include +#include "Layer.h" +#include "SequenceToBatch.h" +#include "paddle/utils/Stat.h" + +namespace paddle { + +/** + * @brief RecurrentLayer takes 1 input layer. The output size is the same with + * input layer. + * For each sequence [start, end] it performs the following computation: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\ + * out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end + * + * \f] + * If reversed is true, the order is reversed: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\ + * out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end + * \f] + * There are two methods to calculate rnn. One way is to compute rnn one + * sequence by one sequence. The other way is to reorganize the input + * into batches, then compute rnn one batch by one batch. Users can select + * them by rnn_use_batch flag. + */ + +class RecurrentLayer : public Layer { +public: + explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + + void backward(const UpdateCallback& callback) override; + + void resetState() override; + + void setState(LayerStatePtr state) override; + + LayerStatePtr getState() override; + +protected: + /** + * @brief If user do not set --rnn_use_batch=true, it will + * compute rnn forward one sequence by one sequence in default. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void forwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn forward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ + void forwardOneSequence(int start, int length); + /** + * @brief Compute rnn backward one sequence by onesequence. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void backwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn backward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ + void backwardOneSequence(int start, int length); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. It will convert batch shape to sequence after finishing forward. + * The batch info can refer to SequenceToBatch class. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + virtual void forwardBatch(int batchSize, + size_t numSequences, + const int* starts); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + virtual void backwardBatch(int batchSize, + size_t numSequences, + const int* starts); + +protected: + std::unique_ptr weight_; + std::unique_ptr bias_; + + /// frameOutput_[i] is used to hold the i-th sample of output_ + std::vector frameOutput_; + MatrixPtr prevOutput_; + /// Whether compute rnn by reverse. + bool reversed_; + /// If compute batch by batch, batchValue_ will be used to save the + /// reorganized input value. + std::unique_ptr batchValue_; + /// If compute batch by batch, batchGrad_ will be used to save the + /// gradient with respect to reorganized input value. + std::unique_ptr batchGrad_; +}; + +} // namespace paddle From 82091035514c0ddeae2c18ff5f523a2647d59948 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 13:43:25 +0800 Subject: [PATCH 06/10] follow comments and refine code --- paddle/gserver/layers/MKLPackedGemm.h | 95 --------- .../layers/MKLPackedRecurrentLayer.cpp | 191 ++---------------- .../gserver/layers/MKLPackedRecurrentLayer.h | 87 ++------ paddle/gserver/layers/MKLPackedWeight.h | 100 +++++++++ 4 files changed, 125 insertions(+), 348 deletions(-) delete mode 100644 paddle/gserver/layers/MKLPackedGemm.h create mode 100644 paddle/gserver/layers/MKLPackedWeight.h diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h deleted file mode 100644 index 91e2515e323217..00000000000000 --- a/paddle/gserver/layers/MKLPackedGemm.h +++ /dev/null @@ -1,95 +0,0 @@ -/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/math/MathFunctions.h" -#include "paddle/math/Matrix.h" - -namespace paddle { - -class MKLPackedGemm { -protected: - real* weightPacked_; - real* weightTPacked_; - size_t weightHeight_; - size_t weightWidth_; - -public: - explicit MKLPackedGemm(MatrixPtr weight) { - weightHeight_ = weight->getHeight(); - weightWidth_ = weight->getWidth(); - weightPacked_ = - cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_); - weightTPacked_ = - cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_); - cblas_sgemm_pack(CblasRowMajor, - CblasBMatrix, - CblasNoTrans, - 1, - weightWidth_, - weightHeight_, - 1.0, - weight->getData(), - weightWidth_, - weightPacked_); - cblas_sgemm_pack(CblasRowMajor, - CblasBMatrix, - CblasTrans, - 1, - weightWidth_, - weightHeight_, - 1.0, - weight->getData(), - weightWidth_, - weightTPacked_); - } - void compute(MatrixPtr batch2, MatrixPtr batch1, bool transW = false) { - if (transW) { - cblas_sgemm_compute(CblasRowMajor, - CblasNoTrans, - CblasPacked, - batch2->getHeight(), - weightWidth_, - weightHeight_, - batch1->getData(), - weightHeight_, - weightTPacked_, - weightWidth_, - 1, - batch2->getData(), - weightWidth_); - } else { - cblas_sgemm_compute(CblasRowMajor, - CblasNoTrans, - CblasPacked, - batch2->getHeight(), - weightWidth_, - weightHeight_, - batch1->getData(), - weightHeight_, - weightPacked_, - weightWidth_, - 1, - batch2->getData(), - weightWidth_); - } - } - ~MKLPackedGemm() { - cblas_sgemm_free(weightPacked_); - cblas_sgemm_free(weightTPacked_); - } -}; - -} // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp index 6f455af91ed800..bd3c4ceb5e0956 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp @@ -20,188 +20,21 @@ REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer); bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { - if (!Layer::init(layerMap, parameterMap)) return false; - CHECK_EQ(1U, inputLayers_.size()); - CHECK_EQ(1U, parameters_.size()); - CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize()); - weight_.reset(new Weight(getSize(), getSize(), parameters_[0])); - if (biasParameter_.get() != NULL) { - bias_.reset(new Weight(1, getSize(), biasParameter_)); + if (!RecurrentLayer::init(layerMap, parameterMap)) return false; + packed_weight_.reset(new MKLPackedWeight(weight_->getW())); + packed_weight_->pack(); + if (needGradient_) { + packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true)); + packed_weightT_->pack(); } - reversed_ = config_.reversed(); - - sgemm_packed_.reset(new MKLPackedGemm(weight_->getW())); - return true; } -void MKLPackedRecurrentLayer::resetState() { - CHECK(!reversed_) << "state is not allowed for reversed recurrent layer"; - Matrix::resizeOrCreate( - prevOutput_, 1, getSize(), /* trans= */ false, useGpu_); - prevOutput_->zeroMem(); -} - -void MKLPackedRecurrentLayer::setState(LayerStatePtr state) { - CHECK(state->value.size() == 1) << "one matrix is expected for RNN state"; - prevOutput_->copyFrom(*(state->value[0])); -} - -LayerStatePtr MKLPackedRecurrentLayer::getState() { - LayerStatePtr res = std::make_shared(); - res->value.push_back(prevOutput_->clone(0, 0, useGpu_)); - res->value[0]->copyFrom(*prevOutput_); - return res; -} - -void MKLPackedRecurrentLayer::forward(PassType passType) { - REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str()); - Layer::forward(passType); - const Argument& input = getInput(0); - CHECK(input.sequenceStartPositions); - int batchSize = input.getBatchSize(); - size_t numSequences = input.getNumSequences(); - resetOutput(batchSize, getSize()); - CHECK_EQ(getSize(), input.value->getWidth()); - const int* starts = input.sequenceStartPositions->getData(false); - CHECK_EQ(starts[numSequences], batchSize); - - output_.value->assign(*input.value); - if (bias_) { - output_.value->addBias(*bias_->getW(), 1); - } - if (!FLAGS_rnn_use_batch) { - forwardSequence(batchSize, numSequences, starts); - } else { - forwardBatch(batchSize, numSequences, starts); - } -} - -void MKLPackedRecurrentLayer::forwardSequence(int batchSize, - size_t numSequences, - const int* starts) { - REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str()); - - frameOutput_.reserve(batchSize); - for (int i = frameOutput_.size(); i < batchSize; ++i) { - Argument arg; - arg.value = Matrix::create(nullptr, - /* height= */ 1, - getSize(), - /* trans= */ false, - useGpu_); - arg.grad = Matrix::create(nullptr, - /* height= */ 1, - getSize(), - /* trans= */ false, - useGpu_); - frameOutput_.push_back(arg); - } - - for (int i = 0; i < batchSize; ++i) { - frameOutput_[i].value->setData(output_.value->getData() + i * getSize()); - } - - for (size_t i = 0; i < numSequences; ++i) { - forwardOneSequence(starts[i], starts[i + 1] - starts[i]); - } -} - -void MKLPackedRecurrentLayer::forwardOneSequence(int start, int length) { - if (!reversed_) { - if (prevOutput_) { - frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1); - } - activation_->forward(frameOutput_[start]).check(); - - for (int i = 1; i < length; ++i) { - frameOutput_[start + i].value->mul( - *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1); - activation_->forward(frameOutput_[start + i]).check(); - } - if (prevOutput_) { - prevOutput_->assign(*frameOutput_[start + length - 1].value); - } - } else { - activation_->forward(frameOutput_[start + length - 1]).check(); - for (int i = length - 2; i >= 0; --i) { - frameOutput_[start + i].value->mul( - *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1); - activation_->forward(frameOutput_[start + i]).check(); - } - } -} - void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) { - REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str()); - const Argument& input = getInput(0); - CHECK(input.sequenceStartPositions); - int batchSize = input.getBatchSize(); - const int* starts = input.sequenceStartPositions->getData(false); - size_t numSequences = input.getNumSequences(); - - if (!FLAGS_rnn_use_batch) { - backwardSequence(batchSize, numSequences, starts); - } else { - backwardBatch(batchSize, numSequences, starts); - } - - if (input.grad) { - input.grad->add(*output_.grad); - } - - if (bias_ && bias_->getWGrad()) { - bias_->getWGrad()->collectBias(*output_.grad, 1); - bias_->getParameterPtr()->incUpdate(callback); - } - - weight_->getParameterPtr()->incUpdate(callback); - sgemm_packed_.reset(new MKLPackedGemm(weight_->getW())); -} - -void MKLPackedRecurrentLayer::backwardSequence(int batchSize, - size_t numSequences, - const int* starts) { - REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str()); - for (int i = 0; i < batchSize; ++i) { - frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize()); - } - - for (size_t i = 0; i < numSequences; ++i) { - backwardOneSequence(starts[i], starts[i + 1] - starts[i]); - } -} - -void MKLPackedRecurrentLayer::backwardOneSequence(int start, int length) { - MatrixPtr weightT = weight_->getW()->getTranspose(); - if (!reversed_) { - for (int i = length - 1; i > 0; --i) { - activation_->backward(frameOutput_[start + i]).check(); - frameOutput_[start + i - 1].grad->mul( - *frameOutput_[start + i].grad, *weightT, 1, 1); - } - activation_->backward(frameOutput_[start]).check(); - if (weight_->getWGrad()) { - weight_->getWGrad()->mul( - *output_.value->subMatrix(start, length - 1)->getTranspose(), - *output_.grad->subMatrix(start + 1, length - 1), - 1, - 1); - } - } else { - for (int i = 0; i < length - 1; ++i) { - activation_->backward(frameOutput_[start + i]).check(); - frameOutput_[start + i + 1].grad->mul( - *frameOutput_[start + i].grad, *weightT, 1, 1); - } - activation_->backward(frameOutput_[start + length - 1]).check(); - if (weight_->getWGrad()) { - weight_->getWGrad()->mul( - *output_.value->subMatrix(start + 1, length - 1)->getTranspose(), - *output_.grad->subMatrix(start, length - 1), - 1, - 1); - } + RecurrentLayer::backward(callback); + packed_weight_->pack(); + if (needGradient_) { + packed_weightT_->pack(); } } @@ -227,7 +60,7 @@ void MKLPackedRecurrentLayer::forwardBatch(int batchSize, batchValue_->getBatchValue(n - 1, batch2->getHeight()); // batch2->mul(*batch1, *weight_->getW(), 1, 1); - sgemm_packed_->compute(batch2, batch1); + packed_weight_->compute(batch2, batch1); } #pragma omp parallel for collapse(2) @@ -272,7 +105,7 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize, if (n != 0) { batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight()); // batch1->mul(*batch2, *weightT, 1, 1); - sgemm_packed_->compute(batch1, batch2, true); + packed_weightT_->compute(batch1, batch2); } if (backwardByBatch && weight_->getWGrad()) { diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h index b8727e0ff30994..ba6487b11e0e90 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -16,7 +16,8 @@ limitations under the License. */ #include #include "Layer.h" -#include "MKLPackedGemm.h" +#include "MKLPackedWeight.h" +#include "RecurrentLayer.h" #include "SequenceToBatch.h" #include "paddle/utils/Stat.h" @@ -45,90 +46,28 @@ namespace paddle { * them by rnn_use_batch flag. */ -class MKLPackedRecurrentLayer : public Layer { +class MKLPackedRecurrentLayer : public RecurrentLayer { public: - explicit MKLPackedRecurrentLayer(const LayerConfig& config) : Layer(config) {} + explicit MKLPackedRecurrentLayer(const LayerConfig& config) + : RecurrentLayer(config) {} bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) override; - void forward(PassType passType) override; - void backward(const UpdateCallback& callback) override; - void resetState() override; - - void setState(LayerStatePtr state) override; - - LayerStatePtr getState() override; - protected: - /** - * @brief If user do not set --rnn_use_batch=true, it will - * compute rnn forward one sequence by one sequence in default. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void forwardSequence(int batchSize, size_t numSequences, const int* starts); - /** - * @brief Compute rnn forward by one sequence. - * @param start The start position of this sequence (or sample). - * @param length The length of this sequence (or sample), namely the words - * number of this sequence. - */ - void forwardOneSequence(int start, int length); - /** - * @brief Compute rnn backward one sequence by onesequence. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void backwardSequence(int batchSize, size_t numSequences, const int* starts); - /** - * @brief Compute rnn backward by one sequence. - * @param start The start position of this sequence (or sample). - * @param length The length of this sequence (or sample), namely the words - * number of this sequence. - */ - void backwardOneSequence(int start, int length); + void forwardBatch(int batchSize, + size_t numSequences, + const int* starts) override; - /** - * @brief Reorganize input into batches and compute rnn forward batch - * by batch. It will convert batch shape to sequence after finishing forward. - * The batch info can refer to SequenceToBatch class. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void forwardBatch(int batchSize, size_t numSequences, const int* starts); - - /** - * @brief Reorganize input into batches and compute rnn forward batch - * by batch. - * @param batchSize Total words number of all samples in this batch. - * @param numSequences The sample number. - * @param starts Each start position of each samples. - */ - void backwardBatch(int batchSize, size_t numSequences, const int* starts); + void backwardBatch(int batchSize, + size_t numSequences, + const int* starts) override; protected: - std::unique_ptr weight_; - std::unique_ptr bias_; - - /// frameOutput_[i] is used to hold the i-th sample of output_ - std::vector frameOutput_; - MatrixPtr prevOutput_; - /// Whether compute rnn by reverse. - bool reversed_; - /// If compute batch by batch, batchValue_ will be used to save the - /// reorganized input value. - std::unique_ptr batchValue_; - /// If compute batch by batch, batchGrad_ will be used to save the - /// gradient with respect to reorganized input value. - std::unique_ptr batchGrad_; - - std::unique_ptr sgemm_packed_; + std::unique_ptr packed_weight_; + std::unique_ptr packed_weightT_; }; } // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h new file mode 100644 index 00000000000000..a8dcfd561b6550 --- /dev/null +++ b/paddle/gserver/layers/MKLPackedWeight.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/math/MathFunctions.h" +#include "paddle/parameter/Parameter.h" +#include "paddle/parameter/Weight.h" + +namespace paddle { + +class MKLPackedWeight { +protected: + real *weight_; + real *packedWeight_; + size_t height_; + size_t width_; + bool transW_; + +public: + MKLPackedWeight(MatrixPtr weight, bool transW = false) { + packedWeight_ = nullptr; + weight_ = weight->getData(); + height_ = weight->getHeight(); + width_ = weight->getWidth(); + transW_ = transW; + } + + ~MKLPackedWeight() { free_(); } + + void pack() { pack_(weight_); } + + void compute(MatrixPtr dst, MatrixPtr src) { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + src->getHeight(), + transW_ ? height_ : width_, + transW_ ? width_ : height_, + src->getData(), + src->getWidth(), + packedWeight_, + width_, + 1.0, + dst->getData(), + dst->getWidth()); + } + + void compute(size_t M, real *A, size_t lda, real *C, size_t ldc) { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + M, + width_, + height_, + A, + lda, + packedWeight_, + width_, + 1.0, + C, + ldc); + } + +protected: + void pack_(real *src) { + if (!packedWeight_) { + packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_); + } + cblas_sgemm_pack(CblasRowMajor, + CblasBMatrix, + transW_ ? CblasTrans : CblasNoTrans, + 1, + transW_ ? height_ : width_, + transW_ ? width_ : height_, + 1.0, + src, + width_, + packedWeight_); + } + + void free_() { + if (packedWeight_) { + cblas_sgemm_free(packedWeight_); + } + } +}; + +} // namespace paddle From 0596cd8826ddf94c53fd2d834a189be2b829a595 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 14:45:07 +0800 Subject: [PATCH 07/10] refine test recurrent layer --- paddle/gserver/tests/test_RecurrentLayer.cpp | 119 ++++++++----------- 1 file changed, 52 insertions(+), 67 deletions(-) diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 44d84dd8be2f7c..0e130843339a10 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -222,6 +222,7 @@ TEST(Layer, RecurrentLayer) { #define protected public #include "paddle/gserver/layers/GatedRecurrentLayer.h" #include "paddle/gserver/layers/LstmLayer.h" +#include "paddle/gserver/layers/RecurrentLayer.h" template class TestRecurrentLayer { public: @@ -422,6 +423,8 @@ TEST(Layer, LstmLayer) { #ifdef PADDLE_WITH_MKLML +#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h" + LayerPtr initMKLPackedLayer(LayerConfig layerConfig, bool reversed, int layerSize, @@ -453,7 +456,31 @@ LayerPtr initMKLPackedLayer(LayerConfig layerConfig, return testLayer; } -void checkMKLPackedLayer(LayerPtr testLayer1, LayerPtr testLayer2) { +void checkMKLPackedLayer(LayerConfig layerConfig1, + LayerConfig layerConfig2, + bool reversed, + int layerSize, + int batchSize, + bool useBatch1, + bool useBatch2) { + LayerPtr dataLayer; + ParameterPtr para, bias; + + if (layerConfig1.type() == "recurrent") { + dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false); + para = creatParameter("para_0", 0, layerSize * layerSize, false); + bias = nullptr; + } else if (layerConfig1.type() == "gated_recurrent") { + dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false); + para = creatParameter("para_0", 0, layerSize * layerSize * 3, false); + bias = creatParameterBias("bias_0", 1, layerSize * 3, false); + } + + LayerPtr testLayer1 = initMKLPackedLayer( + layerConfig1, reversed, layerSize, dataLayer, para, bias); + LayerPtr testLayer2 = initMKLPackedLayer( + layerConfig2, reversed, layerSize, dataLayer, para, bias); + const VectorPtr& weightGrad = (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT); const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad(); @@ -462,78 +489,34 @@ void checkMKLPackedLayer(LayerPtr testLayer1, LayerPtr testLayer2) { CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth()); CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth()); - CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth()); - outputGrad.randomizeUniform(); - for (int i = 0; i < 2; i++) { - FLAGS_rnn_use_batch = true; + FLAGS_rnn_use_batch = useBatch1; testLayer1->forward(PASS_GC); - testLayer1->getOutputGrad()->copyFrom(outputGrad); - - weightGrad->zero(); - inputGrad->zero(); - - testLayer1->backward(nullptr); - - wgt_grad1.copyFrom(*weightGrad); - input_grad1.copyFrom(*inputGrad); - - FLAGS_rnn_use_batch = true; - - testLayer2->forward(PASS_GC); - testLayer2->getOutputGrad()->copyFrom(outputGrad); - - weightGrad->zero(); - inputGrad->zero(); - - testLayer2->backward(nullptr); - - wgt_grad2.copyFrom(*weightGrad); - input_grad2.copyFrom(*inputGrad); - - checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue()); - - checkError(wgt_grad1, wgt_grad2); - checkError(input_grad1, input_grad2); - } - - for (int i = 0; i < 2; i++) { - CpuMatrix outputValue(testLayer2->getOutputValue()->getHeight(), - testLayer2->getOutputValue()->getWidth()); - - FLAGS_rnn_use_batch = true; - + FLAGS_rnn_use_batch = useBatch2; testLayer2->forward(PASS_GC); - outputValue.copyFrom(*testLayer2->getOutputValue()); - testLayer2->getOutputGrad()->copyFrom(outputGrad); + testLayer1->getOutputGrad()->randomizeUniform(); + testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad()); weightGrad->zero(); inputGrad->zero(); - - testLayer2->backward(nullptr); + FLAGS_rnn_use_batch = useBatch1; + testLayer1->backward(nullptr); wgt_grad1.copyFrom(*weightGrad); input_grad1.copyFrom(*inputGrad); - FLAGS_rnn_use_batch = false; - - testLayer2->getOutputValue()->zero(); - - testLayer2->forward(PASS_GC); - testLayer2->getOutputGrad()->copyFrom(outputGrad); - weightGrad->zero(); inputGrad->zero(); - + FLAGS_rnn_use_batch = useBatch2; testLayer2->backward(nullptr); wgt_grad2.copyFrom(*weightGrad); input_grad2.copyFrom(*inputGrad); - checkError(outputValue, *testLayer2->getOutputValue()); + checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue()); checkError(wgt_grad1, wgt_grad2); checkError(input_grad1, input_grad2); } @@ -556,20 +539,22 @@ TEST(MKLPackedLayer, RecurrentLayer) { for (auto layerSize : {32, 64, 128, 256, 512}) { for (auto batchSize : {1, 5, 100, 500}) { for (auto reversed : {true, false}) { - LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize - << " reversed=" << reversed; - - LayerPtr dataLayer = - creatDataLayer("layer_0", batchSize, layerSize, false); - ParameterPtr para = - creatParameter("para_0", 0, layerSize * layerSize, false); - - LayerPtr testLayer1 = initMKLPackedLayer( - layerConfig1, reversed, layerSize, dataLayer, para); - LayerPtr testLayer2 = initMKLPackedLayer( - layerConfig2, reversed, layerSize, dataLayer, para); - - checkMKLPackedLayer(testLayer1, testLayer2); + for (auto paddle_use_batch : {true, false}) { + for (auto MKLPacked_use_batch : {true, false}) { + LOG(INFO) << " layerSize=" << layerSize + << " batchSize=" << batchSize << " reversed=" << reversed + << " paddle_use_batch=" << paddle_use_batch + << " MKLPacked_use_batch=" << MKLPacked_use_batch; + + checkMKLPackedLayer(layerConfig1, + layerConfig2, + reversed, + layerSize, + batchSize, + paddle_use_batch, + MKLPacked_use_batch); + } + } } } } From 4360615850a01a32747b7a4e4d8f99f0ff8c6252 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 16:34:24 +0800 Subject: [PATCH 08/10] fix compile error --- paddle/gserver/layers/MKLPackedWeight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h index a8dcfd561b6550..cc8a336154970c 100644 --- a/paddle/gserver/layers/MKLPackedWeight.h +++ b/paddle/gserver/layers/MKLPackedWeight.h @@ -29,7 +29,7 @@ class MKLPackedWeight { bool transW_; public: - MKLPackedWeight(MatrixPtr weight, bool transW = false) { + explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) { packedWeight_ = nullptr; weight_ = weight->getData(); height_ = weight->getHeight(); From df2b054b13d19d467afa51aafdf1871569c6fa56 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 3 Jan 2018 11:37:55 +0800 Subject: [PATCH 09/10] follow comments refine code --- .../layers/MKLPackedRecurrentLayer.cpp | 64 ++++++++----------- .../gserver/layers/MKLPackedRecurrentLayer.h | 29 ++------- paddle/gserver/layers/MKLPackedWeight.h | 20 +----- paddle/gserver/layers/RecurrentLayer.cpp | 4 -- 4 files changed, 36 insertions(+), 81 deletions(-) diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp index bd3c4ceb5e0956..b4a6413048327c 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp @@ -53,28 +53,19 @@ void MKLPackedRecurrentLayer::forwardBatch(int batchSize, REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str()); /* forward one batch */ for (size_t n = 0; n < batchValue_->getNumBatch(); n++) { - MatrixPtr batch2 = batchValue_->getBatchValue(n); + MatrixPtr batchValue = batchValue_->getBatchValue(n); if (n != 0) { - MatrixPtr batch1 = - batchValue_->getBatchValue(n - 1, batch2->getHeight()); + MatrixPtr preBatchValue = + batchValue_->getBatchValue(n - 1, batchValue->getHeight()); - // batch2->mul(*batch1, *weight_->getW(), 1, 1); - packed_weight_->compute(batch2, batch1); - } - -#pragma omp parallel for collapse(2) - for (size_t i = 0; i < batch2->getHeight(); i++) { - for (size_t j = 0; j < batch2->getWidth(); j++) { - *(batch2->getData() + i * batch2->getWidth() + j) = - *(batch2->getData() + i * batch2->getWidth() + j) > 0 - ? *(batch2->getData() + i * batch2->getWidth() + j) - : 0; - } + packed_weight_->compute(batchValue, preBatchValue); } + Argument arg; + arg.value = batchValue; + activation_->forward(arg).check(); } } - batchValue_->copyBackSeq(*output_.value); } @@ -94,25 +85,27 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize, REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str()); /* backward one batch */ for (int n = (int)numBatch - 1; n >= 0; n--) { - MatrixPtr batch2 = batchGrad_->getBatchValue(n); - MatrixPtr batch1 = batchValue_->getBatchValue(n, batch2->getHeight()); + MatrixPtr batchGrad = batchGrad_->getBatchValue(n); + MatrixPtr batchValue = + batchValue_->getBatchValue(n, batchGrad->getHeight()); Argument arg; - arg.value = batch1; - arg.grad = batch2; + arg.value = batchValue; + arg.grad = batchGrad; activation_->backward(arg).check(); if (n != 0) { - batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight()); - // batch1->mul(*batch2, *weightT, 1, 1); - packed_weightT_->compute(batch1, batch2); + batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight()); + packed_weightT_->compute(batchValue, batchGrad); } if (backwardByBatch && weight_->getWGrad()) { if (n != 0) { /* backward weight */ - batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight()); - weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1); + batchValue = + batchValue_->getBatchValue(n - 1, batchGrad->getHeight()); + weight_->getWGrad()->mul( + *batchValue->getTranspose(), *batchGrad, 1, 1); } } } @@ -124,19 +117,14 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize, REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str()); for (size_t seq = 0; seq < numSequences; ++seq) { int len = starts[seq + 1] - starts[seq]; - if (!reversed_) { - weight_->getWGrad()->mul( - *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(), - *output_.grad->subMatrix(starts[seq] + 1, len - 1), - 1, - 1); - } else { - weight_->getWGrad()->mul( - *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(), - *output_.grad->subMatrix(starts[seq], len - 1), - 1, - 1); - } + weight_->getWGrad()->mul( + *output_.value + ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1) + ->getTranspose(), + *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1, + len - 1), + 1, + 1); } } } diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h index ba6487b11e0e90..19874d538e919c 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -14,36 +14,18 @@ limitations under the License. */ #pragma once -#include -#include "Layer.h" #include "MKLPackedWeight.h" #include "RecurrentLayer.h" -#include "SequenceToBatch.h" -#include "paddle/utils/Stat.h" DECLARE_bool(rnn_use_batch); namespace paddle { /** - * @brief MKLPackedRecurrentLayer takes 1 input layer. The output size is the - * same with - * input layer. - * For each sequence [start, end] it performs the following computation: - * \f[ - * out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\ - * out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end - * - * \f] - * If reversed is true, the order is reversed: - * \f[ - * out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\ - * out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end - * \f] - * There are two methods to calculate rnn. One way is to compute rnn one - * sequence by one sequence. The other way is to reorganize the input - * into batches, then compute rnn one batch by one batch. Users can select - * them by rnn_use_batch flag. + * @brief MKLPackedRecurrentLayer is same with RecurrentLayer but is optimized + * with MKL cblas packed gemm. + * More details: + * /~https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md */ class MKLPackedRecurrentLayer : public RecurrentLayer { @@ -66,7 +48,10 @@ class MKLPackedRecurrentLayer : public RecurrentLayer { const int* starts) override; protected: + /// packed_weight_ is contains same data with + /// RecurrentLayer::weight_ but is packed std::unique_ptr packed_weight_; + /// packed_weightT_ is the transposition matrix of packed_weight_ std::unique_ptr packed_weightT_; }; diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h index cc8a336154970c..f77aa4dbbf5ddb 100644 --- a/paddle/gserver/layers/MKLPackedWeight.h +++ b/paddle/gserver/layers/MKLPackedWeight.h @@ -22,7 +22,9 @@ namespace paddle { class MKLPackedWeight { protected: + /// The pointor of weight real *weight_; + /// The pointor of cblas packed gemm to weight real *packedWeight_; size_t height_; size_t width_; @@ -41,7 +43,7 @@ class MKLPackedWeight { void pack() { pack_(weight_); } - void compute(MatrixPtr dst, MatrixPtr src) { + void compute(MatrixPtr dst, const MatrixPtr src) { cblas_sgemm_compute(CblasRowMajor, CblasNoTrans, CblasPacked, @@ -57,22 +59,6 @@ class MKLPackedWeight { dst->getWidth()); } - void compute(size_t M, real *A, size_t lda, real *C, size_t ldc) { - cblas_sgemm_compute(CblasRowMajor, - CblasNoTrans, - CblasPacked, - M, - width_, - height_, - A, - lda, - packedWeight_, - width_, - 1.0, - C, - ldc); - } - protected: void pack_(real *src) { if (!packedWeight_) { diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp index 285b11b5a027cd..6bd42c06cadf75 100644 --- a/paddle/gserver/layers/RecurrentLayer.cpp +++ b/paddle/gserver/layers/RecurrentLayer.cpp @@ -13,10 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "RecurrentLayer.h" -#include -#include "Layer.h" -#include "SequenceToBatch.h" -#include "paddle/utils/Stat.h" DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation."); From 89cb3a249cc5ecbf3955ea37e67655ec431142e3 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 3 Jan 2018 14:02:01 +0800 Subject: [PATCH 10/10] follow comments, refine comment and function name --- paddle/gserver/layers/MKLPackedRecurrentLayer.cpp | 4 ++-- paddle/gserver/layers/MKLPackedRecurrentLayer.h | 6 +++--- paddle/gserver/layers/MKLPackedWeight.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp index b4a6413048327c..dd75555fae1346 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp @@ -59,7 +59,7 @@ void MKLPackedRecurrentLayer::forwardBatch(int batchSize, MatrixPtr preBatchValue = batchValue_->getBatchValue(n - 1, batchValue->getHeight()); - packed_weight_->compute(batchValue, preBatchValue); + packed_weight_->gemm_compute(preBatchValue, batchValue); } Argument arg; arg.value = batchValue; @@ -96,7 +96,7 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize, if (n != 0) { batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight()); - packed_weightT_->compute(batchValue, batchGrad); + packed_weightT_->gemm_compute(batchGrad, batchValue); } if (backwardByBatch && weight_->getWGrad()) { diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h index 19874d538e919c..bded523a8fbd6f 100644 --- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -22,8 +22,8 @@ DECLARE_bool(rnn_use_batch); namespace paddle { /** - * @brief MKLPackedRecurrentLayer is same with RecurrentLayer but is optimized - * with MKL cblas packed gemm. + * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer + * but is optimized with MKL cblas packed gemm. * More details: * /~https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md */ @@ -48,7 +48,7 @@ class MKLPackedRecurrentLayer : public RecurrentLayer { const int* starts) override; protected: - /// packed_weight_ is contains same data with + /// packed_weight_ contains same data with /// RecurrentLayer::weight_ but is packed std::unique_ptr packed_weight_; /// packed_weightT_ is the transposition matrix of packed_weight_ diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h index f77aa4dbbf5ddb..15d5093beb43e2 100644 --- a/paddle/gserver/layers/MKLPackedWeight.h +++ b/paddle/gserver/layers/MKLPackedWeight.h @@ -22,9 +22,9 @@ namespace paddle { class MKLPackedWeight { protected: - /// The pointor of weight + /// The pointer of weight real *weight_; - /// The pointor of cblas packed gemm to weight + /// The pointer of cblas packed gemm to weight real *packedWeight_; size_t height_; size_t width_; @@ -43,7 +43,7 @@ class MKLPackedWeight { void pack() { pack_(weight_); } - void compute(MatrixPtr dst, const MatrixPtr src) { + void gemm_compute(const MatrixPtr src, MatrixPtr dst) { cblas_sgemm_compute(CblasRowMajor, CblasNoTrans, CblasPacked,