From 624e3e52089e7577ada38e061d0f3f299b1fac6d Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Tue, 19 Dec 2017 20:41:46 +0800
Subject: [PATCH 01/10] add MKL Packed RecurrentLayer

---
 paddle/gserver/CMakeLists.txt                 |  10 +
 paddle/gserver/layers/MKLPackedGemm.h         |  94 ++++++
 .../layers/MKLPackedRecurrentLayer.cpp        | 311 ++++++++++++++++++
 .../gserver/layers/MKLPackedRecurrentLayer.h  | 131 ++++++++
 4 files changed, 546 insertions(+)
 create mode 100644 paddle/gserver/layers/MKLPackedGemm.h
 create mode 100644 paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
 create mode 100644 paddle/gserver/layers/MKLPackedRecurrentLayer.h

diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 41ead3c5ecef24..3d6ced713f00bd 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -34,6 +34,16 @@ else()
     message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
 endif()
 
+if(NOT WITH_MKLML)
+    file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h")
+    file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES})
+    message(STATUS "Skip compiling with MKLPackedLayers")
+else()
+    message(STATUS "Compile with MKLPackedLayers")
+endif()
+
 if(NOT WITH_GPU)
     list(REMOVE_ITEM GSERVER_HEADER
         layers/CudnnConvBaseLayer.h
diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h
new file mode 100644
index 00000000000000..3c4c62eeb87f6a
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedGemm.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/MathFunctions.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+class MKLPackedGemm {
+protected:
+  real* weightPacked_;
+  real* weightTPacked_;
+  size_t weightHeight_;
+  size_t weightWidth_;
+
+public:
+  MKLPackedGemm(MatrixPtr weight) {
+    weightHeight_ = weight->getHeight();
+    weightWidth_ = weight->getWidth();
+    weightPacked_ =
+        cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_);
+    weightTPacked_ =
+        cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_);
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     CblasNoTrans,
+                     1,
+                     weightWidth_,
+                     weightHeight_,
+                     1.0,
+                     weight->getData(),
+                     weightWidth_,
+                     weightPacked_);
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     CblasTrans,
+                     1,
+                     weightWidth_,
+                     weightHeight_,
+                     1.0,
+                     weight->getData(),
+                     weightWidth_,
+                     weightTPacked_);
+  }
+  void compute(MatrixPtr batch2, MatrixPtr batch1, bool transW = false) {
+    if (transW) {
+      cblas_sgemm_compute(CblasRowMajor,
+                          CblasNoTrans,
+                          CblasPacked,
+                          batch2->getHeight(),
+                          weightWidth_,
+                          weightHeight_,
+                          batch1->getData(),
+                          weightHeight_,
+                          weightTPacked_,
+                          weightWidth_,
+                          1,
+                          batch2->getData(),
+                          weightWidth_);
+    } else {
+      cblas_sgemm_compute(CblasRowMajor,
+                          CblasNoTrans,
+                          CblasPacked,
+                          batch2->getHeight(),
+                          weightWidth_,
+                          weightHeight_,
+                          batch1->getData(),
+                          weightHeight_,
+                          weightPacked_,
+                          weightWidth_,
+                          1,
+                          batch2->getData(),
+                          weightWidth_);
+    }
+  }
+  ~MKLPackedGemm() {
+    cblas_sgemm_free(weightPacked_);
+    cblas_sgemm_free(weightTPacked_);
+  }
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
new file mode 100644
index 00000000000000..6f455af91ed800
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
@@ -0,0 +1,311 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLPackedRecurrentLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer);
+
+bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+  CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize());
+  weight_.reset(new Weight(getSize(), getSize(), parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, getSize(), biasParameter_));
+  }
+  reversed_ = config_.reversed();
+
+  sgemm_packed_.reset(new MKLPackedGemm(weight_->getW()));
+
+  return true;
+}
+
+void MKLPackedRecurrentLayer::resetState() {
+  CHECK(!reversed_) << "state is not allowed for reversed recurrent layer";
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
+  prevOutput_->zeroMem();
+}
+
+void MKLPackedRecurrentLayer::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 1) << "one matrix is expected for RNN state";
+  prevOutput_->copyFrom(*(state->value[0]));
+}
+
+LayerStatePtr MKLPackedRecurrentLayer::getState() {
+  LayerStatePtr res = std::make_shared<LayerState>();
+  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
+  res->value[0]->copyFrom(*prevOutput_);
+  return res;
+}
+
+void MKLPackedRecurrentLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str());
+  Layer::forward(passType);
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  size_t numSequences = input.getNumSequences();
+  resetOutput(batchSize, getSize());
+  CHECK_EQ(getSize(), input.value->getWidth());
+  const int* starts = input.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  output_.value->assign(*input.value);
+  if (bias_) {
+    output_.value->addBias(*bias_->getW(), 1);
+  }
+  if (!FLAGS_rnn_use_batch) {
+    forwardSequence(batchSize, numSequences, starts);
+  } else {
+    forwardBatch(batchSize, numSequences, starts);
+  }
+}
+
+void MKLPackedRecurrentLayer::forwardSequence(int batchSize,
+                                              size_t numSequences,
+                                              const int* starts) {
+  REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str());
+
+  frameOutput_.reserve(batchSize);
+  for (int i = frameOutput_.size(); i < batchSize; ++i) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               getSize(),
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              getSize(),
+                              /* trans= */ false,
+                              useGpu_);
+    frameOutput_.push_back(arg);
+  }
+
+  for (int i = 0; i < batchSize; ++i) {
+    frameOutput_[i].value->setData(output_.value->getData() + i * getSize());
+  }
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    forwardOneSequence(starts[i], starts[i + 1] - starts[i]);
+  }
+}
+
+void MKLPackedRecurrentLayer::forwardOneSequence(int start, int length) {
+  if (!reversed_) {
+    if (prevOutput_) {
+      frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1);
+    }
+    activation_->forward(frameOutput_[start]).check();
+
+    for (int i = 1; i < length; ++i) {
+      frameOutput_[start + i].value->mul(
+          *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1);
+      activation_->forward(frameOutput_[start + i]).check();
+    }
+    if (prevOutput_) {
+      prevOutput_->assign(*frameOutput_[start + length - 1].value);
+    }
+  } else {
+    activation_->forward(frameOutput_[start + length - 1]).check();
+    for (int i = length - 2; i >= 0; --i) {
+      frameOutput_[start + i].value->mul(
+          *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1);
+      activation_->forward(frameOutput_[start + i]).check();
+    }
+  }
+}
+
+void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str());
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  const int* starts = input.sequenceStartPositions->getData(false);
+  size_t numSequences = input.getNumSequences();
+
+  if (!FLAGS_rnn_use_batch) {
+    backwardSequence(batchSize, numSequences, starts);
+  } else {
+    backwardBatch(batchSize, numSequences, starts);
+  }
+
+  if (input.grad) {
+    input.grad->add(*output_.grad);
+  }
+
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*output_.grad, 1);
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+  sgemm_packed_.reset(new MKLPackedGemm(weight_->getW()));
+}
+
+void MKLPackedRecurrentLayer::backwardSequence(int batchSize,
+                                               size_t numSequences,
+                                               const int* starts) {
+  REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str());
+  for (int i = 0; i < batchSize; ++i) {
+    frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize());
+  }
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    backwardOneSequence(starts[i], starts[i + 1] - starts[i]);
+  }
+}
+
+void MKLPackedRecurrentLayer::backwardOneSequence(int start, int length) {
+  MatrixPtr weightT = weight_->getW()->getTranspose();
+  if (!reversed_) {
+    for (int i = length - 1; i > 0; --i) {
+      activation_->backward(frameOutput_[start + i]).check();
+      frameOutput_[start + i - 1].grad->mul(
+          *frameOutput_[start + i].grad, *weightT, 1, 1);
+    }
+    activation_->backward(frameOutput_[start]).check();
+    if (weight_->getWGrad()) {
+      weight_->getWGrad()->mul(
+          *output_.value->subMatrix(start, length - 1)->getTranspose(),
+          *output_.grad->subMatrix(start + 1, length - 1),
+          1,
+          1);
+    }
+  } else {
+    for (int i = 0; i < length - 1; ++i) {
+      activation_->backward(frameOutput_[start + i]).check();
+      frameOutput_[start + i + 1].grad->mul(
+          *frameOutput_[start + i].grad, *weightT, 1, 1);
+    }
+    activation_->backward(frameOutput_[start + length - 1]).check();
+    if (weight_->getWGrad()) {
+      weight_->getWGrad()->mul(
+          *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
+          *output_.grad->subMatrix(start, length - 1),
+          1,
+          1);
+    }
+  }
+}
+
+void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
+                                           size_t numSequences,
+                                           const int* starts) {
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
+
+  batchValue_->copyFromSeq(*output_.value);
+
+  {
+    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
+    /* forward one batch */
+    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
+      MatrixPtr batch2 = batchValue_->getBatchValue(n);
+
+      if (n != 0) {
+        MatrixPtr batch1 =
+            batchValue_->getBatchValue(n - 1, batch2->getHeight());
+
+        // batch2->mul(*batch1, *weight_->getW(), 1, 1);
+        sgemm_packed_->compute(batch2, batch1);
+      }
+
+#pragma omp parallel for collapse(2)
+      for (size_t i = 0; i < batch2->getHeight(); i++) {
+        for (size_t j = 0; j < batch2->getWidth(); j++) {
+          *(batch2->getData() + i * batch2->getWidth() + j) =
+              *(batch2->getData() + i * batch2->getWidth() + j) > 0
+                  ? *(batch2->getData() + i * batch2->getWidth() + j)
+                  : 0;
+        }
+      }
+    }
+  }
+
+  batchValue_->copyBackSeq(*output_.value);
+}
+
+void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
+                                            size_t numSequences,
+                                            const int* starts) {
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  size_t numBatch = batchGrad_->getNumBatch();
+  bool backwardByBatch = numBatch < numSequences;
+
+  batchGrad_->copyFromSeq(*output_.grad);
+  {
+    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
+    /* backward one batch */
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      MatrixPtr batch2 = batchGrad_->getBatchValue(n);
+      MatrixPtr batch1 = batchValue_->getBatchValue(n, batch2->getHeight());
+
+      Argument arg;
+      arg.value = batch1;
+      arg.grad = batch2;
+      activation_->backward(arg).check();
+
+      if (n != 0) {
+        batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
+        // batch1->mul(*batch2, *weightT, 1, 1);
+        sgemm_packed_->compute(batch1, batch2, true);
+      }
+
+      if (backwardByBatch && weight_->getWGrad()) {
+        if (n != 0) {
+          /* backward weight */
+          batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight());
+          weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1);
+        }
+      }
+    }
+  }
+
+  batchGrad_->copyBackSeq(*output_.grad);
+
+  if (!backwardByBatch && weight_->getWGrad()) {
+    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
+    for (size_t seq = 0; seq < numSequences; ++seq) {
+      int len = starts[seq + 1] - starts[seq];
+      if (!reversed_) {
+        weight_->getWGrad()->mul(
+            *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(),
+            *output_.grad->subMatrix(starts[seq] + 1, len - 1),
+            1,
+            1);
+      } else {
+        weight_->getWGrad()->mul(
+            *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(),
+            *output_.grad->subMatrix(starts[seq], len - 1),
+            1,
+            1);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
new file mode 100644
index 00000000000000..719137f2db74f8
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include "Layer.h"
+#include "MKLPackedGemm.h"
+#include "SequenceToBatch.h"
+#include "paddle/utils/Stat.h"
+
+DECLARE_bool(rnn_use_batch);
+
+namespace paddle {
+
+/**
+ * @brief MKLPackedRecurrentLayer takes 1 input layer. The output size is the
+ * same with
+ * input layer.
+ * For each sequence [start, end] it performs the following computation:
+ * \f[
+ *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
+ *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
+ *
+ * \f]
+ * If reversed is true, the order is reversed:
+ * \f[
+ *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
+ *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
+ * \f]
+ * There are two methods to calculate rnn. One way is to compute rnn one
+ * sequence by one sequence. The other way is to reorganize the input
+ * into batches, then compute rnn one batch by one batch. Users can select
+ * them by rnn_use_batch flag.
+ */
+
+class MKLPackedRecurrentLayer : public Layer {
+public:
+  explicit MKLPackedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+protected:
+  /**
+   * @brief If user do not set --rnn_use_batch=true, it will
+   * compute rnn forward one sequence by one sequence in default.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn forward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void forwardOneSequence(int start, int length);
+  /**
+   * @brief Compute rnn backward one sequence by onesequence.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn backward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void backwardOneSequence(int start, int length);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch. It will convert batch shape to sequence after finishing forward.
+   * The batch info can refer to SequenceToBatch class.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void forwardBatch(int batchSize, size_t numSequences, const int* starts);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void backwardBatch(int batchSize, size_t numSequences, const int* starts);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+  /// frameOutput_[i] is used to hold the i-th sample of output_
+  std::vector<Argument> frameOutput_;
+  MatrixPtr prevOutput_;
+  /// Whether compute rnn by reverse.
+  bool reversed_;
+  /// If compute batch by batch, batchValue_ will be used to save the
+  /// reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// If compute batch by batch, batchGrad_ will be used to save the
+  /// gradient with respect to reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+
+  std::unique_ptr<MKLPackedGemm> sgemm_packed_;
+};
+}

From 2e101df7c656d524c92b6c31711a8bbcaf7ce09f Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Tue, 19 Dec 2017 21:40:34 +0800
Subject: [PATCH 02/10] enable gtest for MKLPackedRecurrentLayer

---
 paddle/gserver/tests/test_RecurrentLayer.cpp | 165 ++++++++++++++++++-
 1 file changed, 159 insertions(+), 6 deletions(-)

diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 16ab0e6aecb6a8..1f31158579a2c9 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -420,12 +420,165 @@ TEST(Layer, LstmLayer) {
   }
 }
 
+#ifdef PADDLE_WITH_MKLML
+
+LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
+                            bool reversed,
+                            int layerSize,
+                            LayerPtr dataLayer,
+                            ParameterPtr para,
+                            ParameterPtr bias = nullptr) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  parameterMap[para->getName()] = para;
+  if (bias) {
+    parameterMap[bias->getName()] = bias;
+    layerConfig.set_bias_parameter_name("bias_0");
+  }
+
+  layerConfig.set_size(layerSize);
+  layerConfig.set_reversed(reversed);
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkMKLPackedLayer(LayerPtr testLayer1, LayerPtr testLayer2) {
+  const VectorPtr& weightGrad =
+      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
+  CpuVector wgt_grad1(weightGrad->getSize());
+  CpuVector wgt_grad2(weightGrad->getSize());
+  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
+
+  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+  outputGrad.randomizeUniform();
+
+  for (int i = 0; i < 2; i++) {
+    FLAGS_rnn_use_batch = true;
+
+    testLayer1->forward(PASS_GC);
+
+    testLayer1->getOutputGrad()->copyFrom(outputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+
+    testLayer1->backward(nullptr);
+
+    wgt_grad1.copyFrom(*weightGrad);
+    input_grad1.copyFrom(*inputGrad);
+
+    FLAGS_rnn_use_batch = true;
+
+    testLayer2->forward(PASS_GC);
+    testLayer2->getOutputGrad()->copyFrom(outputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+
+    testLayer2->backward(nullptr);
+
+    wgt_grad2.copyFrom(*weightGrad);
+    input_grad2.copyFrom(*inputGrad);
+
+    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
+
+    checkError(wgt_grad1, wgt_grad2);
+    checkError(input_grad1, input_grad2);
+  }
+
+  for (int i = 0; i < 2; i++) {
+    CpuMatrix outputValue(testLayer2->getOutputValue()->getHeight(),
+                          testLayer2->getOutputValue()->getWidth());
+
+    FLAGS_rnn_use_batch = true;
+
+    testLayer2->forward(PASS_GC);
+    outputValue.copyFrom(*testLayer2->getOutputValue());
+
+    testLayer2->getOutputGrad()->copyFrom(outputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+
+    testLayer2->backward(nullptr);
+
+    wgt_grad1.copyFrom(*weightGrad);
+    input_grad1.copyFrom(*inputGrad);
+
+    FLAGS_rnn_use_batch = false;
+
+    testLayer2->getOutputValue()->zero();
+
+    testLayer2->forward(PASS_GC);
+    testLayer2->getOutputGrad()->copyFrom(outputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+
+    testLayer2->backward(nullptr);
+
+    wgt_grad2.copyFrom(*weightGrad);
+    input_grad2.copyFrom(*inputGrad);
+
+    checkError(outputValue, *testLayer2->getOutputValue());
+    checkError(wgt_grad1, wgt_grad2);
+    checkError(input_grad1, input_grad2);
+  }
+}
+
+TEST(MKLPackedLayer, RecurrentLayer) {
+  LayerConfig layerConfig1;
+  LayerConfig layerConfig2;
+
+  layerConfig1.set_name("paddle-rnn");
+  layerConfig1.set_type("recurrent");
+  layerConfig1.set_active_type("relu");
+
+  layerConfig2.set_name("mkl-packed-rnn");
+  layerConfig2.set_type("mkl_packed_recurrent");
+  layerConfig2.set_active_type("relu");
+
+  for (auto layerSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {true, false}) {
+        LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
+                  << " reversed=" << reversed;
+
+        LayerPtr dataLayer =
+            creatDataLayer("layer_0", batchSize, layerSize, false);
+        ParameterPtr para =
+            creatParameter("para_0", 0, layerSize * layerSize, false);
+
+        LayerPtr testLayer1 = initMKLPackedLayer(
+            layerConfig1, reversed, layerSize, dataLayer, para);
+        LayerPtr testLayer2 = initMKLPackedLayer(
+            layerConfig2, reversed, layerSize, dataLayer, para);
+
+        checkMKLPackedLayer(testLayer1, testLayer2);
+      }
+    }
+  }
+}
+#endif
+
 int main(int argc, char** argv) {
-  if (version::isWithGpu()) {
-    testing::InitGoogleTest(&argc, argv);
-    initMain(argc, argv);
-    return RUN_ALL_TESTS();
-  } else {
-    return 0;
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  if (!version::isWithGpu()) {
+    testing::GTEST_FLAG(filter) = "-Layer.*";
   }
+  return RUN_ALL_TESTS();
 }

From 0f8aad2934f88540249da6d7e5c8e8ceeafd60ec Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Tue, 19 Dec 2017 23:06:54 +0800
Subject: [PATCH 03/10] fix compile error

---
 paddle/gserver/layers/MKLPackedGemm.h           | 3 ++-
 paddle/gserver/layers/MKLPackedRecurrentLayer.h | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h
index 3c4c62eeb87f6a..91e2515e323217 100644
--- a/paddle/gserver/layers/MKLPackedGemm.h
+++ b/paddle/gserver/layers/MKLPackedGemm.h
@@ -27,7 +27,7 @@ class MKLPackedGemm {
   size_t weightWidth_;
 
 public:
-  MKLPackedGemm(MatrixPtr weight) {
+  explicit MKLPackedGemm(MatrixPtr weight) {
     weightHeight_ = weight->getHeight();
     weightWidth_ = weight->getWidth();
     weightPacked_ =
@@ -91,4 +91,5 @@ class MKLPackedGemm {
     cblas_sgemm_free(weightTPacked_);
   }
 };
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
index 719137f2db74f8..b8727e0ff30994 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #include <gflags/gflags.h>
 #include "Layer.h"
 #include "MKLPackedGemm.h"
@@ -128,4 +130,5 @@ class MKLPackedRecurrentLayer : public Layer {
 
   std::unique_ptr<MKLPackedGemm> sgemm_packed_;
 };
-}
+
+}  // namespace paddle

From b95834dc0c43cedd27124e18a13345712dcf9d47 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 20 Dec 2017 09:05:41 +0800
Subject: [PATCH 04/10] disable use_gpu when test mkl recurrent layer comparing
 with cpu

---
 paddle/gserver/tests/test_RecurrentLayer.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 1f31158579a2c9..44d84dd8be2f7c 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -551,6 +551,8 @@ TEST(MKLPackedLayer, RecurrentLayer) {
   layerConfig2.set_type("mkl_packed_recurrent");
   layerConfig2.set_active_type("relu");
 
+  FLAGS_use_gpu = false;
+
   for (auto layerSize : {32, 64, 128, 256, 512}) {
     for (auto batchSize : {1, 5, 100, 500}) {
       for (auto reversed : {true, false}) {

From 0b080a42da85d67d7a900a9b23bdcea6cfcbc01c Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 22 Dec 2017 13:36:59 +0800
Subject: [PATCH 05/10] add recurrent layer header

---
 paddle/gserver/layers/RecurrentLayer.cpp | 106 +-----------------
 paddle/gserver/layers/RecurrentLayer.h   | 130 +++++++++++++++++++++++
 2 files changed, 131 insertions(+), 105 deletions(-)
 create mode 100644 paddle/gserver/layers/RecurrentLayer.h

diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index e4c2b483d2fa40..285b11b5a027cd 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "RecurrentLayer.h"
 #include <gflags/gflags.h>
 #include "Layer.h"
 #include "SequenceToBatch.h"
@@ -21,110 +22,6 @@ DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
 
 namespace paddle {
 
-/**
- * @brief RecurrentLayer takes 1 input layer. The output size is the same with
- * input layer.
- * For each sequence [start, end] it performs the following computation:
- * \f[
- *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
- *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
- *
- * \f]
- * If reversed is true, the order is reversed:
- * \f[
- *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
- *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
- * \f]
- * There are two methods to calculate rnn. One way is to compute rnn one
- * sequence by one sequence. The other way is to reorganize the input
- * into batches, then compute rnn one batch by one batch. Users can select
- * them by rnn_use_batch flag.
- */
-
-class RecurrentLayer : public Layer {
-public:
-  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
-protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-};
-
 REGISTER_LAYER(recurrent, RecurrentLayer);
 
 bool RecurrentLayer::init(const LayerMap& layerMap,
@@ -260,7 +157,6 @@ void RecurrentLayer::backward(const UpdateCallback& callback) {
     bias_->getWGrad()->collectBias(*output_.grad, 1);
     bias_->getParameterPtr()->incUpdate(callback);
   }
-
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h
new file mode 100644
index 00000000000000..f40dbe150fa93b
--- /dev/null
+++ b/paddle/gserver/layers/RecurrentLayer.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <gflags/gflags.h>
+#include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief RecurrentLayer takes 1 input layer. The output size is the same with
+ * input layer.
+ * For each sequence [start, end] it performs the following computation:
+ * \f[
+ *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
+ *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
+ *
+ * \f]
+ * If reversed is true, the order is reversed:
+ * \f[
+ *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
+ *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
+ * \f]
+ * There are two methods to calculate rnn. One way is to compute rnn one
+ * sequence by one sequence. The other way is to reorganize the input
+ * into batches, then compute rnn one batch by one batch. Users can select
+ * them by rnn_use_batch flag.
+ */
+
+class RecurrentLayer : public Layer {
+public:
+  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+protected:
+  /**
+   * @brief If user do not set --rnn_use_batch=true, it will
+   * compute rnn forward one sequence by one sequence in default.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn forward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void forwardOneSequence(int start, int length);
+  /**
+   * @brief Compute rnn backward one sequence by onesequence.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn backward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void backwardOneSequence(int start, int length);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch. It will convert batch shape to sequence after finishing forward.
+   * The batch info can refer to SequenceToBatch class.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void forwardBatch(int batchSize,
+                            size_t numSequences,
+                            const int* starts);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void backwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int* starts);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+  /// frameOutput_[i] is used to hold the i-th sample of output_
+  std::vector<Argument> frameOutput_;
+  MatrixPtr prevOutput_;
+  /// Whether compute rnn by reverse.
+  bool reversed_;
+  /// If compute batch by batch, batchValue_ will be used to save the
+  /// reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// If compute batch by batch, batchGrad_ will be used to save the
+  /// gradient with respect to reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+};
+
+}  // namespace paddle

From 82091035514c0ddeae2c18ff5f523a2647d59948 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 22 Dec 2017 13:43:25 +0800
Subject: [PATCH 06/10] follow comments and refine code

---
 paddle/gserver/layers/MKLPackedGemm.h         |  95 ---------
 .../layers/MKLPackedRecurrentLayer.cpp        | 191 ++----------------
 .../gserver/layers/MKLPackedRecurrentLayer.h  |  87 ++------
 paddle/gserver/layers/MKLPackedWeight.h       | 100 +++++++++
 4 files changed, 125 insertions(+), 348 deletions(-)
 delete mode 100644 paddle/gserver/layers/MKLPackedGemm.h
 create mode 100644 paddle/gserver/layers/MKLPackedWeight.h

diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h
deleted file mode 100644
index 91e2515e323217..00000000000000
--- a/paddle/gserver/layers/MKLPackedGemm.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/MathFunctions.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class MKLPackedGemm {
-protected:
-  real* weightPacked_;
-  real* weightTPacked_;
-  size_t weightHeight_;
-  size_t weightWidth_;
-
-public:
-  explicit MKLPackedGemm(MatrixPtr weight) {
-    weightHeight_ = weight->getHeight();
-    weightWidth_ = weight->getWidth();
-    weightPacked_ =
-        cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_);
-    weightTPacked_ =
-        cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_);
-    cblas_sgemm_pack(CblasRowMajor,
-                     CblasBMatrix,
-                     CblasNoTrans,
-                     1,
-                     weightWidth_,
-                     weightHeight_,
-                     1.0,
-                     weight->getData(),
-                     weightWidth_,
-                     weightPacked_);
-    cblas_sgemm_pack(CblasRowMajor,
-                     CblasBMatrix,
-                     CblasTrans,
-                     1,
-                     weightWidth_,
-                     weightHeight_,
-                     1.0,
-                     weight->getData(),
-                     weightWidth_,
-                     weightTPacked_);
-  }
-  void compute(MatrixPtr batch2, MatrixPtr batch1, bool transW = false) {
-    if (transW) {
-      cblas_sgemm_compute(CblasRowMajor,
-                          CblasNoTrans,
-                          CblasPacked,
-                          batch2->getHeight(),
-                          weightWidth_,
-                          weightHeight_,
-                          batch1->getData(),
-                          weightHeight_,
-                          weightTPacked_,
-                          weightWidth_,
-                          1,
-                          batch2->getData(),
-                          weightWidth_);
-    } else {
-      cblas_sgemm_compute(CblasRowMajor,
-                          CblasNoTrans,
-                          CblasPacked,
-                          batch2->getHeight(),
-                          weightWidth_,
-                          weightHeight_,
-                          batch1->getData(),
-                          weightHeight_,
-                          weightPacked_,
-                          weightWidth_,
-                          1,
-                          batch2->getData(),
-                          weightWidth_);
-    }
-  }
-  ~MKLPackedGemm() {
-    cblas_sgemm_free(weightPacked_);
-    cblas_sgemm_free(weightTPacked_);
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
index 6f455af91ed800..bd3c4ceb5e0956 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
@@ -20,188 +20,21 @@ REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer);
 
 bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap,
                                    const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize());
-  weight_.reset(new Weight(getSize(), getSize(), parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize(), biasParameter_));
+  if (!RecurrentLayer::init(layerMap, parameterMap)) return false;
+  packed_weight_.reset(new MKLPackedWeight(weight_->getW()));
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true));
+    packed_weightT_->pack();
   }
-  reversed_ = config_.reversed();
-
-  sgemm_packed_.reset(new MKLPackedGemm(weight_->getW()));
-
   return true;
 }
 
-void MKLPackedRecurrentLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed recurrent layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->zeroMem();
-}
-
-void MKLPackedRecurrentLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1) << "one matrix is expected for RNN state";
-  prevOutput_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr MKLPackedRecurrentLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-  res->value[0]->copyFrom(*prevOutput_);
-  return res;
-}
-
-void MKLPackedRecurrentLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str());
-  Layer::forward(passType);
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize(), input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  output_.value->assign(*input.value);
-  if (bias_) {
-    output_.value->addBias(*bias_->getW(), 1);
-  }
-  if (!FLAGS_rnn_use_batch) {
-    forwardSequence(batchSize, numSequences, starts);
-  } else {
-    forwardBatch(batchSize, numSequences, starts);
-  }
-}
-
-void MKLPackedRecurrentLayer::forwardSequence(int batchSize,
-                                              size_t numSequences,
-                                              const int* starts) {
-  REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str());
-
-  frameOutput_.reserve(batchSize);
-  for (int i = frameOutput_.size(); i < batchSize; ++i) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               getSize(),
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              getSize(),
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutput_.push_back(arg);
-  }
-
-  for (int i = 0; i < batchSize; ++i) {
-    frameOutput_[i].value->setData(output_.value->getData() + i * getSize());
-  }
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    forwardOneSequence(starts[i], starts[i + 1] - starts[i]);
-  }
-}
-
-void MKLPackedRecurrentLayer::forwardOneSequence(int start, int length) {
-  if (!reversed_) {
-    if (prevOutput_) {
-      frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1);
-    }
-    activation_->forward(frameOutput_[start]).check();
-
-    for (int i = 1; i < length; ++i) {
-      frameOutput_[start + i].value->mul(
-          *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]).check();
-    }
-    if (prevOutput_) {
-      prevOutput_->assign(*frameOutput_[start + length - 1].value);
-    }
-  } else {
-    activation_->forward(frameOutput_[start + length - 1]).check();
-    for (int i = length - 2; i >= 0; --i) {
-      frameOutput_[start + i].value->mul(
-          *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]).check();
-    }
-  }
-}
-
 void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str());
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  if (!FLAGS_rnn_use_batch) {
-    backwardSequence(batchSize, numSequences, starts);
-  } else {
-    backwardBatch(batchSize, numSequences, starts);
-  }
-
-  if (input.grad) {
-    input.grad->add(*output_.grad);
-  }
-
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*output_.grad, 1);
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-  sgemm_packed_.reset(new MKLPackedGemm(weight_->getW()));
-}
-
-void MKLPackedRecurrentLayer::backwardSequence(int batchSize,
-                                               size_t numSequences,
-                                               const int* starts) {
-  REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str());
-  for (int i = 0; i < batchSize; ++i) {
-    frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize());
-  }
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    backwardOneSequence(starts[i], starts[i + 1] - starts[i]);
-  }
-}
-
-void MKLPackedRecurrentLayer::backwardOneSequence(int start, int length) {
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-  if (!reversed_) {
-    for (int i = length - 1; i > 0; --i) {
-      activation_->backward(frameOutput_[start + i]).check();
-      frameOutput_[start + i - 1].grad->mul(
-          *frameOutput_[start + i].grad, *weightT, 1, 1);
-    }
-    activation_->backward(frameOutput_[start]).check();
-    if (weight_->getWGrad()) {
-      weight_->getWGrad()->mul(
-          *output_.value->subMatrix(start, length - 1)->getTranspose(),
-          *output_.grad->subMatrix(start + 1, length - 1),
-          1,
-          1);
-    }
-  } else {
-    for (int i = 0; i < length - 1; ++i) {
-      activation_->backward(frameOutput_[start + i]).check();
-      frameOutput_[start + i + 1].grad->mul(
-          *frameOutput_[start + i].grad, *weightT, 1, 1);
-    }
-    activation_->backward(frameOutput_[start + length - 1]).check();
-    if (weight_->getWGrad()) {
-      weight_->getWGrad()->mul(
-          *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-          *output_.grad->subMatrix(start, length - 1),
-          1,
-          1);
-    }
+  RecurrentLayer::backward(callback);
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_->pack();
   }
 }
 
@@ -227,7 +60,7 @@ void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
             batchValue_->getBatchValue(n - 1, batch2->getHeight());
 
         // batch2->mul(*batch1, *weight_->getW(), 1, 1);
-        sgemm_packed_->compute(batch2, batch1);
+        packed_weight_->compute(batch2, batch1);
       }
 
 #pragma omp parallel for collapse(2)
@@ -272,7 +105,7 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
       if (n != 0) {
         batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
         // batch1->mul(*batch2, *weightT, 1, 1);
-        sgemm_packed_->compute(batch1, batch2, true);
+        packed_weightT_->compute(batch1, batch2);
       }
 
       if (backwardByBatch && weight_->getWGrad()) {
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
index b8727e0ff30994..ba6487b11e0e90 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include "Layer.h"
-#include "MKLPackedGemm.h"
+#include "MKLPackedWeight.h"
+#include "RecurrentLayer.h"
 #include "SequenceToBatch.h"
 #include "paddle/utils/Stat.h"
 
@@ -45,90 +46,28 @@ namespace paddle {
  * them by rnn_use_batch flag.
  */
 
-class MKLPackedRecurrentLayer : public Layer {
+class MKLPackedRecurrentLayer : public RecurrentLayer {
 public:
-  explicit MKLPackedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
+  explicit MKLPackedRecurrentLayer(const LayerConfig& config)
+      : RecurrentLayer(config) {}
 
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType) override;
-
   void backward(const UpdateCallback& callback) override;
 
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
 protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts) override;
 
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardBatch(int batchSize, size_t numSequences, const int* starts);
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int* starts) override;
 
 protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-
-  std::unique_ptr<MKLPackedGemm> sgemm_packed_;
+  std::unique_ptr<MKLPackedWeight> packed_weight_;
+  std::unique_ptr<MKLPackedWeight> packed_weightT_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
new file mode 100644
index 00000000000000..a8dcfd561b6550
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/MathFunctions.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/Weight.h"
+
+namespace paddle {
+
+class MKLPackedWeight {
+protected:
+  real *weight_;
+  real *packedWeight_;
+  size_t height_;
+  size_t width_;
+  bool transW_;
+
+public:
+  MKLPackedWeight(MatrixPtr weight, bool transW = false) {
+    packedWeight_ = nullptr;
+    weight_ = weight->getData();
+    height_ = weight->getHeight();
+    width_ = weight->getWidth();
+    transW_ = transW;
+  }
+
+  ~MKLPackedWeight() { free_(); }
+
+  void pack() { pack_(weight_); }
+
+  void compute(MatrixPtr dst, MatrixPtr src) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        src->getHeight(),
+                        transW_ ? height_ : width_,
+                        transW_ ? width_ : height_,
+                        src->getData(),
+                        src->getWidth(),
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        dst->getData(),
+                        dst->getWidth());
+  }
+
+  void compute(size_t M, real *A, size_t lda, real *C, size_t ldc) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        M,
+                        width_,
+                        height_,
+                        A,
+                        lda,
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        C,
+                        ldc);
+  }
+
+protected:
+  void pack_(real *src) {
+    if (!packedWeight_) {
+      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
+    }
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     transW_ ? CblasTrans : CblasNoTrans,
+                     1,
+                     transW_ ? height_ : width_,
+                     transW_ ? width_ : height_,
+                     1.0,
+                     src,
+                     width_,
+                     packedWeight_);
+  }
+
+  void free_() {
+    if (packedWeight_) {
+      cblas_sgemm_free(packedWeight_);
+    }
+  }
+};
+
+}  // namespace paddle

From 0596cd8826ddf94c53fd2d834a189be2b829a595 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 22 Dec 2017 14:45:07 +0800
Subject: [PATCH 07/10] refine test recurrent layer

---
 paddle/gserver/tests/test_RecurrentLayer.cpp | 119 ++++++++-----------
 1 file changed, 52 insertions(+), 67 deletions(-)

diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 44d84dd8be2f7c..0e130843339a10 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -222,6 +222,7 @@ TEST(Layer, RecurrentLayer) {
 #define protected public
 #include "paddle/gserver/layers/GatedRecurrentLayer.h"
 #include "paddle/gserver/layers/LstmLayer.h"
+#include "paddle/gserver/layers/RecurrentLayer.h"
 template <class T>
 class TestRecurrentLayer {
 public:
@@ -422,6 +423,8 @@ TEST(Layer, LstmLayer) {
 
 #ifdef PADDLE_WITH_MKLML
 
+#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h"
+
 LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
                             bool reversed,
                             int layerSize,
@@ -453,7 +456,31 @@ LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
   return testLayer;
 }
 
-void checkMKLPackedLayer(LayerPtr testLayer1, LayerPtr testLayer2) {
+void checkMKLPackedLayer(LayerConfig layerConfig1,
+                         LayerConfig layerConfig2,
+                         bool reversed,
+                         int layerSize,
+                         int batchSize,
+                         bool useBatch1,
+                         bool useBatch2) {
+  LayerPtr dataLayer;
+  ParameterPtr para, bias;
+
+  if (layerConfig1.type() == "recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize, false);
+    bias = nullptr;
+  } else if (layerConfig1.type() == "gated_recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
+    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
+  }
+
+  LayerPtr testLayer1 = initMKLPackedLayer(
+      layerConfig1, reversed, layerSize, dataLayer, para, bias);
+  LayerPtr testLayer2 = initMKLPackedLayer(
+      layerConfig2, reversed, layerSize, dataLayer, para, bias);
+
   const VectorPtr& weightGrad =
       (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
   const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
@@ -462,78 +489,34 @@ void checkMKLPackedLayer(LayerPtr testLayer1, LayerPtr testLayer2) {
   CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
   CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
 
-  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  outputGrad.randomizeUniform();
-
   for (int i = 0; i < 2; i++) {
-    FLAGS_rnn_use_batch = true;
+    FLAGS_rnn_use_batch = useBatch1;
 
     testLayer1->forward(PASS_GC);
 
-    testLayer1->getOutputGrad()->copyFrom(outputGrad);
-
-    weightGrad->zero();
-    inputGrad->zero();
-
-    testLayer1->backward(nullptr);
-
-    wgt_grad1.copyFrom(*weightGrad);
-    input_grad1.copyFrom(*inputGrad);
-
-    FLAGS_rnn_use_batch = true;
-
-    testLayer2->forward(PASS_GC);
-    testLayer2->getOutputGrad()->copyFrom(outputGrad);
-
-    weightGrad->zero();
-    inputGrad->zero();
-
-    testLayer2->backward(nullptr);
-
-    wgt_grad2.copyFrom(*weightGrad);
-    input_grad2.copyFrom(*inputGrad);
-
-    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
-
-    checkError(wgt_grad1, wgt_grad2);
-    checkError(input_grad1, input_grad2);
-  }
-
-  for (int i = 0; i < 2; i++) {
-    CpuMatrix outputValue(testLayer2->getOutputValue()->getHeight(),
-                          testLayer2->getOutputValue()->getWidth());
-
-    FLAGS_rnn_use_batch = true;
-
+    FLAGS_rnn_use_batch = useBatch2;
     testLayer2->forward(PASS_GC);
-    outputValue.copyFrom(*testLayer2->getOutputValue());
 
-    testLayer2->getOutputGrad()->copyFrom(outputGrad);
+    testLayer1->getOutputGrad()->randomizeUniform();
+    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
 
     weightGrad->zero();
     inputGrad->zero();
-
-    testLayer2->backward(nullptr);
+    FLAGS_rnn_use_batch = useBatch1;
+    testLayer1->backward(nullptr);
 
     wgt_grad1.copyFrom(*weightGrad);
     input_grad1.copyFrom(*inputGrad);
 
-    FLAGS_rnn_use_batch = false;
-
-    testLayer2->getOutputValue()->zero();
-
-    testLayer2->forward(PASS_GC);
-    testLayer2->getOutputGrad()->copyFrom(outputGrad);
-
     weightGrad->zero();
     inputGrad->zero();
-
+    FLAGS_rnn_use_batch = useBatch2;
     testLayer2->backward(nullptr);
 
     wgt_grad2.copyFrom(*weightGrad);
     input_grad2.copyFrom(*inputGrad);
 
-    checkError(outputValue, *testLayer2->getOutputValue());
+    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
     checkError(wgt_grad1, wgt_grad2);
     checkError(input_grad1, input_grad2);
   }
@@ -556,20 +539,22 @@ TEST(MKLPackedLayer, RecurrentLayer) {
   for (auto layerSize : {32, 64, 128, 256, 512}) {
     for (auto batchSize : {1, 5, 100, 500}) {
       for (auto reversed : {true, false}) {
-        LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
-                  << " reversed=" << reversed;
-
-        LayerPtr dataLayer =
-            creatDataLayer("layer_0", batchSize, layerSize, false);
-        ParameterPtr para =
-            creatParameter("para_0", 0, layerSize * layerSize, false);
-
-        LayerPtr testLayer1 = initMKLPackedLayer(
-            layerConfig1, reversed, layerSize, dataLayer, para);
-        LayerPtr testLayer2 = initMKLPackedLayer(
-            layerConfig2, reversed, layerSize, dataLayer, para);
-
-        checkMKLPackedLayer(testLayer1, testLayer2);
+        for (auto paddle_use_batch : {true, false}) {
+          for (auto MKLPacked_use_batch : {true, false}) {
+            LOG(INFO) << " layerSize=" << layerSize
+                      << " batchSize=" << batchSize << " reversed=" << reversed
+                      << " paddle_use_batch=" << paddle_use_batch
+                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
+
+            checkMKLPackedLayer(layerConfig1,
+                                layerConfig2,
+                                reversed,
+                                layerSize,
+                                batchSize,
+                                paddle_use_batch,
+                                MKLPacked_use_batch);
+          }
+        }
       }
     }
   }

From 4360615850a01a32747b7a4e4d8f99f0ff8c6252 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Fri, 22 Dec 2017 16:34:24 +0800
Subject: [PATCH 08/10] fix compile error

---
 paddle/gserver/layers/MKLPackedWeight.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
index a8dcfd561b6550..cc8a336154970c 100644
--- a/paddle/gserver/layers/MKLPackedWeight.h
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -29,7 +29,7 @@ class MKLPackedWeight {
   bool transW_;
 
 public:
-  MKLPackedWeight(MatrixPtr weight, bool transW = false) {
+  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
     packedWeight_ = nullptr;
     weight_ = weight->getData();
     height_ = weight->getHeight();

From df2b054b13d19d467afa51aafdf1871569c6fa56 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 3 Jan 2018 11:37:55 +0800
Subject: [PATCH 09/10] follow comments refine code

---
 .../layers/MKLPackedRecurrentLayer.cpp        | 64 ++++++++-----------
 .../gserver/layers/MKLPackedRecurrentLayer.h  | 29 ++-------
 paddle/gserver/layers/MKLPackedWeight.h       | 20 +-----
 paddle/gserver/layers/RecurrentLayer.cpp      |  4 --
 4 files changed, 36 insertions(+), 81 deletions(-)

diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
index bd3c4ceb5e0956..b4a6413048327c 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
@@ -53,28 +53,19 @@ void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
     REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
     /* forward one batch */
     for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
-      MatrixPtr batch2 = batchValue_->getBatchValue(n);
+      MatrixPtr batchValue = batchValue_->getBatchValue(n);
 
       if (n != 0) {
-        MatrixPtr batch1 =
-            batchValue_->getBatchValue(n - 1, batch2->getHeight());
+        MatrixPtr preBatchValue =
+            batchValue_->getBatchValue(n - 1, batchValue->getHeight());
 
-        // batch2->mul(*batch1, *weight_->getW(), 1, 1);
-        packed_weight_->compute(batch2, batch1);
-      }
-
-#pragma omp parallel for collapse(2)
-      for (size_t i = 0; i < batch2->getHeight(); i++) {
-        for (size_t j = 0; j < batch2->getWidth(); j++) {
-          *(batch2->getData() + i * batch2->getWidth() + j) =
-              *(batch2->getData() + i * batch2->getWidth() + j) > 0
-                  ? *(batch2->getData() + i * batch2->getWidth() + j)
-                  : 0;
-        }
+        packed_weight_->compute(batchValue, preBatchValue);
       }
+      Argument arg;
+      arg.value = batchValue;
+      activation_->forward(arg).check();
     }
   }
-
   batchValue_->copyBackSeq(*output_.value);
 }
 
@@ -94,25 +85,27 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
     REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
     /* backward one batch */
     for (int n = (int)numBatch - 1; n >= 0; n--) {
-      MatrixPtr batch2 = batchGrad_->getBatchValue(n);
-      MatrixPtr batch1 = batchValue_->getBatchValue(n, batch2->getHeight());
+      MatrixPtr batchGrad = batchGrad_->getBatchValue(n);
+      MatrixPtr batchValue =
+          batchValue_->getBatchValue(n, batchGrad->getHeight());
 
       Argument arg;
-      arg.value = batch1;
-      arg.grad = batch2;
+      arg.value = batchValue;
+      arg.grad = batchGrad;
       activation_->backward(arg).check();
 
       if (n != 0) {
-        batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
-        // batch1->mul(*batch2, *weightT, 1, 1);
-        packed_weightT_->compute(batch1, batch2);
+        batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight());
+        packed_weightT_->compute(batchValue, batchGrad);
       }
 
       if (backwardByBatch && weight_->getWGrad()) {
         if (n != 0) {
           /* backward weight */
-          batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight());
-          weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1);
+          batchValue =
+              batchValue_->getBatchValue(n - 1, batchGrad->getHeight());
+          weight_->getWGrad()->mul(
+              *batchValue->getTranspose(), *batchGrad, 1, 1);
         }
       }
     }
@@ -124,19 +117,14 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
     REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
     for (size_t seq = 0; seq < numSequences; ++seq) {
       int len = starts[seq + 1] - starts[seq];
-      if (!reversed_) {
-        weight_->getWGrad()->mul(
-            *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(),
-            *output_.grad->subMatrix(starts[seq] + 1, len - 1),
-            1,
-            1);
-      } else {
-        weight_->getWGrad()->mul(
-            *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(),
-            *output_.grad->subMatrix(starts[seq], len - 1),
-            1,
-            1);
-      }
+      weight_->getWGrad()->mul(
+          *output_.value
+               ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1)
+               ->getTranspose(),
+          *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1,
+                                   len - 1),
+          1,
+          1);
     }
   }
 }
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
index ba6487b11e0e90..19874d538e919c 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -14,36 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#include <gflags/gflags.h>
-#include "Layer.h"
 #include "MKLPackedWeight.h"
 #include "RecurrentLayer.h"
-#include "SequenceToBatch.h"
-#include "paddle/utils/Stat.h"
 
 DECLARE_bool(rnn_use_batch);
 
 namespace paddle {
 
 /**
- * @brief MKLPackedRecurrentLayer takes 1 input layer. The output size is the
- * same with
- * input layer.
- * For each sequence [start, end] it performs the following computation:
- * \f[
- *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
- *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
- *
- * \f]
- * If reversed is true, the order is reversed:
- * \f[
- *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
- *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
- * \f]
- * There are two methods to calculate rnn. One way is to compute rnn one
- * sequence by one sequence. The other way is to reorganize the input
- * into batches, then compute rnn one batch by one batch. Users can select
- * them by rnn_use_batch flag.
+ * @brief MKLPackedRecurrentLayer is same with RecurrentLayer but is optimized
+ * with MKL cblas packed gemm.
+ * More details:
+ * /~https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
  */
 
 class MKLPackedRecurrentLayer : public RecurrentLayer {
@@ -66,7 +48,10 @@ class MKLPackedRecurrentLayer : public RecurrentLayer {
                      const int* starts) override;
 
 protected:
+  /// packed_weight_ is contains same data with
+  /// RecurrentLayer::weight_ but is packed
   std::unique_ptr<MKLPackedWeight> packed_weight_;
+  /// packed_weightT_ is the transposition matrix of packed_weight_
   std::unique_ptr<MKLPackedWeight> packed_weightT_;
 };
 
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
index cc8a336154970c..f77aa4dbbf5ddb 100644
--- a/paddle/gserver/layers/MKLPackedWeight.h
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -22,7 +22,9 @@ namespace paddle {
 
 class MKLPackedWeight {
 protected:
+  /// The pointor of weight
   real *weight_;
+  /// The pointor of cblas packed gemm to weight
   real *packedWeight_;
   size_t height_;
   size_t width_;
@@ -41,7 +43,7 @@ class MKLPackedWeight {
 
   void pack() { pack_(weight_); }
 
-  void compute(MatrixPtr dst, MatrixPtr src) {
+  void compute(MatrixPtr dst, const MatrixPtr src) {
     cblas_sgemm_compute(CblasRowMajor,
                         CblasNoTrans,
                         CblasPacked,
@@ -57,22 +59,6 @@ class MKLPackedWeight {
                         dst->getWidth());
   }
 
-  void compute(size_t M, real *A, size_t lda, real *C, size_t ldc) {
-    cblas_sgemm_compute(CblasRowMajor,
-                        CblasNoTrans,
-                        CblasPacked,
-                        M,
-                        width_,
-                        height_,
-                        A,
-                        lda,
-                        packedWeight_,
-                        width_,
-                        1.0,
-                        C,
-                        ldc);
-  }
-
 protected:
   void pack_(real *src) {
     if (!packedWeight_) {
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 285b11b5a027cd..6bd42c06cadf75 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "RecurrentLayer.h"
-#include <gflags/gflags.h>
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/utils/Stat.h"
 
 DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
 

From 89cb3a249cc5ecbf3955ea37e67655ec431142e3 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Wed, 3 Jan 2018 14:02:01 +0800
Subject: [PATCH 10/10] follow comments, refine comment and function name

---
 paddle/gserver/layers/MKLPackedRecurrentLayer.cpp | 4 ++--
 paddle/gserver/layers/MKLPackedRecurrentLayer.h   | 6 +++---
 paddle/gserver/layers/MKLPackedWeight.h           | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
index b4a6413048327c..dd75555fae1346 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
@@ -59,7 +59,7 @@ void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
         MatrixPtr preBatchValue =
             batchValue_->getBatchValue(n - 1, batchValue->getHeight());
 
-        packed_weight_->compute(batchValue, preBatchValue);
+        packed_weight_->gemm_compute(preBatchValue, batchValue);
       }
       Argument arg;
       arg.value = batchValue;
@@ -96,7 +96,7 @@ void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
 
       if (n != 0) {
         batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight());
-        packed_weightT_->compute(batchValue, batchGrad);
+        packed_weightT_->gemm_compute(batchGrad, batchValue);
       }
 
       if (backwardByBatch && weight_->getWGrad()) {
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
index 19874d538e919c..bded523a8fbd6f 100644
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -22,8 +22,8 @@ DECLARE_bool(rnn_use_batch);
 namespace paddle {
 
 /**
- * @brief MKLPackedRecurrentLayer is same with RecurrentLayer but is optimized
- * with MKL cblas packed gemm.
+ * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer
+ * but is optimized with MKL cblas packed gemm.
  * More details:
  * /~https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
  */
@@ -48,7 +48,7 @@ class MKLPackedRecurrentLayer : public RecurrentLayer {
                      const int* starts) override;
 
 protected:
-  /// packed_weight_ is contains same data with
+  /// packed_weight_ contains same data with
   /// RecurrentLayer::weight_ but is packed
   std::unique_ptr<MKLPackedWeight> packed_weight_;
   /// packed_weightT_ is the transposition matrix of packed_weight_
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
index f77aa4dbbf5ddb..15d5093beb43e2 100644
--- a/paddle/gserver/layers/MKLPackedWeight.h
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -22,9 +22,9 @@ namespace paddle {
 
 class MKLPackedWeight {
 protected:
-  /// The pointor of weight
+  /// The pointer of weight
   real *weight_;
-  /// The pointor of cblas packed gemm to weight
+  /// The pointer of cblas packed gemm to weight
   real *packedWeight_;
   size_t height_;
   size_t width_;
@@ -43,7 +43,7 @@ class MKLPackedWeight {
 
   void pack() { pack_(weight_); }
 
-  void compute(MatrixPtr dst, const MatrixPtr src) {
+  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
     cblas_sgemm_compute(CblasRowMajor,
                         CblasNoTrans,
                         CblasPacked,