Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refine the activation type getting in the LSTM operator to speed. #6996

Merged
merged 2 commits into from
Dec 26, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions paddle/operators/lstm_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence2batch.h"
Expand Down Expand Up @@ -102,9 +103,12 @@ class LSTMKernel : public framework::OpKernel<T> {

auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1;
auto gate_act = ctx.Attr<std::string>("gate_activation");
auto cell_act = ctx.Attr<std::string>("cell_activation");
auto cand_act = ctx.Attr<std::string>("candidate_activation");
auto gate_act = math::detail::GetActivationType(
ctx.Attr<std::string>("gate_activation"));
auto cell_act = math::detail::GetActivationType(
ctx.Attr<std::string>("cell_activation"));
auto cand_act = math::detail::GetActivationType(
ctx.Attr<std::string>("candidate_activation"));

for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]);
Expand Down Expand Up @@ -264,9 +268,12 @@ class LSTMGradKernel : public framework::OpKernel<T> {
batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
batch_gate_g.set_lod(batch_gate->lod());

auto gate_act = ctx.Attr<std::string>("gate_activation");
auto cell_act = ctx.Attr<std::string>("cell_activation");
auto cand_act = ctx.Attr<std::string>("candidate_activation");
auto gate_act = math::detail::GetActivationType(
ctx.Attr<std::string>("gate_activation"));
auto cell_act = math::detail::GetActivationType(
ctx.Attr<std::string>("cell_activation"));
auto cand_act = math::detail::GetActivationType(
ctx.Attr<std::string>("candidate_activation"));

auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1;
Expand Down
21 changes: 21 additions & 0 deletions paddle/operators/math/detail/activation_functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#pragma once
#include <math.h>
#include "paddle/platform/enforce.h"
#include "paddle/platform/hostdevice.h"

#ifdef __AVX__
Expand All @@ -29,6 +30,26 @@ namespace detail {
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0

enum ActivationType {
kSigmoid,
kReLU,
kTanh,
kIdentity,
};

inline ActivationType GetActivationType(const std::string &type) {
if (type == "sigmoid") {
return ActivationType::kSigmoid;
} else if (type == "relu") {
return ActivationType::kReLU;
} else if (type == "tanh") {
return ActivationType::kTanh;
} else if (type == "identity" || type == "") {
return ActivationType::kIdentity;
}
PADDLE_THROW("Not support type %s.", type);
}

namespace forward {

template <typename T>
Expand Down
37 changes: 17 additions & 20 deletions paddle/operators/math/detail/lstm_cpu_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,9 @@ namespace detail {

template <class T, class Op>
void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frame_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
int frame_size, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
T r_value_in;
T r_value_ig;
T r_value_fg;
Expand Down Expand Up @@ -77,9 +76,9 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
T r_value_in;
T r_value_ig;
T r_value_fg;
Expand Down Expand Up @@ -149,10 +148,9 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,

template <class T, class Op>
void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frame_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
int frame_size, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
#ifdef __AVX__
__m256 r_value_in;
__m256 r_value_ig;
Expand Down Expand Up @@ -204,9 +202,9 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
#ifdef __AVX__
__m256 r_value_in;
__m256 r_value_ig;
Expand Down Expand Up @@ -281,9 +279,8 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,

template <class T, class Op>
void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state);
Expand All @@ -295,9 +292,9 @@ void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,

template <class T, class Op>
void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
int frame_size, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
active_gate, active_state);
Expand Down
22 changes: 10 additions & 12 deletions paddle/operators/math/detail/lstm_gpu_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ namespace detail {
*/
template <class T, class Op, bool is_batch>
__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
int batch_size, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
int batch_size, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;

Expand Down Expand Up @@ -91,9 +91,9 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
template <class T, class Op, bool is_batch>
__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size,
int batch_size, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
int batch_size, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;

Expand Down Expand Up @@ -185,9 +185,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
dim3 threads;
dim3 grid;
if (batch_size == 1) {
Expand Down Expand Up @@ -220,9 +219,8 @@ template <class T, class Op>
void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
dim3 threads;
dim3 grid;
if (batch_size == 1) {
Expand Down
22 changes: 11 additions & 11 deletions paddle/operators/math/detail/lstm_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ class lstm {
HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
T &prev_state, T &state, T &state_atv, T &output,
T &checkI, T &checkF, T &checkO,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
value_in = activation(value_in, active_node);
value_ig = activation(value_ig + prev_state * checkI, active_gate);
value_fg = activation(value_fg + prev_state * checkF, active_gate);
Expand All @@ -53,9 +53,9 @@ class lstm {
__m256 &prev_state, __m256 &state,
__m256 &state_atv, __m256 &output, __m256 &checkI,
__m256 &checkF, __m256 &checkO,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
value_in = activation(value_in, active_node);
value_ig =
activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
Expand Down Expand Up @@ -87,9 +87,9 @@ class lstm {
T &state_grad, T &state_atv, T &output_grad,
T &checkI, T &checkF, T &checkO, T &checkIGrad,
T &checkFGrad, T &checkOGrad,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
grad_og = activation(output_grad * state_atv, value_og, active_gate);
state_grad += activation(output_grad * value_og, state_atv, active_state) +
grad_og * checkO;
Expand All @@ -114,8 +114,8 @@ class lstm {
__m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
__m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
__m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
__m256 &checkFGrad, __m256 &checkOGrad, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_state) {
__m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node,
ActivationType active_gate, ActivationType active_state) {
grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
active_gate);
state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
Expand Down
16 changes: 8 additions & 8 deletions paddle/operators/math/lstm_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ template <class T>
struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
static void compute(const platform::CPUDeviceContext& context,
LstmMetaValue<T> value, int frame_size, int batch_size,
const std::string& gate_act, const std::string& cell_act,
const std::string& cand_act) {
const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
ActiveType(cand_act), ActiveType(gate_act),
ActiveType(cell_act));
cand_act, gate_act, cell_act);
value.gate_value += frame_size * 4;
value.state_value += frame_size;
value.state_active_value += frame_size;
Expand All @@ -46,12 +46,12 @@ struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
static void compute(const platform::CPUDeviceContext& context,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
const std::string& gate_act, const std::string& cell_act,
const std::string& cand_act) {
const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
frame_size, ActiveType(cand_act),
ActiveType(gate_act), ActiveType(cell_act));
frame_size, cand_act, gate_act, cell_act);

value.gate_value += frame_size * 4;
value.state_value += frame_size;
Expand Down
18 changes: 10 additions & 8 deletions paddle/operators/math/lstm_compute.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ template <class T>
struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
static void compute(const platform::CUDADeviceContext& context,
LstmMetaValue<T> value, int frame_size, int batch_size,
const std::string& gate_act, const std::string& cell_act,
const std::string& cand_act) {
const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
frame_size, batch_size, ActiveType(cand_act),
ActiveType(gate_act), ActiveType(cell_act));
frame_size, batch_size, cand_act, gate_act,
cell_act);
}
};

Expand All @@ -37,11 +38,12 @@ struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
static void compute(const platform::CUDADeviceContext& context,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
const std::string& gate_act, const std::string& cell_act,
const std::string& cand_act) {
const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
frame_size, batch_size, ActiveType(cand_act),
ActiveType(gate_act), ActiveType(cell_act));
frame_size, batch_size, cand_act, gate_act,
cell_act);
}
};

Expand Down
11 changes: 7 additions & 4 deletions paddle/operators/math/lstm_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#pragma once

#include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"

Expand Down Expand Up @@ -72,17 +73,19 @@ class LstmUnitFunctor {
public:
static void compute(const DeviceContext &context, LstmMetaValue<T> value,
int frame_size, int batch_size,
const std::string &gate_act, const std::string &cell_act,
const std::string &cand_act);
const detail::ActivationType &gate_act,
const detail::ActivationType &cell_act,
const detail::ActivationType &cand_act);
};

template <typename DeviceContext, typename T>
class LstmUnitGradFunctor {
public:
static void compute(const DeviceContext &context, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size, int batch_size,
const std::string &gate_act, const std::string &cell_act,
const std::string &cand_act);
const detail::ActivationType &gate_act,
const detail::ActivationType &cell_act,
const detail::ActivationType &cand_act);
};

} // namespace math
Expand Down