diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps index 33f7063506eec..70b25d19839af 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_kernel.kps @@ -29,7 +29,7 @@ static __device__ inline void memset_value_float(float* lm, int size, float valu template struct pooling_engine { - static __device__ void sum_pooling(T* local_x, + static __device__ inline void sum_pooling(T* local_x, T* sum, T2* sum_show_clk, int len, @@ -66,7 +66,7 @@ struct pooling_engine { template struct pooling_engine_with_large_dim { - static __device__ void sum_pooling(T* local_x, + static __device__ inline void sum_pooling(T* local_x, T* sum, T2* sum_show_clk, int len, @@ -104,7 +104,7 @@ struct pooling_engine_with_large_dim { // need_filter:true && embed_threshold_filter:true && embedx_concate_filter:false template struct pooling_engine{ - static __device__ void sum_pooling(T* local_x, + static __device__ inline void sum_pooling(T* local_x, T* sum, T2* sum_show_clk, int len, @@ -120,8 +120,10 @@ struct pooling_engine{ int embedx_concate_size, float32x16_t &v_scale, __global_ptr__ T* cur_x) { - sum_show_clk[0] = sum[0]; - sum_show_clk[1] = sum[1]; + // cvm_offset = [0, 2] + for (int cvm_i = 0; cvm_i < cvm_offset; cvm_i++) { + sum_show_clk[cvm_i] = sum[cvm_i]; + } for (int j = 0; j < len; j++) { mfence(); @@ -173,13 +175,17 @@ struct pooling_engine{ vstore_lm_float32x16(sum, v_dst1); vstore_lm_float32x16(sum + 16, v_dst2); - sum_show_clk[0] += local_x[0]; - sum_show_clk[1] += local_x[1]; + // cvm_offset = [0, 2] + for (int cvm_i = 0; cvm_i < cvm_offset; cvm_i++) { + sum_show_clk[cvm_i] += local_x[cvm_i]; + } } mfence_lm(); - sum[0] = (float)sum_show_clk[0]; - sum[1] = (float)sum_show_clk[1]; + // cvm_offset = [0, 2] + for (int cvm_i = 0; cvm_i < cvm_offset; cvm_i++) { + sum[cvm_i] = (float)sum_show_clk[cvm_i]; + } } }; @@ -187,7 +193,7 @@ struct pooling_engine{ // need_filter:true && embed_threshold_filter:true && embedx_concate_filter:false template struct pooling_engine_with_large_dim{ - static __device__ void sum_pooling(T* local_x, + static __device__ inline void sum_pooling(T* local_x, T* sum, T2* sum_show_clk, int len, @@ -203,8 +209,10 @@ struct pooling_engine_with_large_dim struct pooling_engine{ - static __device__ void sum_pooling(T* local_x, + static __device__ inline void sum_pooling(T* local_x, T* sum, T2* sum_show_clk, int len, @@ -288,8 +301,11 @@ struct pooling_engine struct pooling_engine_with_large_dim{ - static __device__ void sum_pooling(T* local_x, + static __device__ inline void sum_pooling(T* local_x, T* sum, T2* sum_show_clk, int len, @@ -353,8 +374,11 @@ struct pooling_engine_with_large_dim struct pooling_engine{ - static __device__ void sum_pooling(T* local_x, + static __device__ inline void sum_pooling(T* local_x, T* sum, T2* sum_show_clk, int len, @@ -419,8 +449,11 @@ struct pooling_engine struct pooling_engine_with_large_dim{ - static __device__ void sum_pooling(T* local_x, + static __device__ inline void sum_pooling(T* local_x, T* sum, T2* sum_show_clk, int len, @@ -478,8 +516,11 @@ struct pooling_engine_with_large_dim