PaddlePaddle · YuanRisheng · Apr 20, 2023 · Apr 18, 2023 · Apr 18, 2023 · Apr 18, 2023
diff --git a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
@@ -214,6 +214,9 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
           *(op_with_kernel->PhiKernelSignature()),
           runtime_context,
           *dev_ctx);
+    } else if (new_op_func_node.phi_kernel_->GetKernelRegisteredType() ==
+               phi::KernelRegisteredType::STRUCTURE) {
+      (*new_op_func_node.phi_kernel_)(&exec_ctx);
     } else {
       phi::KernelContext phi_kernel_context;
       op_with_kernel->BuildPhiKernelContext(

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -2493,7 +2493,7 @@ Scope* OperatorWithKernel::PrepareData(
                                                  expected_kernel_key.layout(),
                                                  expected_kernel_key.dtype());
           }
-        } else if (in_def != nullptr &&
+        } else if (in_def != nullptr &&  // KernelRegisteredType is Function
                    in_def->backend != phi::Backend::ALL_BACKEND) {
           auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
           if ((in_def->backend != tensor_backend &&

diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -149,9 +149,15 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(sequence_softmax_grad,
                   ops::SequenceSoftmaxGradOp,
                   ops::SequenceSoftmaxGradOpNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(sequence_softmax,
-                       ops::SequenceSoftmaxKernel<phi::CPUContext, float>,
-                       ops::SequenceSoftmaxKernel<phi::CPUContext, double>);
-REGISTER_OP_CPU_KERNEL(sequence_softmax_grad,
-                       ops::SequenceSoftmaxGradKernel<phi::CPUContext, float>,
-                       ops::SequenceSoftmaxGradKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(sequence_softmax,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::SequenceSoftmaxKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(sequence_softmax_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::SequenceSoftmaxGradKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -177,10 +177,15 @@ struct SequenceSoftmaxGradFunctor<phi::GPUContext, T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(sequence_softmax,
-                        ops::SequenceSoftmaxKernel<phi::GPUContext, float>,
-                        ops::SequenceSoftmaxKernel<phi::GPUContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<phi::GPUContext, float>,
-    ops::SequenceSoftmaxGradKernel<phi::GPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(sequence_softmax,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::SequenceSoftmaxKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(sequence_softmax_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::SequenceSoftmaxGradKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
@@ -86,7 +86,7 @@ struct SequenceSoftmaxGradFunctor<phi::CPUContext, T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class SequenceSoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -130,7 +130,7 @@ class SequenceSoftmaxKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {

diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
@@ -137,9 +137,13 @@ REGISTER_OPERATOR(
     ops::SequenceTopkAvgPoolGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(sequence_topk_avg_pooling_grad,
                   ops::SequenceTopkAvgPoolingGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sequence_topk_avg_pooling,
-    ops::SequenceTopkAvgPoolingKernel<phi::CPUContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_topk_avg_pooling_grad,
-    ops::SequenceTopkAvgPoolingGradKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(sequence_topk_avg_pooling,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::SequenceTopkAvgPoolingKernel,
+                          float) {}
+PD_REGISTER_STRUCT_KERNEL(sequence_topk_avg_pooling_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::SequenceTopkAvgPoolingGradKernel,
+                          float) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
@@ -67,7 +67,7 @@ static void get_topk_pos(const T* data, int length, int k, int* pos) {
 }
 }  // namespace details
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -178,7 +178,7 @@ class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {

diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -194,14 +194,19 @@ REGISTER_OPERATOR(sequence_unpad,
 REGISTER_OPERATOR(sequence_unpad_grad,
                   ops::SequenceUnpadGradOp,
                   ops::SequenceUnpadGradOpNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(sequence_unpad,
-                       ops::SequenceUnpadOpKernel<phi::CPUContext, float>,
-                       ops::SequenceUnpadOpKernel<phi::CPUContext, double>,
-                       ops::SequenceUnpadOpKernel<phi::CPUContext, int>,
-                       ops::SequenceUnpadOpKernel<phi::CPUContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    sequence_unpad_grad,
-    ops::SequenceUnpadGradOpKernel<phi::CPUContext, float>,
-    ops::SequenceUnpadGradOpKernel<phi::CPUContext, double>,
-    ops::SequenceUnpadGradOpKernel<phi::CPUContext, int>,
-    ops::SequenceUnpadGradOpKernel<phi::CPUContext, int64_t>);
+PD_REGISTER_STRUCT_KERNEL(sequence_unpad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::SequenceUnpadOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(sequence_unpad_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::SequenceUnpadGradOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu
@@ -15,14 +15,19 @@ limitations under the License. */
 #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(sequence_unpad,
-                        ops::SequenceUnpadOpKernel<phi::GPUContext, float>,
-                        ops::SequenceUnpadOpKernel<phi::GPUContext, double>,
-                        ops::SequenceUnpadOpKernel<phi::GPUContext, int>,
-                        ops::SequenceUnpadOpKernel<phi::GPUContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_unpad_grad,
-    ops::SequenceUnpadGradOpKernel<phi::GPUContext, float>,
-    ops::SequenceUnpadGradOpKernel<phi::GPUContext, double>,
-    ops::SequenceUnpadGradOpKernel<phi::GPUContext, int>,
-    ops::SequenceUnpadGradOpKernel<phi::GPUContext, int64_t>);
+PD_REGISTER_STRUCT_KERNEL(sequence_unpad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::SequenceUnpadOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(sequence_unpad_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::SequenceUnpadGradOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -27,7 +27,7 @@ namespace operators {
 using LoDTensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class SequenceUnpadOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -81,7 +81,7 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {

diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op_xpu.cc
@@ -18,6 +18,6 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(sequence_unpad,
-                       ops::SequenceUnpadOpKernel<phi::XPUContext, float>);
+                       ops::SequenceUnpadOpKernel<float, phi::XPUContext>);
 
 #endif
diff --git a/paddle/fluid/operators/share_data_op.cc b/paddle/fluid/operators/share_data_op.cc
@@ -62,18 +62,22 @@ Return a tensor $Out$ that shares data with the input tensor $X$ and without ten
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OPERATOR(
     share_data,
     ops::ShareDataOp,
     ops::ShareDataOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(share_data,
-                       ops::ShareDataKernel<bool>,
-                       ops::ShareDataKernel<int>,
-                       ops::ShareDataKernel<int8_t>,
-                       ops::ShareDataKernel<uint8_t>,
-                       ops::ShareDataKernel<paddle::platform::float16>,
-                       ops::ShareDataKernel<int64_t>,
-                       ops::ShareDataKernel<float>,
-                       ops::ShareDataKernel<double>)
+PD_REGISTER_STRUCT_KERNEL(share_data,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::ShareDataKernel,
+                          bool,
+                          int,
+                          int8_t,
+                          uint8_t,
+                          int64_t,
+                          float,
+                          double,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/share_data_op.cu b/paddle/fluid/operators/share_data_op.cu
@@ -14,13 +14,17 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/share_data_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    share_data,
-    paddle::operators::ShareDataKernel<bool>,
-    paddle::operators::ShareDataKernel<int>,
-    paddle::operators::ShareDataKernel<int8_t>,
-    paddle::operators::ShareDataKernel<uint8_t>,
-    paddle::operators::ShareDataKernel<paddle::platform::float16>,
-    paddle::operators::ShareDataKernel<int64_t>,
-    paddle::operators::ShareDataKernel<float>,
-    paddle::operators::ShareDataKernel<double>);
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+PD_REGISTER_STRUCT_KERNEL(share_data,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::ShareDataKernel,
+                          bool,
+                          int,
+                          int8_t,
+                          uint8_t,
+                          int64_t,
+                          float,
+                          double,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/share_data_op.h b/paddle/fluid/operators/share_data_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class ShareDataKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {

diff --git a/paddle/fluid/operators/shuffle_batch_op.cc b/paddle/fluid/operators/shuffle_batch_op.cc
@@ -159,14 +159,19 @@ REGISTER_OPERATOR(shuffle_batch,
                   ops::ShuffleBatchGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(shuffle_batch_grad, ops::ShuffleBatchOpGrad);
 
-REGISTER_OP_CPU_KERNEL(shuffle_batch,
-                       ops::ShuffleBatchKernel<float>,
-                       ops::ShuffleBatchKernel<double>,
-                       ops::ShuffleBatchKernel<int32_t>,
-                       ops::ShuffleBatchKernel<int64_t>);
-
-REGISTER_OP_CPU_KERNEL(shuffle_batch_grad,
-                       ops::ShuffleBatchGradKernel<float>,
-                       ops::ShuffleBatchGradKernel<double>,
-                       ops::ShuffleBatchGradKernel<int32_t>,
-                       ops::ShuffleBatchGradKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(shuffle_batch,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::ShuffleBatchKernel,
+                          float,
+                          double,
+                          int32_t,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(shuffle_batch_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::ShuffleBatchGradKernel,
+                          float,
+                          double,
+                          int32_t,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu
@@ -79,7 +79,7 @@ struct ReorderFunctor {
   int64_t stride_;
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class ShuffleBatchCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -149,7 +149,7 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class ShuffleBatchGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -180,15 +180,21 @@ class ShuffleBatchGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(shuffle_batch,
-                        ops::ShuffleBatchCUDAKernel<float>,
-                        ops::ShuffleBatchCUDAKernel<double>,
-                        ops::ShuffleBatchCUDAKernel<int32_t>,
-                        ops::ShuffleBatchCUDAKernel<int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(shuffle_batch_grad,
-                        ops::ShuffleBatchGradCUDAKernel<float>,
-                        ops::ShuffleBatchGradCUDAKernel<double>,
-                        ops::ShuffleBatchGradCUDAKernel<int32_t>,
-                        ops::ShuffleBatchGradCUDAKernel<int64_t>);
+
+PD_REGISTER_STRUCT_KERNEL(shuffle_batch,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::ShuffleBatchCUDAKernel,
+                          float,
+                          double,
+                          int32_t,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(shuffle_batch_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::ShuffleBatchGradCUDAKernel,
+                          float,
+                          double,
+                          int32_t,
+                          int64_t) {}
 #endif
diff --git a/paddle/fluid/operators/shuffle_batch_op.h b/paddle/fluid/operators/shuffle_batch_op.h
@@ -36,7 +36,7 @@ namespace operators {
 template <typename T>
 using Vector = phi::Vector<T>;
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class ShuffleBatchKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -122,7 +122,7 @@ class ShuffleBatchKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class ShuffleBatchGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {