PaddlePaddle · Xreki · May 19, 2023 · Apr 10, 2023 · Apr 18, 2023 · Apr 18, 2023
diff --git a/cmake/external/flashattn.cmake b/cmake/external/flashattn.cmake
@@ -20,7 +20,7 @@ set(FLASHATTN_PREFIX_DIR ${THIRD_PARTY_PATH}/flashattn)
 set(FLASHATTN_SOURCE_SUBDIR csrc/flash_attn)
 set(FLASHATTN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flashattn)
 set(FLASHATTN_REPOSITORY ${GIT_URL}/PaddlePaddle/flash-attention.git)
-set(FLASHATTN_TAG 5ff4bbf56ad066750407c4aef16ac740ebda0717)
+set(FLASHATTN_TAG 18106c1ba0ccee81b97ca947397c08a141815a47)
 
 set(FLASHATTN_INCLUDE_DIR
     "${FLASHATTN_INSTALL_DIR}/include"

diff --git a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h
@@ -27,6 +27,7 @@ std::tuple<paddle::Tensor,
            paddle::Tensor,
            paddle::Tensor,
            paddle::Tensor,
+           paddle::Tensor,
            paddle::Tensor>
 fused_gate_attention_dygraph_function(
     const paddle::Tensor& Query,

diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
@@ -26,6 +26,7 @@ std::tuple<paddle::Tensor,
            paddle::Tensor,
            paddle::Tensor,
            paddle::Tensor,
+           paddle::Tensor,
            paddle::Tensor>
 fused_gate_attention_dygraph_function(
     const paddle::Tensor& Query,
@@ -181,6 +182,9 @@ fused_gate_attention_dygraph_function(
        {"SoftmaxOut",
         {std::make_shared<egr::EagerVariable>(
             egr::Controller::Instance().GenerateUniqueName())}},
+       {"SoftmaxLse",
+        {std::make_shared<egr::EagerVariable>(
+            egr::Controller::Instance().GenerateUniqueName())}},
        {"FMHAOut",
         {std::make_shared<egr::EagerVariable>(
             egr::Controller::Instance().GenerateUniqueName())}},
@@ -256,6 +260,8 @@ fused_gate_attention_dygraph_function(
   egr::EagerUtils::GetOutput(outs["QKVTransposeOut"][0], &QKVTransposeOut);
   paddle::Tensor SoftmaxOut;
   egr::EagerUtils::GetOutput(outs["SoftmaxOut"][0], &SoftmaxOut);
+  paddle::Tensor SoftmaxLse;
+  egr::EagerUtils::GetOutput(outs["SoftmaxLse"][0], &SoftmaxLse);
   paddle::Tensor FMHAOut;
   egr::EagerUtils::GetOutput(outs["FMHAOut"][0], &FMHAOut);
   paddle::Tensor GateOut;
@@ -296,7 +302,7 @@ fused_gate_attention_dygraph_function(
                                         p_autograd_Out);
       // Create GradOpNode
       auto grad_node = std::shared_ptr<fused_gate_attentionGradNodeCompat>(
-          new fused_gate_attentionGradNodeCompat(8, 12));
+          new fused_gate_attentionGradNodeCompat(9, 12));
 
       bool merge_qkv = true;
       if (attrs.count("merge_qkv")) {
@@ -308,6 +314,11 @@ fused_gate_attention_dygraph_function(
         has_gating = PADDLE_GET_CONST(bool, attrs.at("has_gating"));
       }
 
+      bool use_flash_attn = false;
+      if (attrs.count("use_flash_attn")) {
+        use_flash_attn = PADDLE_GET_CONST(bool, attrs.at("use_flash_attn"));
+      }
+
       // Set Attributes
       grad_node->SetAttrMap(std::move(attrs));
       grad_node->SetDefaultAttrMap(std::move(default_attrs));
@@ -354,6 +365,12 @@ fused_gate_attention_dygraph_function(
         grad_node->SetGradOutMeta(NonbatchedBias, 6);
       }
 
+      if (use_flash_attn) {
+        grad_node->SetTensorWrapperSoftmaxLse(SoftmaxLse);
+        grad_node->SetTensorWrapperSrcMask(SrcMask);
+        grad_node->SetGradOutMeta(SrcMask, 7);
+      }
+
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_QueryTransposeOut, 0);
       grad_node->SetGradInMeta(QueryTransposeOut, 0);
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_KeyTransposeOut, 1);
@@ -379,6 +396,7 @@ fused_gate_attention_dygraph_function(
                          ValueTransposeOut,
                          QKVTransposeOut,
                          SoftmaxOut,
+                         SoftmaxLse,
                          FMHAOut,
                          GateOut,
                          Out);

diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc
@@ -45,6 +45,11 @@ fused_gate_attentionGradNodeCompat::operator()(
     has_gating = PADDLE_GET_CONST(bool, attr_map_.at("has_gating"));
   }
 
+  bool use_flash_attn = false;
+  if (attr_map_.count("use_flash_attn")) {
+    use_flash_attn = PADDLE_GET_CONST(bool, attr_map_.at("use_flash_attn"));
+  }
+
   std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins0 =
       {{"FMHAOut",
         egr::EagerUtils::TrySyncToVars(
@@ -168,6 +173,13 @@ fused_gate_attentionGradNodeCompat::operator()(
           egr::Controller::Instance().GenerateUniqueName())};
   }
 
+  if (use_flash_attn) {
+    auto SrcMask = egr::EagerUtils::RecoverTensorWrapper(&this->SrcMask_);
+    ins0["SrcMask"] = egr::EagerUtils::TrySyncToVars(SrcMask);
+    auto SoftmaxLse = egr::EagerUtils::RecoverTensorWrapper(&this->SoftmaxLse_);
+    ins0["SoftmaxLse"] = egr::EagerUtils::TrySyncToVars(SoftmaxLse);
+  }
+
   auto& attrs_map0 = this->attr_map_;
   // Pass the entire attribute map to TraceOp
   // The underlying kernel will pickup whatever attribute they need at runtime

diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
@@ -61,12 +61,14 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase {
     GateOut_.clear();
     GateWeight_.clear();
     NonbatchedBias_.clear();
+    SrcMask_.clear();
     OutLinearBias_.clear();
     OutLinearWeight_.clear();
     QKVTransposeOut_.clear();
     QKVWeight_.clear();
     Query_.clear();
     SoftmaxOut_.clear();
+    SoftmaxLse_.clear();
     Key_.clear();
     QueryWeight_.clear();
     KeyWeight_.clear();
@@ -103,6 +105,9 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase {
   void SetTensorWrapperNonbatchedBias(const paddle::Tensor& NonbatchedBias) {
     NonbatchedBias_ = egr::TensorWrapper(NonbatchedBias, false);
   }
+  void SetTensorWrapperSrcMask(const paddle::Tensor& SrcMask) {
+    SrcMask_ = egr::TensorWrapper(SrcMask, false);
+  }
   void SetTensorWrapperOutLinearBias(const paddle::Tensor& OutLinearBias) {
     OutLinearBias_ = egr::TensorWrapper(OutLinearBias, false);
   }
@@ -121,6 +126,9 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase {
   void SetTensorWrapperSoftmaxOut(const paddle::Tensor& SoftmaxOut) {
     SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false);
   }
+  void SetTensorWrapperSoftmaxLse(const paddle::Tensor& SoftmaxLse) {
+    SoftmaxLse_ = egr::TensorWrapper(SoftmaxLse, false);
+  }
   void SetTensorWrapperKey(const paddle::Tensor& Key) {
     Key_ = egr::TensorWrapper(Key, false);
   }
@@ -160,12 +168,14 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase {
   egr::TensorWrapper GateOut_;
   egr::TensorWrapper GateWeight_;
   egr::TensorWrapper NonbatchedBias_;
+  egr::TensorWrapper SrcMask_;
   egr::TensorWrapper OutLinearBias_;
   egr::TensorWrapper OutLinearWeight_;
   egr::TensorWrapper QKVTransposeOut_;
   egr::TensorWrapper QKVWeight_;
   egr::TensorWrapper Query_;
   egr::TensorWrapper SoftmaxOut_;
+  egr::TensorWrapper SoftmaxLse_;
 
   egr::TensorWrapper Key_;
   egr::TensorWrapper QueryWeight_;