Added RunBackward and HookUtils to Eager Dygraph (PaddlePaddle#37599)

Zjq9409 · Dec 10, 2021 · 0d32f7e · 0d32f7e
1 parent 8f0a578
commit 0d32f7e
Show file tree

Hide file tree

Showing 13 changed files with 997 additions and 2 deletions.
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
@@ -6,3 +6,4 @@ cc_library(autograd_meta SRCS autograd_meta.cc DEPS pten pten_api)
 cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api)
 cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulation)
 cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta)
+cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info)
diff --git a/paddle/fluid/eager/api/CMakeLists.txt b/paddle/fluid/eager/api/CMakeLists.txt
@@ -1,4 +1,4 @@
 add_subdirectory(utils)
 add_subdirectory(generated)
 
-cc_library(eager_api SRCS all.cc DEPS global_utils eager_scale)
+cc_library(eager_api SRCS all.cc DEPS tensor_utils hook_utils global_utils eager_scale)
diff --git a/paddle/fluid/eager/api/all.h b/paddle/fluid/eager/api/all.h
@@ -16,3 +16,5 @@
 
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/scale.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/api/utils/hook_utils.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
diff --git a/paddle/fluid/eager/api/utils/CMakeLists.txt b/paddle/fluid/eager/api/utils/CMakeLists.txt
@@ -1,2 +1,3 @@
 cc_library(tensor_utils SRCS tensor_utils.cc DEPS pten pten_api autograd_meta grad_node_info accumulation_node)
+cc_library(hook_utils SRCS hook_utils.cc DEPS pten tensor_utils autograd_meta grad_node_info utils accumulation_node)
 cc_library(global_utils SRCS global_utils.cc DEPS place)
diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/api/utils/hook_utils.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace egr {
+
+void RegisterGradientHookForTensor(
+    const egr::EagerTensor& tensor,
+    std::function<egr::EagerTensor(const egr::EagerTensor&)>& hook) {
+  // Find grad_node and out_rank from AutogradMeta
+  std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
+  auto rank_info = EagerUtils::unsafe_autograd_meta(tensor)->OutRankInfo();
+
+  grad_node->RegisterGradientHook(rank_info.first, rank_info.second, hook);
+}
+
+void RegisterReduceHookForTensor(const egr::EagerTensor& tensor,
+                                 const std::function<void(void)>& hook) {
+  // Find grad_node and out_rank from AutogradMeta
+  std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
+
+  grad_node->RegisterReduceHook(hook);
+}
+
+void RetainGradForTensor(const egr::EagerTensor& tensor) {
+  // TODO(jiabin): Support More Tensor type here
+  AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor);
+  egr::EagerTensor* grad_tensor = meta->MutableGrad();
+
+  // Define Hook
+  std::function<egr::EagerTensor(const egr::EagerTensor&)> hook =
+      [grad_tensor](const egr::EagerTensor& t) {
+        if (!grad_tensor) {
+          PADDLE_THROW(paddle::platform::errors::Fatal(
+              "Detected null grad_tensor."
+              "Grad tensor in AutogradMeta of should not be nullptr"));
+        }
+        if (t.defined()) {
+          // Simply Copy impl() to grad_tensor
+          grad_tensor->set_impl(t.impl());
+          return *grad_tensor;
+        } else {
+          PADDLE_ENFORCE_EQ(
+              t.Var().IsInitialized(), true,
+              paddle::platform::errors::Fatal(
+                  "Detected uninitialized variable, causing segmentation fault "
+                  "inside the hook."
+                  "Variable %s has to be initialized while we need to set it."
+                  "please check tensor initialization status.",
+                  t.name()));
+          grad_tensor->MutableVar()
+              ->GetMutable<paddle::framework::LoDTensor>()
+              ->ShareDataWith(t.Var().Get<paddle::framework::LoDTensor>());
+          return *grad_tensor;
+        }
+      };
+
+  if (IsLeafTensor(tensor)) {
+    // Add RetainGrad as PostHook to AccumulationNode
+    std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
+    PADDLE_ENFORCE(
+        grad_node.get() != nullptr,
+        paddle::platform::errors::Fatal("Detected NULL grad_node"
+                                        "Leaf tensor should have had grad_node "
+                                        "with type: GradNodeAccumulation"));
+    auto accumulation_grad_node =
+        std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node);
+    accumulation_grad_node->RetainGrad(hook);
+
+  } else {
+    // Append to GradientHooks
+    RegisterGradientHookForTensor(tensor, hook);
+  }
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/api/utils/hook_utils.h b/paddle/fluid/eager/api/utils/hook_utils.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/pten/api/all.h"
+namespace egr {
+
+void RegisterGradientHookForTensor(
+    const egr::EagerTensor& tensor,
+    std::function<egr::EagerTensor(const egr::EagerTensor&)>& hook);
+
+void RegisterReduceHookForTensor(const egr::EagerTensor& tensor,
+                                 const std::function<void(void)>& hook);
+void RetainGradForTensor(const egr::EagerTensor& tensor);
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
@@ -0,0 +1,212 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/backward.h"
+#include <queue>
+
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/grad_tensor_holder.h"
+#include "paddle/fluid/eager/utils.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+
+#include "glog/logging.h"
+
+namespace egr {
+
+std::unordered_map<GradNodeBase*, int> getInDegreeMap(
+    const std::queue<GradNodeBase*>& init_queue) {
+  // Calculate in_degree for each node
+  // We can completely remove this pass, if in_degree were set during forward
+  // pass
+  std::unordered_map<GradNodeBase*, int> node_in_degree_map;
+
+  // Copy nodes
+  std::queue<GradNodeBase*> queue = init_queue;
+  std::unordered_set<GradNodeBase*> visited;
+
+  // Visit each node exactly once in any order
+  while (!queue.empty()) {
+    GradNodeBase* node = queue.front();
+    queue.pop();
+
+    if (visited.count(node)) {
+      continue;
+    }
+    visited.insert(node);
+
+    // Find and append next nodes
+    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
+    for (const auto& edge_list : edges) {
+      for (const Edge& edge : edge_list) {
+        GradNodeBase* next_node = edge.GetMutableGradNode().get();
+        // Update in_degree
+        if (!node_in_degree_map.count(next_node))
+          node_in_degree_map[next_node] = 0;
+        node_in_degree_map[next_node]++;
+        queue.push(next_node);
+      }
+    }
+  }
+
+  return node_in_degree_map;
+}
+
+void RunBackward(const std::vector<egr::EagerTensor>& tensors,
+                 const std::vector<egr::EagerTensor>& grad_tensors,
+                 bool retain_graph) {
+  VLOG(6) << "Start Backward";
+  // *Gradient Hook should happen at node-level
+  // *Inplace version check should perform at node-level
+  // *Cross-batch accumulation happens at forward pass
+
+  /* --- Initialization --- */
+  // 1. Init queue with starting nodes
+  // 2. Prepare initial input buffers
+  std::queue<GradNodeBase*> queue;
+  std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
+      node_input_buffers_dict;
+  for (size_t i = 0; i < tensors.size(); i++) {
+    const egr::EagerTensor& tensor = tensors[i];
+
+    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(tensor);
+    // Get grad input info from target tensors
+    auto input_info = auto_grad_meta->OutRankInfo();
+
+    VLOG(2) << "Out Rank of Tensor is slot: " << input_info.first
+            << ", rank: " << input_info.second;
+    // Get target GradNodeBase from target tensors
+    GradNodeBase* grad_node = auto_grad_meta->GetMutableGradNode().get();
+
+    PADDLE_ENFORCE(grad_node,
+                   paddle::platform::errors::Fatal(
+                       "Detected null grad_node."
+                       "Grad Node is nullptr for grad input tensor %d",
+                       i));
+    // Prepare GradTensorHolder
+    if (!node_input_buffers_dict.count(grad_node)) {
+      VLOG(6) << "Create Value for grad input tensor " << i;
+      node_input_buffers_dict[grad_node] =
+          std::make_unique<GradTensorHolder>(grad_node->InputMeta());
+    }
+
+    if (grad_tensors.size() > 0) {
+      PADDLE_ENFORCE(
+          grad_tensors.size() == tensors.size(),
+          paddle::platform::errors::Fatal(
+              "Detected size mismatch between tensors and grad_tensors"
+              "grad_tensors should either have "
+              "size = 0 or same size as tensors"));
+      // Feed given tensor if it's provided
+      VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
+      node_input_buffers_dict[grad_node]->add(
+          input_info.first, input_info.second, grad_tensors[i]);
+
+    } else {
+      VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
+      // Initialize tensor with 1.0
+      // Forward Tensor "tensor" is passed to indicate tensortype, datatype and
+      // dims
+      // GradTensorHolder will initialize another tensor with same tensortype,
+      // datatype and dims but filled with 1.0
+      node_input_buffers_dict[grad_node]->add(
+          input_info.first, input_info.second, tensor, true /*fill_one=true*/);
+    }
+
+    // Prepare queue
+    queue.push(grad_node);
+  }
+
+  VLOG(6) << "Update In degree Map for backward";
+  // 3. Compute in_degree for each node
+  std::unordered_map<GradNodeBase*, int> node_in_degree_map =
+      getInDegreeMap(queue);
+
+  /* --- Topological Visit --- */
+  // 1. Pop queue
+  // 2. Run node
+  //    |- node(grads)
+  //    |- Prepare for next node
+  // 3. Update queue
+  VLOG(6) << "Run Backward";
+  while (!queue.empty()) {
+    GradNodeBase* node = queue.front();
+    queue.pop();
+
+    // Run node: This is where Hook happens
+    PADDLE_ENFORCE(
+        node_input_buffers_dict.count(node),
+        paddle::platform::errors::Fatal(
+            "Unable to find next node in the InputBuufer"
+            "Trying to run Node without configuring its GradTensorHolder"));
+
+    std::unique_ptr<GradTensorHolder> node_input_buffer =
+        std::move(node_input_buffers_dict[node]);
+    VLOG(6) << "Run Backward Kernel with input_buffer";
+    // Run Backward Node and get outputs
+    std::vector<std::vector<egr::EagerTensor>> grad_output_tensors =
+        (*node)(node_input_buffer->Buffers());
+    // TODO(jiabin): Should we erase it or find a more efficient way.
+    node_input_buffers_dict.erase(node);
+
+    // Prepare GradTensorHolder for next node
+    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
+
+    PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(),
+                   paddle::platform::errors::Fatal(
+                       "Number of edges should be either empty ( for leaf node "
+                       ") or the same as number of output grad tensors"));
+
+    for (size_t i = 0; i < edges.size(); i++) {
+      for (size_t j = 0; j < edges[i].size(); j++) {
+        const Edge& edge = edges[i][j];
+        auto edge_rank = edge.GetEdgeRankInfo();
+        // Since we make edge has as same rank as bwd outputs, we indexing them
+        // with
+        // the same rank(i, j)
+        VLOG(6) << "Get Edge with slot: " << i << ", rank: " << j;
+        egr::EagerTensor& grad_output_tensor = grad_output_tensors[i][j];
+        if (!grad_output_tensor.defined() ||
+            !grad_output_tensor.initialized()) {
+          VLOG(6) << "We get grad_output_tensor with slot: " << i
+                  << ", rank: " << j << " as uninitialized or undefined tensor";
+        }
+        GradNodeBase* next_node = edge.GetMutableGradNode().get();
+
+        if (!node_input_buffers_dict.count(next_node)) {
+          node_input_buffers_dict[next_node] =
+              std::make_unique<GradTensorHolder>(next_node->InputMeta());
+        }
+        VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
+                << ", rank: " << edge_rank.second;
+        node_input_buffers_dict[next_node]->add(
+            edge_rank.first, edge_rank.second, grad_output_tensor);
+
+        // Update queue
+        node_in_degree_map[next_node]--;
+        PADDLE_ENFORCE(node_in_degree_map[next_node] >= 0,
+                       paddle::platform::errors::Fatal(
+                           "Detected in-degree value smaller than zero."
+                           "Node's in-degree cannot be negative"));
+        if (node_in_degree_map[next_node] == 0) {
+          queue.emplace(std::move(next_node));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/backward.h b/paddle/fluid/eager/backward.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/pten/api/all.h"
+
+namespace egr {
+
+// run_backward():
+// tensors corresponds to those lived in the backward graph
+// each grad_tensors[i] keeps the value for its corresponding tensors[i]
+void RunBackward(const std::vector<egr::EagerTensor> &tensors,
+                 const std::vector<egr::EagerTensor> &grad_tensors,
+                 bool retain_graph = false);
+
+// Reserved for gradient()
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/CMakeLists.txt b/paddle/fluid/eager/tests/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(eager_deps pten pten_api tensor_utils utils global_utils pten_tensor autograd_meta grad_node_info grad_tensor_holder gradient_accumulation accumulation_node)
+set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor autograd_meta grad_node_info grad_tensor_holder gradient_accumulation accumulation_node)
 set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy)
 
 add_subdirectory(data_structure_tests)