diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 9b0e784c0efb17..2e377e43ca3ec9 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -25,6 +25,8 @@ #include "glog/logging.h" +namespace egr { + static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, const paddle::experimental::Tensor& t) { if (!tensor->defined() || !tensor->initialized()) { @@ -36,14 +38,6 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, } } -namespace egr { - -void GradNodeAccumulation::RetainGrad( - const std::function& hook) { - retain_grad_hook_ = hook; -} - std::vector> GradNodeAccumulation:: operator()( const std::vector>& grads) { @@ -59,17 +53,18 @@ operator()( "However received: %d in slot %d .", grads[0].size(), 0)); // Apply Gradient Hooks + paddle::experimental::Tensor grad_out; if (GradientHooksRegistered()) { std::vector> hooked_grads = ApplyGradientHooks(grads); - // TODO(jiabin): It's little weird - CopyOrAddTensor(&accumulated_grad, hooked_grads[0][0]); + grad_out = hooked_grads[0][0]; } else { - CopyOrAddTensor(&accumulated_grad, grads[0][0]); + grad_out = grads[0][0]; } - if (retain_grad_hook_ != nullptr) { - retain_grad_hook_(accumulated_grad); + if (!weak_grad_.expired()) { + auto grad = weak_grad_.lock(); + CopyOrAddTensor(grad.get(), grad_out); } // Apply Reduce Hooks @@ -77,7 +72,7 @@ operator()( ApplyReduceHooks(); } - return {{accumulated_grad}}; + return {{grad_out}}; } void GradNodeAccumulation::RegisterReduceHook( diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 3f53517204a5a1..787149ab305263 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" namespace egr { @@ -21,7 +22,10 @@ namespace egr { class GradNodeAccumulation : public GradNodeBase { public: // Constructor: configure fwd input tensors to grad node - GradNodeAccumulation() : GradNodeBase(1, 1) { SetDefaultGradInOutMeta(); } + explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) { + weak_grad_ = meta->WeakGrad(); + SetDefaultGradInOutMeta(); + } ~GradNodeAccumulation() override = default; @@ -30,11 +34,6 @@ class GradNodeAccumulation : public GradNodeBase { const std::vector>& grads) override; - void RetainGrad(const std::function& hook); - - paddle::experimental::Tensor* Grad() { return &accumulated_grad; } - std::string name() { return "GradNodeAccumulation"; } /** @@ -49,7 +48,7 @@ class GradNodeAccumulation : public GradNodeBase { void ApplyReduceHooks(); private: - paddle::experimental::Tensor accumulated_grad; + std::weak_ptr weak_grad_; std::function diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc index 7d2997eb884c86..748afe6d1f313d 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.cc +++ b/paddle/fluid/eager/api/utils/hook_utils.cc @@ -52,9 +52,15 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, } } -void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { - // TODO(jiabin): Support More Tensor type here +static void RetainGradForRegularNode( + const paddle::experimental::Tensor& tensor) { AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor); + if (meta->RetainGrads()) { + return; + } else { + meta->SetRetainGrads(true); + } + std::weak_ptr weak_grad_tensor = meta->WeakGrad(); @@ -79,21 +85,17 @@ void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { } }; - if (IsLeafTensor(tensor)) { - // Add RetainGrad as PostHook to AccumulationNode - std::shared_ptr grad_node = EagerUtils::grad_node(tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - accumulation_grad_node->RetainGrad(hook); + // Append to GradientHooks + RegisterGradientHookForTensor(tensor, hook); +} +void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { + if (IsLeafTensor(tensor)) { + // Leaf tensor's grad will always be retained + // Refer to implementation of AccumulationNode for more details + return; } else { - // Append to GradientHooks - RegisterGradientHookForTensor(tensor, hook); + RetainGradForRegularNode(tensor); } } diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index c06edef7017be1..628c0c500b3c4a 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -47,7 +47,7 @@ paddle::experimental::Tensor CreateTensorWithValue( auto meta = EagerUtils::autograd_meta(&out); if (is_leaf) { - auto accumulation_node = std::make_shared(); + auto accumulation_node = std::make_shared(meta); meta->SetGradNode(accumulation_node); meta->SetStopGradient(false); } diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 7cddfd9c1c7dc1..e1f4d6ee9a129e 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -1031,6 +1031,8 @@ static std::string GenerateGradNodeCreationContent( const std::string& output_name = output.name(); const std::string& output_autograd_name = "p_autograd_" + output_name; + // Skip Intermediate Tensor + if (output.duplicable()) { const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = " std::vector %s = " @@ -1145,6 +1147,8 @@ static std::string GenerateGradNodeCreationContent( const std::string& output_autograd_name = "p_autograd_" + output_name; size_t output_position = fwd_outputs_name_pos_map.at(output_name); + // Intermediate Tensor does not require SetHistory, nor RetainGrad + if (output.duplicable()) { pass_stop_gradient_args += ", &" + output_autograd_name; const char* SET_OUT_RANK_TEMPLATE = diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h index 53f17a4ffe58c5..9e1dc4f2c8c6ba 100644 --- a/paddle/fluid/eager/autograd_meta.h +++ b/paddle/fluid/eager/autograd_meta.h @@ -97,6 +97,7 @@ class AutogradMeta : public AbstractAutogradMeta { "Should Not set NULL as GradNode pointer, since " "our default Edge and autogradMeta has nullptr for " "grad node. Set Nullptr will lead error.")); + grad_node_ = grad_node; } diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index d83fa916db66c3..27c376b4c80c6b 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -53,7 +53,7 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } else { - meta->SetGradNode(std::make_shared()); + meta->SetGradNode(std::make_shared(meta)); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } @@ -76,7 +76,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } else { - meta->SetGradNode(std::make_shared()); + meta->SetGradNode(std::make_shared(meta)); VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " << this->name() << " to " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 1732e0513d5244..31aaa93c41643f 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -66,14 +66,13 @@ class TensorWrapper { } intermidiate_tensor_.set_name(tensor.name() + "@Saved"); - PADDLE_ENFORCE_NOT_NULL( - EagerUtils::unsafe_autograd_meta(tensor), - paddle::platform::errors::Fatal( - "Full reserved Tensor should not have null autograd meta, since " - "tensor_wrapper is used to build backward info. There is no way " - "for us to build it with null autograd_meta.")); - // copy output_rank - out_rank_info_ = EagerUtils::OutRankInfo(tensor); + + // If an output is marked "intermedaite", we won't create + // autograd_meta for it. + // In that case, simply skip OutRankInfo Copy + if (EagerUtils::nullable_autograd_meta(tensor)) { + out_rank_info_ = EagerUtils::OutRankInfo(tensor); + } } paddle::experimental::Tensor recover( diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc index 682e55e7d92945..880bd268410271 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc @@ -17,11 +17,13 @@ #include "gtest/gtest.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" -#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/kernel_registry.h" // TODO(jiabin): remove nolint here!!! @@ -37,7 +39,7 @@ TEST(AccumulationNode, Tensor) { .get(), meta); dt0->mutable_data( - paddle::platform::CPUPlace())[0] = 10.0; + paddle::platform::CPUPlace())[0] = paddle::platform::float16(10.0f); paddle::experimental::Tensor et0 = paddle::experimental::Tensor(dt0); std::shared_ptr dt1 = std::make_shared( @@ -47,84 +49,100 @@ TEST(AccumulationNode, Tensor) { meta); dt1->mutable_data( - paddle::platform::CPUPlace())[0] = 20.0; + paddle::platform::CPUPlace())[0] = paddle::platform::float16(20.0f); paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1); + std::shared_ptr input_dt = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + paddle::experimental::Tensor input_et = + paddle::experimental::Tensor(input_dt); + auto grad_meta = EagerUtils::autograd_meta(&input_et); + + // Initialize Grad Tensor std::shared_ptr grad_dt = std::make_shared( std::make_unique( paddle::platform::CPUPlace()) .get(), meta); - paddle::experimental::Tensor grad_et = paddle::experimental::Tensor(grad_dt); + grad_dt->mutable_data( + paddle::platform::CPUPlace())[0] = paddle::platform::float16(0.0f); + grad_meta->MutableGrad()->set_impl(grad_dt); // AccumulationNode - GradNodeAccumulation node = GradNodeAccumulation(); - - // Hook, RetainGrad - std::function - hook = [&grad_et](const paddle::experimental::Tensor& t) { - grad_et.set_impl(t.impl()); - return grad_et; - }; - node.RetainGrad(hook); + auto node = std::make_shared(grad_meta); + grad_meta->SetGradNode(node); + grad_meta->SetStopGradient(false); // operator() - paddle::experimental::Tensor ret_et0 = node({{et0}})[0][0]; + paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0]; auto* ret_et0_ptr = std::dynamic_pointer_cast(ret_et0.impl()) ->data(); CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f)); - paddle::experimental::Tensor ret_et1 = node({{et1}})[0][0]; + paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0]; + auto* ret_et1_ptr = std::dynamic_pointer_cast(ret_et1.impl()) ->data(); - CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(30.0f)); + CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(20.0f)); - // Retain Grad - auto* ret_grad_et_ptr = - std::dynamic_pointer_cast(grad_et.impl()) - ->data(); - CHECK_EQ(ret_grad_et_ptr[0], paddle::platform::float16(30.0f)); + // Check Retain Grad + CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) + ->data()[0], + paddle::platform::float16(10.0f)); + paddle::experimental::Tensor* grad = EagerUtils::mutable_grad(input_et); + auto* grad_ptr = std::dynamic_pointer_cast(grad->impl()) + ->data(); + CHECK_EQ(grad_ptr[0], paddle::platform::float16(30.0f)); // Reduce Hook case 1: Call RegisterReduceHook and run operator() VLOG(6) << "Test Reduce Hook"; + CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) + ->data()[0], + paddle::platform::float16(10.0f)); + auto reduce_hook_1 = [&](void) -> void { - auto* grad_et_ptr = - std::dynamic_pointer_cast(grad_et.impl()) - ->data(); - grad_et_ptr[0] = 36.0; + auto* input_et_ptr = + std::dynamic_pointer_cast(input_et.impl()) + ->mutable_data( + paddle::platform::CPUPlace()); + input_et_ptr[0] = 36.0; VLOG(6) << "Running Reduce Hook"; }; - node.RegisterReduceHook(reduce_hook_1); + node->RegisterReduceHook(reduce_hook_1); // operator() - paddle::experimental::Tensor _ret = node({{et0}})[0][0]; + paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0]; // Check operator() result, should be 36.0 auto* _ret_ptr = std::dynamic_pointer_cast(_ret.impl()) ->data(); - CHECK_EQ(_ret_ptr[0], paddle::platform::float16(36.0f)); + CHECK_EQ(_ret_ptr[0], paddle::platform::float16(10.0f)); // Check Retain Grad, should be 36.0 - auto* _ret_grad_et_ptr = - std::dynamic_pointer_cast(grad_et.impl()) + auto* _ret_input_et_ptr = + std::dynamic_pointer_cast(input_et.impl()) ->data(); - CHECK_EQ(_ret_grad_et_ptr[0], paddle::platform::float16(36.0f)); + CHECK_EQ(_ret_input_et_ptr[0], paddle::platform::float16(36.0f)); // Reduce Hook case 2: Call RegisterReduceHook and ApplyReduceHooks directly VLOG(6) << "Test Reduce Hook"; auto reduce_hook_2 = [&](void) -> void { auto* ret_et0_ptr = std::dynamic_pointer_cast(et0.impl()) - ->data(); + ->mutable_data( + paddle::platform::CPUPlace()); ret_et0_ptr[0] = 100.0; // set to 100.0 VLOG(6) << "Running Reduce Hook"; }; - node.RegisterReduceHook(reduce_hook_2); - node.ApplyReduceHooks(); + node->RegisterReduceHook(reduce_hook_2); + node->ApplyReduceHooks(); // Check ApplyReduceHooks result CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index 771b324a69b5a9..a4bc56bd606f3f 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -59,22 +59,18 @@ TEST(Backward, SingleNodeEmptyGrad) { auto_grad_meta->SetSingleOutRankWithSlot(0, 0); auto_grad_meta->SetStopGradient(false); + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); - AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); auto_grad_meta1->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node0 -> AccumulationNode via Edge - auto meta = egr::AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; + std::vector res = {auto_grad_meta1}; node0_ptr->AddEdges(&res, 0); } std::vector outs = {target_tensor}; @@ -123,22 +119,17 @@ TEST(Backward, SingleNodeCustomGrad) { std::dynamic_pointer_cast(node0_ptr)); auto_grad_meta->SetSingleOutRankWithSlot(0, 0); auto_grad_meta->SetStopGradient(false); - // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + auto_grad_meta1->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node0 -> AccumulationNode via Edge - auto meta = egr::AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; + auto_grad_meta1->SetStopGradient(false); + std::vector res = {auto_grad_meta1}; node0_ptr->AddEdges(&res, 0); } @@ -201,22 +192,17 @@ TEST(Backward, LinearNodes) { std::vector res0 = {&meta0}; node0_ptr->AddEdges(&res0, 0); + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); - AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); auto_grad_meta1->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node1 -> AccumulationNode via Edge - auto meta1 = egr::AutogradMeta(); - meta1.SetStopGradient(false); - meta1.SetSingleOutRankWithSlot(0, 0); - meta1.SetGradNode(acc_node_ptr); - std::vector res1 = {&meta1}; + auto_grad_meta1->SetStopGradient(false); + std::vector res1 = {auto_grad_meta1}; node1_ptr->AddEdges(&res1, 0); } @@ -311,22 +297,17 @@ TEST(Backward, WithAccumulation) { std::vector res1 = {&meta1}; node1_ptr->AddEdges(&res1, 0); + AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor); // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); + auto acc_node_ptr = + std::make_shared(auto_grad_meta2); - AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor); auto_grad_meta2->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta2->SetSingleOutRankWithSlot(0, 0); - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node2 -> AccumulationNode via Edge - auto meta2 = egr::AutogradMeta(); - meta2.SetStopGradient(false); - meta2.SetSingleOutRankWithSlot(0, 0); - meta2.SetGradNode(acc_node_ptr); - std::vector res2 = {&meta2}; + auto_grad_meta2->SetStopGradient(false); + std::vector res2 = {auto_grad_meta2}; node2_ptr->AddEdges(&res2, 0); } diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index a44ca6fcffbff5..524872b2e55638 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -46,34 +46,26 @@ TEST(CrossBatchAccumulation, SingleScaleNode) { paddle::experimental::Tensor& target_tensor = target_tensors[0]; paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor(); - { - auto scale_node_ptr = std::make_shared(1, 1); - scale_node_ptr->SetAttributes_scale(5.0 /*scale*/); - - scale_node_ptr->SetDefaultGradInOutMeta(); - - auto acc_node_ptr = std::make_shared(); - - AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor); - auto_grad_meta->SetGradNode( - std::dynamic_pointer_cast(scale_node_ptr)); - auto_grad_meta->SetSingleOutRankWithSlot(0, 0); - auto_grad_meta->SetStopGradient(false); - egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0 - - auto meta = AutogradMeta(); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetStopGradient(false); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; - scale_node_ptr->AddEdges(&res, 0); - - AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); - auto_grad_meta1->SetGradNode( - std::dynamic_pointer_cast(acc_node_ptr)); - auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - egr_utils_api::RetainGradForTensor(leaf_tensor); - } + + auto scale_node_ptr = std::make_shared(1, 1); + scale_node_ptr->SetAttributes_scale(5.0 /*scale*/); + + scale_node_ptr->SetDefaultGradInOutMeta(); + + AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(scale_node_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0 + + AutogradMeta* meta = EagerUtils::autograd_meta(&leaf_tensor); + auto acc_node_ptr = std::make_shared(meta); + meta->SetStopGradient(false); + meta->SetSingleOutRankWithSlot(0, 0); + meta->SetGradNode(acc_node_ptr); + std::vector res = {meta}; + scale_node_ptr->AddEdges(&res, 0); RunBackward(target_tensors, {}); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index bf2f620dd19bae..fbc71168fe4169 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -79,9 +79,6 @@ TEST(RetainGrad, HookBeforeRetainGrad) { // Set grad in/out meta for node0 scale_node_ptr->SetDefaultGradInOutMeta(); - // Create AccumulationNode - auto acc_node_ptr = std::make_shared(); - // Connect Input Tensor and ScaleNode via AutoGradMeta // Apply RetainGrad { @@ -102,16 +99,8 @@ TEST(RetainGrad, HookBeforeRetainGrad) { egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); egr_utils_api::RetainGradForTensor( target_tensor); // result: 1.0 + 3.0 = 4.0 - } - - // Connect ScaleNode -> AccumulationNode via Edge - { - auto meta = AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; - scale_node_ptr->AddEdges(&res, 0); + egr_utils_api::RetainGradForTensor( + target_tensor); // result: 1.0 + 3.0 = 4.0 } // Retain Grad for leaf tensor1 @@ -123,9 +112,16 @@ TEST(RetainGrad, HookBeforeRetainGrad) { hook = &hook_function; auto auto_grad_meta = std::make_shared(); - auto_grad_meta->SetGradNode( - std::dynamic_pointer_cast(acc_node_ptr)); + + auto acc_node_ptr = + std::make_shared(auto_grad_meta.get()); + + auto_grad_meta->SetStopGradient(false); + auto_grad_meta->SetGradNode(acc_node_ptr); auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + std::vector res = {auto_grad_meta.get()}; + scale_node_ptr->AddEdges(&res, 0); + leaf_tensor.set_autograd_meta( std::dynamic_pointer_cast( auto_grad_meta)); @@ -160,8 +156,6 @@ TEST(RetainGrad, HookAfterRetainGrad) { scale_node_ptr->SetAttributes_scale(5.0 /*scale*/); // Set grad in/out meta for node0 scale_node_ptr->SetDefaultGradInOutMeta(); - // Create AccumulationNode - auto acc_node_ptr = std::make_shared(); // Connect Input Tensor and ScaleNode via AutoGradMeta // Apply RetainGrad @@ -184,16 +178,6 @@ TEST(RetainGrad, HookAfterRetainGrad) { egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); } - // Connect ScaleNode -> AccumulationNode via Edge - { - auto meta = AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; - scale_node_ptr->AddEdges(&res, 0); - } - // Retain Grad for leaf tensor1 paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor(); { @@ -203,17 +187,18 @@ TEST(RetainGrad, HookAfterRetainGrad) { hook = &hook_function; auto auto_grad_meta = std::make_shared(); - auto_grad_meta->SetGradNode( - std::dynamic_pointer_cast(acc_node_ptr)); + auto acc_node_ptr = + std::make_shared(auto_grad_meta.get()); + auto_grad_meta->SetGradNode(acc_node_ptr); + auto_grad_meta->SetStopGradient(false); + std::vector res = {auto_grad_meta.get()}; + scale_node_ptr->AddEdges(&res, 0); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); leaf_tensor.set_autograd_meta( std::dynamic_pointer_cast( auto_grad_meta)); - egr_utils_api::RetainGradForTensor( - leaf_tensor); // RetainGrad for leaf tensor gets - // postponed, result: 4.0*5.0 + 3.0 = - // 23.0 egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook); } diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 5d8dff5cd5b245..7464ad74135853 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/tensor_wrapper.h" @@ -21,7 +22,6 @@ #include "paddle/phi/common/layout.h" #include "paddle/phi/core/tensor_meta.h" -#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/framework/variable.h" @@ -109,6 +109,16 @@ std::shared_ptr EagerUtils::grad_node( } } +paddle::experimental::Tensor* EagerUtils::mutable_grad( + const paddle::experimental::Tensor& target) { + auto* meta = nullable_autograd_meta(target); + if (meta) { + return meta->MutableGrad(); + } else { + return nullptr; + } +} + void EagerUtils::SetHistory(std::vector* autograd_metas, const std::shared_ptr& grad_node) { for (const auto& autograd_meta : *autograd_metas) { @@ -342,7 +352,8 @@ std::shared_ptr EagerUtils::GetGradAccumulationNode( } else { if (!autograd_ptr->StopGradient()) { VLOG(6) << "Add GradNodeAccumulation for tensor: " << tensor.name(); - autograd_ptr->SetGradNode(std::make_shared()); + autograd_ptr->SetGradNode( + std::make_shared(autograd_ptr)); return autograd_ptr->GetMutableGradNode(); } else { return nullptr; diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index b74d68db2a6d50..fa5735e6f32a0c 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -102,6 +102,8 @@ class EagerUtils { static std::shared_ptr grad_node( const paddle::experimental::Tensor& target); + static paddle::experimental::Tensor* mutable_grad( + const paddle::experimental::Tensor& target); // Set history is used to set backward info during forward process, it will // set forward var's autograd meta's grad node as current backward node. diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 2296169a161046..d9a2dcb6869096 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -86,7 +86,8 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name, if (!autograd_meta->GetMutableGradNode()) { VLOG(3) << "Tensor(" << name << ") have not GradNode, add GradNodeAccumulation for it."; - autograd_meta->SetGradNode(std::make_shared()); + autograd_meta->SetGradNode( + std::make_shared(autograd_meta)); } } diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 27328bea692af6..4e900ae2ffbc11 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -177,7 +177,7 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args, if (!meta->GetMutableGradNode()) { VLOG(6) << "Make grad node of tensor: " << self->tensor.name() << "become accumulation node"; - meta->SetGradNode(std::make_shared()); + meta->SetGradNode(std::make_shared(meta)); } egr::egr_utils_api::RetainGradForTensor(self->tensor); } @@ -199,17 +199,12 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args, paddle::experimental::Tensor* grad; if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { - // Add RetainGrad as PostHook to AccumulationNode - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - grad = accumulation_grad_node->Grad(); + grad = egr::EagerUtils::mutable_grad(self->tensor); + PADDLE_ENFORCE(grad != nullptr, + paddle::platform::errors::Fatal( + "Detected NULL grad" + "Please check if you have manually cleared" + "the grad inside autograd_meta")); } else { auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor); grad = meta->MutableGrad(); @@ -248,19 +243,15 @@ static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args, if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { // Add RetainGrad as PostHook to AccumulationNode - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - if (accumulation_grad_node->Grad()->initialized()) { - accumulation_grad_node->Grad()->set_impl( - paddle::experimental::zeros_like(*(accumulation_grad_node->Grad())) - .impl()); + paddle::experimental::Tensor* grad = + egr::EagerUtils::mutable_grad(self->tensor); + PADDLE_ENFORCE(grad != nullptr, + paddle::platform::errors::Fatal( + "Detected NULL grad" + "Please check if you have manually cleared" + "the grad inside autograd_meta")); + if (grad->initialized()) { + grad->set_impl(paddle::experimental::zeros_like(*(grad)).impl()); } } else { auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor); diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 43cfb50f2afe11..2e1390cb96155c 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -70,26 +70,13 @@ PyObject* tensor_properties_get_stop_gradient(TensorObject* self, PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) { EAGER_TRY - if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - return ToPyObject(*accumulation_grad_node->Grad()); + VLOG(6) << "Get grad for tensor: " << self->tensor.name(); + auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor); + if (meta) { + return ToPyObject(meta->Grad()); } else { - VLOG(6) << "Get grad for tensor: " << self->tensor.name(); - auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor); - if (meta) { - return ToPyObject(meta->Grad()); - } else { - Py_INCREF(Py_None); - return Py_None; - } + Py_INCREF(Py_None); + return Py_None; } EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -101,16 +88,15 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value, PADDLE_ENFORCE( egr::egr_utils_api::IsLeafTensor(self->tensor), paddle::platform::errors::Fatal("Only leaf Tensor can be set grad.")); - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - accumulation_grad_node->Grad()->copy_(src, true); + + paddle::experimental::Tensor* grad = + egr::EagerUtils::mutable_grad(self->tensor); + PADDLE_ENFORCE(grad != nullptr, + paddle::platform::errors::Fatal( + "Detected NULL grad" + "Please check if you have manually cleared" + "the grad inside autograd_meta")); + grad->copy_(src, true); return 0; EAGER_CATCH_AND_THROW_RETURN_ZERO } diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 92cba4fca5aba0..848ebae0706e3c 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -606,8 +606,12 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict): if is_input: v = self._create_var_from_numpy(np_value_temp) + if if_return_inputs_grad_dict: v.stop_gradient = False + if _in_eager_mode(): + v.retain_grads() + if has_lod: v.value().get_tensor().set_recursive_sequence_lengths( lod_temp) @@ -618,7 +622,6 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict): type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=False) - return v # prepare variable for input or output @@ -681,7 +684,6 @@ def _calc_dygraph_output(self, place, parallel=False, no_check_set=None): # prepare input variable inputs = self.append_input_output_for_dygraph(op_proto, self.inputs, True, False, block) - # prepare output variable outputs = self.append_input_output_for_dygraph( op_proto, self.outputs, False, False, block) @@ -1741,6 +1743,7 @@ def _get_dygraph_grad(self, for attrs_name in self.attrs: if self.attrs[attrs_name] is not None: attrs_outputs[attrs_name] = self.attrs[attrs_name] + block.append_op( type=self.op_type, inputs=inputs, @@ -1817,7 +1820,9 @@ def _get_dygraph_grad(self, inputs={"X": loss_sum}, outputs={"Out": loss}, attrs={'scale': 1.0 / float(len(avg_sum))}) + loss.backward() + fetch_list_grad = [] for inputs_to_check_name in inputs_to_check: a = inputs_grad_dict[inputs_to_check_name].gradient()