Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactored GradNodeAccumulation data structure and behaviour #39526

Merged
merged 15 commits into from
Feb 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 9 additions & 14 deletions paddle/fluid/eager/accumulation/accumulation_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

#include "glog/logging.h"

namespace egr {

static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
const paddle::experimental::Tensor& t) {
if (!tensor->defined() || !tensor->initialized()) {
Expand All @@ -36,14 +38,6 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
}
}

namespace egr {

void GradNodeAccumulation::RetainGrad(
const std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>& hook) {
retain_grad_hook_ = hook;
}

std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
Expand All @@ -59,25 +53,26 @@ operator()(
"However received: %d in slot %d .",
grads[0].size(), 0));
// Apply Gradient Hooks
paddle::experimental::Tensor grad_out;
if (GradientHooksRegistered()) {
std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
ApplyGradientHooks(grads);
// TODO(jiabin): It's little weird
CopyOrAddTensor(&accumulated_grad, hooked_grads[0][0]);
grad_out = hooked_grads[0][0];
} else {
CopyOrAddTensor(&accumulated_grad, grads[0][0]);
grad_out = grads[0][0];
}

if (retain_grad_hook_ != nullptr) {
retain_grad_hook_(accumulated_grad);
if (!weak_grad_.expired()) {
auto grad = weak_grad_.lock();
CopyOrAddTensor(grad.get(), grad_out);
}

// Apply Reduce Hooks
if (ReduceHooksRegistered()) {
ApplyReduceHooks();
}

return {{accumulated_grad}};
return {{grad_out}};
}

void GradNodeAccumulation::RegisterReduceHook(
Expand Down
13 changes: 6 additions & 7 deletions paddle/fluid/eager/accumulation/accumulation_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,18 @@

#pragma once

#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/grad_node_info.h"

namespace egr {

class GradNodeAccumulation : public GradNodeBase {
public:
// Constructor: configure fwd input tensors to grad node
GradNodeAccumulation() : GradNodeBase(1, 1) { SetDefaultGradInOutMeta(); }
explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
weak_grad_ = meta->WeakGrad();
SetDefaultGradInOutMeta();
}

~GradNodeAccumulation() override = default;

Expand All @@ -30,11 +34,6 @@ class GradNodeAccumulation : public GradNodeBase {
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;

void RetainGrad(const std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>& hook);

paddle::experimental::Tensor* Grad() { return &accumulated_grad; }

std::string name() { return "GradNodeAccumulation"; }

/**
Expand All @@ -49,7 +48,7 @@ class GradNodeAccumulation : public GradNodeBase {
void ApplyReduceHooks();

private:
paddle::experimental::Tensor accumulated_grad;
std::weak_ptr<paddle::experimental::Tensor> weak_grad_;

std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
Expand Down
32 changes: 17 additions & 15 deletions paddle/fluid/eager/api/utils/hook_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,15 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor,
}
}

void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
// TODO(jiabin): Support More Tensor type here
static void RetainGradForRegularNode(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

merge this with RegisterGradientHookForTensor

const paddle::experimental::Tensor& tensor) {
AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor);
if (meta->RetainGrads()) {
return;
} else {
meta->SetRetainGrads(true);
}

std::weak_ptr<paddle::experimental::Tensor> weak_grad_tensor =
meta->WeakGrad();

Expand All @@ -79,21 +85,17 @@ void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
}
};

if (IsLeafTensor(tensor)) {
// Add RetainGrad as PostHook to AccumulationNode
std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(tensor);
PADDLE_ENFORCE(
grad_node.get() != nullptr,
paddle::platform::errors::Fatal("Detected NULL grad_node"
"Leaf tensor should have had grad_node "
"with type: GradNodeAccumulation"));
auto accumulation_grad_node =
std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node);
accumulation_grad_node->RetainGrad(hook);
// Append to GradientHooks
RegisterGradientHookForTensor(tensor, hook);
}

void RetainGradForTensor(const paddle::experimental::Tensor& tensor) {
if (IsLeafTensor(tensor)) {
// Leaf tensor's grad will always be retained
// Refer to implementation of AccumulationNode for more details
return;
} else {
// Append to GradientHooks
RegisterGradientHookForTensor(tensor, hook);
RetainGradForRegularNode(tensor);
}
}

Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/eager/api/utils/tensor_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ paddle::experimental::Tensor CreateTensorWithValue(

auto meta = EagerUtils::autograd_meta(&out);
if (is_leaf) {
auto accumulation_node = std::make_shared<GradNodeAccumulation>();
auto accumulation_node = std::make_shared<GradNodeAccumulation>(meta);
meta->SetGradNode(accumulation_node);
meta->SetStopGradient(false);
}
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/eager/auto_code_generator/eager_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1031,6 +1031,8 @@ static std::string GenerateGradNodeCreationContent(
const std::string& output_name = output.name();
const std::string& output_autograd_name = "p_autograd_" + output_name;

// Skip Intermediate Tensor

if (output.duplicable()) {
const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
" std::vector<egr::AutogradMeta*> %s = "
Expand Down Expand Up @@ -1145,6 +1147,8 @@ static std::string GenerateGradNodeCreationContent(
const std::string& output_autograd_name = "p_autograd_" + output_name;
size_t output_position = fwd_outputs_name_pos_map.at(output_name);

// Intermediate Tensor does not require SetHistory, nor RetainGrad

if (output.duplicable()) {
pass_stop_gradient_args += ", &" + output_autograd_name;
const char* SET_OUT_RANK_TEMPLATE =
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/eager/autograd_meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ class AutogradMeta : public AbstractAutogradMeta {
"Should Not set NULL as GradNode pointer, since "
"our default Edge and autogradMeta has nullptr for "
"grad node. Set Nullptr will lead error."));

grad_node_ = grad_node;
}

Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/eager/grad_node_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
} else {
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
}
Expand All @@ -76,7 +76,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
} else {
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
<< this->name() << " to " << meta->GetMutableGradNode()->name();
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
Expand Down
15 changes: 7 additions & 8 deletions paddle/fluid/eager/tensor_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,13 @@ class TensorWrapper {
}

intermidiate_tensor_.set_name(tensor.name() + "@Saved");
PADDLE_ENFORCE_NOT_NULL(
EagerUtils::unsafe_autograd_meta(tensor),
paddle::platform::errors::Fatal(
"Full reserved Tensor should not have null autograd meta, since "
"tensor_wrapper is used to build backward info. There is no way "
"for us to build it with null autograd_meta."));
// copy output_rank
out_rank_info_ = EagerUtils::OutRankInfo(tensor);

// If an output is marked "intermedaite", we won't create
// autograd_meta for it.
// In that case, simply skip OutRankInfo Copy
if (EagerUtils::nullable_autograd_meta(tensor)) {
out_rank_info_ = EagerUtils::OutRankInfo(tensor);
}
}

paddle::experimental::Tensor recover(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
#include "gtest/gtest.h"

#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/utils/hook_utils.h"
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/grad_tensor_holder.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/fluid/eager/utils.h"

#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/kernel_registry.h"

// TODO(jiabin): remove nolint here!!!
Expand All @@ -37,7 +39,7 @@ TEST(AccumulationNode, Tensor) {
.get(),
meta);
dt0->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace())[0] = 10.0;
paddle::platform::CPUPlace())[0] = paddle::platform::float16(10.0f);
paddle::experimental::Tensor et0 = paddle::experimental::Tensor(dt0);

std::shared_ptr<phi::DenseTensor> dt1 = std::make_shared<phi::DenseTensor>(
Expand All @@ -47,84 +49,100 @@ TEST(AccumulationNode, Tensor) {
meta);

dt1->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace())[0] = 20.0;
paddle::platform::CPUPlace())[0] = paddle::platform::float16(20.0f);
paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1);

std::shared_ptr<phi::DenseTensor> input_dt =
std::make_shared<phi::DenseTensor>(
std::make_unique<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace())
.get(),
meta);
paddle::experimental::Tensor input_et =
paddle::experimental::Tensor(input_dt);
auto grad_meta = EagerUtils::autograd_meta(&input_et);

// Initialize Grad Tensor
std::shared_ptr<phi::DenseTensor> grad_dt =
std::make_shared<phi::DenseTensor>(
std::make_unique<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace())
.get(),
meta);
paddle::experimental::Tensor grad_et = paddle::experimental::Tensor(grad_dt);
grad_dt->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace())[0] = paddle::platform::float16(0.0f);
grad_meta->MutableGrad()->set_impl(grad_dt);

// AccumulationNode
GradNodeAccumulation node = GradNodeAccumulation();

// Hook, RetainGrad
std::function<paddle::experimental::Tensor(
const paddle::experimental::Tensor&)>
hook = [&grad_et](const paddle::experimental::Tensor& t) {
grad_et.set_impl(t.impl());
return grad_et;
};
node.RetainGrad(hook);
auto node = std::make_shared<GradNodeAccumulation>(grad_meta);
grad_meta->SetGradNode(node);
grad_meta->SetStopGradient(false);

// operator()
paddle::experimental::Tensor ret_et0 = node({{et0}})[0][0];
paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0];
auto* ret_et0_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl())
->data<paddle::platform::float16>();
CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f));

paddle::experimental::Tensor ret_et1 = node({{et1}})[0][0];
paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0];

auto* ret_et1_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(ret_et1.impl())
->data<paddle::platform::float16>();
CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(30.0f));
CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(20.0f));

// Retain Grad
auto* ret_grad_et_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
->data<paddle::platform::float16>();
CHECK_EQ(ret_grad_et_ptr[0], paddle::platform::float16(30.0f));
// Check Retain Grad
CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
->data<paddle::platform::float16>()[0],
paddle::platform::float16(10.0f));
paddle::experimental::Tensor* grad = EagerUtils::mutable_grad(input_et);
auto* grad_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(grad->impl())
->data<paddle::platform::float16>();
CHECK_EQ(grad_ptr[0], paddle::platform::float16(30.0f));

// Reduce Hook case 1: Call RegisterReduceHook and run operator()
VLOG(6) << "Test Reduce Hook";
CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
->data<paddle::platform::float16>()[0],
paddle::platform::float16(10.0f));

auto reduce_hook_1 = [&](void) -> void {
auto* grad_et_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
->data<paddle::platform::float16>();
grad_et_ptr[0] = 36.0;
auto* input_et_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(input_et.impl())
->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace());
input_et_ptr[0] = 36.0;
VLOG(6) << "Running Reduce Hook";
};

node.RegisterReduceHook(reduce_hook_1);
node->RegisterReduceHook(reduce_hook_1);

// operator()
paddle::experimental::Tensor _ret = node({{et0}})[0][0];
paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0];

// Check operator() result, should be 36.0
auto* _ret_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(_ret.impl())
->data<paddle::platform::float16>();
CHECK_EQ(_ret_ptr[0], paddle::platform::float16(36.0f));
CHECK_EQ(_ret_ptr[0], paddle::platform::float16(10.0f));

// Check Retain Grad, should be 36.0
auto* _ret_grad_et_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(grad_et.impl())
auto* _ret_input_et_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(input_et.impl())
->data<paddle::platform::float16>();
CHECK_EQ(_ret_grad_et_ptr[0], paddle::platform::float16(36.0f));
CHECK_EQ(_ret_input_et_ptr[0], paddle::platform::float16(36.0f));

// Reduce Hook case 2: Call RegisterReduceHook and ApplyReduceHooks directly
VLOG(6) << "Test Reduce Hook";
auto reduce_hook_2 = [&](void) -> void {
auto* ret_et0_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
->data<paddle::platform::float16>();
->mutable_data<paddle::platform::float16>(
paddle::platform::CPUPlace());
ret_et0_ptr[0] = 100.0; // set to 100.0
VLOG(6) << "Running Reduce Hook";
};
node.RegisterReduceHook(reduce_hook_2);
node.ApplyReduceHooks();
node->RegisterReduceHook(reduce_hook_2);
node->ApplyReduceHooks();

// Check ApplyReduceHooks result
CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(et0.impl())
Expand Down
Loading