Skip to content

Commit

Permalink
[new-exec] enable check_nan_inf (#36802)
Browse files Browse the repository at this point in the history
* enable check_nan_inf and fix variable scope

* add ut

* fix bug

* update ut

* revert doc change

* fix npu compile
  • Loading branch information
zhiqiu authored Oct 29, 2021
1 parent 82fb63e commit be55bac
Show file tree
Hide file tree
Showing 9 changed files with 108 additions and 42 deletions.
2 changes: 1 addition & 1 deletion paddle/fluid/framework/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader)
cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)

cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto)
cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto scope)
if (WITH_GPU)
target_link_libraries(var_type_traits dynload_cuda)
endif()
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/framework/details/nan_inf_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace framework {
namespace details {
// assert false when meets NAN or inf
void CheckVarHasNanOrInf(const std::string& op_type,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const std::string& var_name,
const platform::Place& place);

Expand All @@ -37,7 +37,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
const platform::Place& place);

void CheckOpHasNanOrInf(const framework::OperatorBase& op,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const platform::Place& place);

template <typename VarType>
Expand All @@ -55,7 +55,7 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,

#ifdef PADDLE_WITH_ASCEND_CL
void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const platform::Place& place);
#endif

Expand Down
15 changes: 7 additions & 8 deletions paddle/fluid/framework/details/nan_inf_utils_detail.cc
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
}

void CheckVarHasNanOrInf(const std::string& op_type,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const std::string& var_name,
const platform::Place& place) {
auto* var = scope.FindVar(var_name);
Expand Down Expand Up @@ -440,7 +440,7 @@ static framework::Tensor& npu_float_status() {
}

void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const platform::Place& place) {
if (!platform::is_npu_place(place)) return;

Expand Down Expand Up @@ -505,7 +505,7 @@ void PrintNpuVarInfo(const std::string& op_type, const std::string& var_name,
}

void PrintNPUOpValueInfo(const framework::OperatorBase& op,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const platform::Place& place) {
LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type()
<< "), here we print some tensor value info of this op.";
Expand All @@ -523,7 +523,7 @@ void PrintNPUOpValueInfo(const framework::OperatorBase& op,
}

static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
const framework::Scope& scope,
const framework::ScopeBase& scope,
const platform::Place& place) {
if (!platform::is_npu_place(place)) return;

Expand Down Expand Up @@ -551,14 +551,13 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,

if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place);

PADDLE_ENFORCE_LT(
sum, 1.0, platform::errors::PreconditionNotMet(
"Operator %s contains Nan/Inf.", op.DebugStringEx(&scope)));
PADDLE_ENFORCE_LT(sum, 1.0, platform::errors::PreconditionNotMet(
"Operator %s contains Nan/Inf.", op.Type()));
}
#endif

void CheckOpHasNanOrInf(const framework::OperatorBase& op,
const framework::Scope& exec_scope,
const framework::ScopeBase& exec_scope,
const platform::Place& place) {
std::call_once(white_list_init_flag, InitWhiteListFormEnv);

Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/new_executor/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor)
graph_to_program_pass variable_helper timer monitor nan_inf_utils)

cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce)
cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS})
Expand Down
18 changes: 14 additions & 4 deletions paddle/fluid/framework/new_executor/interpretercore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@

#include <unordered_set>

#include "paddle/fluid/framework/details/nan_inf_utils.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
#include "paddle/fluid/platform/profiler.h"

PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
"Use inplace in new executor");

DECLARE_bool(check_nan_inf);

constexpr const char* kExceptionCaught = "ExceptionCaught";

namespace paddle {
Expand Down Expand Up @@ -80,7 +83,6 @@ paddle::framework::FetchList InterpreterCore::Run(
auto FeedInput = [&] {
for (size_t i = 0; i < feed_names_.size(); ++i) {
auto* feed_var = global_scope_->Var(feed_names_[i]);

auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
feed_tensor->ShareDataWith(feed_tensors[i]);
}
Expand Down Expand Up @@ -246,10 +248,10 @@ void InterpreterCore::BuildInplace() {
auto outvar = global_scope_->Var(iterout->second[0]);
if (invar && outvar) {
instr.AddInplace(invar, outvar);
VLOG(3) << "inplace " << op_base->Type() << " "
<< global_scope_->VarDesc(iter->second[0])->Name()
VLOG(3) << "inplace " << vec_instruction_[i].OpBase()->Type()
<< " " << global_scope_->GetNameById(iter->second[0])
<< " -> "
<< global_scope_->VarDesc(iterout->second[0])->Name()
<< global_scope_->GetNameById(iterout->second[0])
<< std::endl;
}
}
Expand Down Expand Up @@ -330,6 +332,14 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
platform::RecordEvent compute_event("Compute");
instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
}

// for debug nan/inf
if (FLAGS_check_nan_inf) {
VLOG(4) << "Check nan/inf";
framework::details::CheckOpHasNanOrInf(
*instr_node.OpBase(), *global_scope_,
instr_node.DeviceContext().GetPlace());
}
}

void InterpreterCore::ExecuteInstructionList(
Expand Down
69 changes: 49 additions & 20 deletions paddle/fluid/framework/new_executor/new_executor_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -471,44 +471,73 @@ struct VariableMetaInfo {
paddle::framework::VarDesc* vardesc_;
};

// TODO(Aurelius84): Consider inherit ScopeBase to unify interface.
class VariableScope {
// TODO(zhiqiu): Maybe we need to add rwlock for VariableScope?
class VariableScope : public ScopeBase {
public:
Variable* FindVar(const std::string& name) const {
if (!HasVar(name)) {
return nullptr;
auto it = name2id_.find(name);
if (it != name2id_.end()) {
PADDLE_ENFORCE_LT(it->second, var_list_.size(),
platform::errors::NotFound(
"The id(%d) of variable(%s) should not be larger "
"than the size of variable list(%d).",
it->second, name, var_list_.size()));
return var_list_[it->second];
}
auto var_id = VarId(name);
CheckExist(var_id);
return var_list[var_id];
return nullptr;
}

// Get variable id by name, return -1 if not found
int GetIdByName(const std::string& name) const {
auto it = name2id_.find(name);
if (it != name2id_.end()) {
return it->second;
}
return -1;
}

// Get variable name by id, return "" if not found
std::string GetNameById(int id) const {
// NOTE(zhiqiu): do not use vec_meta_info_[id].vardesc_->Name() since
// vec_meta_info_[id] may be nullptr,
// typically when the target variable is not existed in the original program
// desc, but created by interpretercore.
// For example, created and used by d2h_copy or h2d_copy operator.
auto it =
std::find_if(name2id_.begin(), name2id_.end(),
[id](const auto& pair) { return pair.second == id; });
if (it != name2id_.end()) {
return it->first;
}
return "";
}

bool HasVar(const std::string& name) const {
return name2id.find(name) != name2id.end();
return name2id_.find(name) != name2id_.end();
}

int VarId(const std::string& name) const {
CheckExist(name);
return name2id.at(name);
return name2id_.at(name);
}

Variable* Var(int id) const { return var_list.at(id); }
Variable* Var(int id) const { return var_list_.at(id); }

Variable* Var(const std::string& name) const {
return var_list.at(VarId(name));
return var_list_.at(VarId(name));
}

size_t VarSize() const { return var_list.size(); }
size_t VarSize() const { return var_list_.size(); }

void AddVar(const std::string& name, VarDesc* var_desc) { // NOLINT
name2id[name] = VarSize();
name2id_[name] = VarSize();
auto v = new Variable();
if (nullptr == var_desc) {
v->GetMutable<LoDTensor>();
} else {
InitializeVariable(v, var_desc->GetType());
}
var_list.push_back(v);
var_list_.push_back(v);

VariableMetaInfo info;
info.var_ref_count_ = 0;
Expand All @@ -517,8 +546,8 @@ class VariableScope {
}

void AddVar(const std::string& name, Variable& var) { // NOLINT
name2id[name] = VarSize();
var_list.push_back(&var);
name2id_[name] = VarSize();
var_list_.push_back(&var);

VariableMetaInfo info;
info.var_ref_count_ = 0;
Expand All @@ -540,10 +569,10 @@ class VariableScope {
}

void CheckExist(int id) const {
PADDLE_ENFORCE_LT(id, var_list.size(),
PADDLE_ENFORCE_LT(id, var_list_.size(),
platform::errors::PreconditionNotMet(
"Required var_id < %d, but received var_id = %d.",
var_list.size(), id));
var_list_.size(), id));
}

void CheckExist(const std::string& name) const {
Expand All @@ -553,8 +582,8 @@ class VariableScope {
}

private:
std::vector<Variable*> var_list;
std::map<std::string, int> name2id;
std::vector<Variable*> var_list_;
std::map<std::string, int> name2id_;
std::vector<VariableMetaInfo> vec_meta_info_;
};

Expand Down
12 changes: 11 additions & 1 deletion paddle/fluid/framework/scope.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ class Variable;
namespace paddle {
namespace framework {

// TODO(zhiqiu): add more function in base class
class ScopeBase {
public:
/// Find a variable in the scope or any of its ancestors. Returns
/// nullptr if cannot find.
/// Caller doesn't own the returned Variable.
virtual Variable* FindVar(const std::string& name) const = 0;
virtual ~ScopeBase() {}
};

class Scope;

/**
Expand All @@ -49,7 +59,7 @@ class Scope;
* One net can run in different scopes and update different variable in the
* scope.
*/
class Scope {
class Scope : public ScopeBase {
public:
Scope() {}
~Scope();
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/var_type_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class BKCLCommunicator;

namespace framework {
class LoDRankTable;
class ScopeBase;
class LoDTensor;
class ReaderHolder;
class Scope;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,10 +256,12 @@ def build_program(self):
main_program = paddle.static.Program()
startup_program = paddle.static.Program()
with paddle.static.program_guard(main_program, startup_program):
w = paddle.rand([10, 20])
w = paddle.rand([10, 3])
ids = paddle.static.data(name="id", shape=[5], dtype='int64')
data = paddle.static.data(name="data", shape=[3], dtype='float32')
emb = paddle.nn.functional.embedding(
x=ids, weight=w, sparse=False, name="embedding")
emb = emb + data

return main_program, startup_program, emb

Expand All @@ -273,7 +275,7 @@ def _run(self, feeds):

for feed in feeds:
out = exe.run(main_program, feed=feed, fetch_list=fetch_vars)

print(out)
return out

def run_new_executor(self, feed):
Expand All @@ -284,12 +286,27 @@ def run_new_executor(self, feed):

def test_exception(self):
feed = [{
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64)
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
'data': np.array([1, 2, 3, 4]).astype(np.float32),
}, {
'id': np.array([1, 2, 3, 4, 11]).astype(np.int64)
'id': np.array([1, 2, 3, 4, 11]).astype(np.int64),
'data': np.array([1, 2, 3, 4]).astype(np.float32),
}]
self.assertRaises(ValueError, self.run_new_executor, feed)

def test_nan(self):
flags = {'FLAGS_check_nan_inf': True}
paddle.fluid.set_flags(flags)
feed = [{
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
'data': np.array([1, 2, 3]).astype(np.float32),
}, {
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
'data': np.array([1, 2, 3]).astype(np.float32),
}]
feed[1]['data'][0] = np.nan
self.assertRaises(RuntimeError, self.run_new_executor, feed)


if __name__ == "__main__":
unittest.main()

0 comments on commit be55bac

Please sign in to comment.