From 202b2f1fa71b33b5165e166ecdde0163a9799bdb Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 10 Dec 2018 17:27:20 +0800 Subject: [PATCH 001/103] Move the beta pow scale calculation into Adam Op --- paddle/fluid/framework/ir/graph.cc | 98 ++++++++++----------- paddle/fluid/operators/optimizers/adam_op.h | 17 ++++ python/paddle/fluid/optimizer.py | 43 ++++----- 3 files changed, 88 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index fc91564bbaecf..dfa310a386372 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -28,55 +28,55 @@ namespace { void CheckProgram(const ProgramDesc &program) { #define _INT(role) static_cast(role) - std::map visit; - for (OpDesc *op : program.Block(0).AllOps()) { - // For backward compatibility, some program doesn't have role added. - if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; - int role_id = - boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - visit[role_id] = true; - switch (role_id) { - case _INT(OpRole::kForward): - if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) - << "Cannot add backward operator before forward operator %s." - << op->Type(); - } - break; - case _INT(OpRole::kBackward): - case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add backward operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kForward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | - _INT(OpRole::kLoss)) == visit.end(), - "Cannot add backward|loss operator before " - "forward|loss operator %s.", - op->Type()); - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add forward|loss operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kOptimize): - case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), - "Optimize operators %s must follow backward operator.", - op->Type()); - break; - case _INT(OpRole::kLRSched): - case _INT(OpRole::kDist): - case _INT(OpRole::kRPC): - case _INT(OpRole::kNotSpecified): - break; - default: - LOG(FATAL) << "Unknown operator role. Don't add new role because " - "you don't know what you are doing."; - } - } +// std::map visit; +// for (OpDesc *op : program.Block(0).AllOps()) { +// // For backward compatibility, some program doesn't have role added. +// if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; +// int role_id = +// boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); +// visit[role_id] = true; +// switch (role_id) { +// case _INT(OpRole::kForward): +// if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { +// LOG(ERROR) +// << "Cannot add backward operator before forward operator %s." +// << op->Type(); +// } +// break; +// case _INT(OpRole::kBackward): +// case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): +// PADDLE_ENFORCE( +// visit.find(_INT(OpRole::kOptimize)) == visit.end(), +// "Cannot add backward operator %s after optimize operator.", +// op->Type()); +// break; +// case _INT(OpRole::kForward) | _INT(OpRole::kLoss): +// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | +// _INT(OpRole::kLoss)) == visit.end(), +// "Cannot add backward|loss operator before " +// "forward|loss operator %s.", +// op->Type()); +// PADDLE_ENFORCE( +// visit.find(_INT(OpRole::kOptimize)) == visit.end(), +// "Cannot add forward|loss operator %s after optimize operator.", +// op->Type()); +// break; +// case _INT(OpRole::kOptimize): +// case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): +// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), +// "Optimize operators %s must follow backward operator.", +// op->Type()); +// break; +// case _INT(OpRole::kLRSched): +// case _INT(OpRole::kDist): +// case _INT(OpRole::kRPC): +// case _INT(OpRole::kNotSpecified): +// break; +// default: +// LOG(FATAL) << "Unknown operator role. Don't add new role because " +// "you don't know what you are doing."; +// } +// } #undef _INT } diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54e8e..2205f473f2318 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -292,6 +292,23 @@ class AdamOpKernel : public framework::OpKernel { static_cast(ctx.device_context()), param.numel()); for_range(functor); + + auto& dev = + *ctx.template device_context().eigen_device(); + + const LoDTensor* beta1_pow_ptr = ctx.Input("Beta1Pow"); + auto eigen_in_beta1_pow = + framework::EigenVector::Flatten(*beta1_pow_ptr); + auto eigen_out_beta1_pow = framework::EigenVector::Flatten( + *(const_cast(beta1_pow_ptr))); + eigen_out_beta1_pow.device(dev) = beta1 * eigen_in_beta1_pow; + + const LoDTensor* beta2_pow_ptr = ctx.Input("Beta2Pow"); + auto eigen_in_beta2_pow = + framework::EigenVector::Flatten(*beta2_pow_ptr); + auto eigen_out_beta2_pow = framework::EigenVector::Flatten( + *(const_cast(beta2_pow_ptr))); + eigen_out_beta2_pow.device(dev) = beta2 * eigen_in_beta2_pow; } } else if (grad_var->IsType()) { auto& grad = diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index da92826d41050..1930ac106b284 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -477,7 +477,7 @@ class LarsMomentumOptimizer(Optimizer): regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. - + Examples: .. code-block:: python @@ -739,26 +739,27 @@ def _finish_update(self, block, param_and_grads): """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - for param, grad in param_and_grads: - if grad is None: - continue - with param.block.program._optimized_guard( - [param, grad]), name_scope("optimizer"): - beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, - param) - beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, - param) - main_block.append_op( - type="scale", - inputs={"X": beta1_pow_acc}, - outputs={"Out": beta1_pow_acc}, - attrs={"scale": self._beta1}) - - main_block.append_op( - type="scale", - inputs={"X": beta2_pow_acc}, - outputs={"Out": beta2_pow_acc}, - attrs={"scale": self._beta2}) + # for param, grad in param_and_grads: + + # if grad is None: + # continue + # with param.block.program._optimized_guard( + # [param, grad]), name_scope("optimizer"): + # beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + # param) + # beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + # param) + # main_block.append_op( + # type="scale", + # inputs={"X": beta1_pow_acc}, + # outputs={"Out": beta1_pow_acc}, + # attrs={"scale": self._beta1}) + + # main_block.append_op( + # type="scale", + # inputs={"X": beta2_pow_acc}, + # outputs={"Out": beta2_pow_acc}, + # attrs={"scale": self._beta2}) class AdamaxOptimizer(Optimizer): From 570338699b2038b802e9d49ea80efc916416477a Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 11 Dec 2018 18:29:16 +0800 Subject: [PATCH 002/103] Add debug info --- .../details/computation_op_handle.cc | 45 ++++- .../fast_threaded_ssa_graph_executor.cc | 1 + .../fluid/framework/details/op_handle_base.cc | 2 +- paddle/fluid/framework/operator.cc | 160 +++++++++++------- paddle/fluid/framework/scope.cc | 37 ++-- .../operators/elementwise/elementwise_op.h | 69 ++++---- paddle/fluid/operators/optimizers/adam_op.cc | 79 ++++----- python/paddle/fluid/profiler.py | 3 +- 8 files changed, 239 insertions(+), 157 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c600c6..900303343838c 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -26,17 +26,46 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, scope_(scope), place_(place) {} +struct RecordTime { + RecordTime(const std::string &name, const std::string &type) + : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} + + ~RecordTime() { + if (type_ == "elementsize_add") { + end_ = std::chrono::system_clock::now(); + std::chrono::duration diff = end_ - start_; + VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); + } + } + + std::string name_; + std::string type_; + std::chrono::system_clock::time_point start_; + std::chrono::system_clock::time_point end_; +}; + void ComputationOpHandle::RunImpl() { - WaitInputVarGenerated(place_); + { + RecordTime rt("ComputationOpHandle::RunImpl", "Wait"); + WaitInputVarGenerated(place_); + } + + Scope *scope = nullptr; + { + RecordTime rt("ComputationOpHandle::RunImpl", "PrepareScope"); + scope = scope_->FindVar(kLocalExecScopeName)->Get(); + } + + { + RecordTime rt("ComputationOpHandle::RunImpl", "ReallyRun " + op_->Type()); - auto run_func = [this]() { - op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); - }; + auto run_func = [this, scope]() { op_->Run(*scope, place_); }; - if (is_lock_and_record_event_free_) { - run_func(); - } else { - this->RunAndRecordEvent(run_func); + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); + } } } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 949510e03705a..872bc5d654cd6 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ClearFetchOp(graph_.get(), &fetch_ops); return fetches; } + void FastThreadedSSAGraphExecutor::RunOpAsync( std::unordered_map> *op_deps, OpHandleBase *op, diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4822627ac3b65..5997f12ffabcf 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,7 +41,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_cuda) { + if (events_.empty() && use_cuda && !dev_ctxes_.empty()) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f7ce..b8adce4edf19c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -701,85 +701,125 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->InferShape(&infer_shape_ctx); } +struct RecordTime { + RecordTime(const std::string& name, const std::string& type) + : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} + + void inline stop() { + end_ = std::chrono::system_clock::now(); + std::chrono::duration diff = end_ - start_; + VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); + } + + ~RecordTime() { + if (type_ == "elementwise_add") { + stop(); + } + // stop(); + } + + std::string name_; + std::string type_; + std::chrono::system_clock::time_point start_; + std::chrono::system_clock::time_point end_; +}; + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - - // check if op[type] has kernel registered. - auto& all_op_kernels = AllOpKernels(); - auto kernels_iter = all_op_kernels.find(type_); - if (kernels_iter == all_op_kernels.end()) { - PADDLE_THROW( - "There are no kernels which are registered in the %s operator.", type_); + RecordTime rt("OperatorWithKernel::All", type_); + { + RecordTime rt("OperatorWithKernel::InferShape", type_); + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); } - OpKernelMap& kernels = kernels_iter->second; + { + RecordTime* rt_1 = new RecordTime("OperatorWithKernel::Compute1", type_); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", + type_); + } - // for (auto& candidate : kKernelPriority) { - // Do selection - // } + OpKernelMap& kernels = kernels_iter->second; - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + // TODO(dzhwinter) : kernel fallback mechanism will be added when all the + // transform functions are ready. - auto kernel_iter = kernels.find(expected_kernel_key); + // for (auto& candidate : kKernelPriority) { + // Do selection + // } + + auto expected_kernel_key = + this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_MKLDNN - // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set - if (kernel_iter == kernels.end() && - expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; - expected_kernel_key.library_type_ = LibraryType::kPlain; - expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; - kernel_iter = kernels.find(expected_kernel_key); - } + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = LibraryType::kPlain; + expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } #endif - if (kernel_iter == kernels.end()) { - PADDLE_THROW("op %s does not have kernel for %s", type_, - KernelTypeToString(expected_kernel_key)); - } + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } - // do data transformScope &transfer_scope; - std::vector transfered_inplace_vars; - auto* transfer_scope = - TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + // do data transformScope &transfer_scope; + std::vector transfered_inplace_vars; + Scope* transfer_scope = nullptr; + // auto* transfer_scope = + // TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); - // exec scope is the scope that kernel actually executed on. - const Scope& exec_scope = - (transfer_scope == nullptr ? scope : *transfer_scope); + // exec scope is the scope that kernel actually executed on. + const Scope& exec_scope = scope; + // const Scope& exec_scope = + // (transfer_scope == nullptr ? scope : *transfer_scope); - if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { - dev_ctx = pool.Get(expected_kernel_key.place_); - } + if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { + dev_ctx = pool.Get(expected_kernel_key.place_); + } + delete rt_1; - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + RecordTime* rt_2 = new RecordTime("OperatorWithKernel::Compute2", type_); + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + delete rt_2; - if (!transfered_inplace_vars.empty()) { - // there is inplace variable has been transfered. - TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); - } + RecordTime* rt_3 = new RecordTime("OperatorWithKernel::Compute3", type_); + if (!transfered_inplace_vars.empty()) { + // there is inplace variable has been transfered. + TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); + } - /*For profiling/benchmark only*/ - if (FLAGS_benchmark) { - dev_ctx->Wait(); - } + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + dev_ctx->Wait(); + } - if (FLAGS_check_nan_inf) { - for (auto& vname : OutputVars(true)) { - auto* var = exec_scope.FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get()); - } else if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get().value()); + if (FLAGS_check_nan_inf) { + for (auto& vname : OutputVars(true)) { + auto* var = exec_scope.FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(vname, + var->Get().value()); + } } } + delete rt_3; } } void OperatorWithKernel::TransferInplaceVarsBack( diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7ccc32..61416676d631f 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -43,9 +43,16 @@ DEFINE_double( // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_LOCK_GUARD +#define SCOPE_READER_LOCK +#define SCOPE_WRITER_LOCK #else -#define SCOPE_LOCK_GUARD std::lock_guard lock(mutex_); +// TODO(minqiyang): use reader lock and writer lock in all platforms +#define SCOPE_READER_LOCK +#define SCOPE_WRITER_LOCK +// #define SCOPE_READER_LOCK boost::shared_lock +// lock(mutex_); +// #define SCOPE_WRITER_LOCK boost::unique_lock +// lock(mutex_); #endif namespace paddle { @@ -61,18 +68,18 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK return VarInternal(name); } Variable* Scope::Var(std::string* name) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -81,34 +88,34 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -118,7 +125,7 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -132,7 +139,7 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -145,12 +152,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 87bf7c6b156f3..181baac870a6b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -33,34 +33,37 @@ class ElementwiseOp : public framework::OperatorWithKernel { using Tensor = framework::Tensor; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), - "Input(Y) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of elementwise op should not be null."); - - PADDLE_ENFORCE( - ctx->GetInputsVarType("Y").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s [%s]", - ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); - - if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR) { - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); - } else if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::SELECTED_ROWS) { - PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && - (ctx->GetInputDim("Y")[0] == 1), - "For elementwise_op, if X is Sparse, " - "Y must be scalar."); - } else { - PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - ctx->GetInputsVarType("X").front()); + if (!ctx->IsRuntime()) { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of elementwise op should not be null."); + + PADDLE_ENFORCE(ctx->GetInputsVarType("Y").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the " + "received is %s [%s]", + ctx->GetInputsVarType("Y").front(), + ctx->Inputs("Y").front()); + + if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::LOD_TENSOR) { + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + } else if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS) { + PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && + (ctx->GetInputDim("Y")[0] == 1), + "For elementwise_op, if X is Sparse, " + "Y must be scalar."); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + ctx->GetInputsVarType("X").front()); + } } ctx->ShareDim("X", /*->*/ "Out"); @@ -125,7 +128,7 @@ The equation is: $$%s$$ -- $X$: a tensor of any dimension. +- $X$: a tensor of any dimension. - $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$. There are two cases for this operator: @@ -135,10 +138,10 @@ There are two cases for this operator: For case 2: -1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index - for broadcasting $Y$ onto $X$. +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. 2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. -3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of subsequence, such as shape(Y) = (2, 1) => (2). For example: @@ -152,7 +155,7 @@ For example: shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 -The inputs $X$ and $Y$ can carry the different LoD information. +The inputs $X$ and $Y$ can carry the different LoD information. But the output only shares the LoD information with the input $X$. )DOC", diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 5710cda39acce..bc1b20321f1fe 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -23,56 +23,57 @@ class AdamOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Param"), - "Input(Param) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Grad"), - "Input(Grad) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment1"), - "Input(Moment1) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment2"), - "Input(Moment2) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - "Input(LearningRate) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), - "Input(Beta1Pow) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), - "Input(Beta2Pow) of AdamOp should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - "Output(ParamOut) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), - "Output(Moment1Out) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), - "Output(Moment2Out) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Param"), + // "Input(Param) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Grad"), + // "Input(Grad) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Moment1"), + // "Input(Moment1) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Moment2"), + // "Input(Moment2) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + // "Input(LearningRate) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + // "Input(Beta1Pow) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), + // "Input(Beta2Pow) of AdamOp should not be null."); + + // PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + // "Output(ParamOut) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), + // "Output(Moment1Out) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), + // "Output(Moment2Out) of AdamOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, - "Learning rate should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + // "Learning rate should have 1 dimension"); auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, - "Beta1 power accumulator should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + // "Beta1 power accumulator should have 1 dimension"); auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); - PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, - "Beta2 power accumulator should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, + // "Beta2 power accumulator should have 1 dimension"); auto param_dims = ctx->GetInputDim("Param"); - if (ctx->GetInputsVarType("Grad")[0] == - framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Grad"), - "Param and Grad input of AdamOp should have same dimension"); - } - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment1"), - "Param and Moment1 input of AdamOp should have same dimension"); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment2"), - "Param and Moment2 input of AdamOp should have same dimension"); + // if (ctx->GetInputsVarType("Grad")[0] == + // framework::proto::VarType::LOD_TENSOR) { + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Grad"), + // "Param and Grad input of AdamOp should have same dimension"); + // } + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Moment1"), + // "Param and Moment1 input of AdamOp should have same dimension"); + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Moment2"), + // "Param and Moment2 input of AdamOp should have same dimension"); ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment2Out", param_dims); } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index e05885f5f5bfc..8df2e01b03749 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -92,7 +92,8 @@ def cuda_profiler(output_file, output_mode=None, config=None): config_file = 'nvprof_config_file' with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) - core.nvprof_init(output_file, output_mode, config_file) + #Comment this for nvprof + #core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() yield From 7a43e5170325f3a78e026bb4d7039e0c25be8686 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 16:16:26 +0800 Subject: [PATCH 003/103] Add gperf tools --- CMakeLists.txt | 6 ++++ cmake/generic.cmake | 16 +++++++++++ paddle/fluid/framework/parallel_executor.cc | 31 ++++++++++++++++++++- python/paddle/fluid/__init__.py | 3 +- 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index efa68c9ba243a..3e59aca2d9394 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,6 +81,12 @@ option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) +if (WITH_PROFILER) + find_package(Gperftools REQUIRED) + include_directories(${GPERFTOOLS_INCLUDE_DIR}) + add_definitions(-DWITH_GPERFTOOLS) +endif() + # PY_VERSION if(NOT PY_VERSION) set(PY_VERSION 2.7) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 312fbaa0b3d83..a8b9dcfcf5eec 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME) endif() endfunction(find_fluid_modules) + +function(common_link TARGET_NAME) + if (WITH_PROFILER) + target_link_libraries(${TARGET_NAME} gperftools::profiler) + endif() +endfunction() + + # find all third_party modules is used for paddle static library # for reduce the dependency when building the inference libs. set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY) @@ -274,6 +282,7 @@ function(cc_library TARGET_NAME) endif() target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) + common_link(${TARGET_NAME}) endif() # cpplint code style @@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME) if(cc_binary_DEPS) target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endfunction(cc_binary) @@ -362,6 +372,7 @@ function(cc_test TARGET_NAME) target_link_libraries(${TARGET_NAME} ${win32_deps}) endif(WIN32) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) @@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME) if(nv_binary_DEPS) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endif() endfunction(nv_binary) @@ -433,6 +445,7 @@ function(nv_test TARGET_NAME) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) @@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME) if(hip_binary_DEPS) target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS}) add_dependencies(${TARGET_NAME} ${hip_binary_DEPS}) + common_link(${TARGET_NAME}) endif() endif() endfunction(hip_binary) @@ -518,6 +532,7 @@ function(hip_test TARGET_NAME) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) + common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(hip_test) @@ -560,6 +575,7 @@ function(go_library TARGET_NAME) endif() if(go_library_DEPS) add_dependencies(${TARGET_NAME} ${go_library_DEPS}) + common_link(${TARGET_NAME}) endif(go_library_DEPS) # The "source file" of the library is `${dummyfile}` which never diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b98408ee77267..28a4b14b27bda 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -30,13 +30,36 @@ limitations under the License. */ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif +DEFINE_string(pe_profile_fname, "", + "Profiler filename for PE, which generated by gperftools." + "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); + namespace paddle { namespace framework { +static std::once_flag gProfileOnce; +#ifdef WITH_GPERFTOOLS +static bool gProfileStarted = false; +#endif class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) - : places_(places) {} + : places_(places) { + if (!FLAGS_pe_profile_fname.empty()) { + std::call_once(gProfileOnce, [] { +#ifdef WITH_GPERFTOOLS + ProfilerStart(FLAGS_pe_profile_fname.c_str()); + gProfileStarted = true; +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_pe_profile_fname will be ignored"; +#endif + }); + } + } ~ParallelExecutorPrivate() { if (own_local_scope_) { @@ -270,6 +293,12 @@ void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { +#ifdef WITH_GPERFTOOLS + if (gProfileStarted) { + ProfilerFlush(); + } +#endif + platform::RecordBlock b(0); #ifdef PADDLE_WITH_CUDA if (!gcs_.empty()) { diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2a53519188e74..4cf0784d8176c 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -125,7 +125,8 @@ def __bootstrap__(): 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', - 'reader_queue_speed_test_mode', 'print_sub_graph_dir' + 'reader_queue_speed_test_mode', 'print_sub_graph_dir', + 'pe_profile_fname' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') From b75bd29c3ae74b5d48d573916eebab6473b3c30f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 16:51:01 +0800 Subject: [PATCH 004/103] Remove debug info --- .../details/computation_op_handle.cc | 45 +---- .../fluid/framework/details/op_handle_base.cc | 2 +- paddle/fluid/framework/ir/graph.cc | 132 +++++++++------ paddle/fluid/framework/operator.cc | 160 +++++++----------- .../operators/elementwise/elementwise_op.h | 69 ++++---- paddle/fluid/operators/optimizers/adam_op.cc | 79 +++++---- 6 files changed, 224 insertions(+), 263 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 900303343838c..7ad1e40c600c6 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -26,46 +26,17 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, scope_(scope), place_(place) {} -struct RecordTime { - RecordTime(const std::string &name, const std::string &type) - : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} - - ~RecordTime() { - if (type_ == "elementsize_add") { - end_ = std::chrono::system_clock::now(); - std::chrono::duration diff = end_ - start_; - VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); - } - } - - std::string name_; - std::string type_; - std::chrono::system_clock::time_point start_; - std::chrono::system_clock::time_point end_; -}; - void ComputationOpHandle::RunImpl() { - { - RecordTime rt("ComputationOpHandle::RunImpl", "Wait"); - WaitInputVarGenerated(place_); - } - - Scope *scope = nullptr; - { - RecordTime rt("ComputationOpHandle::RunImpl", "PrepareScope"); - scope = scope_->FindVar(kLocalExecScopeName)->Get(); - } - - { - RecordTime rt("ComputationOpHandle::RunImpl", "ReallyRun " + op_->Type()); + WaitInputVarGenerated(place_); - auto run_func = [this, scope]() { op_->Run(*scope, place_); }; + auto run_func = [this]() { + op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); + }; - if (is_lock_and_record_event_free_) { - run_func(); - } else { - this->RunAndRecordEvent(run_func); - } + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); } } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 5997f12ffabcf..4822627ac3b65 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,7 +41,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_cuda && !dev_ctxes_.empty()) { + if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index dfa310a386372..9ebf13669861c 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -20,6 +20,10 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" +DEFINE_bool(enforce_when_check_program, true, + "Checking whether the program is correct or not. We will log " + "errors rather than throwing exceptions if this flag turned off"); + namespace paddle { namespace framework { namespace ir { @@ -28,55 +32,85 @@ namespace { void CheckProgram(const ProgramDesc &program) { #define _INT(role) static_cast(role) -// std::map visit; -// for (OpDesc *op : program.Block(0).AllOps()) { -// // For backward compatibility, some program doesn't have role added. -// if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; -// int role_id = -// boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); -// visit[role_id] = true; -// switch (role_id) { -// case _INT(OpRole::kForward): -// if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { -// LOG(ERROR) -// << "Cannot add backward operator before forward operator %s." -// << op->Type(); -// } -// break; -// case _INT(OpRole::kBackward): -// case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): -// PADDLE_ENFORCE( -// visit.find(_INT(OpRole::kOptimize)) == visit.end(), -// "Cannot add backward operator %s after optimize operator.", -// op->Type()); -// break; -// case _INT(OpRole::kForward) | _INT(OpRole::kLoss): -// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | -// _INT(OpRole::kLoss)) == visit.end(), -// "Cannot add backward|loss operator before " -// "forward|loss operator %s.", -// op->Type()); -// PADDLE_ENFORCE( -// visit.find(_INT(OpRole::kOptimize)) == visit.end(), -// "Cannot add forward|loss operator %s after optimize operator.", -// op->Type()); -// break; -// case _INT(OpRole::kOptimize): -// case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): -// PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), -// "Optimize operators %s must follow backward operator.", -// op->Type()); -// break; -// case _INT(OpRole::kLRSched): -// case _INT(OpRole::kDist): -// case _INT(OpRole::kRPC): -// case _INT(OpRole::kNotSpecified): -// break; -// default: -// LOG(FATAL) << "Unknown operator role. Don't add new role because " -// "you don't know what you are doing."; -// } -// } + std::map visit; + for (OpDesc *op : program.Block(0).AllOps()) { + // For backward compatibility, some program doesn't have role added. + if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; + int role_id = + boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + visit[role_id] = true; + switch (role_id) { + case _INT(OpRole::kForward): + if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { + LOG(ERROR) + << "Cannot add backward operator before forward operator %s." + << op->Type(); + } + break; + case _INT(OpRole::kBackward): + case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): + if (!FLAGS_enforce_when_check_program) { + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add backward operator %s after optimize operator.", + op->Type()); + } else { + if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { + LOG(ERROR) + << "Cannot add backward operator %s after optimize operator.", + << op->Type(); + } + } + break; + case _INT(OpRole::kForward) | _INT(OpRole::kLoss): + if (!FLAGS_enforce_when_check_program) { + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | + _INT(OpRole::kLoss)) == visit.end(), + "Cannot add backward|loss operator before " + "forward|loss operator %s.", + op->Type()); + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add forward|loss operator %s after optimize operator.", + op->Type()); + } else { + if (visit.find(_INT(OpRole::kBackward) | _INT(OpRole::kLoss)) != + visit.end()) { + LOG(ERROR) << "Cannot add backward|loss operator before " + << "forward|loss operator %s." << op->Type(); + } + + if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { + LOG(ERROR) << "Cannot add forward|loss operator %s after optimize " + "operator.", + << op->Type(); + } + } + break; + case _INT(OpRole::kOptimize): + case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): + if (!FLAGS_enforce_when_check_program) { + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), + "Optimize operators %s must follow backward operator.", + op->Type()); + } else { + if (visit.find(_INT(OpRole::kBackward)) == visit.end()) { + LOG(ERROR) + << "Optimize operators %s must follow backward operator.", + << op->Type(); + } + } + break; + case _INT(OpRole::kLRSched): + case _INT(OpRole::kDist): + case _INT(OpRole::kRPC): + case _INT(OpRole::kNotSpecified): + break; + default: + LOG(FATAL) << "Unknown operator role. Don't add new role because " + "you don't know what you are doing."; + } + } #undef _INT } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b8adce4edf19c..c6f3254e9f7ce 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -701,125 +701,85 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->InferShape(&infer_shape_ctx); } -struct RecordTime { - RecordTime(const std::string& name, const std::string& type) - : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} - - void inline stop() { - end_ = std::chrono::system_clock::now(); - std::chrono::duration diff = end_ - start_; - VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); - } - - ~RecordTime() { - if (type_ == "elementwise_add") { - stop(); - } - // stop(); - } - - std::string name_; - std::string type_; - std::chrono::system_clock::time_point start_; - std::chrono::system_clock::time_point end_; -}; - void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RecordTime rt("OperatorWithKernel::All", type_); - { - RecordTime rt("OperatorWithKernel::InferShape", type_); - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); - } - - { - RecordTime* rt_1 = new RecordTime("OperatorWithKernel::Compute1", type_); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); - // check if op[type] has kernel registered. - auto& all_op_kernels = AllOpKernels(); - auto kernels_iter = all_op_kernels.find(type_); - if (kernels_iter == all_op_kernels.end()) { - PADDLE_THROW( - "There are no kernels which are registered in the %s operator.", - type_); - } + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", type_); + } - OpKernelMap& kernels = kernels_iter->second; + OpKernelMap& kernels = kernels_iter->second; - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. + // TODO(dzhwinter) : kernel fallback mechanism will be added when all the + // transform functions are ready. - // for (auto& candidate : kKernelPriority) { - // Do selection - // } + // for (auto& candidate : kKernelPriority) { + // Do selection + // } - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + auto expected_kernel_key = + this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; - auto kernel_iter = kernels.find(expected_kernel_key); + auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_MKLDNN - // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set - if (kernel_iter == kernels.end() && - expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; - expected_kernel_key.library_type_ = LibraryType::kPlain; - expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; - kernel_iter = kernels.find(expected_kernel_key); - } + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = LibraryType::kPlain; + expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } #endif - if (kernel_iter == kernels.end()) { - PADDLE_THROW("op %s does not have kernel for %s", type_, - KernelTypeToString(expected_kernel_key)); - } + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } - // do data transformScope &transfer_scope; - std::vector transfered_inplace_vars; - Scope* transfer_scope = nullptr; - // auto* transfer_scope = - // TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + // do data transformScope &transfer_scope; + std::vector transfered_inplace_vars; + auto* transfer_scope = + TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); - // exec scope is the scope that kernel actually executed on. - const Scope& exec_scope = scope; - // const Scope& exec_scope = - // (transfer_scope == nullptr ? scope : *transfer_scope); + // exec scope is the scope that kernel actually executed on. + const Scope& exec_scope = + (transfer_scope == nullptr ? scope : *transfer_scope); - if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { - dev_ctx = pool.Get(expected_kernel_key.place_); - } - delete rt_1; + if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { + dev_ctx = pool.Get(expected_kernel_key.place_); + } - RecordTime* rt_2 = new RecordTime("OperatorWithKernel::Compute2", type_); - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); - delete rt_2; + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); - RecordTime* rt_3 = new RecordTime("OperatorWithKernel::Compute3", type_); - if (!transfered_inplace_vars.empty()) { - // there is inplace variable has been transfered. - TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); - } + if (!transfered_inplace_vars.empty()) { + // there is inplace variable has been transfered. + TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); + } - /*For profiling/benchmark only*/ - if (FLAGS_benchmark) { - dev_ctx->Wait(); - } + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + dev_ctx->Wait(); + } - if (FLAGS_check_nan_inf) { - for (auto& vname : OutputVars(true)) { - auto* var = exec_scope.FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get()); - } else if (var->IsType()) { - CheckTensorNANOrInf(vname, - var->Get().value()); - } + if (FLAGS_check_nan_inf) { + for (auto& vname : OutputVars(true)) { + auto* var = exec_scope.FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get().value()); } } - delete rt_3; } } void OperatorWithKernel::TransferInplaceVarsBack( diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 181baac870a6b..87bf7c6b156f3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -33,37 +33,34 @@ class ElementwiseOp : public framework::OperatorWithKernel { using Tensor = framework::Tensor; void InferShape(framework::InferShapeContext *ctx) const override { - if (!ctx->IsRuntime()) { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), - "Input(Y) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of elementwise op should not be null."); - - PADDLE_ENFORCE(ctx->GetInputsVarType("Y").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the " - "received is %s [%s]", - ctx->GetInputsVarType("Y").front(), - ctx->Inputs("Y").front()); - - if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR) { - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); - } else if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::SELECTED_ROWS) { - PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && - (ctx->GetInputDim("Y")[0] == 1), - "For elementwise_op, if X is Sparse, " - "Y must be scalar."); - } else { - PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - ctx->GetInputsVarType("X").front()); - } + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of elementwise op should not be null."); + + PADDLE_ENFORCE( + ctx->GetInputsVarType("Y").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s [%s]", + ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); + + if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::LOD_TENSOR) { + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + } else if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS) { + PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && + (ctx->GetInputDim("Y")[0] == 1), + "For elementwise_op, if X is Sparse, " + "Y must be scalar."); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + ctx->GetInputsVarType("X").front()); } ctx->ShareDim("X", /*->*/ "Out"); @@ -128,7 +125,7 @@ The equation is: $$%s$$ -- $X$: a tensor of any dimension. +- $X$: a tensor of any dimension. - $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$. There are two cases for this operator: @@ -138,10 +135,10 @@ There are two cases for this operator: For case 2: -1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index - for broadcasting $Y$ onto $X$. +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. 2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. -3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of subsequence, such as shape(Y) = (2, 1) => (2). For example: @@ -155,7 +152,7 @@ For example: shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 -The inputs $X$ and $Y$ can carry the different LoD information. +The inputs $X$ and $Y$ can carry the different LoD information. But the output only shares the LoD information with the input $X$. )DOC", diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index bc1b20321f1fe..5710cda39acce 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -23,57 +23,56 @@ class AdamOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - // PADDLE_ENFORCE(ctx->HasInput("Param"), - // "Input(Param) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Grad"), - // "Input(Grad) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Moment1"), - // "Input(Moment1) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Moment2"), - // "Input(Moment2) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - // "Input(LearningRate) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), - // "Input(Beta1Pow) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), - // "Input(Beta2Pow) of AdamOp should not be null."); - - // PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - // "Output(ParamOut) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), - // "Output(Moment1Out) of AdamOp should not be null."); - // PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), - // "Output(Moment2Out) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment1"), + "Input(Moment1) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment2"), + "Input(Moment2) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + "Input(Beta1Pow) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), + "Input(Beta2Pow) of AdamOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), + "Output(Moment1Out) of AdamOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), + "Output(Moment2Out) of AdamOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); - // PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, - // "Learning rate should have 1 dimension"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - // PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, - // "Beta1 power accumulator should have 1 dimension"); + PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + "Beta1 power accumulator should have 1 dimension"); auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); - // PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, - // "Beta2 power accumulator should have 1 dimension"); + PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, + "Beta2 power accumulator should have 1 dimension"); auto param_dims = ctx->GetInputDim("Param"); - // if (ctx->GetInputsVarType("Grad")[0] == - // framework::proto::VarType::LOD_TENSOR) { - // PADDLE_ENFORCE_EQ( - // param_dims, ctx->GetInputDim("Grad"), - // "Param and Grad input of AdamOp should have same dimension"); - // } - // PADDLE_ENFORCE_EQ( - // param_dims, ctx->GetInputDim("Moment1"), - // "Param and Moment1 input of AdamOp should have same dimension"); - // PADDLE_ENFORCE_EQ( - // param_dims, ctx->GetInputDim("Moment2"), - // "Param and Moment2 input of AdamOp should have same dimension"); + if (ctx->GetInputsVarType("Grad")[0] == + framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdamOp should have same dimension"); + } + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment1"), + "Param and Moment1 input of AdamOp should have same dimension"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment2"), + "Param and Moment2 input of AdamOp should have same dimension"); ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment2Out", param_dims); } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = From 1b61021cb36eae45e142a953c2c96cf46853aa7c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 17:02:24 +0800 Subject: [PATCH 005/103] Polish code --- paddle/fluid/framework/ir/graph.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 9ebf13669861c..db74d5674a4dc 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -57,7 +57,7 @@ void CheckProgram(const ProgramDesc &program) { } else { if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { LOG(ERROR) - << "Cannot add backward operator %s after optimize operator.", + << "Cannot add backward operator %s after optimize operator." << op->Type(); } } @@ -82,8 +82,8 @@ void CheckProgram(const ProgramDesc &program) { if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { LOG(ERROR) << "Cannot add forward|loss operator %s after optimize " - "operator.", - << op->Type(); + "operator." + << op->Type(); } } break; @@ -95,9 +95,8 @@ void CheckProgram(const ProgramDesc &program) { op->Type()); } else { if (visit.find(_INT(OpRole::kBackward)) == visit.end()) { - LOG(ERROR) - << "Optimize operators %s must follow backward operator.", - << op->Type(); + LOG(ERROR) << "Optimize operators %s must follow backward operator." + << op->Type(); } } break; From a61eb543f5796d9899bff073e5f6647bc1003d71 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 12 Dec 2018 19:18:45 +0800 Subject: [PATCH 006/103] Add RWLock to Scope --- paddle/fluid/framework/rw_lock.h | 16 ++++++++++++---- paddle/fluid/framework/scope.cc | 11 ++++------- paddle/fluid/framework/scope.h | 4 ++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index dbf00f3a79f7d..dd918fcdfa667 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -16,7 +16,9 @@ limitations under the License. */ #if !defined(_WIN32) #include -#endif // !_WIN32 +#else +#include // NOLINT +#endif // !_WIN32 #include "paddle/fluid/platform/enforce.h" @@ -51,9 +53,15 @@ struct RWLock { // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive // In windows, rw_lock seems like a hack. Use empty object and do nothing. struct RWLock { - void RDLock() {} - void WRLock() {} - void UNLock() {} + // FIXME(minqiyang): use mutex here to do fake lock + void RDLock() { mutex_.lock(); } + + void WRLock() { mutex_.lock(); } + + void UNLock() { mutex_.unlock(); } + + private: + std::mutex mutex_; }; #endif diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 61416676d631f..190a057d9e4ba 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -46,13 +46,10 @@ DEFINE_double( #define SCOPE_READER_LOCK #define SCOPE_WRITER_LOCK #else -// TODO(minqiyang): use reader lock and writer lock in all platforms -#define SCOPE_READER_LOCK -#define SCOPE_WRITER_LOCK -// #define SCOPE_READER_LOCK boost::shared_lock -// lock(mutex_); -// #define SCOPE_WRITER_LOCK boost::unique_lock -// lock(mutex_); +// TODO(minqiyang): use rwlock in all platforms, now rwlock is a fake one +// in _WIN32 platform +#define SCOPE_READER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kRDLock); +#define SCOPE_WRITER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kWRLock); #endif namespace paddle { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 1901ffbe57e0d..c140212c3e44e 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -15,11 +15,11 @@ limitations under the License. */ #pragma once #include -#include // NOLINT #include #include #include +#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -123,7 +123,7 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); private: - mutable std::mutex mutex_; + mutable RWLock rw_lock_; }; // Generate some debug string about the inherience structure of scope, quite From ad6ae0b071041c1f69c66c7c173733bfe7cb2752 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 13 Dec 2018 18:39:46 +0800 Subject: [PATCH 007/103] 1. Add SpinLock 2. Seperate the lock of kids and vars in Scope test=develop --- CMakeLists.txt | 1 + cmake/external/robin_map.cmake | 31 +++++++ .../framework/details/execution_strategy.h | 2 +- .../scope_buffered_ssa_graph_executor.cc | 9 +- paddle/fluid/framework/operator.cc | 6 +- paddle/fluid/framework/rw_lock.h | 91 +++++-------------- paddle/fluid/framework/scope.cc | 58 ++++++------ paddle/fluid/framework/scope.h | 15 ++- paddle/fluid/framework/spin_lock.h | 71 +++++++++++++++ paddle/fluid/operators/optimizers/adam_op.h | 17 ---- paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/fluid/optimizer.py | 43 +++++---- 12 files changed, 201 insertions(+), 145 deletions(-) create mode 100644 cmake/external/robin_map.cmake create mode 100644 paddle/fluid/framework/spin_lock.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e59aca2d9394..2abbcef41a9d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -215,6 +215,7 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream +include(external/robin_map) # download tsl::robin_map if (NOT WIN32) # there is no official support of warpctc, nccl, cupti in windows diff --git a/cmake/external/robin_map.cmake b/cmake/external/robin_map.cmake new file mode 100644 index 0000000000000..ddaf59536cb2c --- /dev/null +++ b/cmake/external/robin_map.cmake @@ -0,0 +1,31 @@ +include(ExternalProject) + +set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map) +set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include) + +include_directories(${ROBIN_MAP_INCLUDE_DIR}) + +ExternalProject_Add( + extern_robin_map + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "/~https://github.com/Tessil/robin-map.git" + GIT_TAG "v0.5.0" + PREFIX ${ROBIN_MAP_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(robin_map STATIC ${dummyfile}) +else() + add_library(robin_map INTERFACE) +endif() + +add_dependencies(robin_map extern_robin_map) + +LIST(APPEND externl_project_dependencies robin_map) diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 15c496130c2b6..37b07e5736312 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,7 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{100}; + size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 499246a9856bb..9ded0266a9b11 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -76,9 +76,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( : nullptr; #endif - if (!fetch_tensors.empty() || - drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { - drop_scope_counter_ = 0; + if (!fetch_tensors.empty()) { // Wait All computational streams for (auto p : places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); @@ -91,12 +89,17 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( } #endif } + } + + if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { + drop_scope_counter_ = 0; for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } } + if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f7ce..58e5926f5445c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -163,11 +163,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { } bool OperatorBase::HasInputs(const std::string& name) const { - if (inputs_.find(name) != inputs_.end()) { - return true; - } else { - return false; - } + return inputs_.find(name) != inputs_.end(); } std::string OperatorBase::Input(const std::string& name) const { diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index dd918fcdfa667..75e6bef9bf33d 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -31,17 +31,17 @@ struct RWLock { ~RWLock() { pthread_rwlock_destroy(&lock_); } - void RDLock() { + inline void RDLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed"); } - void WRLock() { + inline void WRLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed"); } - void UNLock() { + inline void UNLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); } @@ -54,86 +54,43 @@ struct RWLock { // In windows, rw_lock seems like a hack. Use empty object and do nothing. struct RWLock { // FIXME(minqiyang): use mutex here to do fake lock - void RDLock() { mutex_.lock(); } + inline void RDLock() { mutex_.lock(); } - void WRLock() { mutex_.lock(); } + inline void WRLock() { mutex_.lock(); } - void UNLock() { mutex_.unlock(); } + inline void UNLock() { mutex_.unlock(); } private: std::mutex mutex_; }; #endif -class RWLockGuard { +class AutoWRLock { public: - enum Status { kUnLock, kWRLock, kRDLock }; - - RWLockGuard(RWLock* rw_lock, Status init_status) - : lock_(rw_lock), status_(Status::kUnLock) { - switch (init_status) { - case Status::kRDLock: { - RDLock(); - break; - } - case Status::kWRLock: { - WRLock(); - break; - } - case Status::kUnLock: { - break; - } - } - } + explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } - void WRLock() { - switch (status_) { - case Status::kUnLock: { - lock_->WRLock(); - status_ = Status::kWRLock; - break; - } - case Status::kWRLock: { - break; - } - case Status::kRDLock: { - PADDLE_THROW( - "Please unlock read lock first before invoking write lock."); - break; - } - } - } + inline void Lock() { lock_->WRLock(); } - void RDLock() { - switch (status_) { - case Status::kUnLock: { - lock_->RDLock(); - status_ = Status::kRDLock; - break; - } - case Status::kRDLock: { - break; - } - case Status::kWRLock: { - PADDLE_THROW( - "Please unlock write lock first before invoking read lock."); - break; - } - } - } + inline void UnLock() { lock_->UNLock(); } - void UnLock() { - if (status_ != Status::kUnLock) { - lock_->UNLock(); - status_ = Status::kUnLock; - } - } + ~AutoWRLock() { UnLock(); } + + private: + RWLock* lock_; +}; + +class AutoRDLock { + public: + explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + + inline void Lock() { lock_->RDLock(); } + + inline void UnLock() { lock_->UNLock(); } - ~RWLockGuard() { UnLock(); } + ~AutoRDLock() { UnLock(); } private: RWLock* lock_; - Status status_; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 190a057d9e4ba..f05208c5ec98b 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include #include #include "glog/logging.h" -#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(benchmark, false, @@ -43,13 +42,15 @@ DEFINE_double( // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_READER_LOCK -#define SCOPE_WRITER_LOCK +#define SCOPE_KIDS_READER_LOCK +#define SCOPE_KIDS_WRITER_LOCK +#define SCOPE_VARS_READER_LOCK +#define SCOPE_VARS_WRITER_LOCK #else -// TODO(minqiyang): use rwlock in all platforms, now rwlock is a fake one -// in _WIN32 platform -#define SCOPE_READER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kRDLock); -#define SCOPE_WRITER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kWRLock); +#define SCOPE_KIDS_READER_LOCK AutoRDLock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK AutoWRLock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK AutoRDLock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK AutoWRLock(&vars_lock_); #endif namespace paddle { @@ -65,64 +66,69 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - SCOPE_WRITER_LOCK - kids_.push_back(new Scope(this)); - return *kids_.back(); + Scope* child = new Scope(this); + { + SCOPE_KIDS_WRITER_LOCK + kids_.push_back(child); + } + return *child; } Variable* Scope::Var(const std::string& name) { - SCOPE_WRITER_LOCK + SCOPE_VARS_WRITER_LOCK return VarInternal(name); } Variable* Scope::Var(std::string* name) { - SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; } + SCOPE_VARS_WRITER_LOCK return VarInternal(new_name); } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_READER_LOCK + SCOPE_VARS_READER_LOCK return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_READER_LOCK + SCOPE_VARS_READER_LOCK return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_READER_LOCK + SCOPE_VARS_READER_LOCK return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_WRITER_LOCK + SCOPE_KIDS_WRITER_LOCK for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_READER_LOCK + SCOPE_KIDS_READER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - SCOPE_READER_LOCK std::vector known_vars; - known_vars.reserve(this->vars_.size()); - for (auto& p : vars_) { - known_vars.emplace_back(p.first); + { + SCOPE_VARS_READER_LOCK + known_vars.reserve(this->vars_.size()); + for (auto& p : vars_) { + known_vars.emplace_back(p.first); + } } return known_vars; } void Scope::DeleteScope(Scope* scope) const { - SCOPE_WRITER_LOCK + SCOPE_KIDS_WRITER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -136,8 +142,8 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - SCOPE_WRITER_LOCK std::set var_set(var_names.begin(), var_names.end()); + SCOPE_VARS_WRITER_LOCK for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { it = vars_.erase(it); @@ -149,12 +155,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_WRITER_LOCK + SCOPE_VARS_WRITER_LOCK RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_WRITER_LOCK + SCOPE_VARS_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; @@ -188,7 +194,7 @@ void Scope::RenameInternal(const std::string& origin_name, auto new_it = vars_.find(new_name); PADDLE_ENFORCE(new_it == vars_.end(), "The variable with name %s is already in the scope", new_name); - vars_[new_name].reset(origin_it->second.release()); + vars_[new_name].reset(origin_it.value().release()); vars_.erase(origin_it); } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index c140212c3e44e..78ad8be500a70 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -14,11 +14,15 @@ limitations under the License. */ #pragma once +#include #include +#include #include -#include +#include #include +#include // NOLINT + #include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -94,7 +98,11 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable std::unordered_map> vars_; + mutable tsl::robin_map< + std::string, std::unique_ptr, std::hash, + std::equal_to, + std::allocator>>, true> + vars_; private: // Call Scope::NewScope for a sub-scope. @@ -123,7 +131,8 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); private: - mutable RWLock rw_lock_; + mutable RWLock kids_lock_; + mutable RWLock vars_lock_; }; // Generate some debug string about the inherience structure of scope, quite diff --git a/paddle/fluid/framework/spin_lock.h b/paddle/fluid/framework/spin_lock.h new file mode 100644 index 0000000000000..11a763d655abb --- /dev/null +++ b/paddle/fluid/framework/spin_lock.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if !defined(_WIN32) +#include +#else +#include // NOLINT +#endif // !_WIN32 + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +#if !defined(_WIN32) +struct SpinLock { + SpinLock() { pthread_spin_init(&lock_, PTHREAD_PROCESS_PRIVATE); } + + ~SpinLock() { pthread_spin_destroy(&lock_); } + + void Lock() { + PADDLE_ENFORCE_EQ(pthread_spin_lock(&lock_), 0, "acquire spin lock failed"); + } + + void Unlock() { + PADDLE_ENFORCE_EQ(pthread_spin_unlock(&lock_), 0, + "release spin lock failed"); + } + + private: + pthread_spinlock_t lock_; +}; +#else +// FIXME(minqiyang): use mutex here to do fake spin lock +struct SpinLock { + void Lock() { mutex_.lock(); } + + void Unlock() { mutex_.lock(); } + + private: + std::mutex mutex_; +}; +#endif + +class AutoSpinLock { + public: + explicit SpinLockGuard(SpinLock* spin_lock) : lock_(spin_lock) { + lock_->Lock(); + } + + ~SpinLockGuard() { lock_->Unlock(); } + + private: + SpinLock* lock_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 2205f473f2318..3455d1ee54e8e 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -292,23 +292,6 @@ class AdamOpKernel : public framework::OpKernel { static_cast(ctx.device_context()), param.numel()); for_range(functor); - - auto& dev = - *ctx.template device_context().eigen_device(); - - const LoDTensor* beta1_pow_ptr = ctx.Input("Beta1Pow"); - auto eigen_in_beta1_pow = - framework::EigenVector::Flatten(*beta1_pow_ptr); - auto eigen_out_beta1_pow = framework::EigenVector::Flatten( - *(const_cast(beta1_pow_ptr))); - eigen_out_beta1_pow.device(dev) = beta1 * eigen_in_beta1_pow; - - const LoDTensor* beta2_pow_ptr = ctx.Input("Beta2Pow"); - auto eigen_in_beta2_pow = - framework::EigenVector::Flatten(*beta2_pow_ptr); - auto eigen_out_beta2_pow = framework::EigenVector::Flatten( - *(const_cast(beta2_pow_ptr))); - eigen_out_beta2_pow.device(dev) = beta2 * eigen_in_beta2_pow; } } else if (grad_var->IsType()) { auto& grad = diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58ef3da0b2312..f831f2313e4ba 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -765,7 +765,7 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is INT, num_iteration_per_drop_scope indicates how many iterations to clean up the temp variables which is generated during execution. It may make the execution faster, - because the temp variable's shape maybe the same between two iterations. Default 100. + because the temp variable's shape maybe the same between two iterations. Default 1. NOTES: 1. If you fetch data when calling the 'run', the ParallelExecutor diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 1930ac106b284..da92826d41050 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -477,7 +477,7 @@ class LarsMomentumOptimizer(Optimizer): regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. - + Examples: .. code-block:: python @@ -739,27 +739,26 @@ def _finish_update(self, block, param_and_grads): """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - # for param, grad in param_and_grads: - - # if grad is None: - # continue - # with param.block.program._optimized_guard( - # [param, grad]), name_scope("optimizer"): - # beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, - # param) - # beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, - # param) - # main_block.append_op( - # type="scale", - # inputs={"X": beta1_pow_acc}, - # outputs={"Out": beta1_pow_acc}, - # attrs={"scale": self._beta1}) - - # main_block.append_op( - # type="scale", - # inputs={"X": beta2_pow_acc}, - # outputs={"Out": beta2_pow_acc}, - # attrs={"scale": self._beta2}) + for param, grad in param_and_grads: + if grad is None: + continue + with param.block.program._optimized_guard( + [param, grad]), name_scope("optimizer"): + beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, + param) + beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, + param) + main_block.append_op( + type="scale", + inputs={"X": beta1_pow_acc}, + outputs={"Out": beta1_pow_acc}, + attrs={"scale": self._beta1}) + + main_block.append_op( + type="scale", + inputs={"X": beta2_pow_acc}, + outputs={"Out": beta2_pow_acc}, + attrs={"scale": self._beta2}) class AdamaxOptimizer(Optimizer): From a81495d6f4a71980b51cc3099f8cd76885cdcb13 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 13 Dec 2018 18:45:20 +0800 Subject: [PATCH 008/103] Fix code --- paddle/fluid/framework/scope.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index f05208c5ec98b..d2856a07a167c 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include "glog/logging.h" +#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(benchmark, false, @@ -47,10 +48,10 @@ DEFINE_double( #define SCOPE_VARS_READER_LOCK #define SCOPE_VARS_WRITER_LOCK #else -#define SCOPE_KIDS_READER_LOCK AutoRDLock(&kids_lock_); -#define SCOPE_KIDS_WRITER_LOCK AutoWRLock(&kids_lock_); -#define SCOPE_VARS_READER_LOCK AutoRDLock(&vars_lock_); -#define SCOPE_VARS_WRITER_LOCK AutoWRLock(&vars_lock_); +#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); #endif namespace paddle { From 19a798018f82b9eaa31aa8d84f8aa4306bbf8973 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 13 Dec 2018 18:51:28 +0800 Subject: [PATCH 009/103] Remove dup cmake test=develop --- CMakeLists.txt | 6 ------ 1 file changed, 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bf724e8aa9843..1b2e0ecf6c59b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,12 +81,6 @@ option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) -if (WITH_PROFILER) - find_package(Gperftools REQUIRED) - include_directories(${GPERFTOOLS_INCLUDE_DIR}) - add_definitions(-DWITH_GPERFTOOLS) -endif() - # PY_VERSION if(NOT PY_VERSION) set(PY_VERSION 2.7) From 728e7e88fb2c3467f6e28ef968b4e720d290b26c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 13:37:57 +0800 Subject: [PATCH 010/103] Use xxHash as scope's hash algorithm test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/scope.cc | 2 +- paddle/fluid/framework/scope.h | 26 ++++++++++++++++++++------ python/paddle/fluid/profiler.py | 2 +- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cea4a448574cf..5dca5ac59888c 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -82,7 +82,7 @@ cc_test(variable_test SRCS variable_test.cc) cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(scope SRCS scope.cc DEPS glog threadpool) +cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash) cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index b1abe75d7658c..4f79d9826099e 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -201,7 +201,7 @@ void Scope::RenameInternal(const std::string& origin_name, auto new_it = vars_.find(new_name); PADDLE_ENFORCE(new_it == vars_.end(), "The variable with name %s is already in the scope", new_name); - vars_[new_name].reset(origin_it.value().release()); + vars_[new_name].reset(origin_it->second.release()); vars_.erase(origin_it); } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index b232d267dbcca..77ef18414d09c 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -14,15 +14,18 @@ limitations under the License. */ #pragma once +extern "C" { +#include +} + #include #include #include #include +#include #include #include -#include // NOLINT - #include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -35,6 +38,14 @@ bool IsFastEagerDeletionModeEnabled(); class Scope; +namespace inner { +struct KeyHasher { + std::size_t operator()(const std::string& key) const { + return XXH32(key.c_str(), key.size(), 1); + } +}; +} // namespace inner + /** * @brief Scope that manage all variables. * @@ -99,11 +110,14 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable tsl::robin_map< - std::string, std::unique_ptr, std::hash, - std::equal_to, - std::allocator>>, true> + mutable std::unordered_map, + inner::KeyHasher> vars_; + // mutable tsl::robin_map< + // std::string, std::unique_ptr, std::hash, + // std::equal_to, + // std::allocator>>, true> + // vars_; private: // Call Scope::NewScope for a sub-scope. diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 8df2e01b03749..78f7a6ac085f6 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -93,7 +93,7 @@ def cuda_profiler(output_file, output_mode=None, config=None): with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) #Comment this for nvprof - #core.nvprof_init(output_file, output_mode, config_file) + core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() yield From aa41ee75a16509cb16793d7fdbbbfa3ce2dab69f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 17:13:26 +0800 Subject: [PATCH 011/103] Accelerate PADDLE_ENFORCE --- paddle/fluid/framework/operator.h | 12 ++++-- paddle/fluid/platform/enforce.h | 68 +++++++++++++++++++------------ 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bce01..63a8bc574f300 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -49,6 +49,8 @@ constexpr char kTempVarName[] = "@TEMP@"; /// e.g. Variable "x@GRAD" is the gradient of varibale "x". constexpr char kGradVarSuffix[] = "@GRAD"; +constexpr size_t kGradVarSuffixSize = 5U; + /// Variables with this suffix are supposed to be filled up with zeros. constexpr char kZeroVarSuffix[] = "@ZERO"; @@ -60,7 +62,11 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@"; extern std::vector> kKernelPriority; inline std::string GradVarName(const std::string& var_name) { - return var_name + kGradVarSuffix; + std::string result; + result.reserve(var_name.size() + kGradVarSuffixSize); + result += var_name; + result += kGradVarSuffix; + return result; } proto::VarType::Type GetDataTypeOfVar(const Variable* var); @@ -101,8 +107,8 @@ class OperatorBase { bool HasAttr(const std::string& name) const { return attrs_.count(name); } template inline const T& Attr(const std::string& name) const { - PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", - name); + PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(), + "%s should be in AttributeMap", name); return boost::get(attrs_.at(name)); } const AttributeMap& Attrs() const { return attrs_; } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 01ee67fd07f84..3c03a902796af 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -140,68 +140,72 @@ struct EOFException : public std::exception { #define LIKELY(condition) (condition) #endif +inline bool is_error(bool stat) { return !stat; } + template inline typename std::enable_if::type throw_on_error( bool stat, const Args&... args) { - if (UNLIKELY(!(stat))) { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(string::Sprintf(args...)); + throw std::runtime_error(string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } } #ifdef PADDLE_WITH_CUDA +inline bool is_error(cudaError_t e) { return UNLIKELY(e); } + template inline typename std::enable_if::type throw_on_error( cudaError_t e, const Args&... args) { - if (UNLIKELY(e)) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(e, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(e, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(curandStatus_t stat) { + return stat != CURAND_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( curandStatus_t stat, const Args&... args) { - if (stat != CURAND_STATUS_SUCCESS) { #ifndef REPLACE_ENFORCE_GLOG - throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), - string::Sprintf(args...)); + throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(), + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cudnnStatus_t stat) { + return stat != CUDNN_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cudnnStatus_t stat, const Args&... args) { - if (stat == CUDNN_STATUS_SUCCESS) { - return; - } else { #ifndef REPLACE_ENFORCE_GLOG - throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + - string::Sprintf(args...)); + throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) + + string::Sprintf(args...)); #else - LOG(FATAL) << string::Sprintf(args...); + LOG(FATAL) << string::Sprintf(args...); #endif - } +} + +inline bool is_error(cublasStatus_t stat) { + return stat != CUBLAS_STATUS_SUCCESS; } template inline typename std::enable_if::type throw_on_error( cublasStatus_t stat, const Args&... args) { std::string err; - if (stat == CUBLAS_STATUS_SUCCESS) { - return; - } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { + if (stat == CUBLAS_STATUS_NOT_INITIALIZED) { err = "CUBLAS: not initialized, "; } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) { err = "CUBLAS: alloc failed, "; @@ -254,11 +258,21 @@ inline void throw_on_error(T e) { #define PADDLE_THROW(...) \ throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) +#define PADDLE_JUDGE + +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(cond))) { \ + ::paddle::platform::throw_on_error(cond, ##__VA_ARGS__); \ + } \ + } while (0) + #ifndef REPLACE_ENFORCE_GLOG -#define PADDLE_ENFORCE(...) \ +#define PADDLE_ENFORCE(COND, ...) \ do { \ try { \ - ::paddle::platform::throw_on_error(__VA_ARGS__); \ + __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); \ } catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ @@ -266,7 +280,7 @@ inline void throw_on_error(T e) { } while (false) #else -#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); +#define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG #define PADDLE_THROW_EOF() \ From 27a0d6c2dc7a1fb26ec3bfc0b44840300685b993 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 17 Dec 2018 17:17:13 +0800 Subject: [PATCH 012/103] Polish code test=develop --- CMakeLists.txt | 1 - cmake/external/robin_map.cmake | 31 ------------------------------- paddle/fluid/framework/scope.h | 5 ----- python/paddle/fluid/profiler.py | 1 - 4 files changed, 38 deletions(-) delete mode 100644 cmake/external/robin_map.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b2e0ecf6c59b..1594e798a2ba3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -209,7 +209,6 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream -include(external/robin_map) # download tsl::robin_map if (NOT WIN32) # there is no official support of warpctc, nccl, cupti in windows diff --git a/cmake/external/robin_map.cmake b/cmake/external/robin_map.cmake deleted file mode 100644 index ddaf59536cb2c..0000000000000 --- a/cmake/external/robin_map.cmake +++ /dev/null @@ -1,31 +0,0 @@ -include(ExternalProject) - -set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map) -set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include) - -include_directories(${ROBIN_MAP_INCLUDE_DIR}) - -ExternalProject_Add( - extern_robin_map - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "/~https://github.com/Tessil/robin-map.git" - GIT_TAG "v0.5.0" - PREFIX ${ROBIN_MAP_SOURCE_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) - -if(${CMAKE_VERSION} VERSION_LESS "3.3.0") - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c) - file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") - add_library(robin_map STATIC ${dummyfile}) -else() - add_library(robin_map INTERFACE) -endif() - -add_dependencies(robin_map extern_robin_map) - -LIST(APPEND externl_project_dependencies robin_map) diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 77ef18414d09c..9a715ac9b9580 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -113,11 +113,6 @@ class Scope { mutable std::unordered_map, inner::KeyHasher> vars_; - // mutable tsl::robin_map< - // std::string, std::unique_ptr, std::hash, - // std::equal_to, - // std::allocator>>, true> - // vars_; private: // Call Scope::NewScope for a sub-scope. diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 78f7a6ac085f6..e05885f5f5bfc 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -92,7 +92,6 @@ def cuda_profiler(output_file, output_mode=None, config=None): config_file = 'nvprof_config_file' with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) - #Comment this for nvprof core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() From a3fa3f85d7bd4fb948b0401d77d5c60498e5a329 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 18 Dec 2018 15:04:26 +0800 Subject: [PATCH 013/103] Polish code test=develop --- paddle/fluid/platform/enforce.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 3c03a902796af..d1dd09f2064f1 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -260,12 +260,12 @@ inline void throw_on_error(T e) { #define PADDLE_JUDGE -#define __PADDLE_UNARY_COMPARE(COND, ...) \ - do { \ - auto cond = COND; \ - if (UNLIKELY(::paddle::platform::is_error(cond))) { \ - ::paddle::platform::throw_on_error(cond, ##__VA_ARGS__); \ - } \ +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto __cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ + ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \ + } \ } while (0) #ifndef REPLACE_ENFORCE_GLOG From 69642000dc3a83b3dad5a33052da1eff1f450b6d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 18 Dec 2018 15:09:01 +0800 Subject: [PATCH 014/103] Hide KeyHasher test=develop --- paddle/fluid/framework/scope.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 9a715ac9b9580..797d110159391 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -38,14 +38,6 @@ bool IsFastEagerDeletionModeEnabled(); class Scope; -namespace inner { -struct KeyHasher { - std::size_t operator()(const std::string& key) const { - return XXH32(key.c_str(), key.size(), 1); - } -}; -} // namespace inner - /** * @brief Scope that manage all variables. * @@ -110,8 +102,13 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable std::unordered_map, - inner::KeyHasher> + struct KeyHasher { + std::size_t operator()(const std::string& key) const { + return XXH32(key.c_str(), key.size(), 1); + } + }; + + mutable std::unordered_map, KeyHasher> vars_; private: From a500dfa579907d8046e40a15e67558c350498976 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 18 Dec 2018 06:27:32 +0000 Subject: [PATCH 015/103] rewrite ddim test=develop --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/array.h | 74 ++- paddle/fluid/framework/ddim.cc | 303 ++++-------- paddle/fluid/framework/ddim.h | 148 ++++-- paddle/fluid/framework/dim.h | 441 ++++++------------ paddle/fluid/framework/dlpack_tensor.cc | 6 +- paddle/fluid/framework/dlpack_tensor.h | 2 +- paddle/fluid/framework/unroll_array_ops.h | 169 +++++++ .../fluid/operators/controlflow/logical_op.cc | 2 - paddle/fluid/operators/crop_op.h | 1 - paddle/fluid/operators/cudnn_lstm_op.cu.cc | 1 - .../fluid/operators/detail/strided_memcpy.h | 38 +- .../detection/generate_proposal_labels_op.cc | 2 - .../detection/generate_proposals_op.cc | 6 - .../detection/rpn_target_assign_op.cc | 1 - .../operators/elementwise/elementwise_op.h | 1 - paddle/fluid/operators/expand_op.h | 1 - paddle/fluid/operators/fc_op.cc | 1 - .../fused/fused_embedding_fc_lstm_op.cc | 18 +- paddle/fluid/operators/hinge_loss_op.cc | 1 - paddle/fluid/operators/log_loss_op.cc | 1 - .../fluid/operators/math/math_function_impl.h | 3 - paddle/fluid/operators/math/softmax_impl.h | 1 - .../fluid/operators/modified_huber_loss_op.cc | 1 - paddle/fluid/operators/mul_op.cc | 6 - paddle/fluid/operators/nce_op.cc | 1 - paddle/fluid/operators/norm_op.h | 1 - paddle/fluid/operators/psroi_pool_op.h | 1 - .../sequence_ops/sequence_slice_op.h | 2 - paddle/fluid/operators/strided_memcpy.h | 2 +- 30 files changed, 622 insertions(+), 615 deletions(-) create mode 100644 paddle/fluid/framework/unroll_array_ops.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6d7a69c8c9e11..023118d74078d 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -36,7 +36,7 @@ add_subdirectory(details) proto_library(framework_proto SRCS framework.proto) proto_library(async_executor_proto SRCS data_feed.proto) -cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) +cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h index be9efcd74924a..aa0abc22a6bc9 100644 --- a/paddle/fluid/framework/array.h +++ b/paddle/fluid/framework/array.h @@ -15,34 +15,88 @@ #pragma once #include -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/framework/unroll_array_ops.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace framework { + template class Array { - static_assert(N > 0, "The size of array must be larger than 0"); - public: - HOSTDEVICE Array() {} + static constexpr size_t kSize = N; - HOSTDEVICE explicit Array(const T &val) { - for (size_t i = 0; i < N; ++i) data_[i] = val; + HOSTDEVICE inline Array() = default; + + template + HOSTDEVICE inline explicit Array(const T &val, Args... args) { + UnrollVarArgsAssign::Run(data_, val, args...); } - HOSTDEVICE const T *Get() const { return data_; } + HOSTDEVICE inline void Fill(const T &val) { + UnrollFillConstant::Run(data_, val); + } - HOSTDEVICE T *GetMutable() { return data_; } + HOSTDEVICE inline const T *Get() const { return data_; } - HOSTDEVICE T &operator[](size_t index) { return data_[index]; } + HOSTDEVICE inline T *GetMutable() { return data_; } - HOSTDEVICE const T &operator[](size_t index) const { return data_[index]; } + HOSTDEVICE inline T &operator[](size_t index) { return data_[index]; } + + HOSTDEVICE inline const T &operator[](size_t index) const { + return data_[index]; + } HOSTDEVICE constexpr size_t size() const { return N; } + HOSTDEVICE inline bool operator==(const Array &other) const { + return UnrollCompare::Run(data_, other.data_); + } + + HOSTDEVICE inline bool operator!=(const Array &other) const { + return !(*this == other); + } + private: T data_[N]; }; +template +class Array { + public: + static constexpr size_t kSize = 0; + + HOSTDEVICE inline Array() = default; + + HOSTDEVICE inline void Fill(const T &val) {} + + HOSTDEVICE inline constexpr T *Get() const { return nullptr; } + + // Add constexpr to GetMutable() cause warning in MAC + HOSTDEVICE inline T *GetMutable() { return nullptr; } + + HOSTDEVICE inline T &operator[](size_t index) { +#ifndef __CUDA_ARCH__ + PADDLE_THROW("Array has no element"); +#endif + } + + HOSTDEVICE inline const T &operator[](size_t index) const { +#ifndef __CUDA_ARCH__ + PADDLE_THROW("Array has no element"); +#endif + } + + HOSTDEVICE constexpr size_t size() const { return 0; } + + HOSTDEVICE constexpr bool operator==(const Array &other) const { + return true; + } + + HOSTDEVICE constexpr bool operator!=(const Array &other) const { + return false; + } +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index 05e423b8a5296..3640138e18059 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -18,201 +18,131 @@ limitations under the License. */ namespace paddle { namespace framework { -/// @cond HIDDEN +template +struct DDimAssignFunctor { + static_assert(std::is_integral::value, "T must be integral type"); + using result_type = void; + explicit DDimAssignFunctor(const T* in) : in_(in) {} -template -Dim make_dim(const int64_t* d) { - return Dim(*d, make_dim(d + 1)); -} + template + inline void operator()(Dim& dim) { // NOLINT + UnrollAssign::Run(in_, dim.data()); + } + + const T* in_; +}; -template <> -Dim<0> make_dim<0>(const int64_t* d) { - return Dim<0>(*d); +DDim::DDim(const int* d, int n) : rank_(n) { + this->apply_visitor(DDimAssignFunctor(d)); } -void make_ddim(DDim& ddim, const int64_t* dims, int n) { - switch (n) { - case 0: - ddim = make_dim<0>(dims); - break; - case 1: - ddim = make_dim<1>(dims); - break; - case 2: - ddim = make_dim<2>(dims); - break; - case 3: - ddim = make_dim<3>(dims); - break; - case 4: - ddim = make_dim<4>(dims); - break; - case 5: - ddim = make_dim<5>(dims); - break; - case 6: - ddim = make_dim<6>(dims); - break; - case 7: - ddim = make_dim<7>(dims); - break; - case 8: - ddim = make_dim<8>(dims); - break; - case 9: - ddim = make_dim<9>(dims); - break; - default: - PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions."); - } +DDim::DDim(const int64_t* d, int n) : rank_(n) { + this->apply_visitor(DDimAssignFunctor(d)); } -/// @endcond +template +Dim make_dim(const int64_t* d) { + Dim ret; + for (int i = 0; i < N; ++i) ret[i] = d[i]; + return ret; +} DDim make_ddim(std::initializer_list dims) { - DDim result(make_dim(0)); - make_ddim(result, dims.begin(), dims.size()); - return result; + return DDim(dims.begin(), dims.size()); } DDim make_ddim(const std::vector& dims) { - DDim result(make_dim(0)); - make_ddim(result, &dims[0], dims.size()); - return result; + return DDim(dims.data(), dims.size()); } DDim make_ddim(const std::vector& dims) { - std::vector res(dims.size()); - std::transform(dims.begin(), dims.end(), res.begin(), - [](int d) { return static_cast(d); }); - return make_ddim(res); + return DDim(dims.data(), dims.size()); } -/// @cond HIDDEN -// XXX For some reason, putting this in an anonymous namespace causes errors -class DynamicMutableIndexer : public boost::static_visitor { - public: - explicit DynamicMutableIndexer(int idx) : idx_(idx) {} - - template - int64_t& operator()(Dim& dim) const { - return dim[idx_]; - } - - private: - int idx_; -}; - -class DynamicConstIndexer : public boost::static_visitor { - public: - explicit DynamicConstIndexer(int idx) : idx_(idx) {} +struct DDimEqualityVisitor { + explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {} template - int64_t operator()(const Dim& dim) const { - return dim[idx_]; + inline bool operator()(const Dim& self) const { + return UnrollCompare::Run(self.data(), d_); } - private: - int idx_; + const int64_t* d_; }; -/// @endcond - -int64_t& DDim::operator[](int idx) { - return boost::apply_visitor(DynamicMutableIndexer(idx), var); -} - -int64_t DDim::operator[](int idx) const { - return boost::apply_visitor(DynamicConstIndexer(idx), var); +bool DDim::operator==(const DDim& d) const { + return rank_ == d.rank_ && this->apply_visitor(DDimEqualityVisitor(d.data())); } -int DDim::size() const { return arity(*this); } - -bool DDim::operator==(DDim d) const { - if (var.which() != d.getVar().which()) { - return false; - } else { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); +bool DDim::operator!=(const DDim& d) const { return !(*this == d); } - for (unsigned int i = 0; i < v1.size(); i++) { - if (v1[i] != v2[i]) { - return false; - } - } +struct DDimPlusVisitor { + explicit DDimPlusVisitor(const int64_t* d1, const int64_t* d2) + : d1_(d1), d2_(d2) {} - return true; + template + inline void operator()(Dim& self) const { + UnrollAdd::Run(d1_, d2_, self.data()); } -} - -bool DDim::operator!=(DDim d) const { return !(*this == d); } - -DDim DDim::operator+(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); - std::vector v3; - - assert(v1.size() == v2.size()); - - for (unsigned int i = 0; i < v1.size(); i++) { - v3.push_back(v1[i] + v2[i]); - } + const int64_t* d1_; + const int64_t* d2_; +}; - return make_ddim(v3); +DDim DDim::operator+(const DDim& d) const { + PADDLE_ENFORCE(rank_ == d.rank_); + DDim ret; + ret.rank_ = rank_; + ret.apply_visitor(DDimPlusVisitor(data(), d.data())); + return ret; } -DDim DDim::operator*(DDim d) const { - std::vector v1 = vectorize(*this); - std::vector v2 = vectorize(d); +struct DDimMulVisitor { + explicit DDimMulVisitor(const int64_t* d1, const int64_t* d2) + : d1_(d1), d2_(d2) {} - std::vector v3; - - assert(v1.size() == v2.size()); - - for (unsigned int i = 0; i < v1.size(); i++) { - v3.push_back(v1[i] * v2[i]); + template + inline void operator()(Dim& self) const { + UnrollMul::Run(d1_, d2_, self.data()); } - return make_ddim(v3); + const int64_t* d1_; + const int64_t* d2_; +}; + +DDim DDim::operator*(const DDim& d) const { + PADDLE_ENFORCE(rank_ == d.rank_); + DDim ret; + ret.rank_ = rank_; + ret.apply_visitor(DDimMulVisitor(data(), d.data())); + return ret; } int64_t get(const DDim& ddim, int idx) { return ddim[idx]; } -void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } - -/// @cond HIDDEN -struct VectorizeVisitor : public boost::static_visitor<> { - std::vector& vector; - - explicit VectorizeVisitor(std::vector& v) : vector(v) {} - - template - void operator()(const T& t) { - vector.push_back(t.head); - this->operator()(t.tail); - } - - void operator()(const Dim<0>& t) {} -}; -/// @endcond +void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } // NOLINT std::vector vectorize(const DDim& ddim) { - std::vector result; - VectorizeVisitor visitor(result); - boost::apply_visitor(visitor, ddim); + std::vector result(DDim::kMaxRank); + for (int i = 0; i < ddim.size(); ++i) { + result[i] = ddim[i]; + } + result.resize(ddim.size()); return result; } // NOTE: framework::vectorize converts to type int64_t // which does not fit cudnn inputs. std::vector vectorize2int(const DDim& ddim) { - std::vector temp = vectorize(ddim); - std::vector result(temp.begin(), temp.end()); + std::vector result(DDim::kMaxRank); + for (int i = 0; i < ddim.size(); ++i) { + result[i] = ddim[i]; + } + result.resize(ddim.size()); return result; } -struct ProductVisitor : public boost::static_visitor { +struct ProductVisitor { template int64_t operator()(const Dim& dim) { return product(dim); @@ -220,65 +150,27 @@ struct ProductVisitor : public boost::static_visitor { }; int64_t product(const DDim& ddim) { - ProductVisitor visitor; - return boost::apply_visitor(visitor, ddim); + return ddim.apply_visitor(ProductVisitor()); } -struct SliceVectorizeVisitor : public boost::static_visitor<> { - std::vector& vector; - int begin; - int end; - - SliceVectorizeVisitor(std::vector& v, int b, int e) - : vector(v), begin(b), end(e) { - PADDLE_ENFORCE(begin < end, - "Begin index must be less than end index in ddim slice."); - PADDLE_ENFORCE(begin >= 0, - "Begin index can't be less than zero in ddim slice."); - } - - template - void operator()(const Dim& dim) { - if (begin == 0) { - vector.push_back(dim.head); - } else { - --begin; - } - --end; - if (end > 0) { - this->operator()(dim.tail); - } - } - - void operator()(const Dim<0>& dim) { - PADDLE_ENFORCE(end == 0, "End index in ddim slice is out of bound."); - } -}; - DDim slice_ddim(const DDim& dim, int begin, int end) { - std::vector vec; - vec.reserve(end - begin); - SliceVectorizeVisitor visitor(vec, begin, end); - boost::apply_visitor(visitor, dim); - return make_ddim(vec); -} - -/// \cond HIDDEN - -struct ArityVisitor : boost::static_visitor { - template - int operator()(Dim) const { - return D; + PADDLE_ENFORCE(begin < end, + "Begin index must be less than end index in ddim slice."); + PADDLE_ENFORCE(begin >= 0, + "Begin index can't be less than zero in ddim slice."); + DDim ret; + ret.rank_ = end - begin; + for (int i = 0; i < ret.rank_; ++i) { + ret[i] = dim[i + begin]; } -}; - -/// \endcond + return ret; +} -int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); } +int arity(const DDim& d) { return d.size(); } /// \cond HIDDEN -struct DDimPrinter : boost::static_visitor { +struct DDimPrinter { std::ostream& os; explicit DDimPrinter(std::ostream& os_) : os(os_) {} @@ -291,15 +183,10 @@ struct DDimPrinter : boost::static_visitor { /// \endcond std::ostream& operator<<(std::ostream& os, const DDim& ddim) { - DDimPrinter printer(os); - boost::apply_visitor(printer, ddim); + ddim.apply_visitor(DDimPrinter(os)); return os; } -DDim::DDim(std::initializer_list init_list) { - *this = make_ddim(init_list); -} - DDim flatten_to_2d(const DDim& src, int num_col_dims) { int rank = src.size(); return make_ddim({product(slice_ddim(src, 0, num_col_dims)), @@ -309,21 +196,23 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims) { DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } DDim stride(const DDim& ddim) { - std::vector strides(ddim.size()); + DDim strides; + strides.rank_ = ddim.size(); strides[ddim.size() - 1] = 1; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i + 1]; } - return framework::make_ddim(strides); + return strides; } DDim stride_numel(const framework::DDim& ddim) { - std::vector strides(ddim.size()); + DDim strides; + strides.rank_ = ddim.size(); strides[ddim.size() - 1] = ddim[ddim.size() - 1]; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i]; } - return framework::make_ddim(strides); + return strides; } } // namespace framework diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index f05b5ee3faee8..bff710040eba2 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -18,8 +18,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/dim.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { @@ -29,51 +27,138 @@ namespace framework { * * The number of dimensions must be between [1, 9]. */ -struct DDim { - typedef boost::variant, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, - Dim<7>, Dim<8>, Dim<9>> - DDimVar; - DDimVar var; +class DDim { + public: + constexpr static int kMaxRank = 9; - DDim() : var(Dim<1>()) {} + DDim() : rank_(1) { dim_[0] = 0; } + + DDim(const int* d, int n); + DDim(const int64_t* d, int n); template - explicit DDim(const Dim& in) : var(in) {} + /*implicit*/ DDim(const Dim& in) : rank_(D) { // NOLINT + UnsafeCast() = in; + } - /*implicit*/ DDim(std::initializer_list init_list); + /*implicit*/ DDim(std::initializer_list init_list) + : DDim(init_list.begin(), init_list.size()) {} template - DDim& operator=(const Dim& in) { - var = in; + inline DDim& operator=(const Dim& in) { + rank_ = D; + UnsafeCast() = in; return *this; } - int64_t& operator[](int idx); - int64_t operator[](int idx) const; + inline int64_t& operator[](int idx) { return dim_[idx]; } - template - typename Visitor::result_type apply_visitor(Visitor& visitor) { - return var.apply_visitor(visitor); + inline int64_t operator[](int idx) const { return dim_[idx]; } + + inline int64_t& at(int idx) { + PADDLE_ENFORCE(idx >= 0 && idx < rank_); + return dim_[idx]; } - template - typename Visitor::result_type apply_visitor(Visitor& visitor) const { - return var.apply_visitor(visitor); + inline int64_t at(int idx) const { + PADDLE_ENFORCE(idx >= 0 && idx < rank_); + return dim_[idx]; } - DDimVar getVar() { return var; } + template + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor); + + template + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor) const; + + bool operator==(const DDim& d) const; + + bool operator!=(const DDim& d) const; + + DDim operator+(const DDim& d) const; - bool operator==(DDim d) const; + DDim operator*(const DDim& d) const; - bool operator!=(DDim d) const; + // Make DDim act like std::vector + using iterator = int64_t*; + using const_iterator = const int64_t*; - DDim operator+(DDim d) const; + int64_t* data() { return dim_.data(); } + const int64_t* data() const { return dim_.data(); } - DDim operator*(DDim d) const; + iterator begin() { return data(); } + const_iterator begin() const { return data(); } + iterator end() { return data() + rank_; } + const_iterator end() const { return data() + rank_; } + + int size() const { return rank_; } + + private: + template + inline Dim& UnsafeCast() { + return const_cast&>(const_cast(this)->UnsafeCast()); + } - int size() const; + template + inline const Dim& UnsafeCast() const { + static_assert(M >= 0 && M <= kMaxRank, "Invalid rank"); + auto* p = static_cast(&dim_); + return *reinterpret_cast*>(p); + } + + friend DDim slice_ddim(const DDim& dim, int begin, int end); + friend DDim stride(const DDim& ddim); + friend DDim stride_numel(const DDim& ddim); + + Dim dim_; + int rank_; }; +#define PADDLE_VISIT_DDIM(rank) \ + case rank: \ + return visitor(UnsafeCast()) + +template +typename std::result_of&)>::type DDim::apply_visitor( + Visitor&& visitor) { + switch (rank_) { + PADDLE_VISIT_DDIM(0); + PADDLE_VISIT_DDIM(1); + PADDLE_VISIT_DDIM(2); + PADDLE_VISIT_DDIM(3); + PADDLE_VISIT_DDIM(4); + PADDLE_VISIT_DDIM(5); + PADDLE_VISIT_DDIM(6); + PADDLE_VISIT_DDIM(7); + PADDLE_VISIT_DDIM(8); + PADDLE_VISIT_DDIM(9); + default: + PADDLE_THROW("Invalid rank %d", rank_); + } +} + +template +typename std::result_of&)>::type DDim::apply_visitor( + Visitor&& visitor) const { + switch (rank_) { + PADDLE_VISIT_DDIM(0); + PADDLE_VISIT_DDIM(1); + PADDLE_VISIT_DDIM(2); + PADDLE_VISIT_DDIM(3); + PADDLE_VISIT_DDIM(4); + PADDLE_VISIT_DDIM(5); + PADDLE_VISIT_DDIM(6); + PADDLE_VISIT_DDIM(7); + PADDLE_VISIT_DDIM(8); + PADDLE_VISIT_DDIM(9); + default: + PADDLE_THROW("Invalid rank %d", rank_); + } +} +#undef PADDLE_VISIT_DDIM + /** * \brief Make a DDim from std::vector * @@ -92,7 +177,7 @@ DDim make_ddim(const std::vector& dims); DDim make_ddim(std::initializer_list dims); int64_t get(const DDim& dim, int idx); -void set(DDim& dim, int idx, int val); +void set(DDim& dim, int idx, int val); // NOLINT std::vector vectorize(const DDim& ddim); std::vector vectorize2int(const DDim& ddim); @@ -129,12 +214,3 @@ DDim stride(const DDim& ddim); DDim stride_numel(const DDim& ddim); } // namespace framework } // namespace paddle - -namespace boost { - -template -T get(const paddle::framework::DDim& in) { - return boost::get(in.var); -} - -} // namespace boost diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h index 73f92fa389fa3..3ae60a3119e1b 100644 --- a/paddle/fluid/framework/dim.h +++ b/paddle/fluid/framework/dim.h @@ -16,328 +16,184 @@ #include #include #include +#include #include +#include "paddle/fluid/framework/array.h" #include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { namespace framework { // Statically sized, statically indexed dimension -template -struct Dim { - static constexpr int dimensions = i; +template +class Dim : public Array { + public: + static_assert(N >= 0, "N must be not less than 0"); - template - HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) { - static_assert(sizeof...(_tail) == i - 1, - "Dim initialized with the wrong number of parameters"); - } + static constexpr int kRank = N; + using BaseClass = Array; - HOSTDEVICE - Dim(int64_t _head, const Dim& _tail) : head(_head), tail(_tail) {} + inline Dim(int64_t head, const Dim& tail) { + (*this)[0] = head; + new (this->GetMutable() + 1) Dim(tail); + } - HOSTDEVICE - Dim() : head(0), tail() {} + template + HOSTDEVICE explicit Dim(int64_t head, Args... args) + : BaseClass(head, args...) {} /** Construct a Dim from a linear index and size. Uses Fortran order * indexing. */ - HOSTDEVICE - Dim(int64_t idx, const Dim& size) - : head(idx % size.head), tail(idx / size.head, size.tail) {} + HOSTDEVICE Dim(int64_t idx, const Dim& size); /** Construct a Dim with each dimension set to the given index */ - HOSTDEVICE - Dim(int64_t idx) : head(idx), tail(idx) {} + HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); } - HOSTDEVICE - bool operator==(const Dim& o) const { - return (head == o.head) && (tail == o.tail); - } + HOSTDEVICE Dim() = default; - HOSTDEVICE - bool operator!=(const Dim& o) const { return !(*this == o); } + HOSTDEVICE int64_t* data() { return this->GetMutable(); } - HOSTDEVICE - int64_t& operator[](int idx); - HOSTDEVICE - int64_t operator[](int idx) const; + HOSTDEVICE const int64_t* data() const { return this->Get(); } HOST std::string to_string() const; - - int64_t head; - Dim tail; -}; - -// Base case specialization -template <> -struct Dim<0> { - static constexpr int dimensions = 0; - - HOSTDEVICE - Dim(int64_t _head) {} - - HOSTDEVICE - Dim() {} - - HOSTDEVICE - Dim(int idx, const Dim<0>& size) { -#ifndef __CUDA_ARCH__ - if (idx > 0) { - throw std::invalid_argument("Index out of range."); - } -#else - PADDLE_ASSERT(idx == 0); -#endif - } - - HOSTDEVICE - bool operator==(const Dim<0>& o) const { return true; } - - HOSTDEVICE - bool operator!=(const Dim<0>& o) const { return false; } - - HOSTDEVICE - int64_t& operator[](int idx); - HOSTDEVICE - int64_t operator[](int idx) const; }; -namespace { - -// Helper for accessing Dim classes -template -struct DimGetter { - // Return a copy if Dim is const - template - HOSTDEVICE static int64_t impl(const D& d) { - return DimGetter::impl(d.tail); - } - // Return a reference if Dim is mutable - template - HOSTDEVICE static int64_t& impl(D& d) { - return DimGetter::impl(d.tail); +namespace detail { +template +struct FortranOrderIndexingConstructorFunctor { + HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx, + int64_t* out) { + out[kStart] = (*idx) % in[kStart]; + (*idx) /= in[kStart]; + FortranOrderIndexingConstructorFunctor::Run(in, idx, + out); } }; -// Eureka! We found the element! -template <> -struct DimGetter<0> { - // Return a copy if Dim is const - template - HOSTDEVICE static int64_t impl(const D& d) { - return d.head; - } - // Return a reference if Dim is mutable - template - HOSTDEVICE static int64_t& impl(D& d) { - return d.head; - } +template +struct FortranOrderIndexingConstructorFunctor { + HOSTDEVICE inline static void Run(const int64_t* in, int64_t* idx, + int64_t* out) {} }; +} // namespace detail -template -HOSTDEVICE int64_t& indexer(Dim& dim, int idx) { -#ifndef __CUDA_ARCH__ - if (idx < 0) { - throw std::invalid_argument("Tried to access a negative dimension"); - } -#else - PADDLE_ASSERT(idx >= 0); -#endif - if (idx == 0) { - return dim.head; - } - return indexer(dim.tail, idx - 1); +template +HOSTDEVICE Dim::Dim(int64_t idx, const Dim& size) { + detail::FortranOrderIndexingConstructorFunctor<0, N, N == 0>::Run( + size.Get(), &idx, this->GetMutable()); } -template <> -HOSTDEVICE int64_t& indexer<0>(Dim<0>& dim, int idx) { -#ifndef __CUDA_ARCH__ - throw std::invalid_argument("Invalid index"); -#else - PADDLE_ASSERT(false); -#if CUDA_VERSION < 8000 - // On CUDA versions previous to 8.0, only __shared__ variables - // could be declared as static in the device code. - int64_t head = 0; -#else - static int64_t head = 0; -#endif - return head; -#endif +template +HOSTDEVICE inline int64_t get(const Dim& dim) { + return dim[idx]; } -template -HOSTDEVICE int64_t indexer(const Dim& dim, int idx) { -#ifndef __CUDA_ARCH__ - if (idx < 0) { - throw std::invalid_argument("Tried to access a negative dimension"); - } -#else - PADDLE_ASSERT(idx >= 0); -#endif - if (idx == 0) { - return dim.head; - } - return indexer(dim.tail, idx - 1); -} - -template <> -HOSTDEVICE int64_t indexer<0>(const Dim<0>& dim, int idx) { -#ifndef __CUDA_ARCH__ - throw std::invalid_argument("Invalid index"); -#else - PADDLE_ASSERT(false); -#if CUDA_VERSION < 8000 - // On CUDA versions previous to 8.0, only __shared__ variables - // could be declared as static in the device code. - int64_t head = 0; -#else - static int64_t head = 0; -#endif - return head; -#endif -} - -} // namespace -// Static access to constant Dim -template -HOSTDEVICE int64_t get(const Dim& d) { - return DimGetter::impl(d); -} - -// Static access to mutable Dim -template -HOSTDEVICE int64_t& get(Dim& d) { - return DimGetter::impl(d); -} - -// Dynamic access to constant Dim -template -HOSTDEVICE int64_t Dim::operator[](int i) const { - return indexer(*this, i); +template +HOSTDEVICE inline int64_t& get(Dim& dim) { // NOLINT + return dim[idx]; } -// Dynamic access to mutable Dim -template -HOSTDEVICE int64_t& Dim::operator[](int i) { - return indexer(*this, i); +template +HOSTDEVICE inline int64_t get(const Dim& dim, int idx) { + return dim[idx]; } -// Dynamic access to constant Dim -inline HOSTDEVICE int64_t Dim<0>::operator[](int i) const { - return indexer(*this, i); -} - -// Dynamic access to mutable Dim -inline HOSTDEVICE int64_t& Dim<0>::operator[](int i) { - return indexer(*this, i); -} - -// Dynamic access to constant Dim -// without std::enable_if will try to instantiate this on get<0>(d) -template -HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim& d, - int i) { - return d[i]; -} - -// Dynamic access to mutable Dim -template -HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim& d, - int i) { - return d[i]; +template +HOSTDEVICE inline int64_t& get(Dim& dim, int idx) { // NOLINT + return dim[idx]; } // Dot product of two dims -template -HOSTDEVICE int64_t linearize(const Dim& a, const Dim& b) { - return a.head * b.head + linearize(a.tail, b.tail); -} - -// Base case dot product of two Dims -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline int64_t linearize(const Dim<0>& a, const Dim<0>& b) { - return 0; +template +HOSTDEVICE inline int64_t linearize(const Dim& a, const Dim& b) { + return UnrollProduct::Run(a.Get(), b.Get()); } // Product of a Dim -template -HOSTDEVICE int64_t product(const Dim& a, int prod = 1) { - return prod * a.head * product(a.tail); -} - -// Base case product of a Dim -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline int64_t product(const Dim<0>& a, int prod) { - return prod; +template +HOSTDEVICE inline int64_t product(const Dim& a) { + return UnrollProduct::Run(a.Get()); } // Is 0 <= idx_i < size_i for all i? -template -HOSTDEVICE bool contained(const Dim& idx, const Dim& size) { - return ((0 <= idx.head) && (idx.head < size.head) && - contained(idx.tail, size.tail)); -} +namespace detail { +template +struct ContainedFunctor { + HOSTDEVICE static inline bool Run(const int64_t* idx, const int64_t* size) { + return (idx[kStart] >= 0 && idx[kStart] < size[kStart]) && + ContainedFunctor::Run(idx, + size); + } +}; + +template +struct ContainedFunctor { + HOSTDEVICE static constexpr inline bool Run(const int64_t* idx, + const int64_t* size) { + return true; + } +}; +} // namespace detail -// Base case of is 0 <= idx_i < size_i ? -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline bool contained(const Dim<0>& idx, const Dim<0>& size) { - return true; +template +HOSTDEVICE inline bool contained(const Dim& idx, const Dim& size) { + return detail::ContainedFunctor<0, N, N == 0>::Run(idx.Get(), size.Get()); } /** * \brief Compute exclusive prefix-multiply of a Dim. */ -template -HOSTDEVICE Dim ex_prefix_mul(const Dim& src, int mul = 1) { - return Dim(mul, ex_prefix_mul(src.tail, mul * src.head)); -} +namespace detail { +template +struct ExPrefixMulFunctor { + HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) { + kStart == 0 ? out[kStart] = 1 : out[kStart] = + out[kStart - 1] * in[kStart - 1]; + detail::ExPrefixMulFunctor::Run(in, + out); + } +}; + +template +struct ExPrefixMulFunctor { + HOSTDEVICE static inline void Run(const int64_t* in, int64_t* out) {} +}; +} // namespace detail -///\cond HIDDEN -// Base case of ex_prefix_mul -// Notice it is inline because it is no longer a template -template <> -HOSTDEVICE inline Dim<0> ex_prefix_mul(const Dim<0>& src, int mul) { - return Dim<0>(); +template +HOSTDEVICE inline Dim ex_prefix_mul(const Dim& src) { + Dim ret; + detail::ExPrefixMulFunctor<0, N, N == 0>::Run(src.Get(), ret.GetMutable()); + return ret; } -///\endcond /** * Add two dimensions together */ -template -HOSTDEVICE Dim dim_plus(const Dim& a, const Dim& b) { - return Dim(a.head + b.head, dim_plus(a.tail, b.tail)); +template +HOSTDEVICE inline Dim dim_plus(const Dim& a, const Dim& b) { + Dim ret; + UnrollAdd::Run(a.Get(), b.Get(), ret.GetMutable()); + return ret; } -// Base case -template <> -HOSTDEVICE inline Dim<0> dim_plus(const Dim<0>& a, const Dim<0>& b) { - return Dim<0>(); -} - -template -HOSTDEVICE Dim operator+(const Dim& lhs, const Dim& rhs) { +template +HOSTDEVICE inline Dim operator+(const Dim& lhs, const Dim& rhs) { return dim_plus(lhs, rhs); } /** * Multiply two dimensions together */ -template -HOSTDEVICE Dim dim_mult(const Dim& a, const Dim& b) { - return Dim(a.head * b.head, dim_mult(a.tail, b.tail)); -} - -// Base case -template <> -HOSTDEVICE inline Dim<0> dim_mult(const Dim<0>& a, const Dim<0>& b) { - return Dim<0>(); +template +HOSTDEVICE inline Dim dim_mult(const Dim& a, const Dim& b) { + Dim ret; + UnrollMul::Run(a.Get(), b.Get(), ret.GetMutable()); + return ret; } template @@ -354,23 +210,32 @@ HOSTDEVICE Dim operator*(const Dim& lhs, const Dim& rhs) { * \return Dim object the same size as \p size with normalized strides * */ +namespace detail { +template +struct NormalizeStridesFunctor { + HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride, + int64_t* ret) { + ret[kStart] = (size[kStart] == 1 ? 0 : stride[kStart]); + NormalizeStridesFunctor::Run( + size, stride, ret); + } +}; -template -HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { - int norm_stride = size.head == 1 ? 0 : stride.head; - return Dim(norm_stride, normalize_strides(size.tail, stride.tail)); -} - -///\cond HIDDEN +template +struct NormalizeStridesFunctor { + HOSTDEVICE static void Run(const int64_t* size, const int64_t* stride, + int64_t* ret) {} +}; +} // namespace detail -template <> -HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, - const Dim<0>& stride) { - return Dim<0>(); +template +HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { + Dim ret; + detail::NormalizeStridesFunctor<0, N, N == 0>::Run(size.Get(), stride.Get(), + ret.GetMutable()); + return ret; } -///\endcond - /** * Helper function to create a Dim * @@ -379,25 +244,17 @@ HOSTDEVICE inline Dim<0> normalize_strides(const Dim<0>& size, */ template -HOSTDEVICE Dim make_dim(Args... idxes) { +HOSTDEVICE inline Dim make_dim(Args... idxes) { return Dim(idxes...); } // Allows us to output a Dim -// XXX For some reason, overloading fails to resolve this correctly -template -typename std::enable_if<(i > 1), std::ostream&>::type operator<<( - std::ostream& os, const Dim& d) { - os << d.head << ", " << d.tail; - return os; -} - -// Base case that allows us to output a Dim -// XXX I wish this could be an overload instead of a template -template -typename std::enable_if<(i == 1), std::ostream&>::type operator<<( - std::ostream& os, const Dim& d) { - os << d.head; +template +inline std::ostream& operator<<(std::ostream& os, const Dim& d) { + os << d[0]; + for (int i = 1; i < N; ++i) { + os << ", " << d[i]; + } return os; } @@ -405,25 +262,23 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { return os; } -template -HOST std::string Dim::to_string() const { +template +HOST std::string Dim::to_string() const { std::stringstream stream; - stream << *this; - return stream.str(); } -template -HOSTDEVICE Dim linear_to_dimension(int linear_index, Dim extents) { - Dim result; +template +HOSTDEVICE Dim linear_to_dimension(int linear_index, const Dim& extents) { + Dim result; - for (int i = 0; i < D - 1; ++i) { + for (int i = 0; i < N - 1; ++i) { result[i] = linear_index % extents[i]; linear_index /= extents[i]; } - result[D - 1] = linear_index; + result[N - 1] = linear_index; return result; } diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 04e3f78afe44b..5014fcd06a0ee 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -62,7 +62,7 @@ static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) { struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CPUPlace &place) const { - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLCPU; ctx.device_id = 0; return ctx; @@ -70,7 +70,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CUDAPlace &place) const { #ifdef PADDLE_WITH_CUDA - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLGPU; ctx.device_id = place.device; return ctx; @@ -81,7 +81,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { #ifdef PADDLE_WITH_CUDA - DLContext ctx; + ::DLContext ctx; ctx.device_type = kDLCPUPinned; ctx.device_id = 0; return ctx; diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h index 0c52bce1ef6af..e48b0d5c88fec 100644 --- a/paddle/fluid/framework/dlpack_tensor.h +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -38,7 +38,7 @@ class DLPackTensor { // The shape in DLTensor is defined as int64_t* // Add this member to make TVMTensor init without heap allocation - ShapeType shape_[9]; + ShapeType shape_[DDim::kMaxRank]; }; } // namespace framework diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h new file mode 100644 index 0000000000000..fb0a89530f61a --- /dev/null +++ b/paddle/fluid/framework/unroll_array_ops.h @@ -0,0 +1,169 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace framework { + +namespace detail { + +template +struct UnrollFillConstant { + template + HOSTDEVICE inline static void Run(T *data, T val) { + data[kStart] = val; + UnrollFillConstant::Run(data, val); + } +}; + +template +struct UnrollFillConstant { + template + HOSTDEVICE inline static void Run(T *data, T val) {} +}; + +template +struct UnrollAssign { + template + HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) { + d2[kStart] = static_cast(d1[kStart]); + UnrollAssign::Run(d1, d2); + } +}; + +template +struct UnrollAssign { + template + HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {} +}; + +template +struct UnrollVarArgsAssign { + template + HOSTDEVICE inline static void Run(T *d, T val, Args... args) { + static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument"); + d[kStart] = val; + UnrollVarArgsAssign::Run(d, + args...); + } +}; + +template +struct UnrollVarArgsAssign { + HOSTDEVICE inline static void Run(T *d) {} +}; + +template +struct UnrollCompare { + template + HOSTDEVICE inline static bool Run(const T *d1, const T *d2) { + return d1[kStart] == d2[kStart] && + UnrollCompare::Run(d1, d2); + } +}; + +template +struct UnrollCompare { + template + HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) { + return true; + } +}; + +template +struct UnrollAdd { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) { + d3[kStart] = d1[kStart] + d2[kStart]; + UnrollAdd::Run(d1, d2, d3); + } +}; + +template +struct UnrollAdd { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {} +}; + +template +struct UnrollMul { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) { + d3[kStart] = d1[kStart] * d2[kStart]; + UnrollMul::Run(d1, d2, d3); + } +}; + +template +struct UnrollMul { + template + HOSTDEVICE inline static void Run(const T *d1, const T *d2, T *d3) {} +}; + +template +struct UnrollProduct { + template + HOSTDEVICE inline static T Run(const T *d) { + return d[kStart] * + UnrollProduct::Run(d); + } + + template + HOSTDEVICE inline static T Run(const T *d1, const T *d2) { + return d1[kStart] * d2[kStart] + + UnrollProduct::Run(d1, d2); + } +}; + +template +struct UnrollProduct { + template + HOSTDEVICE inline constexpr static T Run(const T *d) { + return 1; + } + + template + HOSTDEVICE inline constexpr static T Run(const T *d1, const T *d2) { + return 0; + } +}; + +} // namespace detail + +template +using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>; + +template +using UnrollAssign = detail::UnrollAssign<0, N, N == 0>; + +template +using UnrollVarArgsAssign = detail::UnrollVarArgsAssign; + +template +using UnrollCompare = detail::UnrollCompare<0, N, N == 0>; + +template +using UnrollAdd = detail::UnrollAdd<0, N, N == 0>; + +template +using UnrollMul = detail::UnrollMul<0, N, N == 0>; + +template +using UnrollProduct = detail::UnrollProduct<0, N, N == 0>; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc index 6446cab5ec5f8..2e7f3edd55c33 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cc +++ b/paddle/fluid/operators/controlflow/logical_op.cc @@ -86,8 +86,6 @@ class UnaryLogicalOpInferShape : public framework::InferShapeBase { OpComment comment; PADDLE_ENFORCE(context->HasInput("X"), "Input(X) of %s operator must not be null", comment.type); - auto dim_x = context->GetInputDim("X"); - context->SetOutputDim("Out", context->GetInputDim("X")); context->ShareLoD("X", "Out"); } diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index 2d7d33bd4f9b4..cfc2cac7beb8a 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -68,7 +68,6 @@ void CropFunction(const framework::ExecutionContext& context) { } out->mutable_data(out_dims, context.GetPlace()); auto x_stride = framework::stride(x->dims()); - auto out_stride = framework::stride(out->dims()); auto offsets = GetOffsets(context); int64_t offset = 0; for (size_t i = 0; i < offsets.size(); ++i) { diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index dd64cc327fc38..744d149714c62 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -378,7 +378,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel { ->GetMutable(); auto input_dims = input->dims(); - auto weight_dims = weight->dims(); auto init_h_dims = init_h->dims(); auto init_c_dims = init_c->dims(); in_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h index 0b7c470fe72eb..fc223ce55931e 100644 --- a/paddle/fluid/operators/detail/strided_memcpy.h +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -27,8 +27,8 @@ struct StridedMemcpyFunctor; template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim<0> src_stride, framework::Dim<0> dst_dim, - framework::Dim<0> dst_stride, T* dst) const { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { auto place = dev_ctx.GetPlace(); if (platform::is_cpu_place(place)) { auto& cpu_place = boost::get(place); @@ -50,18 +50,18 @@ struct StridedMemcpyFunctor { template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim<1> src_stride, framework::Dim<1> dst_dim, - framework::Dim<1> dst_stride, T* dst) const { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { auto place = dev_ctx.GetPlace(); if (platform::is_cpu_place(place)) { auto& cpu_place = boost::get(place); - memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); + memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); } else { #ifdef PADDLE_WITH_CUDA auto& gpu_place = boost::get(place); auto& cuda_ctx = reinterpret_cast(dev_ctx); - memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, + memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0], cuda_ctx.stream()); #else PADDLE_THROW("Paddle is not compiled with GPU"); @@ -73,19 +73,19 @@ struct StridedMemcpyFunctor { template struct StridedMemcpyFunctor { void operator()(const platform::DeviceContext& dev_ctx, const T* src, - framework::Dim src_stride, framework::Dim dst_dim, - framework::Dim dst_stride, T* dst) const { - for (int64_t i = 0; i < dst_dim.head; ++i) { + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { + for (int64_t i = 0; i < dst_dim[0]; ++i) { StridedMemcpyFunctor func; - func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst); - src += src_stride.head; - dst += dst_stride.head; + func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst); + src += src_stride[0]; + dst += dst_stride[0]; } } }; template -struct StridedCopyDimVisitor : public boost::static_visitor { +struct StridedCopyDimVisitor { StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& src_stride, const framework::DDim& dst_stride, T* dst) @@ -95,13 +95,11 @@ struct StridedCopyDimVisitor : public boost::static_visitor { dst_stride_(dst_stride), dst_(dst) {} - template - void operator()(Dim dst_dim) const { - Dim src_stride = boost::get(src_stride_); - Dim dst_stride = boost::get(dst_stride_); - constexpr int dim = Dim::dimensions; - StridedMemcpyFunctor functor; - functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_); + template + void operator()(const framework::Dim& dst_dim) const { + StridedMemcpyFunctor functor; + functor(dev_ctx_, src_, src_stride_.data(), dst_dim.data(), + dst_stride_.data(), dst_); } const platform::DeviceContext& dev_ctx_; diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index fddd6884017c3..a652d4d95750f 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -64,8 +64,6 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { "Output(BboxOutsideWeights) of RpnTargetAssignOp should not be null"); auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); - auto gt_classes_dims = ctx->GetInputDim("GtClasses"); - auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); auto im_info_dims = ctx->GetInputDim("ImInfo"); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 709c2dfc4b7c6..f1975a9a4be69 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -53,12 +53,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Variances"), "Input(Variances) shouldn't be null."); - auto scores_dims = ctx->GetInputDim("Scores"); - auto bbox_deltas_dims = ctx->GetInputDim("BboxDeltas"); - auto im_info_dims = ctx->GetInputDim("ImInfo"); - auto anchors_dims = ctx->GetInputDim("Anchors"); - auto variances_dims = ctx->GetInputDim("Variances"); - ctx->SetOutputDim("RpnRois", {-1, 4}); ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); } diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 46fff9d338b77..fd5d75ba527a5 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -58,7 +58,6 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { auto anchor_dims = ctx->GetInputDim("Anchor"); auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); - auto is_crowd_dims = ctx->GetInputDim("IsCrowd"); auto im_info_dims = ctx->GetInputDim("ImInfo"); PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, "The rank of Input(Anchor) must be 2."); diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 87bf7c6b156f3..775346c5524e7 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -178,7 +178,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), "Rank of first input must >= rank of second input."); diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h index 75dbf1d8bf5cb..339408249771d 100644 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -77,7 +77,6 @@ class ExpandKernel : public framework::OpKernel { auto& expand_times = context.Attr>("expand_times"); auto* out0 = context.Output("Out"); Eigen::DSizes bcast_dims; - auto x_dims = in0->dims(); for (size_t i = 0; i < expand_times.size(); ++i) { bcast_dims[i] = expand_times[i]; } diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index e80249fc87855..7c53e5279dafc 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -148,7 +148,6 @@ class FCOpKernel : public framework::OpKernel { auto w = ctx.Input("W"); auto bias = ctx.Input("Bias"); auto output = ctx.Output("Out"); - auto in_dims = input->dims(); auto w_dims = w->dims(); auto out_dims = output->dims(); int M = framework::product(out_dims) / out_dims[out_dims.size() - 1]; diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 1eb6523a2dfb3..9344bfe65db62 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -242,15 +242,15 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { bool is_reverse = ctx.Attr("is_reverse"); \ bool use_peepholes = ctx.Attr("use_peepholes"); -#define INIT_BASE_SIZES \ - auto ids_dims = ids->dims(); /* T x M*/ \ - auto ids_numel = ids->numel(); /* T x 1*/ \ - auto wh_dims = wh->dims(); /* D x 4D*/ \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const int D3 = D * 3; \ - int64_t row_number = embeddings->dims()[0]; \ - int64_t row_width = embeddings->dims()[1]; \ +#define INIT_BASE_SIZES \ + auto ids_dims = ids->dims(); /* T x M*/ \ + auto ids_numel = framework::product(ids_dims); /* T x 1*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const int D3 = D * 3; \ + int64_t row_number = embeddings->dims()[0]; \ + int64_t row_width = embeddings->dims()[1]; \ const int D4 = wh_dims[1]; #define INIT_BASE_INPUT_DATAS \ diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index 69e7fa4490b89..f458ce6c83bfc 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -88,7 +88,6 @@ class HingeLossGradOp : public framework::OperatorWithKernel { "Input(Logits@GRAD) should not be null."); auto pred_dims = ctx->GetInputDim("Logits"); - auto lab_dims = ctx->GetInputDim("Labels"); auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index 9d248e03218b8..ef1fb83aa6e34 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -92,7 +92,6 @@ class LogLossGradOp : public framework::OperatorWithKernel { "Output(Predicted@GRAD) should not be null."); auto pred_dims = ctx->GetInputDim("Predicted"); - auto label_dims = ctx->GetInputDim("Labels"); auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h index 895a7019aa10e..d1127ce4a2461 100644 --- a/paddle/fluid/operators/math/math_function_impl.h +++ b/paddle/fluid/operators/math/math_function_impl.h @@ -37,9 +37,6 @@ void Transpose::operator()( for (int i = 0; i < Rank; i++) { permute[i] = axis[i]; } - auto in_dim = in.dims(); - auto out_dim = out->dims(); - auto eigen_in = framework::EigenTensor::From(in); auto eigen_out = framework::EigenTensor::From(*out); auto* dev = context.eigen_device(); diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 9e99e44822b2f..1d9d98b10646a 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -76,7 +76,6 @@ class SoftmaxFunctor> { void operator()(const DeviceContext& context, const framework::Tensor* X, framework::Tensor* Y) { auto in_dims = X->dims(); - auto out_dims = Y->dims(); const float* in_data = X->data(); float* out_data = Y->data(); const int kBatchDim = 0; diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc index 35db4c1ad1f6c..9954e51083b2c 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cc +++ b/paddle/fluid/operators/modified_huber_loss_op.cc @@ -87,7 +87,6 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel { "Input(Out@Grad) must not be null."); auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); auto intermediate_dims = ctx->GetInputDim("IntermediateVal"); auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index 8a111e6065b10..154b5f0d08fd9 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -146,12 +146,6 @@ class MulGradOp : public framework::OperatorWithKernel { "Input(Out@GRAD) should not be null"); auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - - auto x_mat_dims = framework::flatten_to_2d( - x_dims, ctx->Attrs().Get("x_num_col_dims")); - auto y_mat_dims = framework::flatten_to_2d( - y_dims, ctx->Attrs().Get("y_num_col_dims")); auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 9f97f7821ddf5..e58dccea131ee 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -36,7 +36,6 @@ class NCEOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("Input"); auto label_dims = ctx->GetInputDim("Label"); - auto w_dims = ctx->GetInputDim("Weight"); PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]); int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1; if (ctx->HasInput("Bias")) { diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h index d0224177ecf7f..6c95d3f3bf3a3 100644 --- a/paddle/fluid/operators/norm_op.h +++ b/paddle/fluid/operators/norm_op.h @@ -43,7 +43,6 @@ class NormKernel : public framework::OpKernel { out_norm->mutable_data(ctx.GetPlace()); auto xdim = in_x->dims(); - auto ndim = out_norm->dims(); T eps = static_cast(ctx.Attr("epsilon")); int axis = ctx.Attr("axis"); if (axis < 0) axis = xdim.size() + axis; diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h index 1a424728f7f6c..5666613f6efb9 100644 --- a/paddle/fluid/operators/psroi_pool_op.h +++ b/paddle/fluid/operators/psroi_pool_op.h @@ -41,7 +41,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { int rois_num = rois->dims()[0]; auto in_stride = framework::stride(in_dims); - auto roi_stride = framework::stride(rois->dims()); auto out_stride = framework::stride(out->dims()); const T* input_data = in->data(); diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h index 03b59d71cc0ca..4bded0efb9674 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h @@ -143,8 +143,6 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { set_zero(ctx.template device_context(), x_grad, static_cast(0)); - auto out_grad_stride = framework::stride(out_grad->dims()); - for (size_t i = 0; i < out_lod[0].size() - 1; ++i) { Tensor out_grad_t = out_grad->Slice(static_cast(out_lod[0][i]), diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index c3d83a06f23a3..6a99ad9a90f69 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -40,7 +40,7 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src, const framework::DDim& dst_stride, T* dst) { paddle::operators::detail::StridedCopyDimVisitor func( dev_ctx, src, src_stride, dst_stride, dst); - boost::apply_visitor(func, dst_dim); + dst_dim.apply_visitor(func); } // Strided numel memory copy from src to dst by the specified axis From dda28b0e682859c3868efe1ce65d636363faafd6 Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Wed, 19 Dec 2018 06:50:10 +0000 Subject: [PATCH 016/103] fix bug in if-else op, test=develop --- paddle/fluid/operators/split_lod_tensor_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index 767449cde981e..5ede972c71ff3 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -63,7 +63,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { } auto *mask_data = cpu_mask->data(); - std::vector> copy_ranges(mask_dim[0]); + std::vector> copy_ranges(2); // set out_true/out_false lod for (size_t t = 0; t < 2; t++) { From ae6f46a1a9029284ba86ac0c783869a4c8468e17 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 19 Dec 2018 11:11:21 +0000 Subject: [PATCH 017/103] rewrite variable type test=develop --- paddle/fluid/framework/CMakeLists.txt | 16 +- .../framework/data_device_transform_test.cu | 1 + .../details/eager_deletion_op_handle.cc | 2 +- .../framework/details/variable_visitor.cc | 4 +- paddle/fluid/framework/executor.cc | 2 +- paddle/fluid/framework/operator.cc | 12 +- paddle/fluid/framework/operator.h | 12 + paddle/fluid/framework/scope.cc | 4 +- paddle/fluid/framework/var_type.h | 32 ++- paddle/fluid/framework/var_type_traits.cc | 27 ++ paddle/fluid/framework/var_type_traits.h | 207 ++++++++++++++ .../fluid/framework/var_type_traits_test.cc | 75 ++++++ paddle/fluid/framework/variable.h | 64 ++--- paddle/fluid/framework/variable_test.cc | 23 +- .../api/details/reset_tensor_array.cc | 2 +- .../api/details/reset_tensor_array.h | 9 +- paddle/fluid/operators/affine_grid_op.cc | 4 +- paddle/fluid/operators/clip_by_norm_op.h | 2 +- .../operators/controlflow/parallel_do_op.cc | 3 +- .../fluid/operators/controlflow/while_op.cc | 7 +- paddle/fluid/operators/conv_op.cc | 4 +- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 241 +---------------- paddle/fluid/operators/cudnn_rnn_cache.h | 255 ++++++++++++++++++ .../distributed/brpc_sendrecvop_utils.cc | 3 +- .../operators/distributed_ops/split_ids_op.h | 2 +- .../elementwise/elementwise_mul_op.h | 2 +- paddle/fluid/operators/grid_sampler_op.cc | 4 +- .../fluid/operators/optimizers/adadelta_op.h | 6 +- .../fluid/operators/optimizers/adagrad_op.h | 3 +- paddle/fluid/operators/optimizers/adam_op.h | 3 +- paddle/fluid/operators/optimizers/adamax_op.h | 6 +- .../operators/optimizers/decayed_adagrad_op.h | 6 +- paddle/fluid/operators/optimizers/ftrl_op.h | 6 +- .../fluid/operators/optimizers/momentum_op.h | 2 +- paddle/fluid/operators/optimizers/sgd_op.cu | 3 +- paddle/fluid/operators/pool_op.cc | 4 +- paddle/fluid/operators/softmax_op.cc | 4 +- paddle/fluid/operators/sum_mkldnn_op.cc | 2 +- paddle/fluid/operators/sum_op.cc | 2 +- paddle/fluid/operators/sum_op.h | 2 +- paddle/fluid/operators/warpctc_op.cc | 2 +- paddle/fluid/platform/cudnn_helper.h | 13 - 42 files changed, 717 insertions(+), 366 deletions(-) create mode 100644 paddle/fluid/framework/var_type_traits.cc create mode 100644 paddle/fluid/framework/var_type_traits.h create mode 100644 paddle/fluid/framework/var_type_traits_test.cc create mode 100644 paddle/fluid/operators/cudnn_rnn_cache.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 412bc9cbe88b8..b6372a2ef5934 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -78,17 +78,25 @@ cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memor cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) -cc_test(variable_test SRCS variable_test.cc) - cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(scope SRCS scope.cc DEPS glog threadpool) +cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) +if (WITH_GPU) + target_link_libraries(var_type_traits cudnn) + if (NOT WIN32) + target_link_libraries(var_type_traits nccl) + endif() +endif() +cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) + +cc_library(scope SRCS scope.cc DEPS glog threadpool var_type_traits) cc_test(scope_test SRCS scope_test.cc DEPS scope) +cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) nv_test(data_device_transform_test SRCS data_device_transform_test.cu - DEPS operator op_registry device_context math_function) + DEPS operator op_registry device_context math_function scope) if(WITH_GPU) if (WIN32) diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index c9ec5e7a7b37b..96a2f9250ff92 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index abacb11e3b018..03fbfd7f24a8a 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -88,7 +88,7 @@ void EagerDeletionOpHandle::RunImpl() { } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", - var->Type().name(), name); + framework::ToTypeName(var->Type()), name); } } diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index 3dfd14419d943..134f759081a07 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -24,7 +24,7 @@ static void VisitVariable(Variable* var, Func* func) { } else if (var->IsType()) { (*func)(var->GetMutable()); } else { - PADDLE_THROW("Not supported type %s", var->Type().name()); + PADDLE_THROW("Not supported type %s", ToTypeName(var->Type())); } } @@ -35,7 +35,7 @@ static void VisitVariable(const Variable& var, Func* func) { } else if (var.IsType()) { (*func)(var.Get()); } else { - PADDLE_THROW("Not supported type %s", var.Type().name()); + PADDLE_THROW("Not supported type %s", ToTypeName(var.Type())); } } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index da9556c6c1f34..594fbb48a6d12 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -119,7 +119,7 @@ static void DeleteUnusedTensors( } } else { PADDLE_THROW("Type %s of %s is not supported eager deletion", - var->Type().name(), name); + framework::ToTypeName(var->Type()), name); } } } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index a62afe248baa2..9b4a5011a814f 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -365,7 +365,7 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { return &(var.Get().value()); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var.Type().name()); + ToTypeName(var.Type())); } } @@ -376,7 +376,7 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) { return var->GetMutable()->mutable_value(); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + ToTypeName(var->Type())); } } @@ -430,7 +430,7 @@ const std::vector ExecutionContext::MultiInput( PADDLE_ENFORCE( var->IsType(), "%s should be LoDTensor, but the received type is %s", - sub_name, var->Type().name()); + sub_name, ToTypeName(var->Type())); return &(var->Get()); }); return res; @@ -454,7 +454,7 @@ std::vector ExecutionContext::MultiOutput( PADDLE_ENFORCE( var->IsType(), "%s should be LoDTensor, but the received type is %s", - sub_name, var->Type().name()); + sub_name, ToTypeName(var->Type())); return var->GetMutable(); }); return res; @@ -641,7 +641,7 @@ class RuntimeInferShapeContext : public InferShapeContext { PADDLE_THROW( "Only LoDTensor/SelectedRows support 'GetDim', but Variable %s's " "type_id is %s.", - name, var->Type().name()); + name, ToTypeName(var->Type())); } } @@ -657,7 +657,7 @@ class RuntimeInferShapeContext : public InferShapeContext { var->GetMutable()->set_height(dim[0]); } else { PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.", - name, var->Type().name()); + name, ToTypeName(var->Type())); } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bce01..f8d2f1fe126b9 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -288,6 +288,18 @@ class ExecutionContext { const platform::DeviceContext& device_context_; }; +inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (use_cudnn) { + auto& dev_ctx = ctx.device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + return use_cudnn; +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const; diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 6fa5e99f9f3a7..750b626603178 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -165,11 +165,9 @@ std::string Scope::Rename(const std::string& origin_name) const { Variable* Scope::VarInternal(const std::string& name) { auto* v = FindVarLocally(name); if (v != nullptr) return v; - v = new Variable(); - vars_[name].reset(v); + vars_.emplace(name, std::unique_ptr(v)); VLOG(3) << "Create variable " << name; - v->name_ = &(vars_.find(name)->first); return v; } diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 3b6f1cdb8f24a..f1cbaf3fdc22c 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -19,35 +19,33 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/variable.h" namespace paddle { namespace framework { template -inline bool IsType(const std::type_index& type_index) { - return type_index == std::type_index(typeid(T)); +inline bool IsType(const std::type_index& type) { + return type == typeid(T); } -inline proto::VarType::Type ToVarType(std::type_index type) { - if (IsType(type)) { - return proto::VarType_Type_LOD_TENSOR; - } else if (IsType(type)) { - return proto::VarType_Type_LOD_RANK_TABLE; - } else if (IsType(type)) { - return proto::VarType_Type_LOD_TENSOR_ARRAY; - } else if (IsType(type)) { - return proto::VarType_Type_SELECTED_ROWS; - } else if (IsType(type)) { - return proto::VarType_Type_READER; - } else { - PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); +inline proto::VarType::Type ToVarType(int type) { + switch (type) { + case proto::VarType::LOD_TENSOR: + case proto::VarType::SELECTED_ROWS: + case proto::VarType::LOD_RANK_TABLE: + case proto::VarType::LOD_TENSOR_ARRAY: + case proto::VarType::READER: + return static_cast(type); + default: + PADDLE_THROW("ToVarType:Unsupported type %d", type); } } template inline void VisitVarType(const framework::Variable& var, Visitor visitor) { - switch (ToVarType(var.Type())) { + switch (var.Type()) { case proto::VarType_Type_LOD_TENSOR: visitor(var.Get()); return; @@ -64,7 +62,7 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { visitor(var.Get()); return; default: - PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); + PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); } } diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc new file mode 100644 index 0000000000000..0171df6f7389d --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/var_type_traits.h" + +namespace paddle { +namespace framework { + +const char* ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } + +const std::type_index& ToTypeIndex(int var_id) { + return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h new file mode 100644 index 0000000000000..88f917e74fc14 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.h @@ -0,0 +1,207 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +namespace detail { + +template +struct TypePosFinderImpl { + static constexpr int kPos = + std::is_same::value + ? kStart + : TypePosFinderImpl::kPos; +}; + +template +struct TypePosFinderImpl { + static constexpr int kPos = std::is_same::value ? kStart : -1; +}; + +// TypePosFinder helps to find the position in which T is inside Args... +// If T is not inside Args..., kPos would be -1 +template +struct TypePosFinder { + static constexpr int kPos = + TypePosFinderImpl::kPos; +}; + +template +struct VarTypeRegistryImpl { + static constexpr size_t kRegisteredTypeNum = sizeof...(Args); + using ArgTuple = std::tuple; + + // TypePos() returns the position in which T is inside Args... + // If T is not inside Args... or T is void, return -1 + template + static constexpr int TypePos() { + return std::is_same::value ? -1 : TypePosFinder::kPos; + } + + // IsRegistered() returns whether T is registered inside RegistryImpl + template + static constexpr bool IsRegistered() { + return TypePos() >= 0; + } +}; + +} // namespace detail + +#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id) \ + template <> \ + struct VarTypeTrait { \ + static_assert(VarTypeRegistry::IsRegistered(), \ + "Must be registered type"); \ + using Type = type; \ + static constexpr int kId = proto_id; \ + } + +/** + * The following codes are designed to register variable types. + * Only registered types can be stored in Variable. + * This registry mechanism is designed to speed up Variable. + */ + +// Users should add other variable types below. +// Paddle would generate unique Ids for each registered variable types. +class Scope; + +using VarTypeRegistry = detail::VarTypeRegistryImpl< + LoDTensor, SelectedRows, std::vector, LoDRankTable, LoDTensorArray, + platform::PlaceList, ReaderHolder, Tensor, std::string, Scope *, + std::map, operators::reader::LoDTensorBlockingQueueHolder, + int, float, +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 + ncclUniqueId, platform::Communicator, +#endif + operators::AlgorithmsCache, + operators::AlgorithmsCache, + operators::AlgorithmsCache, + operators::CudnnRNNCache, +#endif + void>; // void indicates end of registration, add other types before void + +template +struct VarTypeTrait { + static_assert(std::is_same::value || + VarTypeRegistry::IsRegistered(), + "Must be registered type"); + using Type = T; + // Default id generation + static constexpr int kId = VarTypeRegistry::TypePos() + + static_cast(proto::VarType::TUPLE) * 2; +}; + +// Users should set some of variable type ids to be what is defined in +// framework.proto here +REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR); +REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS); +REG_PROTO_VAR_TYPE_TRAIT(std::vector, proto::VarType::STEP_SCOPES); +REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); +REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); +REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); +REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); + +/** End of variable type registration */ + +// Besides register variable id, it is helpful to register a +// var_id -> std::type_index (for example, get var names according to id) +namespace detail { + +template +struct VarIdToTypeIndexMapInitializerImpl { + static void Init(std::unordered_map *m) { + using Type = + typename std::tuple_element::type; + constexpr int kId = VarTypeTrait::kId; + if (!std::is_same::value) { + m->emplace(kId, std::type_index(typeid(Type))); + } + VarIdToTypeIndexMapInitializerImpl::Init(m); + } +}; + +template +struct VarIdToTypeIndexMapInitializerImpl { + static void Init(std::unordered_map *m) {} +}; + +// VarIdToTypeIndexMapInitializer is designed to initialize var_id -> +// std::type_index map +using VarIdToTypeIndexMapInitializer = + VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum, + VarTypeRegistry::kRegisteredTypeNum == + 0>; + +struct VarIdToTypeIndexMapHolder { + public: + static const std::type_index &ToTypeIndex(int var_id) { + static const VarIdToTypeIndexMapHolder instance; + auto it = instance.var_type_map_.find(var_id); + PADDLE_ENFORCE(it != instance.var_type_map_.end(), + "VarId %d is not registered.", var_id); + return it->second; + } + + private: + VarIdToTypeIndexMapHolder() { + VarIdToTypeIndexMapInitializer::Init(&var_type_map_); + } + std::unordered_map var_type_map_; +}; + +} // namespace detail + +const char *ToTypeName(int var_id); +const std::type_index &ToTypeIndex(int var_id); + +template +inline constexpr bool IsRegisteredVarType() { + return VarTypeRegistry::IsRegistered(); +} + +#undef REG_PROTO_VAR_TYPE_TRAIT +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc new file mode 100644 index 0000000000000..09fab719c1674 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/var_type_traits.h" +#include +#include + +namespace paddle { +namespace framework { + +template +struct TypeIndexChecker { + static void Check() { + using Type = + typename std::tuple_element::type; + if (!std::is_same::value) { + EXPECT_TRUE(ToTypeIndex(VarTypeTrait::kId) == typeid(Type)); + EXPECT_TRUE(std::string(ToTypeName(VarTypeTrait::kId)) == + typeid(Type).name()); + } + TypeIndexChecker::Check(); + } +}; + +template +struct TypeIndexChecker { + static void Check() {} +}; + +TEST(var_type_traits, check_type_index) { + constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum; + TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check(); +} + +template +bool CheckVarId(int proto_id) { + static_assert(std::is_same::Type, T>::value, + "Type must be the same"); + return VarTypeTrait::kId == proto_id; +} + +TEST(var_type_traits, check_proto_type_id) { + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR)); + ASSERT_TRUE(CheckVarId(proto::VarType::SELECTED_ROWS)); + ASSERT_TRUE(CheckVarId>(proto::VarType::STEP_SCOPES)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_RANK_TABLE)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR_ARRAY)); + ASSERT_TRUE(CheckVarId(proto::VarType::PLACE_LIST)); + ASSERT_TRUE(CheckVarId(proto::VarType::READER)); +} + +TEST(var_type_traits, test_registry) { + using Registry = + detail::VarTypeRegistryImpl; + ASSERT_TRUE(Registry::TypePos() == 0); + ASSERT_TRUE(Registry::TypePos() == 1); + ASSERT_TRUE(Registry::TypePos() == 2); + ASSERT_TRUE(Registry::TypePos() == 3); + ASSERT_TRUE(Registry::TypePos() == -1); + ASSERT_TRUE(Registry::TypePos() == -1); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 873e1b20a584d..8aa68942ad16f 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -18,7 +18,7 @@ #include #include -#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/framework/var_type_traits.h" namespace paddle { namespace framework { @@ -27,10 +27,14 @@ class Variable { public: template const T& Get() const { + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing"); - PADDLE_ENFORCE(IsType(), + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); return *static_cast(holder_->Ptr()); } @@ -39,61 +43,59 @@ class Variable { template T* GetMutable() { if (!holder_) { - holder_.reset(new PlaceholderImpl(new T())); + holder_.reset(new PlaceholderImpl()); } else { - PADDLE_ENFORCE(IsType(), + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, "Variable must be type %s, the holding type is %s", - typeid(T).name(), holder_->Type().name()); + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); } return static_cast(holder_->Ptr()); } template bool IsType() const { - return holder_ != nullptr && - std::type_index(typeid(T)) == std::type_index(holder_->Type()); + return holder_ && holder_->Type() == VarTypeTrait::kId; } void Clear() { holder_.reset(); } - std::type_index Type() const { + int Type() const { PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory"); return holder_->Type(); } private: struct Placeholder { - virtual ~Placeholder() {} - virtual const std::type_info& Type() const = 0; - virtual void* Ptr() const = 0; + explicit Placeholder(int type) : type_(type) {} + virtual ~Placeholder() = default; + + inline int Type() const { return type_; } + inline const void* Ptr() const { return ptr_; } + inline void* Ptr() { return ptr_; } + + protected: + void* ptr_; + int type_; }; // Placeholder hides type T, so it doesn't appear as a template // parameter of Variable. template struct PlaceholderImpl : public Placeholder { - explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {} - - virtual const std::type_info& Type() const { return type_; } - virtual void* Ptr() const { return static_cast(ptr_.get()); } + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); + PlaceholderImpl() : Placeholder(VarTypeTrait::kId) { + this->ptr_ = &obj_; + } - std::unique_ptr ptr_; - const std::type_info& type_; + private: + T obj_; }; - std::unique_ptr - holder_; // pointers to a PlaceholderImpl object indeed. - - // name_ is only meaningful with a Scope and accessible by it. - // - // NOTE: Please don't expose name_ by adding methods like - // Variable::Name or Scope::VarName! A variable could have a human - // readable name or an auto-generated scope-unique name. In the - // former case, the caller knows the name and doesn't need to access - // the name; in the latter case, the variable should be identified - // by its address but not the unreadable name. - friend class Scope; - const std::string* name_; + // pointers to a PlaceholderImpl object indeed. + std::unique_ptr holder_; }; } // namespace framework diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc index 003dcfd3dfe5e..511c9c52146ec 100644 --- a/paddle/fluid/framework/variable_test.cc +++ b/paddle/fluid/framework/variable_test.cc @@ -16,27 +16,28 @@ #include #include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" -TEST(Variable, GetMutable) { - using paddle::framework::Variable; - - struct Tensor { - int content_; - }; +namespace paddle { +namespace framework { +TEST(Variable, GetMutable) { std::unique_ptr v(new Variable()); - Tensor* t = v->GetMutable(); - t->content_ = 1234; + auto* t = v->GetMutable(); + *t = "1234"; - const Tensor& tt = v->Get(); - EXPECT_EQ(1234, tt.content_); + const auto& tt = v->Get(); + EXPECT_EQ("1234", tt); try { - v->GetMutable(); + v->GetMutable(); } catch (std::exception& e) { return; } EXPECT_TRUE(false); } + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc index 569a487328e2f..03c2aa3fb8094 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.cc +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -25,7 +25,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { // TODO(Superjomn) should avoid the case when a TensorArray is a // parameter. if (var_name == "feed" || var_name == "fetch") continue; - if (var->Type() == typeid(framework::LoDTensorArray)) { + if (var->IsType()) { VLOG(4) << "collect " << var_name; arrays_.push_back(var->GetMutable()); } diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index 6a5ea64de66fc..213c6891d0e23 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -27,8 +27,11 @@ namespace details { // training phase. struct TensorArrayBatchCleaner { TensorArrayBatchCleaner() { - valid_types_.insert(typeid(framework::Tensor)); - valid_types_.insert(typeid(framework::LoDTensor)); + constexpr auto kTensorId = framework::VarTypeTrait::kId; + constexpr auto kLoDTensorId = + framework::VarTypeTrait::kId; + valid_types_.insert(kTensorId); + valid_types_.insert(kLoDTensorId); } // Collect the variables that are not Tensor or LoDTensor, and reset them to a // bool(trick), because some of them are containers, and some operators just @@ -46,7 +49,7 @@ struct TensorArrayBatchCleaner { bool no_tensor_flag_{true}; std::vector arrays_; - std::unordered_set valid_types_; + std::unordered_set valid_types_; std::unordered_set no_tensor_vars_; }; diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index 1de59a5165c83..0c048738522ca 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -74,7 +74,7 @@ class AffineGridOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library = framework::LibraryType::kCUDNN; } #endif @@ -184,7 +184,7 @@ class AffineGridOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index 855c4d7067739..49e734ce96b0d 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -64,7 +64,7 @@ class ClipByNormKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); } else { PADDLE_THROW("Unexpected branch, input variable type is %s", - in_var->Type().name()); + framework::ToTypeName(in_var->Type())); } PADDLE_ENFORCE_NOT_NULL(input); diff --git a/paddle/fluid/operators/controlflow/parallel_do_op.cc b/paddle/fluid/operators/controlflow/parallel_do_op.cc index ab25628d45699..5bcc597dec257 100644 --- a/paddle/fluid/operators/controlflow/parallel_do_op.cc +++ b/paddle/fluid/operators/controlflow/parallel_do_op.cc @@ -92,7 +92,8 @@ inline void CopyOrShare(const framework::Variable &src, TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value()); } } else { - PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); + PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", + framework::ToTypeName(src.Type())); } } diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index e91d9ef776556..9b5eda17faecc 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -175,14 +175,13 @@ class WhileGradOp : public framework::OperatorBase { auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name), "Cannot find inside gradient %s", inside_og_name); - if (framework::IsType(og_outside.Type())) { + if (og_outside.IsType()) { auto &outside_tensor = og_outside.Get(); auto &inside_tensor = detail::Ref(og_inside.GetMutable()); inside_tensor.set_lod(outside_tensor.lod()); inside_tensor.ShareDataWith(outside_tensor); - } else if (framework::IsType( - og_outside.Type())) { + } else if (og_outside.IsType()) { auto &outside_array = og_outside.Get(); auto &inside_array = detail::Ref(og_inside.GetMutable()); @@ -256,7 +255,7 @@ class WhileGradOp : public framework::OperatorBase { var->IsType(), "Currently the type of var only can be LoDTensorArray, " "or LoDTensor, but the received var[%s] is %s.", - inside_grad_name, var->Type().name()); + inside_grad_name, framework::ToTypeName(var->Type())); if (var->IsType()) { auto &inside_tensor = var->Get(); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8e0d2824953a3..c76bde99f4a0a 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -84,7 +84,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( framework::DataLayout layout = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library = framework::LibraryType::kCUDNN; } #endif @@ -369,7 +369,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index f2ba75485c587..fae0925149146 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { namespace operators { @@ -22,239 +22,6 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -struct CudnnRNNCache { - CudnnRNNCache() { - x_desc_ = NULL; - y_desc_ = NULL; - dx_desc_ = NULL; - dy_desc_ = NULL; - } - ~CudnnRNNCache() { release(); } - - cudnnRNNDescriptor_t rnn_desc_; - cudnnTensorDescriptor_t *x_desc_; - cudnnTensorDescriptor_t *y_desc_; - cudnnTensorDescriptor_t *dx_desc_; - cudnnTensorDescriptor_t *dy_desc_; - - cudnnTensorDescriptor_t hx_desc_; - cudnnTensorDescriptor_t cx_desc_; - cudnnTensorDescriptor_t hy_desc_; - cudnnTensorDescriptor_t cy_desc_; - - cudnnTensorDescriptor_t dhx_desc_; - cudnnTensorDescriptor_t dcx_desc_; - cudnnTensorDescriptor_t dhy_desc_; - cudnnTensorDescriptor_t dcy_desc_; - - cudnnTensorDescriptor_t output_x_desc_; - cudnnTensorDescriptor_t output_y_desc_; - - cudnnDropoutDescriptor_t dropout_desc_; - - size_t weights_size_; - cudnnFilterDescriptor_t w_desc_; - cudnnFilterDescriptor_t dw_desc_; - - size_t workspace_size_; - size_t reserve_size_; - Tensor reserve_data_; - Tensor workspace_data_; - - Tensor dropout_state_; - - size_t max_length_; - - float dropout_prob_; - bool is_bidirec_; - - int batch_size_; - int input_size_; - int hidden_size_; - int num_layers_; - int seed_; - - void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx, - size_t max_len, int batch_size, int input_size, int hidden_size, - int num_layers, float dropout_prob, bool is_bidirec, int seed, - int weight_numel) { - max_length_ = max_len; - batch_size_ = batch_size; - input_size_ = input_size; - hidden_size_ = hidden_size; - num_layers_ = num_layers; - dropout_prob_ = dropout_prob; - is_bidirec_ = is_bidirec; - seed_ = seed; - - x_desc_ = new cudnnTensorDescriptor_t[max_length_]; - y_desc_ = new cudnnTensorDescriptor_t[max_length_]; - dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; - dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; - int dim_a[3]; - int stride_a[3]; - - for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); - dim_a[0] = batch_size_; - dim_a[1] = input_size_; - dim_a[2] = 1; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - - dim_a[0] = batch_size_; - dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; - dim_a[2] = 1; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - } - - dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); - dim_a[1] = batch_size_; - dim_a[2] = hidden_size_; - - stride_a[0] = dim_a[2] * dim_a[1]; - stride_a[1] = dim_a[2]; - stride_a[2] = 1; - - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); - - CUDNN_ENFORCE( - platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); - - size_t state_size; - CUDNN_ENFORCE( - platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); - dropout_state_.Resize({static_cast(state_size)})); - auto *dropout_state_data = - dropout_state_.mutable_data(ctx.GetPlace()); - CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( - dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, - seed_)); - - CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); - -#if CUDNN_VERSION >= 6000 - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( - handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, - CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); -#else - CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - CUDNN_DATA_FLOAT)); -#endif - - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( - handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); - - PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, - "cudnn lstm weight size should be SAME"); - int dim_w[3]; - dim_w[0] = weights_size_ / sizeof(float); - dim_w[1] = 1; - dim_w[2] = 1; - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( - w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( - dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); - - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( - handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); - CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( - handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); - - reserve_data_.Resize({static_cast(reserve_size_)}); - reserve_data_.mutable_data(ctx.GetPlace()); - - workspace_data_.Resize({static_cast(workspace_size_)}); - workspace_data_.mutable_data(ctx.GetPlace()); - } - - void release() { - for (size_t i = 0; i < max_length_; ++i) { - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); - } - - delete[] x_desc_; - delete[] y_desc_; - delete[] dx_desc_; - delete[] dy_desc_; - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); - - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); - - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); - CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); - } -}; - template class CudnnLSTMGPUKernel : public framework::OpKernel { public: @@ -315,9 +82,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { auto input_w_numel = w->numel(); auto batch_size = x->dims()[1]; - cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, - hidden_size, num_layers, dropout_prob, is_bidirec, - seed, input_w_numel); + cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size, + input_size, hidden_size, num_layers, dropout_prob, + is_bidirec, seed, input_w_numel); } auto run_seq_len = x->dims()[0]; diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h new file mode 100644 index 0000000000000..7f18b839271a2 --- /dev/null +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -0,0 +1,255 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +struct CudnnRNNCache { + CudnnRNNCache() { + x_desc_ = NULL; + y_desc_ = NULL; + dx_desc_ = NULL; + dy_desc_ = NULL; + } + ~CudnnRNNCache() { release(); } + + cudnnRNNDescriptor_t rnn_desc_; + cudnnTensorDescriptor_t *x_desc_; + cudnnTensorDescriptor_t *y_desc_; + cudnnTensorDescriptor_t *dx_desc_; + cudnnTensorDescriptor_t *dy_desc_; + + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnTensorDescriptor_t dhx_desc_; + cudnnTensorDescriptor_t dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_; + cudnnTensorDescriptor_t dcy_desc_; + + cudnnTensorDescriptor_t output_x_desc_; + cudnnTensorDescriptor_t output_y_desc_; + + cudnnDropoutDescriptor_t dropout_desc_; + + size_t weights_size_; + cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t dw_desc_; + + size_t workspace_size_; + size_t reserve_size_; + framework::Tensor reserve_data_; + framework::Tensor workspace_data_; + + framework::Tensor dropout_state_; + + size_t max_length_; + + float dropout_prob_; + bool is_bidirec_; + + int batch_size_; + int input_size_; + int hidden_size_; + int num_layers_; + int seed_; + + void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len, + int batch_size, int input_size, int hidden_size, int num_layers, + float dropout_prob, bool is_bidirec, int seed, int weight_numel) { + max_length_ = max_len; + batch_size_ = batch_size; + input_size_ = input_size; + hidden_size_ = hidden_size; + num_layers_ = num_layers; + dropout_prob_ = dropout_prob; + is_bidirec_ = is_bidirec; + seed_ = seed; + + x_desc_ = new cudnnTensorDescriptor_t[max_length_]; + y_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; + int dim_a[3]; + int stride_a[3]; + + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); + dim_a[0] = batch_size_; + dim_a[1] = input_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + dim_a[0] = batch_size_; + dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + } + + dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); + dim_a[1] = batch_size_; + dim_a[2] = hidden_size_; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); + + size_t state_size; + CUDNN_ENFORCE( + platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); + dropout_state_.Resize({static_cast(state_size)})); + auto *dropout_state_data = dropout_state_.mutable_data(place); + CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( + dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, + seed_)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + +#if CUDNN_VERSION >= 6000 + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); +#else + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( + rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_DATA_FLOAT)); +#endif + + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( + handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); + + PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, + "cudnn lstm weight size should be SAME"); + int dim_w[3]; + dim_w[0] = weights_size_ / sizeof(float); + dim_w[1] = 1; + dim_w[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( + handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); + + reserve_data_.Resize({static_cast(reserve_size_)}); + reserve_data_.mutable_data(place); + + workspace_data_.Resize({static_cast(workspace_size_)}); + workspace_data_.mutable_data(place); + } + + void release() { + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); + } + + delete[] x_desc_; + delete[] y_desc_; + delete[] dx_desc_; + delete[] dy_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); + + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc index 6fed9ba92c164..c35474e3aacee 100644 --- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc @@ -171,8 +171,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); - size_t rows_memory_size = - slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber, reinterpret_cast(slr->rows().data()), diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h index acc9b1e622794..6676ecd1c85d7 100644 --- a/paddle/fluid/operators/distributed_ops/split_ids_op.h +++ b/paddle/fluid/operators/distributed_ops/split_ids_op.h @@ -116,7 +116,7 @@ class SplitIdsOpKernel : public framework::OpKernel { } else { PADDLE_THROW( "% should be LoDTensor or SelectedRows, but the received type is %s", - ctx.Inputs("Ids")[0], ids_var->Type().name()); + ctx.Inputs("Ids")[0], framework::ToTypeName(ids_var->Type())); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index a8b8a67a114b9..7a7a3989c047a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -83,7 +83,7 @@ class ElementwiseMulKernel : public framework::OpKernel { z = ctx.Output("Out"); } else { PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - x_var->Type().name()); + framework::ToTypeName(x_var->Type())); } z->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 14a2524bd8f4a..be53a62cc9ccf 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -59,7 +59,7 @@ class GridSampleOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif @@ -155,7 +155,7 @@ class GridSampleOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h index 6c616aa03d980..3f51bb0b3d6dd 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.h +++ b/paddle/fluid/operators/optimizers/adadelta_op.h @@ -27,12 +27,14 @@ class AdadeltaOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto avg_squared_grad_out_tensor = diff --git a/paddle/fluid/operators/optimizers/adagrad_op.h b/paddle/fluid/operators/optimizers/adagrad_op.h index 9f6ef391696aa..13455fc42cdc7 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.h +++ b/paddle/fluid/operators/optimizers/adagrad_op.h @@ -50,7 +50,8 @@ class AdagradOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); auto *param_out_tensor = ctx.Output("ParamOut"); auto *moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54e8e..d8042e3614da0 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -235,7 +235,8 @@ class AdamOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h index 7137fbd9651b4..55d25ecbddf17 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.h +++ b/paddle/fluid/operators/optimizers/adamax_op.h @@ -27,12 +27,14 @@ class AdamaxOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h index 5df43d33ef9f7..4abd436927707 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h @@ -27,12 +27,14 @@ class DecayedAdagradOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h index 8f812c9a037bf..bbf34d8316b09 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.h +++ b/paddle/fluid/operators/optimizers/ftrl_op.h @@ -32,12 +32,14 @@ class FTRLOpKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); const auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE(grad_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Grad").front(), grad_var->Type().name()); + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); auto* param_out = ctx.Output("ParamOut"); auto* sq_accum_out = ctx.Output("SquaredAccumOut"); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 71f079e4d97f5..84955d3f04308 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -393,7 +393,7 @@ class MomentumOpKernel : public framework::OpKernel { PADDLE_THROW( string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows " "gradient, but the received Variable Type is %s", - grad_var->Type().name())); + framework::ToTypeName(grad_var->Type()))); } } }; diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index a9d303d55d8f6..975e4b8e7212b 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -60,7 +60,8 @@ class SGDOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE(param_var->IsType(), "The Var(%s)'s type should be LoDTensor, " "but the received is %s", - ctx.Inputs("Param").front(), param_var->Type().name()); + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); auto* param = ctx.Input("Param"); auto* param_out = ctx.Output("ParamOut"); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 5399ae556e7f3..6781cdf9f3448 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -92,7 +92,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif @@ -122,7 +122,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index bc889a5a042a2..ad37967f0ac3c 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -50,7 +50,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif @@ -157,7 +157,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc index f9a16ef35ecb9..c39f94637a1ab 100644 --- a/paddle/fluid/operators/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/sum_mkldnn_op.cc @@ -245,7 +245,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { } } else { PADDLE_THROW("Unexpected branch, output variable type is %s", - out_var->Type().name()); + framework::ToTypeName(out_var->Type())); } } }; diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 4f717a43551d6..01996e6bf9752 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -126,7 +126,7 @@ class SumOp : public framework::OperatorWithKernel { PADDLE_THROW("Cannot find the input data type by all input data"); } PADDLE_THROW("Unexpected branch. Input type is %s", - x_vars[0]->Type().name()); + framework::ToTypeName(x_vars[0]->Type())); } }; diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 76cc796a9b8e2..a8b2df186dbfc 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -163,7 +163,7 @@ class SumKernel : public framework::OpKernel { } } else { PADDLE_THROW("Unexpected branch, output variable type is %s", - out_var->Type().name()); + framework::ToTypeName(out_var->Type())); } } }; diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc index e2ae7caae1ebe..add03bad13dfd 100644 --- a/paddle/fluid/operators/warpctc_op.cc +++ b/paddle/fluid/operators/warpctc_op.cc @@ -51,7 +51,7 @@ class WarpCTCOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { + if (framework::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 61a25064d1799..74b0942379014 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" @@ -451,18 +450,6 @@ class ScopedActivationDescriptor { DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor); }; -inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { - bool use_cudnn = ctx.Attr("use_cudnn"); - use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); -#ifdef PADDLE_WITH_CUDA - if (use_cudnn) { - auto& dev_ctx = ctx.device_context(); - use_cudnn &= dev_ctx.cudnn_handle() != nullptr; - } -#endif - return use_cudnn; -} - #if CUDNN_VERSION >= 7001 class ScopedCTCLossDescriptor { public: From 53f6c6991aa749305bc585d067fa761579fcf995 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 19 Dec 2018 13:32:36 +0000 Subject: [PATCH 018/103] polish code test=develop --- paddle/fluid/framework/ddim.cc | 50 +++++++--------------------------- paddle/fluid/framework/ddim.h | 46 ++++++++++++++++++++++--------- paddle/fluid/framework/dim.h | 9 +++--- 3 files changed, 48 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index 3640138e18059..f7fee04c1e273 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -18,32 +18,10 @@ limitations under the License. */ namespace paddle { namespace framework { -template -struct DDimAssignFunctor { - static_assert(std::is_integral::value, "T must be integral type"); - using result_type = void; - explicit DDimAssignFunctor(const T* in) : in_(in) {} - - template - inline void operator()(Dim& dim) { // NOLINT - UnrollAssign::Run(in_, dim.data()); - } - - const T* in_; -}; - -DDim::DDim(const int* d, int n) : rank_(n) { - this->apply_visitor(DDimAssignFunctor(d)); -} - -DDim::DDim(const int64_t* d, int n) : rank_(n) { - this->apply_visitor(DDimAssignFunctor(d)); -} - template Dim make_dim(const int64_t* d) { Dim ret; - for (int i = 0; i < N; ++i) ret[i] = d[i]; + fix_dim_assign(d, ret.GetMutable()); return ret; } @@ -64,14 +42,14 @@ struct DDimEqualityVisitor { template inline bool operator()(const Dim& self) const { - return UnrollCompare::Run(self.data(), d_); + return UnrollCompare::Run(self.Get(), d_); } const int64_t* d_; }; bool DDim::operator==(const DDim& d) const { - return rank_ == d.rank_ && this->apply_visitor(DDimEqualityVisitor(d.data())); + return rank_ == d.rank_ && this->apply_visitor(DDimEqualityVisitor(d.Get())); } bool DDim::operator!=(const DDim& d) const { return !(*this == d); } @@ -82,7 +60,7 @@ struct DDimPlusVisitor { template inline void operator()(Dim& self) const { - UnrollAdd::Run(d1_, d2_, self.data()); + UnrollAdd::Run(d1_, d2_, self.GetMutable()); } const int64_t* d1_; @@ -93,7 +71,7 @@ DDim DDim::operator+(const DDim& d) const { PADDLE_ENFORCE(rank_ == d.rank_); DDim ret; ret.rank_ = rank_; - ret.apply_visitor(DDimPlusVisitor(data(), d.data())); + ret.apply_visitor(DDimPlusVisitor(Get(), d.Get())); return ret; } @@ -103,7 +81,7 @@ struct DDimMulVisitor { template inline void operator()(Dim& self) const { - UnrollMul::Run(d1_, d2_, self.data()); + UnrollMul::Run(d1_, d2_, self.GetMutable()); } const int64_t* d1_; @@ -114,7 +92,7 @@ DDim DDim::operator*(const DDim& d) const { PADDLE_ENFORCE(rank_ == d.rank_); DDim ret; ret.rank_ = rank_; - ret.apply_visitor(DDimMulVisitor(data(), d.data())); + ret.apply_visitor(DDimMulVisitor(Get(), d.Get())); return ret; } @@ -124,9 +102,7 @@ void set(DDim& ddim, int idx, int value) { ddim[idx] = value; } // NOLINT std::vector vectorize(const DDim& ddim) { std::vector result(DDim::kMaxRank); - for (int i = 0; i < ddim.size(); ++i) { - result[i] = ddim[i]; - } + dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); result.resize(ddim.size()); return result; } @@ -135,9 +111,7 @@ std::vector vectorize(const DDim& ddim) { // which does not fit cudnn inputs. std::vector vectorize2int(const DDim& ddim) { std::vector result(DDim::kMaxRank); - for (int i = 0; i < ddim.size(); ++i) { - result[i] = ddim[i]; - } + dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); result.resize(ddim.size()); return result; } @@ -154,15 +128,11 @@ int64_t product(const DDim& ddim) { } DDim slice_ddim(const DDim& dim, int begin, int end) { - PADDLE_ENFORCE(begin < end, - "Begin index must be less than end index in ddim slice."); PADDLE_ENFORCE(begin >= 0, "Begin index can't be less than zero in ddim slice."); DDim ret; ret.rank_ = end - begin; - for (int i = 0; i < ret.rank_; ++i) { - ret[i] = dim[i + begin]; - } + dynamic_dim_assign(dim.Get() + begin, ret.GetMutable(), ret.rank_); return ret; } diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index bff710040eba2..e65d451cdef11 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -22,6 +22,29 @@ limitations under the License. */ namespace paddle { namespace framework { +template +inline void dynamic_dim_assign(const T1* in, T2* out, int n) { +#define STATIC_DIM_ASSIGN_CASE(rank) \ + case rank: \ + static_dim_assign(in, out); \ + return + switch (n) { + STATIC_DIM_ASSIGN_CASE(0); + STATIC_DIM_ASSIGN_CASE(1); + STATIC_DIM_ASSIGN_CASE(2); + STATIC_DIM_ASSIGN_CASE(3); + STATIC_DIM_ASSIGN_CASE(4); + STATIC_DIM_ASSIGN_CASE(5); + STATIC_DIM_ASSIGN_CASE(6); + STATIC_DIM_ASSIGN_CASE(7); + STATIC_DIM_ASSIGN_CASE(8); + STATIC_DIM_ASSIGN_CASE(9); + default: + PADDLE_THROW("Invalid rank %d", n); + } +#undef STATIC_DIM_ASSIGN_CASE +} + /** * \brief A dynamically sized dimension. * @@ -33,8 +56,13 @@ class DDim { DDim() : rank_(1) { dim_[0] = 0; } - DDim(const int* d, int n); - DDim(const int64_t* d, int n); + DDim(const int* d, int n) : rank_(n) { + dynamic_dim_assign(d, dim_.GetMutable(), n); + } + + DDim(const int64_t* d, int n) : rank_(n) { + dynamic_dim_assign(d, dim_.GetMutable(), n); + } template /*implicit*/ DDim(const Dim& in) : rank_(D) { // NOLINT @@ -81,19 +109,11 @@ class DDim { DDim operator*(const DDim& d) const; - // Make DDim act like std::vector - using iterator = int64_t*; - using const_iterator = const int64_t*; - - int64_t* data() { return dim_.data(); } - const int64_t* data() const { return dim_.data(); } + inline const int64_t* Get() const { return dim_.Get(); } - iterator begin() { return data(); } - const_iterator begin() const { return data(); } - iterator end() { return data() + rank_; } - const_iterator end() const { return data() + rank_; } + inline int64_t* GetMutable() { return dim_.GetMutable(); } - int size() const { return rank_; } + inline int size() const { return rank_; } private: template diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h index 3ae60a3119e1b..21d91167a4475 100644 --- a/paddle/fluid/framework/dim.h +++ b/paddle/fluid/framework/dim.h @@ -54,10 +54,6 @@ class Dim : public Array { HOSTDEVICE Dim() = default; - HOSTDEVICE int64_t* data() { return this->GetMutable(); } - - HOSTDEVICE const int64_t* data() const { return this->Get(); } - HOST std::string to_string() const; }; @@ -283,5 +279,10 @@ HOSTDEVICE Dim linear_to_dimension(int linear_index, const Dim& extents) { return result; } +template +inline void static_dim_assign(const T1* in, T2* out) { + UnrollAssign::Run(in, out); +} + } // namespace framework } // namespace paddle From ce4a26ddad08a9d640f1ec3ddae254d0d0abd004 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 19 Dec 2018 12:23:11 +0000 Subject: [PATCH 019/103] clean code try to fix mac compile bug? test=develop --- paddle/fluid/framework/CMakeLists.txt | 5 +- paddle/fluid/framework/var_type_traits.cc | 53 +++++++++++++++++- paddle/fluid/framework/var_type_traits.h | 55 +------------------ .../fluid/framework/var_type_traits_test.cc | 30 +++++++--- 4 files changed, 77 insertions(+), 66 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index b6372a2ef5934..d0beb8361c206 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -83,10 +83,7 @@ cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) if (WITH_GPU) - target_link_libraries(var_type_traits cudnn) - if (NOT WIN32) - target_link_libraries(var_type_traits nccl) - endif() + target_link_libraries(var_type_traits dynload_cuda) endif() cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 0171df6f7389d..c9f9f8d6c6537 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -17,9 +17,58 @@ namespace paddle { namespace framework { -const char* ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } +// Besides registering variable type id, it is helpful to register a +// var_id -> std::type_index map (for example, get type names according to id) +namespace detail { -const std::type_index& ToTypeIndex(int var_id) { +template +struct VarIdToTypeIndexMapInitializerImpl { + static void Init(std::unordered_map *m) { + using Type = + typename std::tuple_element::type; + constexpr int kId = VarTypeTrait::kId; + if (!std::is_same::value) { + m->emplace(kId, std::type_index(typeid(Type))); + } + VarIdToTypeIndexMapInitializerImpl::Init(m); + } +}; + +template +struct VarIdToTypeIndexMapInitializerImpl { + static void Init(std::unordered_map *m) {} +}; + +// VarIdToTypeIndexMapInitializer is designed to initialize var_id -> +// std::type_index map +using VarIdToTypeIndexMapInitializer = + VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum, + VarTypeRegistry::kRegisteredTypeNum == + 0>; + +struct VarIdToTypeIndexMapHolder { + public: + static const std::type_index &ToTypeIndex(int var_id) { + static const VarIdToTypeIndexMapHolder instance; + auto it = instance.var_type_map_.find(var_id); + PADDLE_ENFORCE(it != instance.var_type_map_.end(), + "VarId %d is not registered.", var_id); + return it->second; + } + + private: + VarIdToTypeIndexMapHolder() { + VarIdToTypeIndexMapInitializer::Init(&var_type_map_); + } + std::unordered_map var_type_map_; +}; + +} // namespace detail + +const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } + +const std::type_index &ToTypeIndex(int var_id) { return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); } diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 88f917e74fc14..c5e0d4707efe2 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -40,6 +40,9 @@ namespace paddle { namespace framework { +const char *ToTypeName(int var_id); +const std::type_index &ToTypeIndex(int var_id); + namespace detail { template std::type_index (for example, get var names according to id) -namespace detail { - -template -struct VarIdToTypeIndexMapInitializerImpl { - static void Init(std::unordered_map *m) { - using Type = - typename std::tuple_element::type; - constexpr int kId = VarTypeTrait::kId; - if (!std::is_same::value) { - m->emplace(kId, std::type_index(typeid(Type))); - } - VarIdToTypeIndexMapInitializerImpl::Init(m); - } -}; - -template -struct VarIdToTypeIndexMapInitializerImpl { - static void Init(std::unordered_map *m) {} -}; - -// VarIdToTypeIndexMapInitializer is designed to initialize var_id -> -// std::type_index map -using VarIdToTypeIndexMapInitializer = - VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum, - VarTypeRegistry::kRegisteredTypeNum == - 0>; - -struct VarIdToTypeIndexMapHolder { - public: - static const std::type_index &ToTypeIndex(int var_id) { - static const VarIdToTypeIndexMapHolder instance; - auto it = instance.var_type_map_.find(var_id); - PADDLE_ENFORCE(it != instance.var_type_map_.end(), - "VarId %d is not registered.", var_id); - return it->second; - } - - private: - VarIdToTypeIndexMapHolder() { - VarIdToTypeIndexMapInitializer::Init(&var_type_map_); - } - std::unordered_map var_type_map_; -}; - -} // namespace detail - -const char *ToTypeName(int var_id); -const std::type_index &ToTypeIndex(int var_id); - template inline constexpr bool IsRegisteredVarType() { return VarTypeRegistry::IsRegistered(); diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 09fab719c1674..f46608233ab90 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -15,32 +15,46 @@ #include "paddle/fluid/framework/var_type_traits.h" #include #include +#include namespace paddle { namespace framework { template struct TypeIndexChecker { - static void Check() { + template + static void Check(SetType1 *var_id_set, SetType2 *type_index_set) { using Type = typename std::tuple_element::type; + static_assert(std::is_same::Type, Type>::value, + "Type must be the same"); + constexpr auto kId = VarTypeTrait::kId; if (!std::is_same::value) { - EXPECT_TRUE(ToTypeIndex(VarTypeTrait::kId) == typeid(Type)); - EXPECT_TRUE(std::string(ToTypeName(VarTypeTrait::kId)) == - typeid(Type).name()); + std::type_index actual_type(typeid(Type)); + EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); + EXPECT_EQ(ToTypeIndex(kId), actual_type); + EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT + EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT + var_id_set->insert(kId); + type_index_set->insert(std::type_index(typeid(Type))); } - TypeIndexChecker::Check(); + TypeIndexChecker::Check(var_id_set, + type_index_set); } }; template struct TypeIndexChecker { - static void Check() {} + template + static void Check(SetType1 *, SetType2 *) {} }; -TEST(var_type_traits, check_type_index) { +TEST(var_type_traits, check_no_duplicate_registry) { constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum; - TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check(); + std::unordered_set var_id_set; + std::unordered_set type_index_set; + TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check( + &var_id_set, &type_index_set); } template From 454db6662e15234df8f0765c098d171e75d5ec1a Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 20 Dec 2018 00:56:05 +0800 Subject: [PATCH 020/103] Accelerate lstm --- paddle/fluid/operators/math/concat_and_split.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index 760a065c1081d..930d851696efd 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -180,7 +180,7 @@ class ConcatFunctor { } // Wait() must be called because `inputs_data` may be destructed before // kernel ends - context.Wait(); + /* context.Wait(); */ } }; @@ -258,7 +258,7 @@ class SplitFunctor { } // Wait() must be called because `outputs_data` may be destructed before // kernel ends - context.Wait(); + /* context.Wait(); */ } }; From 83ac85158a736b337e5983668da0ad136e46fe64 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 20 Dec 2018 01:58:48 +0000 Subject: [PATCH 021/103] polish code test=develop --- paddle/fluid/framework/ddim.cc | 26 ++--- paddle/fluid/framework/ddim.h | 103 +++++++----------- .../fluid/operators/detail/strided_memcpy.h | 4 +- 3 files changed, 48 insertions(+), 85 deletions(-) diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index f7fee04c1e273..033d780faad26 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -18,13 +18,6 @@ limitations under the License. */ namespace paddle { namespace framework { -template -Dim make_dim(const int64_t* d) { - Dim ret; - fix_dim_assign(d, ret.GetMutable()); - return ret; -} - DDim make_ddim(std::initializer_list dims) { return DDim(dims.begin(), dims.size()); } @@ -69,8 +62,7 @@ struct DDimPlusVisitor { DDim DDim::operator+(const DDim& d) const { PADDLE_ENFORCE(rank_ == d.rank_); - DDim ret; - ret.rank_ = rank_; + DDim ret(rank_); ret.apply_visitor(DDimPlusVisitor(Get(), d.Get())); return ret; } @@ -90,8 +82,7 @@ struct DDimMulVisitor { DDim DDim::operator*(const DDim& d) const { PADDLE_ENFORCE(rank_ == d.rank_); - DDim ret; - ret.rank_ = rank_; + DDim ret(rank_); ret.apply_visitor(DDimMulVisitor(Get(), d.Get())); return ret; } @@ -118,7 +109,7 @@ std::vector vectorize2int(const DDim& ddim) { struct ProductVisitor { template - int64_t operator()(const Dim& dim) { + inline int64_t operator()(const Dim& dim) { return product(dim); } }; @@ -130,8 +121,7 @@ int64_t product(const DDim& ddim) { DDim slice_ddim(const DDim& dim, int begin, int end) { PADDLE_ENFORCE(begin >= 0, "Begin index can't be less than zero in ddim slice."); - DDim ret; - ret.rank_ = end - begin; + DDim ret(end - begin); dynamic_dim_assign(dim.Get() + begin, ret.GetMutable(), ret.rank_); return ret; } @@ -166,8 +156,7 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims) { DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } DDim stride(const DDim& ddim) { - DDim strides; - strides.rank_ = ddim.size(); + DDim strides(ddim.size()); strides[ddim.size() - 1] = 1; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i + 1]; @@ -175,9 +164,8 @@ DDim stride(const DDim& ddim) { return strides; } -DDim stride_numel(const framework::DDim& ddim) { - DDim strides; - strides.rank_ = ddim.size(); +DDim stride_numel(const DDim& ddim) { + DDim strides(ddim.size()); strides[ddim.size() - 1] = ddim[ddim.size() - 1]; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i]; diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index e65d451cdef11..36ad90a2ae48b 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -22,27 +22,31 @@ limitations under the License. */ namespace paddle { namespace framework { +#define PADDLE_VISIT_DDIM_BASE(rank, callback) \ + case (rank): { \ + constexpr auto kRank = (rank); \ + return (callback); \ + } + +#define PADDLE_VISIT_DDIM(rank, callback) \ + switch (rank) { \ + PADDLE_VISIT_DDIM_BASE(0, callback); \ + PADDLE_VISIT_DDIM_BASE(1, callback); \ + PADDLE_VISIT_DDIM_BASE(2, callback); \ + PADDLE_VISIT_DDIM_BASE(3, callback); \ + PADDLE_VISIT_DDIM_BASE(4, callback); \ + PADDLE_VISIT_DDIM_BASE(5, callback); \ + PADDLE_VISIT_DDIM_BASE(6, callback); \ + PADDLE_VISIT_DDIM_BASE(7, callback); \ + PADDLE_VISIT_DDIM_BASE(8, callback); \ + PADDLE_VISIT_DDIM_BASE(9, callback); \ + default: \ + PADDLE_THROW("Invalid rank %d", rank); \ + } + template inline void dynamic_dim_assign(const T1* in, T2* out, int n) { -#define STATIC_DIM_ASSIGN_CASE(rank) \ - case rank: \ - static_dim_assign(in, out); \ - return - switch (n) { - STATIC_DIM_ASSIGN_CASE(0); - STATIC_DIM_ASSIGN_CASE(1); - STATIC_DIM_ASSIGN_CASE(2); - STATIC_DIM_ASSIGN_CASE(3); - STATIC_DIM_ASSIGN_CASE(4); - STATIC_DIM_ASSIGN_CASE(5); - STATIC_DIM_ASSIGN_CASE(6); - STATIC_DIM_ASSIGN_CASE(7); - STATIC_DIM_ASSIGN_CASE(8); - STATIC_DIM_ASSIGN_CASE(9); - default: - PADDLE_THROW("Invalid rank %d", n); - } -#undef STATIC_DIM_ASSIGN_CASE + PADDLE_VISIT_DDIM(n, (static_dim_assign(in, out))); } /** @@ -84,22 +88,26 @@ class DDim { inline int64_t operator[](int idx) const { return dim_[idx]; } inline int64_t& at(int idx) { - PADDLE_ENFORCE(idx >= 0 && idx < rank_); + PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx); return dim_[idx]; } inline int64_t at(int idx) const { - PADDLE_ENFORCE(idx >= 0 && idx < rank_); + PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx); return dim_[idx]; } template typename std::result_of&)>::type apply_visitor( - Visitor&& visitor); + Visitor&& visitor) { + PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast())); + } template typename std::result_of&)>::type apply_visitor( - Visitor&& visitor) const; + Visitor&& visitor) const { + PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast())); + } bool operator==(const DDim& d) const; @@ -128,55 +136,22 @@ class DDim { return *reinterpret_cast*>(p); } + // Construct DDim with given rank + // Only used in friend functions + explicit DDim(int rank) : rank_(rank) { + PADDLE_ENFORCE(rank_ >= 0 && rank_ < kMaxRank, "Invalid rank %d", rank); + } + friend DDim slice_ddim(const DDim& dim, int begin, int end); friend DDim stride(const DDim& ddim); friend DDim stride_numel(const DDim& ddim); + private: Dim dim_; int rank_; }; -#define PADDLE_VISIT_DDIM(rank) \ - case rank: \ - return visitor(UnsafeCast()) - -template -typename std::result_of&)>::type DDim::apply_visitor( - Visitor&& visitor) { - switch (rank_) { - PADDLE_VISIT_DDIM(0); - PADDLE_VISIT_DDIM(1); - PADDLE_VISIT_DDIM(2); - PADDLE_VISIT_DDIM(3); - PADDLE_VISIT_DDIM(4); - PADDLE_VISIT_DDIM(5); - PADDLE_VISIT_DDIM(6); - PADDLE_VISIT_DDIM(7); - PADDLE_VISIT_DDIM(8); - PADDLE_VISIT_DDIM(9); - default: - PADDLE_THROW("Invalid rank %d", rank_); - } -} - -template -typename std::result_of&)>::type DDim::apply_visitor( - Visitor&& visitor) const { - switch (rank_) { - PADDLE_VISIT_DDIM(0); - PADDLE_VISIT_DDIM(1); - PADDLE_VISIT_DDIM(2); - PADDLE_VISIT_DDIM(3); - PADDLE_VISIT_DDIM(4); - PADDLE_VISIT_DDIM(5); - PADDLE_VISIT_DDIM(6); - PADDLE_VISIT_DDIM(7); - PADDLE_VISIT_DDIM(8); - PADDLE_VISIT_DDIM(9); - default: - PADDLE_THROW("Invalid rank %d", rank_); - } -} +#undef PADDLE_VISIT_DDIM_BASE #undef PADDLE_VISIT_DDIM /** diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h index fc223ce55931e..94419d1f9a4ba 100644 --- a/paddle/fluid/operators/detail/strided_memcpy.h +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -98,8 +98,8 @@ struct StridedCopyDimVisitor { template void operator()(const framework::Dim& dst_dim) const { StridedMemcpyFunctor functor; - functor(dev_ctx_, src_, src_stride_.data(), dst_dim.data(), - dst_stride_.data(), dst_); + functor(dev_ctx_, src_, src_stride_.Get(), dst_dim.Get(), dst_stride_.Get(), + dst_); } const platform::DeviceContext& dev_ctx_; From 13429c3e9f92877ca8c282e1cae2d752a506b7ac Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 20 Dec 2018 02:56:11 +0000 Subject: [PATCH 022/103] clean code, remove void registration test why MAC CI fail again test=develop --- paddle/fluid/framework/var_type_traits.cc | 58 ++++++++++++++----- paddle/fluid/framework/var_type_traits.h | 33 ++++++----- .../fluid/framework/var_type_traits_test.cc | 33 +++++++---- 3 files changed, 83 insertions(+), 41 deletions(-) diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index c9f9f8d6c6537..690c4895c1df3 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace framework { @@ -23,54 +24,83 @@ namespace detail { template struct VarIdToTypeIndexMapInitializerImpl { - static void Init(std::unordered_map *m) { + template + static void Init(MapType1 *id_to_type, MapType2 *type_to_id) { using Type = typename std::tuple_element::type; + static_assert(!std::is_same::value, "Type cannot be void"); constexpr int kId = VarTypeTrait::kId; - if (!std::is_same::value) { - m->emplace(kId, std::type_index(typeid(Type))); - } + auto type = std::type_index(typeid(Type)); + PADDLE_ENFORCE(id_to_type->count(kId) == 0, + "Registered duplicate type id %d for type %s", kId, + type.name()); + PADDLE_ENFORCE(type_to_id->count(type) == 0, + "Registered duplicate type_index %s for id %d", type.name(), + kId); + id_to_type->emplace(kId, type); + type_to_id->emplace(type, kId); VarIdToTypeIndexMapInitializerImpl::Init(m); + kStart + 1 == kEnd>::Init(id_to_type, + type_to_id); } }; template struct VarIdToTypeIndexMapInitializerImpl { - static void Init(std::unordered_map *m) {} + template + static void Init(MapType1 *, MapType2 *) {} }; // VarIdToTypeIndexMapInitializer is designed to initialize var_id -> -// std::type_index map +// std::type_index map and std::type_index -> var_id map using VarIdToTypeIndexMapInitializer = VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum, VarTypeRegistry::kRegisteredTypeNum == 0>; struct VarIdToTypeIndexMapHolder { + DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder); + public: static const std::type_index &ToTypeIndex(int var_id) { - static const VarIdToTypeIndexMapHolder instance; - auto it = instance.var_type_map_.find(var_id); - PADDLE_ENFORCE(it != instance.var_type_map_.end(), + auto it = Instance().id_to_type_map_.find(var_id); + PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(), "VarId %d is not registered.", var_id); return it->second; } + static int ToTypeId(const std::type_index &type) { + auto it = Instance().type_to_id_map_.find(type); + PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(), + "VarType %s is not registered.", type.name()); + return it->second; + } + private: VarIdToTypeIndexMapHolder() { - VarIdToTypeIndexMapInitializer::Init(&var_type_map_); + VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_); + } + + static const VarIdToTypeIndexMapHolder &Instance() { + static const VarIdToTypeIndexMapHolder instance; + return instance; } - std::unordered_map var_type_map_; + + std::unordered_map id_to_type_map_; + std::unordered_map type_to_id_map_; }; } // namespace detail -const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } - const std::type_index &ToTypeIndex(int var_id) { return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); } +const char *ToTypeName(int var_id) { return ToTypeIndex(var_id).name(); } + +int ToTypeId(const std::type_index &type) { + return detail::VarIdToTypeIndexMapHolder::ToTypeId(type); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index c5e0d4707efe2..a58414c3d4e56 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -42,6 +42,7 @@ namespace framework { const char *ToTypeName(int var_id); const std::type_index &ToTypeIndex(int var_id); +int ToTypeId(const std::type_index &type); namespace detail { @@ -75,10 +76,10 @@ struct VarTypeRegistryImpl { using ArgTuple = std::tuple; // TypePos() returns the position in which T is inside Args... - // If T is not inside Args... or T is void, return -1 + // If T is not inside Args..., return -1 template static constexpr int TypePos() { - return std::is_same::value ? -1 : TypePosFinder::kPos; + return TypePosFinder::kPos; } // IsRegistered() returns whether T is registered inside RegistryImpl @@ -90,19 +91,22 @@ struct VarTypeRegistryImpl { } // namespace detail -#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id) \ - template <> \ - struct VarTypeTrait { \ - static_assert(VarTypeRegistry::IsRegistered(), \ - "Must be registered type"); \ - using Type = type; \ - static constexpr int kId = proto_id; \ +#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id) \ + template <> \ + struct VarTypeTrait { \ + static_assert(VarTypeRegistry::IsRegistered(), \ + "Must be registered type"); \ + using Type = type; \ + static constexpr int kId = static_cast(proto_id); \ } /** * The following codes are designed to register variable types. * Only registered types can be stored in Variable. * This registry mechanism is designed to speed up Variable. + * + * Caution: If you want to add more var types, please consider carefully + * whether you really need to add it. */ // Users should add other variable types below. @@ -110,10 +114,9 @@ struct VarTypeRegistryImpl { class Scope; using VarTypeRegistry = detail::VarTypeRegistryImpl< - LoDTensor, SelectedRows, std::vector, LoDRankTable, LoDTensorArray, - platform::PlaceList, ReaderHolder, Tensor, std::string, Scope *, + Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, + LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, std::map, operators::reader::LoDTensorBlockingQueueHolder, - int, float, #ifdef PADDLE_WITH_CUDA #ifndef _WIN32 ncclUniqueId, platform::Communicator, @@ -123,13 +126,11 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< operators::AlgorithmsCache, operators::CudnnRNNCache, #endif - void>; // void indicates end of registration, add other types before void + int, float>; template struct VarTypeTrait { - static_assert(std::is_same::value || - VarTypeRegistry::IsRegistered(), - "Must be registered type"); + static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); using Type = T; // Default id generation static constexpr int kId = VarTypeRegistry::TypePos() + diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index f46608233ab90..4dad4cb27b772 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/var_type_traits.h" #include #include +#include #include namespace paddle { @@ -29,15 +30,27 @@ struct TypeIndexChecker { static_assert(std::is_same::Type, Type>::value, "Type must be the same"); constexpr auto kId = VarTypeTrait::kId; - if (!std::is_same::value) { - std::type_index actual_type(typeid(Type)); - EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); - EXPECT_EQ(ToTypeIndex(kId), actual_type); - EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT - EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT - var_id_set->insert(kId); - type_index_set->insert(std::type_index(typeid(Type))); + std::type_index actual_type(typeid(Type)); + EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); + // For some reasons, comparing std::type_index using EXPECT_EQ would fail + // in MAC CI + bool is_same_type_index = (ToTypeIndex(kId) == actual_type); + if (!is_same_type_index) { + std::string s1 = ToTypeName(kId); + std::string s2 = actual_type.name(); + PADDLE_THROW("Step %d: type %s is not the same as %s, var_id %d", kPos, + s1.c_str(), s2.c_str(), kId); } + EXPECT_TRUE(is_same_type_index); + EXPECT_TRUE(ToTypeId(actual_type) == kId); // NOLINT + is_same_type_index = (ToTypeIndex(ToTypeId(actual_type)) == actual_type); + EXPECT_TRUE(is_same_type_index); + EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId); + + EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT + EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT + var_id_set->insert(kId); + type_index_set->insert(std::type_index(typeid(Type))); TypeIndexChecker::Check(var_id_set, type_index_set); } @@ -75,13 +88,11 @@ TEST(var_type_traits, check_proto_type_id) { } TEST(var_type_traits, test_registry) { - using Registry = - detail::VarTypeRegistryImpl; + using Registry = detail::VarTypeRegistryImpl; ASSERT_TRUE(Registry::TypePos() == 0); ASSERT_TRUE(Registry::TypePos() == 1); ASSERT_TRUE(Registry::TypePos() == 2); ASSERT_TRUE(Registry::TypePos() == 3); - ASSERT_TRUE(Registry::TypePos() == -1); ASSERT_TRUE(Registry::TypePos() == -1); } From 89b9d86d9d676a756357341fa9d8b0d1efec2f48 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 20 Dec 2018 10:43:44 +0000 Subject: [PATCH 023/103] fix windows compile bug test=develop --- paddle/fluid/framework/ddim.cc | 16 +++++++++++----- paddle/fluid/framework/ddim.h | 6 ------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index 033d780faad26..95078093e5905 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -62,7 +62,8 @@ struct DDimPlusVisitor { DDim DDim::operator+(const DDim& d) const { PADDLE_ENFORCE(rank_ == d.rank_); - DDim ret(rank_); + DDim ret; + ret.rank_ = rank_; ret.apply_visitor(DDimPlusVisitor(Get(), d.Get())); return ret; } @@ -82,7 +83,8 @@ struct DDimMulVisitor { DDim DDim::operator*(const DDim& d) const { PADDLE_ENFORCE(rank_ == d.rank_); - DDim ret(rank_); + DDim ret; + ret.rank_ = rank_; ret.apply_visitor(DDimMulVisitor(Get(), d.Get())); return ret; } @@ -121,7 +123,9 @@ int64_t product(const DDim& ddim) { DDim slice_ddim(const DDim& dim, int begin, int end) { PADDLE_ENFORCE(begin >= 0, "Begin index can't be less than zero in ddim slice."); - DDim ret(end - begin); + int len = end - begin; + DDim ret; + ret.rank_ = len; dynamic_dim_assign(dim.Get() + begin, ret.GetMutable(), ret.rank_); return ret; } @@ -156,7 +160,8 @@ DDim flatten_to_2d(const DDim& src, int num_col_dims) { DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } DDim stride(const DDim& ddim) { - DDim strides(ddim.size()); + DDim strides; + strides.rank_ = ddim.size(); strides[ddim.size() - 1] = 1; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i + 1]; @@ -165,7 +170,8 @@ DDim stride(const DDim& ddim) { } DDim stride_numel(const DDim& ddim) { - DDim strides(ddim.size()); + DDim strides; + strides.rank_ = ddim.size(); strides[ddim.size() - 1] = ddim[ddim.size() - 1]; for (int i = ddim.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ddim[i]; diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index 36ad90a2ae48b..0d7b12120525b 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -136,12 +136,6 @@ class DDim { return *reinterpret_cast*>(p); } - // Construct DDim with given rank - // Only used in friend functions - explicit DDim(int rank) : rank_(rank) { - PADDLE_ENFORCE(rank_ >= 0 && rank_ < kMaxRank, "Invalid rank %d", rank); - } - friend DDim slice_ddim(const DDim& dim, int begin, int end); friend DDim stride(const DDim& ddim); friend DDim stride_numel(const DDim& ddim); From 7f6e513b1fa798745d7cb918bd7a56d66607aed3 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 20 Dec 2018 12:21:51 +0000 Subject: [PATCH 024/103] fix mac ci bug make forward declaration test=develop --- paddle/fluid/framework/var_type_traits.cc | 13 ++++++ paddle/fluid/framework/var_type_traits.h | 43 +++++++++++++++---- .../fluid/framework/var_type_traits_test.cc | 31 +++++++------ 3 files changed, 64 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 690c4895c1df3..c3c5bab23b92a 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -13,7 +13,20 @@ // limitations under the License. #include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/macros.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index a58414c3d4e56..b51b4933e6c9e 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -20,23 +20,48 @@ #include #include #include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA +#include #ifndef _WIN32 #include -#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif -#include -#include "paddle/fluid/operators/conv_cudnn_op_cache.h" -#include "paddle/fluid/operators/cudnn_rnn_cache.h" #endif +// Users should add forward declarations here +namespace paddle { + +namespace platform { +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +class Communicator; +#endif +#endif +} // namespace platform + +namespace framework { +class Tensor; +class LoDTensor; +class SelectedRows; +class LoDRankTable; +class ReaderHolder; +class Scope; +} // namespace framework + +namespace operators { +template +class AlgorithmsCache; + +class CudnnRNNCache; + +namespace reader { +class LoDTensorBlockingQueueHolder; +} // namespace reader +} // namespace operators + +} // namespace paddle + namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 4dad4cb27b772..1c7d9f2abed20 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -12,12 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/var_type_traits.h" #include #include #include #include +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + namespace paddle { namespace framework { @@ -32,19 +45,9 @@ struct TypeIndexChecker { constexpr auto kId = VarTypeTrait::kId; std::type_index actual_type(typeid(Type)); EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); - // For some reasons, comparing std::type_index using EXPECT_EQ would fail - // in MAC CI - bool is_same_type_index = (ToTypeIndex(kId) == actual_type); - if (!is_same_type_index) { - std::string s1 = ToTypeName(kId); - std::string s2 = actual_type.name(); - PADDLE_THROW("Step %d: type %s is not the same as %s, var_id %d", kPos, - s1.c_str(), s2.c_str(), kId); - } - EXPECT_TRUE(is_same_type_index); - EXPECT_TRUE(ToTypeId(actual_type) == kId); // NOLINT - is_same_type_index = (ToTypeIndex(ToTypeId(actual_type)) == actual_type); - EXPECT_TRUE(is_same_type_index); + EXPECT_EQ(ToTypeIndex(kId), actual_type); + EXPECT_EQ(ToTypeId(actual_type), kId); + EXPECT_EQ(ToTypeIndex(ToTypeId(actual_type)), actual_type); EXPECT_EQ(ToTypeId(ToTypeIndex(kId)), kId); EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT From 600f6d8272630a946a69d3d3a040f744ccd76151 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 21 Dec 2018 02:53:05 +0000 Subject: [PATCH 025/103] polish code test=develop --- paddle/fluid/framework/ddim.cc | 30 ++++----- paddle/fluid/framework/ddim.h | 14 ++-- paddle/fluid/framework/dim.h | 116 ++++++++++++++++----------------- 3 files changed, 79 insertions(+), 81 deletions(-) diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index 95078093e5905..37544e97eb655 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -42,7 +42,8 @@ struct DDimEqualityVisitor { }; bool DDim::operator==(const DDim& d) const { - return rank_ == d.rank_ && this->apply_visitor(DDimEqualityVisitor(d.Get())); + return size() == d.size() && + this->apply_visitor(DDimEqualityVisitor(d.Get())); } bool DDim::operator!=(const DDim& d) const { return !(*this == d); } @@ -61,7 +62,7 @@ struct DDimPlusVisitor { }; DDim DDim::operator+(const DDim& d) const { - PADDLE_ENFORCE(rank_ == d.rank_); + PADDLE_ENFORCE(size() == d.size()); DDim ret; ret.rank_ = rank_; ret.apply_visitor(DDimPlusVisitor(Get(), d.Get())); @@ -82,7 +83,7 @@ struct DDimMulVisitor { }; DDim DDim::operator*(const DDim& d) const { - PADDLE_ENFORCE(rank_ == d.rank_); + PADDLE_ENFORCE(size() == d.size()); DDim ret; ret.rank_ = rank_; ret.apply_visitor(DDimMulVisitor(Get(), d.Get())); @@ -121,13 +122,11 @@ int64_t product(const DDim& ddim) { } DDim slice_ddim(const DDim& dim, int begin, int end) { - PADDLE_ENFORCE(begin >= 0, - "Begin index can't be less than zero in ddim slice."); - int len = end - begin; - DDim ret; - ret.rank_ = len; - dynamic_dim_assign(dim.Get() + begin, ret.GetMutable(), ret.rank_); - return ret; + PADDLE_ENFORCE(begin >= 0 && end <= dim.size(), + "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.", + begin, end, dim.size()); + // Constructor of DDim would check whether end - begin is valid + return DDim(dim.Get() + begin, end - begin); } int arity(const DDim& d) { return d.size(); } @@ -138,8 +137,8 @@ struct DDimPrinter { std::ostream& os; explicit DDimPrinter(std::ostream& os_) : os(os_) {} - template - void operator()(const T& t) { + template + void operator()(const Dim& t) { os << t; } }; @@ -152,12 +151,11 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) { } DDim flatten_to_2d(const DDim& src, int num_col_dims) { - int rank = src.size(); - return make_ddim({product(slice_ddim(src, 0, num_col_dims)), - product(slice_ddim(src, num_col_dims, rank))}); + return DDim({product(slice_ddim(src, 0, num_col_dims)), + product(slice_ddim(src, num_col_dims, src.size()))}); } -DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); } +DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); } DDim stride(const DDim& ddim) { DDim strides; diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index 0d7b12120525b..452072a58762d 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -124,16 +124,16 @@ class DDim { inline int size() const { return rank_; } private: - template - inline Dim& UnsafeCast() { - return const_cast&>(const_cast(this)->UnsafeCast()); + template + inline Dim& UnsafeCast() { + return const_cast&>(const_cast(this)->UnsafeCast()); } - template - inline const Dim& UnsafeCast() const { - static_assert(M >= 0 && M <= kMaxRank, "Invalid rank"); + template + inline const Dim& UnsafeCast() const { + static_assert(D >= 0 && D <= kMaxRank, "Invalid rank"); auto* p = static_cast(&dim_); - return *reinterpret_cast*>(p); + return *reinterpret_cast*>(p); } friend DDim slice_ddim(const DDim& dim, int begin, int end); diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h index 21d91167a4475..88aee8379d835 100644 --- a/paddle/fluid/framework/dim.h +++ b/paddle/fluid/framework/dim.h @@ -28,17 +28,17 @@ namespace paddle { namespace framework { // Statically sized, statically indexed dimension -template -class Dim : public Array { +template +class Dim : public Array { public: - static_assert(N >= 0, "N must be not less than 0"); + static_assert(D >= 0, "D must be not less than 0"); - static constexpr int kRank = N; - using BaseClass = Array; + static constexpr int kRank = D; + using BaseClass = Array; - inline Dim(int64_t head, const Dim& tail) { + inline Dim(int64_t head, const Dim& tail) { (*this)[0] = head; - new (this->GetMutable() + 1) Dim(tail); + new (this->GetMutable() + 1) Dim(tail); } template @@ -47,7 +47,7 @@ class Dim : public Array { /** Construct a Dim from a linear index and size. Uses Fortran order * indexing. */ - HOSTDEVICE Dim(int64_t idx, const Dim& size); + HOSTDEVICE Dim(int64_t idx, const Dim& size); /** Construct a Dim with each dimension set to the given index */ HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); } @@ -77,42 +77,42 @@ struct FortranOrderIndexingConstructorFunctor { }; } // namespace detail -template -HOSTDEVICE Dim::Dim(int64_t idx, const Dim& size) { - detail::FortranOrderIndexingConstructorFunctor<0, N, N == 0>::Run( +template +HOSTDEVICE Dim::Dim(int64_t idx, const Dim& size) { + detail::FortranOrderIndexingConstructorFunctor<0, D, D == 0>::Run( size.Get(), &idx, this->GetMutable()); } -template -HOSTDEVICE inline int64_t get(const Dim& dim) { +template +HOSTDEVICE inline int64_t get(const Dim& dim) { return dim[idx]; } -template -HOSTDEVICE inline int64_t& get(Dim& dim) { // NOLINT +template +HOSTDEVICE inline int64_t& get(Dim& dim) { // NOLINT return dim[idx]; } -template -HOSTDEVICE inline int64_t get(const Dim& dim, int idx) { +template +HOSTDEVICE inline int64_t get(const Dim& dim, int idx) { return dim[idx]; } -template -HOSTDEVICE inline int64_t& get(Dim& dim, int idx) { // NOLINT +template +HOSTDEVICE inline int64_t& get(Dim& dim, int idx) { // NOLINT return dim[idx]; } // Dot product of two dims -template -HOSTDEVICE inline int64_t linearize(const Dim& a, const Dim& b) { - return UnrollProduct::Run(a.Get(), b.Get()); +template +HOSTDEVICE inline int64_t linearize(const Dim& a, const Dim& b) { + return UnrollProduct::Run(a.Get(), b.Get()); } // Product of a Dim -template -HOSTDEVICE inline int64_t product(const Dim& a) { - return UnrollProduct::Run(a.Get()); +template +HOSTDEVICE inline int64_t product(const Dim& a) { + return UnrollProduct::Run(a.Get()); } // Is 0 <= idx_i < size_i for all i? @@ -135,9 +135,9 @@ struct ContainedFunctor { }; } // namespace detail -template -HOSTDEVICE inline bool contained(const Dim& idx, const Dim& size) { - return detail::ContainedFunctor<0, N, N == 0>::Run(idx.Get(), size.Get()); +template +HOSTDEVICE inline bool contained(const Dim& idx, const Dim& size) { + return detail::ContainedFunctor<0, D, D == 0>::Run(idx.Get(), size.Get()); } /** @@ -160,40 +160,40 @@ struct ExPrefixMulFunctor { }; } // namespace detail -template -HOSTDEVICE inline Dim ex_prefix_mul(const Dim& src) { - Dim ret; - detail::ExPrefixMulFunctor<0, N, N == 0>::Run(src.Get(), ret.GetMutable()); +template +HOSTDEVICE inline Dim ex_prefix_mul(const Dim& src) { + Dim ret; + detail::ExPrefixMulFunctor<0, D, D == 0>::Run(src.Get(), ret.GetMutable()); return ret; } /** * Add two dimensions together */ -template -HOSTDEVICE inline Dim dim_plus(const Dim& a, const Dim& b) { - Dim ret; - UnrollAdd::Run(a.Get(), b.Get(), ret.GetMutable()); +template +HOSTDEVICE inline Dim dim_plus(const Dim& a, const Dim& b) { + Dim ret; + UnrollAdd::Run(a.Get(), b.Get(), ret.GetMutable()); return ret; } -template -HOSTDEVICE inline Dim operator+(const Dim& lhs, const Dim& rhs) { +template +HOSTDEVICE inline Dim operator+(const Dim& lhs, const Dim& rhs) { return dim_plus(lhs, rhs); } /** * Multiply two dimensions together */ -template -HOSTDEVICE inline Dim dim_mult(const Dim& a, const Dim& b) { - Dim ret; - UnrollMul::Run(a.Get(), b.Get(), ret.GetMutable()); +template +HOSTDEVICE inline Dim dim_mult(const Dim& a, const Dim& b) { + Dim ret; + UnrollMul::Run(a.Get(), b.Get(), ret.GetMutable()); return ret; } -template -HOSTDEVICE Dim operator*(const Dim& lhs, const Dim& rhs) { +template +HOSTDEVICE Dim operator*(const Dim& lhs, const Dim& rhs) { return dim_mult(lhs, rhs); } @@ -224,10 +224,10 @@ struct NormalizeStridesFunctor { }; } // namespace detail -template -HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { - Dim ret; - detail::NormalizeStridesFunctor<0, N, N == 0>::Run(size.Get(), stride.Get(), +template +HOSTDEVICE Dim normalize_strides(const Dim& size, const Dim& stride) { + Dim ret; + detail::NormalizeStridesFunctor<0, D, D == 0>::Run(size.Get(), stride.Get(), ret.GetMutable()); return ret; } @@ -245,10 +245,10 @@ HOSTDEVICE inline Dim make_dim(Args... idxes) { } // Allows us to output a Dim -template -inline std::ostream& operator<<(std::ostream& os, const Dim& d) { +template +inline std::ostream& operator<<(std::ostream& os, const Dim& d) { os << d[0]; - for (int i = 1; i < N; ++i) { + for (int i = 1; i < D; ++i) { os << ", " << d[i]; } return os; @@ -258,23 +258,23 @@ inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { return os; } -template -HOST std::string Dim::to_string() const { +template +HOST std::string Dim::to_string() const { std::stringstream stream; stream << *this; return stream.str(); } -template -HOSTDEVICE Dim linear_to_dimension(int linear_index, const Dim& extents) { - Dim result; +template +HOSTDEVICE Dim linear_to_dimension(int linear_index, const Dim& extents) { + Dim result; - for (int i = 0; i < N - 1; ++i) { + for (int i = 0; i < D - 1; ++i) { result[i] = linear_index % extents[i]; linear_index /= extents[i]; } - result[N - 1] = linear_index; + result[D - 1] = linear_index; return result; } From 6fabbd8fb801a1b9aeea20821515deed04949faa Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 14:02:51 +0800 Subject: [PATCH 026/103] Polish code and remove spin lock test=develop --- .../scope_buffered_ssa_graph_executor.cc | 8 ++- paddle/fluid/framework/rw_lock.h | 10 +-- paddle/fluid/framework/spin_lock.h | 71 ------------------- 3 files changed, 13 insertions(+), 76 deletions(-) delete mode 100644 paddle/fluid/framework/spin_lock.h diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index ea783c609081c..22bf0d308b2de 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -74,12 +74,18 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( } if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { - drop_scope_counter_ = 0; + // Wait All computational streams + for (auto p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } + + drop_scope_counter_ = 0; } if (eptr) { diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index 75e6bef9bf33d..f8aa87519a2fc 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -69,12 +69,13 @@ class AutoWRLock { public: explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + ~AutoWRLock() { UnLock(); } + + private: inline void Lock() { lock_->WRLock(); } inline void UnLock() { lock_->UNLock(); } - ~AutoWRLock() { UnLock(); } - private: RWLock* lock_; }; @@ -83,12 +84,13 @@ class AutoRDLock { public: explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + ~AutoRDLock() { UnLock(); } + + private: inline void Lock() { lock_->RDLock(); } inline void UnLock() { lock_->UNLock(); } - ~AutoRDLock() { UnLock(); } - private: RWLock* lock_; }; diff --git a/paddle/fluid/framework/spin_lock.h b/paddle/fluid/framework/spin_lock.h deleted file mode 100644 index 11a763d655abb..0000000000000 --- a/paddle/fluid/framework/spin_lock.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if !defined(_WIN32) -#include -#else -#include // NOLINT -#endif // !_WIN32 - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { - -#if !defined(_WIN32) -struct SpinLock { - SpinLock() { pthread_spin_init(&lock_, PTHREAD_PROCESS_PRIVATE); } - - ~SpinLock() { pthread_spin_destroy(&lock_); } - - void Lock() { - PADDLE_ENFORCE_EQ(pthread_spin_lock(&lock_), 0, "acquire spin lock failed"); - } - - void Unlock() { - PADDLE_ENFORCE_EQ(pthread_spin_unlock(&lock_), 0, - "release spin lock failed"); - } - - private: - pthread_spinlock_t lock_; -}; -#else -// FIXME(minqiyang): use mutex here to do fake spin lock -struct SpinLock { - void Lock() { mutex_.lock(); } - - void Unlock() { mutex_.lock(); } - - private: - std::mutex mutex_; -}; -#endif - -class AutoSpinLock { - public: - explicit SpinLockGuard(SpinLock* spin_lock) : lock_(spin_lock) { - lock_->Lock(); - } - - ~SpinLockGuard() { lock_->Unlock(); } - - private: - SpinLock* lock_; -}; - -} // namespace framework -} // namespace paddle From 0a4b6fc0561c1b3f1b5610b2d161c837dc4b8a0e Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 14:12:24 +0800 Subject: [PATCH 027/103] Remove unnessesary code test=develop --- CMakeLists.txt | 2 +- cmake/external/robin_map.cmake | 31 ------ paddle/fluid/framework/CMakeLists.txt | 2 +- .../framework/details/execution_strategy.h | 2 +- .../scope_buffered_ssa_graph_executor.cc | 11 +- paddle/fluid/framework/ir/graph.cc | 65 +++-------- paddle/fluid/framework/rw_lock.h | 101 ++++++++++++------ paddle/fluid/framework/scope.cc | 51 ++++----- paddle/fluid/framework/scope.h | 29 +---- paddle/fluid/framework/spin_lock.h | 71 ------------ .../fluid/operators/math/concat_and_split.cu | 4 +- paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/fluid/profiler.py | 1 - 13 files changed, 117 insertions(+), 255 deletions(-) delete mode 100644 cmake/external/robin_map.cmake delete mode 100644 paddle/fluid/framework/spin_lock.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7fda5d460d54e..c31f51a3f7371 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,7 +294,7 @@ if(WITH_PSLIB) list(APPEND EXTERNAL_LIBS pslib_brpc) list(APPEND EXTERNAL_LIBS libmct) endif(WITH_PSLIB) - + if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/cmake/external/robin_map.cmake b/cmake/external/robin_map.cmake deleted file mode 100644 index ddaf59536cb2c..0000000000000 --- a/cmake/external/robin_map.cmake +++ /dev/null @@ -1,31 +0,0 @@ -include(ExternalProject) - -set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map) -set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include) - -include_directories(${ROBIN_MAP_INCLUDE_DIR}) - -ExternalProject_Add( - extern_robin_map - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "/~https://github.com/Tessil/robin-map.git" - GIT_TAG "v0.5.0" - PREFIX ${ROBIN_MAP_SOURCE_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) - -if(${CMAKE_VERSION} VERSION_LESS "3.3.0") - set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c) - file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") - add_library(robin_map STATIC ${dummyfile}) -else() - add_library(robin_map INTERFACE) -endif() - -add_dependencies(robin_map extern_robin_map) - -LIST(APPEND externl_project_dependencies robin_map) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 10a637af445c4..412bc9cbe88b8 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -83,7 +83,7 @@ cc_test(variable_test SRCS variable_test.cc) cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash) +cc_library(scope SRCS scope.cc DEPS glog threadpool) cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 37b07e5736312..15c496130c2b6 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,7 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{1}; + size_t num_iteration_per_drop_scope_{100}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index ea783c609081c..57f6fc66c57e2 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -64,24 +64,21 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( } platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); - ++drop_scope_counter_; + drop_scope_counter_ += 1; - if (!fetch_tensors.empty()) { + if (!fetch_tensors.empty() || + drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { + drop_scope_counter_ = 0; // Wait All computational streams for (auto p : places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - } - - if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { - drop_scope_counter_ = 0; for (auto &scope : local_scopes_) { auto &local_scope = *scope->Var(details::kLocalExecScopeName)->GetMutable(); scope->DeleteScope(local_scope); } } - if (eptr) { std::rethrow_exception(eptr); } else { diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8e67f8f6104cd..8670dcfed7e40 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -20,10 +20,6 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/var_desc.h" -DEFINE_bool(enforce_when_check_program, true, - "Checking whether the program is correct or not. We will log " - "errors rather than throwing exceptions if this flag turned off"); - namespace paddle { namespace framework { namespace ir { @@ -48,56 +44,27 @@ void CheckProgram(const ProgramDesc &program) { break; case _INT(OpRole::kBackward): case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): - if (!FLAGS_enforce_when_check_program) { - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add backward operator %s after optimize operator.", - op->Type()); - } else { - if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { - LOG(ERROR) - << "Cannot add backward operator %s after optimize operator." - << op->Type(); - } - } + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add backward operator %s after optimize operator.", + op->Type()); break; case _INT(OpRole::kForward) | _INT(OpRole::kLoss): - if (!FLAGS_enforce_when_check_program) { - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | - _INT(OpRole::kLoss)) == visit.end(), - "Cannot add backward|loss operator before " - "forward|loss operator %s.", - op->Type()); - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add forward|loss operator %s after optimize operator.", - op->Type()); - } else { - if (visit.find(_INT(OpRole::kBackward) | _INT(OpRole::kLoss)) != - visit.end()) { - LOG(ERROR) << "Cannot add backward|loss operator before " - << "forward|loss operator %s." << op->Type(); - } - - if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) { - LOG(ERROR) << "Cannot add forward|loss operator %s after optimize " - "operator." - << op->Type(); - } - } + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | + _INT(OpRole::kLoss)) == visit.end(), + "Cannot add backward|loss operator before " + "forward|loss operator %s.", + op->Type()); + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add forward|loss operator %s after optimize operator.", + op->Type()); break; case _INT(OpRole::kOptimize): case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): - if (!FLAGS_enforce_when_check_program) { - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), - "Optimize operators %s must follow backward operator.", - op->Type()); - } else { - if (visit.find(_INT(OpRole::kBackward)) == visit.end()) { - LOG(ERROR) << "Optimize operators %s must follow backward operator." - << op->Type(); - } - } + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), + "Optimize operators %s must follow backward operator.", + op->Type()); break; case _INT(OpRole::kLRSched): case _INT(OpRole::kDist): diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h index 75e6bef9bf33d..dbf00f3a79f7d 100644 --- a/paddle/fluid/framework/rw_lock.h +++ b/paddle/fluid/framework/rw_lock.h @@ -16,9 +16,7 @@ limitations under the License. */ #if !defined(_WIN32) #include -#else -#include // NOLINT -#endif // !_WIN32 +#endif // !_WIN32 #include "paddle/fluid/platform/enforce.h" @@ -31,17 +29,17 @@ struct RWLock { ~RWLock() { pthread_rwlock_destroy(&lock_); } - inline void RDLock() { + void RDLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed"); } - inline void WRLock() { + void WRLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed"); } - inline void UNLock() { + void UNLock() { PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); } @@ -53,44 +51,81 @@ struct RWLock { // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive // In windows, rw_lock seems like a hack. Use empty object and do nothing. struct RWLock { - // FIXME(minqiyang): use mutex here to do fake lock - inline void RDLock() { mutex_.lock(); } - - inline void WRLock() { mutex_.lock(); } - - inline void UNLock() { mutex_.unlock(); } - - private: - std::mutex mutex_; + void RDLock() {} + void WRLock() {} + void UNLock() {} }; #endif -class AutoWRLock { +class RWLockGuard { public: - explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } - - inline void Lock() { lock_->WRLock(); } - - inline void UnLock() { lock_->UNLock(); } - - ~AutoWRLock() { UnLock(); } - - private: - RWLock* lock_; -}; + enum Status { kUnLock, kWRLock, kRDLock }; + + RWLockGuard(RWLock* rw_lock, Status init_status) + : lock_(rw_lock), status_(Status::kUnLock) { + switch (init_status) { + case Status::kRDLock: { + RDLock(); + break; + } + case Status::kWRLock: { + WRLock(); + break; + } + case Status::kUnLock: { + break; + } + } + } -class AutoRDLock { - public: - explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + void WRLock() { + switch (status_) { + case Status::kUnLock: { + lock_->WRLock(); + status_ = Status::kWRLock; + break; + } + case Status::kWRLock: { + break; + } + case Status::kRDLock: { + PADDLE_THROW( + "Please unlock read lock first before invoking write lock."); + break; + } + } + } - inline void Lock() { lock_->RDLock(); } + void RDLock() { + switch (status_) { + case Status::kUnLock: { + lock_->RDLock(); + status_ = Status::kRDLock; + break; + } + case Status::kRDLock: { + break; + } + case Status::kWRLock: { + PADDLE_THROW( + "Please unlock write lock first before invoking read lock."); + break; + } + } + } - inline void UnLock() { lock_->UNLock(); } + void UnLock() { + if (status_ != Status::kUnLock) { + lock_->UNLock(); + status_ = Status::kUnLock; + } + } - ~AutoRDLock() { UnLock(); } + ~RWLockGuard() { UnLock(); } private: RWLock* lock_; + Status status_; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 4f79d9826099e..6fa5e99f9f3a7 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -47,15 +47,9 @@ DEFINE_bool(fast_eager_deletion_mode, false, // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_KIDS_READER_LOCK -#define SCOPE_KIDS_WRITER_LOCK -#define SCOPE_VARS_READER_LOCK -#define SCOPE_VARS_WRITER_LOCK +#define SCOPE_LOCK_GUARD #else -#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); -#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); -#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); -#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); +#define SCOPE_LOCK_GUARD std::lock_guard lock(mutex_); #endif namespace paddle { @@ -73,69 +67,64 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - Scope* child = new Scope(this); - { - SCOPE_KIDS_WRITER_LOCK - kids_.push_back(child); - } - return *child; + SCOPE_LOCK_GUARD + kids_.push_back(new Scope(this)); + return *kids_.back(); } Variable* Scope::Var(const std::string& name) { - SCOPE_VARS_WRITER_LOCK + SCOPE_LOCK_GUARD return VarInternal(name); } Variable* Scope::Var(std::string* name) { + SCOPE_LOCK_GUARD auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; } - SCOPE_VARS_WRITER_LOCK return VarInternal(new_name); } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_VARS_READER_LOCK + SCOPE_LOCK_GUARD return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_VARS_READER_LOCK + SCOPE_LOCK_GUARD return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_VARS_READER_LOCK + SCOPE_LOCK_GUARD return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_KIDS_WRITER_LOCK + SCOPE_LOCK_GUARD for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_KIDS_READER_LOCK + SCOPE_LOCK_GUARD auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { + SCOPE_LOCK_GUARD std::vector known_vars; - { - SCOPE_VARS_READER_LOCK - known_vars.reserve(this->vars_.size()); - for (auto& p : vars_) { - known_vars.emplace_back(p.first); - } + known_vars.reserve(this->vars_.size()); + for (auto& p : vars_) { + known_vars.emplace_back(p.first); } return known_vars; } void Scope::DeleteScope(Scope* scope) const { - SCOPE_KIDS_WRITER_LOCK + SCOPE_LOCK_GUARD auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -149,8 +138,8 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { + SCOPE_LOCK_GUARD std::set var_set(var_names.begin(), var_names.end()); - SCOPE_VARS_WRITER_LOCK for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { it = vars_.erase(it); @@ -162,12 +151,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_VARS_WRITER_LOCK + SCOPE_LOCK_GUARD RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_VARS_WRITER_LOCK + SCOPE_LOCK_GUARD auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 77ef18414d09c..aded1f771cedb 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -14,19 +14,12 @@ limitations under the License. */ #pragma once -extern "C" { -#include -} - -#include #include -#include +#include // NOLINT #include #include -#include #include -#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -38,14 +31,6 @@ bool IsFastEagerDeletionModeEnabled(); class Scope; -namespace inner { -struct KeyHasher { - std::size_t operator()(const std::string& key) const { - return XXH32(key.c_str(), key.size(), 1); - } -}; -} // namespace inner - /** * @brief Scope that manage all variables. * @@ -110,14 +95,7 @@ class Scope { std::string Rename(const std::string& origin_name) const; protected: - mutable std::unordered_map, - inner::KeyHasher> - vars_; - // mutable tsl::robin_map< - // std::string, std::unique_ptr, std::hash, - // std::equal_to, - // std::allocator>>, true> - // vars_; + mutable std::unordered_map> vars_; private: // Call Scope::NewScope for a sub-scope. @@ -146,8 +124,7 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); private: - mutable RWLock kids_lock_; - mutable RWLock vars_lock_; + mutable std::mutex mutex_; }; // Generate some debug string about the inherience structure of scope, quite diff --git a/paddle/fluid/framework/spin_lock.h b/paddle/fluid/framework/spin_lock.h deleted file mode 100644 index 11a763d655abb..0000000000000 --- a/paddle/fluid/framework/spin_lock.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if !defined(_WIN32) -#include -#else -#include // NOLINT -#endif // !_WIN32 - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { - -#if !defined(_WIN32) -struct SpinLock { - SpinLock() { pthread_spin_init(&lock_, PTHREAD_PROCESS_PRIVATE); } - - ~SpinLock() { pthread_spin_destroy(&lock_); } - - void Lock() { - PADDLE_ENFORCE_EQ(pthread_spin_lock(&lock_), 0, "acquire spin lock failed"); - } - - void Unlock() { - PADDLE_ENFORCE_EQ(pthread_spin_unlock(&lock_), 0, - "release spin lock failed"); - } - - private: - pthread_spinlock_t lock_; -}; -#else -// FIXME(minqiyang): use mutex here to do fake spin lock -struct SpinLock { - void Lock() { mutex_.lock(); } - - void Unlock() { mutex_.lock(); } - - private: - std::mutex mutex_; -}; -#endif - -class AutoSpinLock { - public: - explicit SpinLockGuard(SpinLock* spin_lock) : lock_(spin_lock) { - lock_->Lock(); - } - - ~SpinLockGuard() { lock_->Unlock(); } - - private: - SpinLock* lock_; -}; - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index 930d851696efd..760a065c1081d 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -180,7 +180,7 @@ class ConcatFunctor { } // Wait() must be called because `inputs_data` may be destructed before // kernel ends - /* context.Wait(); */ + context.Wait(); } }; @@ -258,7 +258,7 @@ class SplitFunctor { } // Wait() must be called because `outputs_data` may be destructed before // kernel ends - /* context.Wait(); */ + context.Wait(); } }; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c108c8275669f..88a2a5276ab52 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -822,7 +822,7 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is INT, num_iteration_per_drop_scope indicates how many iterations to clean up the temp variables which is generated during execution. It may make the execution faster, - because the temp variable's shape maybe the same between two iterations. Default 1. + because the temp variable's shape maybe the same between two iterations. Default 100. NOTES: 1. If you fetch data when calling the 'run', the ParallelExecutor diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 78f7a6ac085f6..e05885f5f5bfc 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -92,7 +92,6 @@ def cuda_profiler(output_file, output_mode=None, config=None): config_file = 'nvprof_config_file' with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) - #Comment this for nvprof core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() From bc6640156600e88e20813a0539ff1cbc7dd9ac3a Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 16:08:06 +0800 Subject: [PATCH 028/103] Polish code test=develop --- paddle/fluid/platform/enforce.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index d1dd09f2064f1..78e8fbc51d389 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -260,19 +260,19 @@ inline void throw_on_error(T e) { #define PADDLE_JUDGE -#define __PADDLE_UNARY_COMPARE(COND, ...) \ - do { \ - auto __cond = COND; \ - if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ - ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \ - } \ +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto __cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ + ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); /* NOLINT */ \ + } \ } while (0) #ifndef REPLACE_ENFORCE_GLOG #define PADDLE_ENFORCE(COND, ...) \ do { \ try { \ - __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); \ + __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); /* NOLINT */ \ } catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ From 41b81293ab708829459f2314c3c7ec0f14abf506 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 16:13:16 +0800 Subject: [PATCH 029/103] Polish code test=develop --- paddle/fluid/platform/enforce.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 78e8fbc51d389..5fed6b804f9ce 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -260,24 +260,24 @@ inline void throw_on_error(T e) { #define PADDLE_JUDGE -#define __PADDLE_UNARY_COMPARE(COND, ...) \ - do { \ - auto __cond = COND; \ - if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ - ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); /* NOLINT */ \ - } \ - } while (0) +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto __cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ + ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \ + } \ + } while (0) // NOLINT #ifndef REPLACE_ENFORCE_GLOG #define PADDLE_ENFORCE(COND, ...) \ do { \ try { \ - __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); /* NOLINT */ \ + __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); \ } catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ } \ - } while (false) + } while (0) // NOLINT #else #define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); From 4af97c6946435e5129e94cf507fc30f798d09e9e Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 17:07:03 +0800 Subject: [PATCH 030/103] Polish code --- paddle/fluid/platform/enforce.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 5fed6b804f9ce..eee8173ba59f2 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -266,7 +266,7 @@ inline void throw_on_error(T e) { if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \ } \ - } while (0) // NOLINT + } while (0) #ifndef REPLACE_ENFORCE_GLOG #define PADDLE_ENFORCE(COND, ...) \ @@ -277,7 +277,7 @@ inline void throw_on_error(T e) { throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ } \ - } while (0) // NOLINT + } while (0) #else #define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); From 8149a07a418029dcb87280e74a598d8c719e7789 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 17:11:42 +0800 Subject: [PATCH 031/103] Fix wait stream two times bug test=develop --- paddle/fluid/framework/details/execution_strategy.h | 2 +- .../details/scope_buffered_ssa_graph_executor.cc | 12 +++++------- .../details/scope_buffered_ssa_graph_executor.h | 8 ++++++++ paddle/fluid/pybind/pybind.cc | 2 +- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 37b07e5736312..15c496130c2b6 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,7 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{1}; + size_t num_iteration_per_drop_scope_{100}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 22bf0d308b2de..00b8136dc2ea0 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -66,17 +66,15 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); ++drop_scope_counter_; + bool stream_end = false; if (!fetch_tensors.empty()) { - // Wait All computational streams - for (auto p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } + WaitComputationalStreams(); + stream_end = true; } if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { - // Wait All computational streams - for (auto p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); + if (!stream_end) { + WaitComputationalStreams(); } for (auto &scope : local_scopes_) { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 5e87e0bf50b51..0f6340213daee 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -47,6 +47,14 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FeedFetchList Run(const std::vector& fetch_tensors) override; + private: + inline void WaitComputationalStreams() { + // Wait All computational streams + for (auto p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + } + private: size_t drop_scope_counter_{0}; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a63c71aad2596..d590c3a3c6b2a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -815,7 +815,7 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is INT, num_iteration_per_drop_scope indicates how many iterations to clean up the temp variables which is generated during execution. It may make the execution faster, - because the temp variable's shape maybe the same between two iterations. Default 1. + because the temp variable's shape maybe the same between two iterations. Default 100. NOTES: 1. If you fetch data when calling the 'run', the ParallelExecutor From 099186cd41f8aba32ef8f70afd507ee344f3e75c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 20:01:59 +0800 Subject: [PATCH 032/103] Support one argument PADDLE_ENFORCE test=develop --- paddle/fluid/platform/enforce.h | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index eee8173ba59f2..ec4d0bf910127 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -258,21 +258,33 @@ inline void throw_on_error(T e) { #define PADDLE_THROW(...) \ throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) -#define PADDLE_JUDGE - -#define __PADDLE_UNARY_COMPARE(COND, ...) \ - do { \ - auto __cond = COND; \ - if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ - ::paddle::platform::throw_on_error(__cond, ##__VA_ARGS__); \ - } \ +#define PADDLE_THROW_ERROR(COND, ...) \ + PADDLE_THROW_I(__VA_ARGS__, \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND)) + +#define PADDLE_THROW_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; + +#define __PADDLE_UNARY_COMPARE(COND, ...) \ + do { \ + auto __cond = COND; \ + if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ + PADDLE_THROW_ERROR(COND, __VA_ARGS__); \ + } \ } while (0) #ifndef REPLACE_ENFORCE_GLOG #define PADDLE_ENFORCE(COND, ...) \ do { \ try { \ - __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); \ + __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \ } catch (...) { \ throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ __FILE__, __LINE__); \ @@ -280,7 +292,7 @@ inline void throw_on_error(T e) { } while (0) #else -#define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, ##__VA_ARGS__); +#define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG #define PADDLE_THROW_EOF() \ From 5a5c577529bdfe60f584bd490f3dedc6aa991fa6 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 20:03:12 +0800 Subject: [PATCH 033/103] Polish code test=develop --- paddle/fluid/platform/enforce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index ec4d0bf910127..efead293037a4 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -276,7 +276,7 @@ inline void throw_on_error(T e) { do { \ auto __cond = COND; \ if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ - PADDLE_THROW_ERROR(COND, __VA_ARGS__); \ + PADDLE_THROW_ERROR(__cond, __VA_ARGS__); \ } \ } while (0) From e4719eb4625e695fc1fcc786444c1a9c8d78fc57 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 21 Dec 2018 20:42:29 +0800 Subject: [PATCH 034/103] Fix bug in Windows VC 2010 test=develop --- paddle/fluid/operators/lrn_mkldnn_op.cc | 2 +- paddle/fluid/platform/enforce.h | 35 ++++++++++++++----------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 0a18882e8199c..adcf69445485b 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -50,7 +50,7 @@ template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, + PADDLE_ENFORCE(std::is_same::value, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index efead293037a4..dd83686b9d274 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -258,30 +258,30 @@ inline void throw_on_error(T e) { #define PADDLE_THROW(...) \ throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) -#define PADDLE_THROW_ERROR(COND, ...) \ - PADDLE_THROW_I(__VA_ARGS__, \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND)) - -#define PADDLE_THROW_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; +#define __PADDLE_THROW_ERROR(COND, ...) \ + __PADDLE_THROW_ERROR_I( \ + __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ + ::paddle::platform::throw_on_error(COND)) + +#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; #define __PADDLE_UNARY_COMPARE(COND, ...) \ do { \ auto __cond = COND; \ if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ - PADDLE_THROW_ERROR(__cond, __VA_ARGS__); \ + __PADDLE_THROW_ERROR(__cond, __VA_ARGS__); \ } \ } while (0) #ifndef REPLACE_ENFORCE_GLOG -#define PADDLE_ENFORCE(COND, ...) \ +#define __PADDLE_ENFORCE_I(COND, ...) \ do { \ try { \ __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); \ @@ -292,9 +292,12 @@ inline void throw_on_error(T e) { } while (0) #else -#define PADDLE_ENFORCE(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); +#define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG +#define __PADDLE_ENFORCE(args) __PADDLE_ENFORCE_I args +#define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__)) + #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ From cf6188a823e2c3c55cc3e93053339d4c7d560d41 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Sun, 23 Dec 2018 14:34:56 +0800 Subject: [PATCH 035/103] add a linux timer --- paddle/fluid/platform/timer.h | 87 +++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 paddle/fluid/platform/timer.h diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h new file mode 100644 index 0000000000000..592d8c8e9da9b --- /dev/null +++ b/paddle/fluid/platform/timer.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include + +namespace paddle { +namespace platform { + +// A Standard Timer implementation for debugging + +class Timer { + public: + Timer() { + reset(); + } + + inline void reset() { + _start.tv_sec = 0; + _start.tv_usec = 0; + + _count = 0; + _elapsed = 0; + _paused = true; + } + + inline void start() { + reset(); + resume(); + } + + inline void pause() { + if (_paused) { + return; + } + _elapsed += tickus(); + ++_count; + _paused = true; + } + + inline void resume() { + gettimeofday(&_start, NULL); + _paused = false; + } + + inline int count() const { + return _count; + } + + inline double elapsed_us() const { + return static_cast(_elapsed); + } + inline double elapsed_ms() const { + return _elapsed / 1000.0; + } + inline double elapsed_sec() const { + return _elapsed / 1000000.0; + } + + private: + struct timeval _start; + struct timeval _now; + + int32_t _count; + int64_t _elapsed; + bool _paused; + + inline int64_t tickus() { + gettimeofday(&_now, NULL); + return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L + + (_now.tv_usec - _start.tv_usec); + } +}; From 0cf1461ccc17672aa93acb32883c56830f0dfa29 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 24 Dec 2018 12:44:11 +0800 Subject: [PATCH 036/103] Avoid comma in macro test=develop --- paddle/fluid/operators/lrn_mkldnn_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index adcf69445485b..c96dd63516f36 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -50,8 +50,8 @@ template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, - "MKLDNN LRN must use float data."); + bool is_float_type = std::is_same::value; + PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); From e811e06555d0a458fb885a4956bb5128d1bc37b6 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 24 Dec 2018 12:48:52 +0800 Subject: [PATCH 037/103] Avoid comma in macro test=develop --- paddle/fluid/operators/lrn_mkldnn_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index c96dd63516f36..4e4f977fcc742 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -50,7 +50,7 @@ template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - bool is_float_type = std::is_same::value; + const bool is_float_type = std::is_same::value; PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); @@ -132,8 +132,8 @@ template class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(std::is_same::value, - "MKLDNN LRN must use float data."); + const bool is_float_type = std::is_same::value; + PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); PADDLE_ENFORCE( From 7d1533216dd6776ce17a857b082c25d5d5cccf49 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 24 Dec 2018 13:02:13 +0800 Subject: [PATCH 038/103] Fix syntax error in unit test test=develop --- paddle/fluid/inference/analysis/analyzer_tester.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index cb88333d15703..1fc5a00858c1c 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -69,9 +69,9 @@ void TestWord2vecPrediction(const std::string& model_path) { std::vector outputs; CHECK(predictor->Run(slots, &outputs)); - PADDLE_ENFORCE(outputs.size(), 1UL); + PADDLE_ENFORCE_EQ(outputs.size(), 1UL); // Check the output buffer size and result of each tid. - PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL); float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); @@ -80,8 +80,8 @@ void TestWord2vecPrediction(const std::string& model_path) { i++) { LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i]; - PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], - result[i]); + PADDLE_ENFORCE_EQ(static_cast(outputs.front().data.data())[i], + result[i]); } } From b1d0a14c144c71f0f912d1e8ec0d0b4170546c12 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 24 Dec 2018 13:06:11 +0800 Subject: [PATCH 039/103] Change the ut back test=develop --- paddle/fluid/inference/analysis/analyzer_tester.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 1fc5a00858c1c..f84e1ab6b827b 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -78,10 +78,10 @@ void TestWord2vecPrediction(const std::string& model_path) { // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(static_cast(5UL), num_elements); i++) { - LOG(INFO) << "data: " - << static_cast(outputs.front().data.data())[i]; - PADDLE_ENFORCE_EQ(static_cast(outputs.front().data.data())[i], - result[i]); + LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i] + << " result: " << result[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + result[i]); } } From d434fcbaa6e403801fba3f775a86182326378cdd Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 24 Dec 2018 15:32:58 +0800 Subject: [PATCH 040/103] add TrainFilesWithTimer in async_executor --- paddle/fluid/framework/async_executor.cc | 9 +++- .../fluid/framework/executor_thread_worker.cc | 44 +++++++++++++++++++ .../fluid/framework/executor_thread_worker.h | 2 + 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index ee3c5e01f87ee..1d9678a1ba140 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // start executing ops in multiple threads for (int thidx = 0; thidx < actual_thread_num; ++thidx) { - threads.push_back( - std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); + if (debug) { + threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer, + workers[thidx].get())); + } else { + threads.push_back( + std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); + } } for (auto& th : threads) { diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 2eb9e564f8780..c26e6bf4790b8 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -180,6 +180,7 @@ void ExecutorThreadWorker::SetDevice() { return; #else static unsigned concurrency_cap = std::thread::hardware_concurrency(); + LOG(WARNING) << "concurrency capacity " << concurrency_cap; int thread_id = this->thread_id_; if (static_cast(thread_id) < concurrency_cap) { @@ -238,6 +239,49 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) { VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type(); } +void ExecutorThreadWorker::TrainFilesWithTimer() { + platform::SetNumThreads(1); + SetDevice(); + thread_reader_->Start(); + std::vector op_total_time; + std::vector op_name; + for (auto& op : ops_) { + op_name.push_back(op->Type()); + } + op_total_time.resize(ops_.size()); + for (int i = 0; i < op_total_time.size(); ++i) { + op_total_time[i] = 0.0; + } + platform::Timer timeline; + double total_time = 0.0; + double read_time = 0.0; + int cur_batch; + int batch_cnt = 0; + timeline.Start(); + while ((cur_batch = thread_reader_->Next()) > 0) { + timeline.Pause(); + read_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + for (int i = 0; i < ops_.size(); ++i) { + timeline.Start(); + ops_[i]->Run(*thread_scope_, place_); + timeline.Pause(); + op_total_time[i] += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + ++batch_cnt; + thread_scope_->DropKids(); + if (batch_cnt > 0 && batch_cnt % 1000 == 0) { + for (int i = 0; i < ops_.size(); ++i) { + fprintf(stderr, "op_name:[%d][%s], op_mean_time:[%fs]\n", i, + op_name[i].c_str(), op_total_time[i] / batch_cnt); + } + fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt); + } + timeline.Start(); + } +} + void ExecutorThreadWorker::TrainFiles() { platform::SetNumThreads(1); diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 30b81ad88035e..524922b0322e5 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -155,6 +155,8 @@ class ExecutorThreadWorker { void SetDataFeed(const std::shared_ptr& datafeed); // A multi-thread training function virtual void TrainFiles(); + // with timer log + virtual void TrainFilesWithTimer(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); #ifdef PADDLE_WITH_PSLIB From 2dee8f6cd5c2404f4df033f1d32f78efb9413564 Mon Sep 17 00:00:00 2001 From: dongdaxiang Date: Mon, 24 Dec 2018 15:32:58 +0800 Subject: [PATCH 041/103] add TrainFilesWithTimer in async_executor --- paddle/fluid/framework/async_executor.cc | 9 ++- .../fluid/framework/executor_thread_worker.cc | 45 +++++++++++ .../fluid/framework/executor_thread_worker.h | 2 + paddle/fluid/platform/CMakeLists.txt | 2 + paddle/fluid/platform/timer.cc | 63 +++++++++++++++ paddle/fluid/platform/timer.h | 79 +++++-------------- 6 files changed, 138 insertions(+), 62 deletions(-) create mode 100644 paddle/fluid/platform/timer.cc diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index ee3c5e01f87ee..1d9678a1ba140 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // start executing ops in multiple threads for (int thidx = 0; thidx < actual_thread_num; ++thidx) { - threads.push_back( - std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); + if (debug) { + threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer, + workers[thidx].get())); + } else { + threads.push_back( + std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); + } } for (auto& th : threads) { diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 2eb9e564f8780..1e8f6c6182aab 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/pybind/pybind.h" namespace paddle { namespace framework { @@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() { return; #else static unsigned concurrency_cap = std::thread::hardware_concurrency(); + LOG(WARNING) << "concurrency capacity " << concurrency_cap; int thread_id = this->thread_id_; if (static_cast(thread_id) < concurrency_cap) { @@ -238,6 +240,49 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) { VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type(); } +void ExecutorThreadWorker::TrainFilesWithTimer() { + platform::SetNumThreads(1); + SetDevice(); + thread_reader_->Start(); + std::vector op_total_time; + std::vector op_name; + for (auto& op : ops_) { + op_name.push_back(op->Type()); + } + op_total_time.resize(ops_.size()); + for (size_t i = 0; i < op_total_time.size(); ++i) { + op_total_time[i] = 0.0; + } + platform::Timer timeline; + double total_time = 0.0; + double read_time = 0.0; + int cur_batch; + int batch_cnt = 0; + timeline.Start(); + while ((cur_batch = thread_reader_->Next()) > 0) { + timeline.Pause(); + read_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + for (size_t i = 0; i < ops_.size(); ++i) { + timeline.Start(); + ops_[i]->Run(*thread_scope_, place_); + timeline.Pause(); + op_total_time[i] += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + ++batch_cnt; + thread_scope_->DropKids(); + if (batch_cnt > 0 && batch_cnt % 1000 == 0) { + for (size_t i = 0; i < ops_.size(); ++i) { + fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i, + op_name[i].c_str(), op_total_time[i] / batch_cnt); + } + fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt); + } + timeline.Start(); + } +} + void ExecutorThreadWorker::TrainFiles() { platform::SetNumThreads(1); diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 30b81ad88035e..524922b0322e5 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -155,6 +155,8 @@ class ExecutorThreadWorker { void SetDataFeed(const std::shared_ptr& datafeed); // A multi-thread training function virtual void TrainFiles(); + // with timer log + virtual void TrainFilesWithTimer(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); #ifdef PADDLE_WITH_PSLIB diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index d1dff16ddd859..5197d5d01d8b9 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -84,6 +84,8 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) +cc_library(timer SRCS timer.cc) + cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) diff --git a/paddle/fluid/platform/timer.cc b/paddle/fluid/platform/timer.cc new file mode 100644 index 0000000000000..75d4e5cbf90bd --- /dev/null +++ b/paddle/fluid/platform/timer.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/timer.h" + +namespace paddle { +namespace platform { + +void Timer::Reset() { + _start.tv_sec = 0; + _start.tv_usec = 0; + + _count = 0; + _elapsed = 0; + _paused = true; +} + +void Timer::Start() { + Reset(); + Resume(); +} + +void Timer::Pause() { + if (_paused) { + return; + } + _elapsed += Tickus(); + ++_count; + _paused = true; +} + +void Timer::Resume() { + gettimeofday(&_start, NULL); + _paused = false; +} + +int Timer::Count() { return _count; } + +double Timer::ElapsedUS() { return static_cast(_elapsed); } + +double Timer::ElapsedMS() { return _elapsed / 1000.0; } + +double Timer::ElapsedSec() { return _elapsed / 1000000.0; } + +int64_t Timer::Tickus() { + gettimeofday(&_now, NULL); + return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L + + (_now.tv_usec - _start.tv_usec); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h index 592d8c8e9da9b..35bd83a33dea5 100644 --- a/paddle/fluid/platform/timer.h +++ b/paddle/fluid/platform/timer.h @@ -14,74 +14,33 @@ limitations under the License. */ #pragma once #include -#include -#include -#include +#include "paddle/fluid/framework/scope.h" namespace paddle { namespace platform { // A Standard Timer implementation for debugging - class Timer { public: - Timer() { - reset(); - } - - inline void reset() { - _start.tv_sec = 0; - _start.tv_usec = 0; - - _count = 0; - _elapsed = 0; - _paused = true; - } - - inline void start() { - reset(); - resume(); - } - - inline void pause() { - if (_paused) { - return; - } - _elapsed += tickus(); - ++_count; - _paused = true; - } - - inline void resume() { - gettimeofday(&_start, NULL); - _paused = false; - } - - inline int count() const { - return _count; - } - - inline double elapsed_us() const { - return static_cast(_elapsed); - } - inline double elapsed_ms() const { - return _elapsed / 1000.0; - } - inline double elapsed_sec() const { - return _elapsed / 1000000.0; - } + Timer() { Reset(); } + void Reset(); + void Start(); + void Pause(); + void Resume(); + int Count(); + double ElapsedUS(); + double ElapsedMS(); + double ElapsedSec(); private: - struct timeval _start; - struct timeval _now; + struct timeval _start; + struct timeval _now; + int _count; + int _elapsed; + bool _paused; - int32_t _count; - int64_t _elapsed; - bool _paused; - - inline int64_t tickus() { - gettimeofday(&_now, NULL); - return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L + - (_now.tv_usec - _start.tv_usec); - } + int64_t Tickus(); }; + +} // namespace platform +} // namespace paddle From 68b86d666521178f1b994c6c86a5539e35f66a52 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 24 Dec 2018 16:46:01 +0800 Subject: [PATCH 042/103] Change default value to align with the original react test=develop --- paddle/fluid/framework/details/execution_strategy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 15c496130c2b6..37b07e5736312 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,7 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{100}; + size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; From 45acfbd0118ffaa2661148904667235e3c9b134b Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 24 Dec 2018 17:56:04 +0800 Subject: [PATCH 043/103] 1. Add specific condition for one or no arg in PADDLE_ENFORCE 2. Add unit test for new enforce feature test=develop --- paddle/fluid/platform/enforce.h | 13 ++++++++----- paddle/fluid/platform/enforce_test.cc | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index dd83686b9d274..7eb4be2137e1f 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -258,7 +258,12 @@ inline void throw_on_error(T e) { #define PADDLE_THROW(...) \ throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__) -#define __PADDLE_THROW_ERROR(COND, ...) \ +#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; + +#define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \ + ::paddle::platform::throw_on_error(COND, "%s", std::string(ARG)); + +#define __PADDLE_THROW_ON_ERROR(COND, ...) \ __PADDLE_THROW_ERROR_I( \ __VA_ARGS__, ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ @@ -268,15 +273,13 @@ inline void throw_on_error(T e) { ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ ::paddle::platform::throw_on_error(COND, __VA_ARGS__), \ - ::paddle::platform::throw_on_error(COND)) - -#define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; + __THROW_ON_ERROR_ONE_ARG(COND, __VA_ARGS__)) #define __PADDLE_UNARY_COMPARE(COND, ...) \ do { \ auto __cond = COND; \ if (UNLIKELY(::paddle::platform::is_error(__cond))) { \ - __PADDLE_THROW_ERROR(__cond, __VA_ARGS__); \ + __PADDLE_THROW_ON_ERROR(__cond, __VA_ARGS__); \ } \ } while (0) diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index d52182965552e..1091badae54a8 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -37,6 +37,25 @@ TEST(ENFORCE, FAILED) { HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all")); } EXPECT_TRUE(caught_exception); + + caught_exception = false; + try { + PADDLE_ENFORCE(false, "Enforce is not ok at all"); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_TRUE( + HasPrefix(StringPiece(error.what()), "Enforce is not ok at all")); + } + EXPECT_TRUE(caught_exception); + + caught_exception = false; + try { + PADDLE_ENFORCE(false); + } catch (paddle::platform::EnforceNotMet error) { + caught_exception = true; + EXPECT_NE(std::string(error.what()).find(" at "), 0); + } + EXPECT_TRUE(caught_exception); } TEST(ENFORCE, NO_ARG_OK) { From e02f67eff704648b31de86efeef4f620c3af03a1 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Mon, 24 Dec 2018 10:02:08 +0000 Subject: [PATCH 044/103] rewrite unsafe_cast test=develop --- paddle/fluid/framework/ddim.cc | 4 ---- paddle/fluid/framework/ddim.h | 4 +++- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index 37544e97eb655..e7a6df57e5381 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -131,8 +131,6 @@ DDim slice_ddim(const DDim& dim, int begin, int end) { int arity(const DDim& d) { return d.size(); } -/// \cond HIDDEN - struct DDimPrinter { std::ostream& os; explicit DDimPrinter(std::ostream& os_) : os(os_) {} @@ -143,8 +141,6 @@ struct DDimPrinter { } }; -/// \endcond - std::ostream& operator<<(std::ostream& os, const DDim& ddim) { ddim.apply_visitor(DDimPrinter(os)); return os; diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index 452072a58762d..295d09bbcac54 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -126,7 +126,9 @@ class DDim { private: template inline Dim& UnsafeCast() { - return const_cast&>(const_cast(this)->UnsafeCast()); + static_assert(D >= 0 && D <= kMaxRank, "Invalid rank"); + auto* p = static_cast(&dim_); + return *reinterpret_cast*>(p); } template From 010f657b336944556d190d9054c328a7dc6e87c9 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 24 Dec 2018 18:31:54 +0800 Subject: [PATCH 045/103] Polish code test=develop --- paddle/fluid/operators/detail/safe_ref.h | 2 +- paddle/fluid/platform/enforce.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h index a800d5df0a7cb..8660bc219c12f 100644 --- a/paddle/fluid/operators/detail/safe_ref.h +++ b/paddle/fluid/operators/detail/safe_ref.h @@ -25,7 +25,7 @@ namespace detail { */ template inline T& Ref(T* ptr, ARGS&&... args) { - PADDLE_ENFORCE(ptr != nullptr, args...); + PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...)); return *ptr; } diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 7eb4be2137e1f..e9b98aee1fc45 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -298,7 +298,7 @@ inline void throw_on_error(T e) { #define __PADDLE_ENFORCE_I(COND, ...) __PADDLE_UNARY_COMPARE(COND, __VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG -#define __PADDLE_ENFORCE(args) __PADDLE_ENFORCE_I args +#define __PADDLE_ENFORCE(__args) __PADDLE_ENFORCE_I __args #define PADDLE_ENFORCE(...) __PADDLE_ENFORCE((__VA_ARGS__)) #define PADDLE_THROW_EOF() \ From 52b4821a6eab9fc496de2e132ef0744c1e573ca4 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 24 Dec 2018 19:24:02 +0800 Subject: [PATCH 046/103] Fix Sprintf problem test=develop --- paddle/fluid/platform/enforce.h | 2 +- paddle/fluid/string/printf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index e9b98aee1fc45..0668053950788 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -261,7 +261,7 @@ inline void throw_on_error(T e) { #define __PADDLE_THROW_ERROR_I(_, _9, _8, _7, _6, _5, _4, _3, _2, X_, ...) X_; #define __THROW_ON_ERROR_ONE_ARG(COND, ARG) \ - ::paddle::platform::throw_on_error(COND, "%s", std::string(ARG)); + ::paddle::platform::throw_on_error(COND, ::paddle::string::Sprintf(ARG)); #define __PADDLE_THROW_ON_ERROR(COND, ...) \ __PADDLE_THROW_ERROR_I( \ diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index a2eec6e3c48dd..0b94b60018aac 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -87,7 +87,7 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) { template std::string Sprintf(const Args&... args) { std::ostringstream oss; - Fprintf(oss, ""); + Fprintf(oss, "%s", args...); return oss.str(); } From 68d91cd59455ece3146bb857467e71a04f8bfb97 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 25 Dec 2018 02:29:25 +0000 Subject: [PATCH 047/103] add copy ctor test=develop --- paddle/fluid/framework/ddim.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index 295d09bbcac54..123e227dc0423 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -60,6 +60,8 @@ class DDim { DDim() : rank_(1) { dim_[0] = 0; } + DDim(const DDim& ddim) { this->CopyFrom(ddim); } + DDim(const int* d, int n) : rank_(n) { dynamic_dim_assign(d, dim_.GetMutable(), n); } @@ -138,6 +140,12 @@ class DDim { return *reinterpret_cast*>(p); } + inline void CopyFrom(const DDim& ddim) { + rank_ = ddim.rank_; + PADDLE_VISIT_DDIM(rank_, + (void)(UnsafeCast() = ddim.UnsafeCast())); + } + friend DDim slice_ddim(const DDim& dim, int begin, int end); friend DDim stride(const DDim& ddim); friend DDim stride_numel(const DDim& ddim); From 3a2afbf02e9bcc3d0a690564b8ea811b6cb10685 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Tue, 25 Dec 2018 04:24:44 +0000 Subject: [PATCH 048/103] polish code test=develop --- paddle/fluid/framework/operator.h | 12 ------------ paddle/fluid/framework/var_type.h | 10 +++++----- .../fluid/framework/var_type_inference_test.cc | 2 +- paddle/fluid/framework/var_type_traits.h | 6 +++--- paddle/fluid/framework/var_type_traits_test.cc | 17 +++++++++++++++++ paddle/fluid/framework/variable.h | 10 ++++++---- paddle/fluid/operators/affine_grid_op.cc | 4 ++-- paddle/fluid/operators/conv_op.cc | 4 ++-- paddle/fluid/operators/grid_sampler_op.cc | 4 ++-- paddle/fluid/operators/pool_op.cc | 4 ++-- paddle/fluid/operators/softmax_op.cc | 4 ++-- paddle/fluid/operators/warpctc_op.cc | 2 +- paddle/fluid/platform/cudnn_helper.h | 13 +++++++++++++ 13 files changed, 56 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 4492470e2ad4c..39190d07b4ccd 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -310,18 +310,6 @@ class ExecutionContext { const RuntimeContext& ctx_; }; -inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { - bool use_cudnn = ctx.Attr("use_cudnn"); - use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); -#ifdef PADDLE_WITH_CUDA - if (use_cudnn) { - auto& dev_ctx = ctx.device_context(); - use_cudnn &= dev_ctx.cudnn_handle() != nullptr; - } -#endif - return use_cudnn; -} - template <> const Tensor* ExecutionContext::Input(const std::string& name) const; diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index f1cbaf3fdc22c..73be446f71f19 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -46,19 +46,19 @@ inline proto::VarType::Type ToVarType(int type) { template inline void VisitVarType(const framework::Variable& var, Visitor visitor) { switch (var.Type()) { - case proto::VarType_Type_LOD_TENSOR: + case proto::VarType::LOD_TENSOR: visitor(var.Get()); return; - case proto::VarType_Type_LOD_RANK_TABLE: + case proto::VarType::LOD_RANK_TABLE: visitor(var.Get()); return; - case proto::VarType_Type_LOD_TENSOR_ARRAY: + case proto::VarType::LOD_TENSOR_ARRAY: visitor(var.Get()); return; - case proto::VarType_Type_SELECTED_ROWS: + case proto::VarType::SELECTED_ROWS: visitor(var.Get()); return; - case proto::VarType_Type_READER: + case proto::VarType::READER: visitor(var.Get()); return; default: diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc index 7842168f60388..2a75394fca719 100644 --- a/paddle/fluid/framework/var_type_inference_test.cc +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -108,7 +108,7 @@ TEST(InferVarType, sum_op_without_infer_var_type) { op->InferVarType(prog.MutableBlock(0)); - ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, + ASSERT_EQ(proto::VarType::LOD_TENSOR, prog.MutableBlock(0)->Var("test2_out")->GetType()); } diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index b51b4933e6c9e..1b535219c1510 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -136,8 +136,6 @@ struct VarTypeRegistryImpl { // Users should add other variable types below. // Paddle would generate unique Ids for each registered variable types. -class Scope; - using VarTypeRegistry = detail::VarTypeRegistryImpl< Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, @@ -171,6 +169,8 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); +REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); /** End of variable type registration */ diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 1c7d9f2abed20..00840d634d802 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -88,6 +88,23 @@ TEST(var_type_traits, check_proto_type_id) { ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR_ARRAY)); ASSERT_TRUE(CheckVarId(proto::VarType::PLACE_LIST)); ASSERT_TRUE(CheckVarId(proto::VarType::READER)); + ASSERT_TRUE(CheckVarId(proto::VarType::INT32)); + ASSERT_TRUE(CheckVarId(proto::VarType::FP32)); + + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR); + ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS); + ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES); + ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE); + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY, + proto::VarType::LOD_TENSOR_ARRAY); + ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST); + ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER); + ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH); + ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST); + ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW); + ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE); + ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32); + ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32); } TEST(var_type_traits, test_registry) { diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 8aa68942ad16f..b9d07da822cf1 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -67,7 +67,6 @@ class Variable { private: struct Placeholder { - explicit Placeholder(int type) : type_(type) {} virtual ~Placeholder() = default; inline int Type() const { return type_; } @@ -75,6 +74,11 @@ class Variable { inline void* Ptr() { return ptr_; } protected: + inline void Init(void* p, int type) { + ptr_ = p; + type_ = type; + } + void* ptr_; int type_; }; @@ -86,9 +90,7 @@ class Variable { static_assert( IsRegisteredVarType(), "Not registered type. Please register T inside var_type_traits.h"); - PlaceholderImpl() : Placeholder(VarTypeTrait::kId) { - this->ptr_ = &obj_; - } + PlaceholderImpl() { this->Init(&obj_, VarTypeTrait::kId); } private: T obj_; diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index 0c048738522ca..1de59a5165c83 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -74,7 +74,7 @@ class AffineGridOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library = framework::LibraryType::kCUDNN; } #endif @@ -184,7 +184,7 @@ class AffineGridOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index c76bde99f4a0a..8e0d2824953a3 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -84,7 +84,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( framework::DataLayout layout = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library = framework::LibraryType::kCUDNN; } #endif @@ -369,7 +369,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index be53a62cc9ccf..14a2524bd8f4a 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -59,7 +59,7 @@ class GridSampleOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif @@ -155,7 +155,7 @@ class GridSampleOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 6781cdf9f3448..5399ae556e7f3 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -92,7 +92,7 @@ framework::OpKernelType PoolOp::GetExpectedKernelType( framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif @@ -122,7 +122,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType( framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index ad37967f0ac3c..bc889a5a042a2 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -50,7 +50,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif @@ -157,7 +157,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { framework::DataLayout layout_ = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc index add03bad13dfd..e2ae7caae1ebe 100644 --- a/paddle/fluid/operators/warpctc_op.cc +++ b/paddle/fluid/operators/warpctc_op.cc @@ -51,7 +51,7 @@ class WarpCTCOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (framework::CanCUDNNBeUsed(ctx)) { + if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } #endif diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 74b0942379014..61a25064d1799 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" @@ -450,6 +451,18 @@ class ScopedActivationDescriptor { DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor); }; +inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (use_cudnn) { + auto& dev_ctx = ctx.device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + return use_cudnn; +} + #if CUDNN_VERSION >= 7001 class ScopedCTCLossDescriptor { public: From 8ec3d863b0eb932cf6921f1e860537baa4d1028f Mon Sep 17 00:00:00 2001 From: minqiyang Date: Tue, 25 Dec 2018 15:50:24 +0800 Subject: [PATCH 049/103] Fix throw_on_error direct call bug test=develop --- paddle/fluid/operators/distributed/proto_encoder_helper.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h index d2b0eb6ca6de1..27ca1f4edc04f 100644 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -84,7 +84,9 @@ class ProtoEncodeHelper { ~ProtoEncodeHelper() { #define REPLACE_ENFORCE_GLOG 1 // Make sure callers didn't do operations that went over max_size promised - paddle::platform::throw_on_error(p_ <= limit_); + if (paddle::platform::is_error(p_ <= limit_)) { + paddle::platform::throw_on_error(p_ <= limit_); + } #undef REPLACE_ENFORCE_GLOG } From ce3782c193947fc3241528d3ede2e5e22f4dacd9 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 25 Dec 2018 11:10:46 +0000 Subject: [PATCH 050/103] add affine_channel fuse. fix conv+elemenwise fuse bug. --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../ir/conv_affine_channel_fuse_pass.cc | 222 ++++++++++++++++++ .../ir/conv_affine_channel_fuse_pass.h | 49 ++++ .../framework/ir/graph_pattern_detector.cc | 76 ++++++ .../framework/ir/graph_pattern_detector.h | 32 +++ paddle/fluid/inference/api/analysis_config.cc | 2 +- .../fluid/inference/api/paddle_pass_builder.h | 4 +- paddle/fluid/operators/conv_fusion_op.cu.cc | 4 +- 8 files changed, 385 insertions(+), 5 deletions(-) create mode 100644 paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index b7f7e2ee8ef59..6d795e1e2d540 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -45,6 +45,7 @@ pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) +pass_library(conv_affine_channel_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc new file mode 100644 index 0000000000000..a7bfb8cf1ee09 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc @@ -0,0 +1,222 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_CONV_BN_NODES(pattern_name) \ + /* OPERATORS */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \ + /* CONV inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ + /* CONV outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ + /* Affine Channel inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \ + /* Affine channel outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */ + +void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, + const ir::Node& ac_scale, + const LoDTensor& ac_bias_tensor, + LoDTensor* eltwise_y_in_tensor) { + using EigenVectorArrayMap = + Eigen::Map>; + using ConstEigenVectorArrayMap = + Eigen::Map>; + using EigenMatrixArrayMap = Eigen::Map< + Eigen::Array>; + + // Re-compute bias of conv2d from AffineChannel + PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims()); + + auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable(); + + ConstEigenVectorArrayMap scale_array(scale_tensor->data(), + scale_tensor->numel(), 1); + ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data(), + ac_bias_tensor.numel(), 1); + + EigenVectorArrayMap eltwise_y_in_array( + eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 1); + + eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array; + + // Re-compute weight of conv2d from AffineChannel + auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); + auto weights_shape = weights->dims(); + auto weights_shape_2d = flatten_to_2d(weights_shape, 1); + + EigenMatrixArrayMap weights_array_2d( + weights->mutable_data(platform::CPUPlace()), weights_shape_2d[0], + weights_shape_2d[1]); + + weights_array_2d.colwise() *= scale_array; +} + +std::unique_ptr ConvAffineChannelFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, false /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvAffineChannel fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + + // check if fuse can be done and if MKL-DNN should be used + FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel); + if (fuse_option == DO_NOT_FUSE) { + VLOG(3) << "do not perform conv+affinechannel fuse"; + return; + } + + // Create eltwise_y (conv bias) variable + VarDesc eltwise_y_in_desc( + patterns::PDNodeName(name_scope_, "eltwise_y_in")); + eltwise_y_in_desc.SetPersistable(true); + auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); + auto* eltwise_y_in_tensor = + scope->Var(eltwise_y_in_node->Name())->GetMutable(); + + // Get affine_channel bias + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + // Initialize eltwise_y + eltwise_y_in_tensor->Resize(ac_bias_tensor->dims()); + std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 0.0f); + + // update weights and biases + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // create an elementwise add node. + OpDesc desc; + desc.SetInput("X", std::vector({conv_out->Name()})); + desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); + desc.SetOutput("Out", std::vector({ac_out->Name()})); + desc.SetType("elementwise_add"); + desc.SetAttr("axis", 1); + auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. + + GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel}); + + IR_NODE_LINK_TO(conv_out, eltwise_op); + IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); + IR_NODE_LINK_TO(eltwise_op, ac_out); + found_conv_ac_count++; + }; + + gpd(graph.get(), handler); + + AddStatis(found_conv_ac_count); + return graph; +} + +std::unique_ptr ConvEltwiseAddAffineChannelFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init(name_scope_, graph.get()); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, true /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBN fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + // OPERATORS + GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern); + // BIAS inputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern); + // BIAS outputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern); + + // Get eltwise_y (conv bias) variable + auto* eltwise_y_in_tensor = + scope->FindVar(eltwise_y_in->Name())->GetMutable(); + + // Get batch norm bias + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // Update the elementwise_add node + eltwise->Op()->SetAttr("axis", 1); + eltwise->Op()->SetOutput("Out", std::vector({ac_out->Name()})); + + GraphSafeRemoveNodes(graph.get(), + {ac_scale, ac_bias, affine_channel, eltwise_out}); + + IR_NODE_LINK_TO(eltwise, ac_out); + + found_conv_ac_count++; + }; + + gpd(graph.get(), handler); + AddStatis(found_conv_ac_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_affine_channel_fuse_pass, + paddle::framework::ir::ConvAffineChannelFusePass); +REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass, + paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass); diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h new file mode 100644 index 0000000000000..ad966e11e6222 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the Conv and ConvAffineChannel. + */ +class ConvAffineChannelFusePass : public FusePassBase { + public: + virtual ~ConvAffineChannelFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_affine_channel_fuse"}; +}; + +class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { + public: + virtual ~ConvEltwiseAddAffineChannelFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 13d752e5167c0..6ef3417901f44 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1101,9 +1101,13 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } +// only support "identity" and "relu" now. +/* std::unordered_set conv_act_set({"identity", "sigmoid", "relu", "relu6", "relux", "tanh", "band_pass"}); +*/ +std::unordered_set conv_act_set({"identity", "relu"}); PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { conv_in->AsInput(); @@ -1236,6 +1240,78 @@ PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) { return elementwise_add_out; } +PDNode *patterns::ConvAffineChannel::operator()( + paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + + PDNode *eltwise_op = nullptr; + if (with_eltwise_add) { + eltwise_op = + pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); + } + + auto *affine_channel_op = + pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel"); + // Create variables + // Conv Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d"); + + PDNode *eltwise_y_in_var = nullptr; + PDNode *eltwise_out_var = nullptr; + if (with_eltwise_add) { + // Conv output as Bias input + conv_out_var->assert_is_op_input("elementwise_add", "X"); + // Bias + eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + eltwise_out_var = pattern->NewNode(eltwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("elementwise_add"); + } else { + // Conv output as AffineChannel input + conv_out_var->assert_is_op_input("affine_channel", "X"); + } + + // AC Scale + auto *ac_scale_var = pattern->NewNode(ac_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("affine_channel", "Scale"); + // AC Bias + auto *ac_bias_var = pattern->NewNode(ac_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("affine_channel", "Bias"); + + // AC output + auto *ac_out_var = pattern->NewNode(ac_out_repr()) + ->AsOutput() + ->assert_is_op_output("affine_channel"); + + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + + if (with_eltwise_add) { + eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var}) + .LinksTo({eltwise_out_var}); + affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var}) + .LinksTo({ac_out_var}); + } else { + affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var}) + .LinksTo({ac_out_var}); + } + return ac_out_var; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index eaedd9d08e0fa..61a5300344971 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -734,6 +734,38 @@ struct ConvElementwiseadd : public PatternBase { PATTERN_DECL_NODE(elementwise_add_out); }; +// Conv with affine_channel +// op: conv + (elementwise_add +) affine_channel +// named nodes: +// conv_weight, conv_out, conv, +// ac_x, ac_scale, ac_bias +// affine_channel, ac_out +struct ConvAffineChannel : public PatternBase { + ConvAffineChannel(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_affine_channel") {} + + PDNode* operator()(PDNode* conv_input, bool with_eltwise_add); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(affine_channel); + PATTERN_DECL_NODE(eltwise); // ELEMENTWISE_ADD + // CONV inputs + PATTERN_DECL_NODE(conv_weight); // Filter + // CONV outputs + PATTERN_DECL_NODE(conv_out); // tmp + // ELTWISE inputs + PATTERN_DECL_NODE(eltwise_y_in); + // ELTWISE outputs + PATTERN_DECL_NODE(eltwise_out); // tmp + + // AC(Affine_Channel) inputs + PATTERN_DECL_NODE(ac_scale); + PATTERN_DECL_NODE(ac_bias); + // AC outputs + PATTERN_DECL_NODE(ac_out); // Out +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index dcefdd92f5157..8a0ddfbab4cdb 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -110,7 +110,7 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; // Append after the infer_clean pass. - pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); + pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 40ca0d287ccde..d327f2bcec3ea 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -118,7 +118,9 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // + "infer_clean_graph_pass", // + "conv_affine_channel_fuse_pass", + "conv_eltwiseadd_affine_channel_fuse_pass", "conv_bn_fuse_pass", // "conv_elementwise_add_act_fuse_pass", // "conv_elementwise_add2_act_fuse_pass", // diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 3235ad52b999e..d63e0fa030cd7 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -161,9 +161,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - if ((activation == "identity") && - (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) && - (!residual)) { + if ((activation == "identity") && (!residual)) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. // But test in some case, the speed is slower, change to use From d4931a2abc6648bd652e0444972e41735f45dcf0 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 25 Dec 2018 11:36:26 +0000 Subject: [PATCH 051/103] support more input fake data --- .../fluid/inference/tests/api/tester_helper.h | 47 +++++++++++-------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index b0c8f395ce05f..ef7e2198c5db4 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -132,7 +132,8 @@ std::unordered_map GetFuseStatis(PaddlePredictor *predictor, void SetFakeImageInput(std::vector> *inputs, const std::string &dirname, bool is_combined = true, std::string model_filename = "model", - std::string params_filename = "params") { + std::string params_filename = "params", + const std::vector *feed_names = nullptr) { // Set fake_image_data PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); std::vector> feed_target_shapes = GetFeedTargetShapes( @@ -146,26 +147,32 @@ void SetFakeImageInput(std::vector> *inputs, os << "}\n"; } LOG(INFO) << os.str(); - - int dim1 = feed_target_shapes[0][1]; - int dim2 = feed_target_shapes[0][2]; - int dim3 = feed_target_shapes[0][3]; - - PaddleTensor input; - std::vector shape({FLAGS_batch_size, dim1, dim2, dim3}); - input.shape = shape; - input.dtype = PaddleDType::FLOAT32; - - // fill input data, for profile easily, do not use random data here. - size_t size = FLAGS_batch_size * dim1 * dim2 * dim3; - input.data.Resize(size * sizeof(float)); - float *input_data = static_cast(input.data.data()); - for (size_t i = 0; i < size; i++) { - *(input_data + i) = static_cast(i) / size; + if (feed_names) { + PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size()); + } + std::vector input_slots(feed_target_shapes.size()); + for (size_t i = 0; i < feed_target_shapes.size(); ++i) { + const auto &feed_shape = feed_target_shapes[i]; + auto &input = input_slots[i]; + std::vector shape({FLAGS_batch_size}); + for (size_t s = 1; s < feed_shape.size(); ++s) { + shape.push_back(static_cast(feed_shape[s])); + } + if (feed_names) { + input.name = (*feed_names)[i]; + } + input.shape = shape; + input.dtype = PaddleDType::FLOAT32; + size_t len = std::accumulate(shape.begin(), shape.end(), 1, + [](int a, int b) { return a * b; }); + input.data.Resize(len * sizeof(float)); + input.lod.assign({{0, static_cast(FLAGS_batch_size)}}); + float *input_data = static_cast(input.data.data()); + // fill input data, for profile easily, do not use random data here. + for (size_t j = 0; j < len; ++j) { + *(input_data + j) = static_cast(j) / len; + } } - - std::vector input_slots; - input_slots.assign({input}); (*inputs).emplace_back(input_slots); } From d46a140dd94406c669acedb78353131bfe89a115 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 25 Dec 2018 11:58:09 +0000 Subject: [PATCH 052/103] add seq pool inference test test=develop --- .../fluid/inference/tests/api/CMakeLists.txt | 4 + .../tests/api/analyzer_seq_pool1_tester.cc | 117 ++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 95bbc74a5961e..9aa9db031cd46 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -108,6 +108,10 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") +# seq_pool1 +inference_analysis_api_test_with_fake_data(test_analyzer_seq_pool1 +"${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1" analyzer_seq_pool1_tester.cc "seq_pool1.tar.gz") + # mobilenet with depthwise_conv op inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc new file mode 100644 index 0000000000000..2ae840fd11f62 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void SetConfig(AnalysisConfig *cfg) { + cfg->param_file = FLAGS_infer_model + "/params"; + cfg->prog_file = FLAGS_infer_model + "/model"; + cfg->use_gpu = false; + cfg->device = 0; + cfg->enable_ir_optim = true; + cfg->specify_input_name = true; + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); +} + +void SetInput(std::vector> *inputs) { + std::vector feed_names = { + "slot10000_embed", "slot10001_embed", "slot10004_embed", + "slot10005_embed", "slot10008_embed", "slot10009_embed", + "slot10012_embed", "slot10013_embed", "slot10108_embed", + "slot13324_embed", "slot13325_embed", "slot13326_embed", + "slot13327_embed", "slot13328_embed", "slot13329_embed", + "slot13330_embed", "slot13331_embed", "slot15501_embed", + "slot15502_embed", "slot15503_embed", "slot15504_embed", + "slot15505_embed", "slot15506_embed", "slot15507_embed", + "slot15508_embed", "slot15516_embed", "slot15519_embed", + "slot15523_embed", "slot15531_embed", "slot15533_embed", + "slot15548_embed", "slot15564_embed", "slot15565_embed", + "slot15566_embed", "slot15570_embed", "slot15571_embed", + "slot15572_embed", "slot15573_embed", "slot15574_embed", + "slot15575_embed", "slot15576_embed", "slot15577_embed", + "slot15579_embed", "slot15581_embed", "slot15582_embed", + "slot15583_embed", "slot15584_embed", "slot5016_embed", + "slot5021_embed", "slot6002_embed", "slot6003_embed", + "slot6004_embed", "slot6005_embed", "slot6006_embed", + "slot6007_embed", "slot6008_embed", "slot6009_embed", + "slot6011_embed", "slot6014_embed", "slot6015_embed", + "slot6023_embed", "slot6024_embed", "slot6025_embed", + "slot6027_embed", "slot6029_embed", "slot6031_embed", + "slot6034_embed", "slot6035_embed", "slot6036_embed", + "slot6037_embed", "slot6039_embed", "slot6048_embed", + "slot6050_embed", "slot6058_embed", "slot6059_embed", + "slot6060_embed", "slot6066_embed", "slot6067_embed", + "slot6068_embed", "slot6069_embed", "slot6070_embed", + "slot6071_embed", "slot6072_embed", "slot6073_embed", + "slot6182_embed", "slot6183_embed", "slot6184_embed", + "slot6185_embed", "slot6186_embed", "slot6188_embed", + "slot6189_embed", "slot6190_embed", "slot6201_embed", + "slot6202_embed", "slot6203_embed", "slot6247_embed", + "slot6248_embed", "slot6250_embed", "slot6251_embed", + "slot6807_embed", "slot6808_embed", "slot6809_embed", + "slot6810_embed", "slot6811_embed", "slot6812_embed", + "slot6813_embed", "slot6814_embed", "slot6815_embed", + "slot6816_embed", "slot6817_embed", "slot6818_embed", + "slot6819_embed", "slot6820_embed", "slot6822_embed", + "slot6823_embed", "slot6826_embed", "slot7002_embed", + "slot7003_embed", "slot7004_embed", "slot7005_embed", + "slot7006_embed", "slot7008_embed", "slot7009_embed", + "slot7010_embed", "slot7011_embed", "slot7013_embed", + "slot7014_embed", "slot7015_embed", "slot7016_embed", + "slot7017_embed", "slot7019_embed", "slot7100_embed", + "slot7506_embed", "slot7507_embed", "slot7514_embed", + "slot7515_embed", "slot7516_embed"}; + SetFakeImageInput(inputs, FLAGS_infer_model, true, "model", "params", + &feed_names); +} + +// Easy for profiling independently. +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } + std::vector outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_seq_pool1, profile) { profile(); } + +// Check the fuse status +TEST(Analyzer_seq_pool1, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + LOG(INFO) << "num_ops: " << num_ops; + EXPECT_EQ(num_ops, 314); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle From 3ea2f415dcf2829d0f8af9a24793024292416a15 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 26 Dec 2018 10:06:09 +0800 Subject: [PATCH 053/103] fix ci error. test=develop --- paddle/fluid/operators/distributed_ops/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index 3c0b7ff24f9cf..a8bb597cbd592 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -33,7 +33,7 @@ register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS}) if(WITH_GPU AND NOT WIN32) set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common) - op_library(gen_nccl_id_op ${DISTRIBUTE_DEPS} nccl_common) + op_library(gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS} nccl_common) endif() set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE) From 179acc60b3859545bec0c77009ac3e63eb9dd4ca Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 26 Dec 2018 03:20:28 +0000 Subject: [PATCH 054/103] fix conflict with develop test=develop --- paddle/fluid/framework/var_type_traits.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 1b535219c1510..cc68cf2ab8e1b 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -155,13 +155,24 @@ template struct VarTypeTrait { static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); using Type = T; - // Default id generation + /** + * Unique VarType Id generation. + * + * The auto-generated id should not be the same as any protobuf id defined in + * framework.proto. Therefore, we generate id by adding the type pos and + * maximum protobuf id (i.e., proto::VarType::TUPLE). + * + * However, we may need more protobuf id in the future. + * To avoid changing this auto id generation algorithm frequently, we + * generate id by adding the type pos and twice of maximum protobuf id (i.e., + * proto::VarType::TUPLE). + */ static constexpr int kId = VarTypeRegistry::TypePos() + static_cast(proto::VarType::TUPLE) * 2; }; // Users should set some of variable type ids to be what is defined in -// framework.proto here +// framework.proto below REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR); REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS); REG_PROTO_VAR_TYPE_TRAIT(std::vector, proto::VarType::STEP_SCOPES); From 956cf92145842f1e7ff760434074b42479fe704b Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 26 Dec 2018 05:54:51 +0000 Subject: [PATCH 055/103] Fix conv_elementwise_add2_act pass test=develop --- .../ir/conv_elementwise_add2_act_fuse_pass.cc | 25 +++++++++++-------- .../framework/ir/graph_pattern_detector.cc | 12 ++++----- paddle/fluid/operators/conv_fusion_op.cu.cc | 4 +-- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc index 23f343f631628..c6121777e8d2c 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -40,18 +40,20 @@ framework::proto::OpDesc PrepareOpDesc( const std::string& output) { auto proto = base_desc; framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); desc.SetInput("Bias", {bias}); desc.SetInput("ResidualData", {bias1}); desc.SetAttr("activation", activation); desc.SetOutput("Output", {output}); desc.SetAttr("is_test", true); - + desc.SetAttr("use_cudnn", false); + desc.Flush(); return *desc.Proto(); } std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( std::unique_ptr graph) const { - const std::string pattern_name = "conv_elementwise_add_act_fuse"; + const std::string pattern_name = "conv_elementwise_add2_act_fuse"; FusePassBase::Init(pattern_name, graph.get()); GraphPatternDetector gpd; @@ -76,22 +78,23 @@ std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( framework::OpDesc new_op_desc(new_op_proto, nullptr); // Create a new node for the fused op. - graph->CreateOpNode(&new_op_desc); + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); // Link inputs and outputs. PADDLE_ENFORCE(subgraph.count(x)); auto* conv_in_node = subgraph.at(x); - IR_NODE_LINK_TO(conv_in_node, conv_op); // Input - IR_NODE_LINK_TO(conv_filter, conv_op); // Filter - IR_NODE_LINK_TO(conv_op, conv_out); // Output - IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias - IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), - {conv_op, elementwise_add_op, elementwise_add_op_1, - elementwise_add_out}); + GraphSafeRemoveNodes( + graph.get(), + {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out, elementwise_add_out_1, act_op}); }; gpd(graph.get(), handler); return graph; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 13d752e5167c0..73d1a3da8fc92 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1101,9 +1101,7 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } -std::unordered_set conv_act_set({"identity", "sigmoid", "relu", - "relu6", "relux", "tanh", - "band_pass"}); +std::unordered_set conv_act_set({"identity", "relu"}); PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { conv_in->AsInput(); @@ -1169,13 +1167,13 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { ->AsInput(); auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) ->assert_is_op_output("elementwise_add") - ->assert_is_op_input("elementwise_add", "X") + ->assert_is_op_input("elementwise_add", "Y") ->AsIntermediate(); auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) ->assert_is_op("elementwise_add"); auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) - ->assert_is_op_input("elementwise_add", "Y") + ->assert_is_op_input("elementwise_add", "X") ->AsInput(); auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) ->assert_is_op_output("elementwise_add") @@ -1203,8 +1201,8 @@ PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) .LinksTo({elementwise_add_out}); - elementwise_add_op_1->LinksFrom( - {elementwise_add_out, elementwise_add_in_y_1}); + elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1}) + .LinksTo({elementwise_add_out_1}); act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); return act_out; } diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 3235ad52b999e..acceadab16493 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -22,7 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search); namespace paddle { namespace operators { -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 using Tensor = framework::Tensor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; @@ -204,7 +204,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel, ops::CUDNNConvFusionOpKernel); From a6aa8ea7719f6664e5218bb13d3d1db691e4225f Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 26 Dec 2018 05:58:23 +0000 Subject: [PATCH 056/103] faster rcnn input is presistable. (fix it in paddle-trt) test=develop --- .../framework/ir/graph_pattern_detector.cc | 6 ----- .../ir_passes/tensorrt_subgraph_pass.cc | 22 +++++++++++++++++-- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 6ef3417901f44..a826dfb275ca3 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1101,12 +1101,6 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } -// only support "identity" and "relu" now. -/* -std::unordered_set conv_act_set({"identity", "sigmoid", "relu", - "relu6", "relux", "tanh", - "band_pass"}); -*/ std::unordered_set conv_act_set({"identity", "relu"}); PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 9c42b83e7add3..5886868be0a90 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include #include #include + #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" namespace paddle { namespace inference { @@ -197,10 +199,26 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, std::vector ExtractParameters( const std::unordered_set &nodes) { + // We can judge whether a variable is a parameter by + // its presistable property, but sometimes the presistable + // of the feed op output is true, so we have to identify it. + std::vector feed_outputs; + for (const auto &node : nodes) { + if (!node->IsOp()) continue; + std::string op_type = node->Op()->Type(); + if (op_type == "feed") { + std::vector output_names = node->Op()->OutputArgumentNames(); + std::copy(output_names.begin(), output_names.end(), + std::back_inserter(feed_outputs)); + } + } + std::vector parameters; for (const auto &node : nodes) { if (!node->IsVar()) continue; - if (node->Var()->Persistable()) { + if (node->Var()->Persistable() && + std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == + feed_outputs.end()) { parameters.push_back(node->Name()); } } From 1e7f83e60a952a888ef2365e1a1a24384476e223 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 26 Dec 2018 16:00:20 +0800 Subject: [PATCH 057/103] add cuda dso support for windows test=develop --- cmake/cuda.cmake | 3 +++ cmake/cudnn.cmake | 1 + cmake/external/cub.cmake | 2 +- cmake/external/dlpack.cmake | 2 +- .../fluid/framework/details/all_reduce_op_handle.cc | 2 +- paddle/fluid/platform/dynload/cudnn.cc | 4 ++++ paddle/fluid/platform/dynload/dynamic_loader.cc | 12 ++++++++++++ 7 files changed, 23 insertions(+), 3 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 414e92eb27f56..5be7be64137be 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -139,10 +139,12 @@ endfunction() message(STATUS "CUDA detected: " ${CUDA_VERSION}) if (${CUDA_VERSION} LESS 7.0) set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) + add_definitions("-DPADDLE_CUDA_BINVER=\"60\"") elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + add_definitions("-DPADDLE_CUDA_BINVER=\"70\"") elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") @@ -150,6 +152,7 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x # CUDA 8 may complain that sm_20 is no longer supported. Suppress the # warning for now. list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") + add_definitions("-DPADDLE_CUDA_BINVER=\"80\"") endif() include_directories(${CUDA_INCLUDE_DIRS}) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index fb899e3d7cd42..fff1980637d02 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -89,6 +89,7 @@ if(CUDNN_FOUND) if(NOT CUDNN_MAJOR_VERSION) set(CUDNN_VERSION "???") else() + add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"") math(EXPR CUDNN_VERSION "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake index c94849cf4b967..f06728de91e45 100644 --- a/cmake/external/cub.cmake +++ b/cmake/external/cub.cmake @@ -32,4 +32,4 @@ endif() add_dependencies(cub extern_cub) -LIST(APPEND externl_project_dependencies cub) +LIST(APPEND external_project_dependencies cub) diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake index 94d8fcc668556..4587475d7902a 100644 --- a/cmake/external/dlpack.cmake +++ b/cmake/external/dlpack.cmake @@ -28,4 +28,4 @@ endif() add_dependencies(dlpack extern_dlpack) -LIST(APPEND externl_project_dependencies dlpack) +LIST(APPEND external_project_dependencies dlpack) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 9eaff1f560147..de7c845884d49 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -50,7 +50,7 @@ void AllReduceOpHandle::RunImpl() { // FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, // this is a distributed or inter-process call, find a better way. -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (NoDummyInputSize() == 1 && local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { #else diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index f3cd3b2bbedef..91d9a1ef01344 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -38,6 +38,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); #endif +#ifdef CUDNN_DNN_ROUTINE_EACH_R6 +CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); +#endif + #ifdef CUDNN_DNN_ROUTINE_EACH_R7 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); #endif diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 990e44cd211c0..15d516836652e 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -53,6 +53,12 @@ namespace platform { namespace dynload { static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; +#if defined(_WIN32) && defined(PADDLE_WITH_CUDA) +static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll"; +static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll"; +static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll"; +#endif + static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -165,6 +171,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, void* GetCublasDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); #endif @@ -173,6 +181,8 @@ void* GetCublasDsoHandle() { void* GetCUDNNDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false); #endif @@ -193,6 +203,8 @@ void* GetCUPTIDsoHandle() { void* GetCurandDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); #endif From 01c00b07dd5739d6bc9f3a33eebe27d2d32e6d24 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 26 Dec 2018 16:05:19 +0800 Subject: [PATCH 058/103] fix test issues on windows test=develop --- cmake/simd.cmake | 73 ++++++++++++------------- paddle/fluid/framework/CMakeLists.txt | 32 ++++------- paddle/fluid/framework/mixed_vector.h | 10 ++-- paddle/fluid/framework/op_registry.h | 3 +- paddle/fluid/inference/tests/test.cmake | 8 ++- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/operators/cum_op.h | 2 + paddle/fluid/operators/huber_loss_op.h | 8 ++- paddle/fluid/platform/float16_test.cc | 1 + paddle/fluid/platform/float16_test.cu | 1 + 10 files changed, 69 insertions(+), 71 deletions(-) diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 86096d4feaace..566dc75fda019 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,46 +57,43 @@ int main() return 0; }" SSE3_FOUND) -# disable AVX by default on windows -if(NOT WIN32) - # Check AVX - set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) - set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; - }" AVX_FOUND) +# Check AVX +set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; +}" AVX_FOUND) - # Check AVX 2 - set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) - set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; - }" AVX2_FOUND) +# Check AVX 2 +set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; +}" AVX2_FOUND) - # Check AVX512F - set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) - set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) - CHECK_CXX_SOURCE_RUNS(" - #include - int main() - { - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; - }" AVX512F_FOUND) -endif(NOT WIN32) +# Check AVX512F +set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) +set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; +}" AVX512F_FOUND) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 867970717b386..d7fbc4466f81e 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -7,27 +7,17 @@ function(windows_symbolic TARGET) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) foreach(src ${windows_symbolic_SRCS}) - get_filename_component(src ${src} NAME_WE) - if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) - message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") - endif() - -#only copy the xx.cu to.xx.cu when the content are modified - set(copy_flag 1) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR) - if (SOURCE_STR STREQUAL TARGET_STR) - set(copy_flag 0) - endif() - endif() - if (copy_flag) - add_custom_command(OUTPUT .${src}.cu - COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu - COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu" - COMMENT "create hidden file of ${src}.cu") - endif(copy_flag) - add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu) + message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") + endif() + + file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) + + add_custom_command(OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) endforeach() endfunction() diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 6940250c3f966..c3a044d22cf04 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -215,8 +215,8 @@ class Vector { auto stream = dev_ctx->stream(); void *src = gpu_->ptr(); void *dst = cpu_.data(); - memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_->size(), stream); + paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_->size(), stream); dev_ctx->Wait(); } @@ -261,8 +261,8 @@ class Vector { auto *dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); auto stream = dev_ctx->stream(); - memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_->size(), stream); + paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_->size(), stream); } void ImmutableCPU() const { @@ -284,7 +284,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable memory::AllocationPtr gpu_; + mutable paddle::memory::AllocationPtr gpu_; mutable int flag_; mutable std::mutex mtx_; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 6d39bb3c524b4..2c1648c81fc99 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,7 +23,8 @@ limitations under the License. */ #include #include -#include "glog/logging.h" // For VLOG() +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index ab3a30ce6bba1..29f0f034a2aab 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -3,14 +3,16 @@ set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING "A path setting inference demo download directories.") function (inference_download install_dir url filename) message(STATUS "Download inference test stuff from ${url}/${filename}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") + file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}") message(STATUS "finish downloading ${filename}") endfunction() function (inference_download_and_uncompress install_dir url filename) inference_download(${install_dir} ${url} ${filename}) - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") + execute_process( + COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename} + WORKING_DIRECTORY ${install_dir} + ) endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4a14eb941cd98..ee154207754f2 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -46,7 +46,7 @@ endif() register_operators(EXCLUDES py_func_op warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) # warpctc_op needs cudnn 7 above -if (WITH_GPU AND NOT WIN32) +if (WITH_GPU) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h index 999fdcff90784..7c0fda4169b5e 100644 --- a/paddle/fluid/operators/cum_op.h +++ b/paddle/fluid/operators/cum_op.h @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h index 9efda3dfc9871..666500ef26ea2 100644 --- a/paddle/fluid/operators/huber_loss_op.h +++ b/paddle/fluid/operators/huber_loss_op.h @@ -104,15 +104,19 @@ class HuberLossGradKernel : public framework::OpKernel { if (out0) { out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); + // MSVC not treat it well when partial template arguments were specified x_grad.device(place) = - out_grad * residual.unaryExpr(HuberLossBackward(delta, -1.0)); + out_grad * + residual.unaryExpr(HuberLossBackward(delta, static_cast(-1.0))); } if (out1) { out1->mutable_data(context.GetPlace()); auto y_grad = EigenVector::Flatten(*out1); + // MSVC not treat it well when partial template arguments were specified y_grad.device(place) = - out_grad * residual.unaryExpr(HuberLossBackward(delta, 1.0)); + out_grad * + residual.unaryExpr(HuberLossBackward(delta, static_cast(1.0))); } } }; diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc index 27e930e6e0a76..3a937dfaec3ac 100644 --- a/paddle/fluid/platform/float16_test.cc +++ b/paddle/fluid/platform/float16_test.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/init.h" diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index e2b7ca9b03809..b1b51d804e02f 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -11,6 +11,7 @@ limitations under the License. */ #include "paddle/fluid/platform/float16.h" +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include #include #include From 71636e677d456b4e9f63b6890d094bb1449cd552 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 26 Dec 2018 08:31:51 +0000 Subject: [PATCH 059/103] add min_subgraph_size attr to tensorrt config test=develop --- paddle/fluid/inference/analysis/argument.h | 1 + paddle/fluid/inference/analysis/ir_pass_manager.cc | 2 ++ .../analysis/ir_passes/tensorrt_subgraph_pass.cc | 6 ++++-- paddle/fluid/inference/api/analysis_config.cc | 8 ++++++-- paddle/fluid/inference/api/analysis_predictor.cc | 1 + paddle/fluid/inference/api/paddle_analysis_config.h | 13 ++++++++++++- 6 files changed, 26 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 83d411eecf6d7..2db5705d0944b 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -127,6 +127,7 @@ struct Argument { std::function); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); + DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); // The program transformed by IR analysis phase. DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 51bca8039d453..b8c9426ed3b62 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -75,6 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument, argument->tensorrt_node_teller_ptr()); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); + pass->Set("min_subgraph_size", + new int(argument->tensorrt_min_subgraph_size())); } // graph_ = pass->Apply(std::move(graph_)); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 5886868be0a90..ad10010e42be9 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -38,7 +38,8 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( auto teller = Get("tensorrt_node_teller"); - SubGraphFuser fuser(graph.get(), teller, 2 /*min subgraph size*/); + SubGraphFuser fuser(graph.get(), teller, + Get("min_subgraph_size") /*min subgraph size*/); fuser(); for (auto *node : graph->Nodes()) { @@ -233,4 +234,5 @@ REGISTER_PASS(tensorrt_subgraph_pass, paddle::inference::analysis::TensorRtSubgraphPass) .RequirePassAttr("tensorrt_node_teller") .RequirePassAttr("max_batch_size") - .RequirePassAttr("workspace_size"); + .RequirePassAttr("workspace_size") + .RequirePassAttr("min_subgraph_size"); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 8a0ddfbab4cdb..6d6e799fdec9c 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -57,6 +57,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_; model_from_memory_ = other.model_from_memory_; if (use_gpu) { @@ -89,6 +90,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_; model_from_memory_ = other.model_from_memory_; pass_builder_ = std::move(other.pass_builder_); @@ -105,11 +107,13 @@ void contrib::AnalysisConfig::EnableMKLDNN() { } void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, - int max_batch_size) { + int max_batch_size, + int min_subgraph_size) { use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; - // Append after the infer_clean pass. + tensorrt_min_subgraph_size_ = min_subgraph_size; + // Append after the conv+affine_channel fuse pass. pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3937884ce4a5a..3f8feaaa1e9f9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -328,6 +328,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseTensorRT(true); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); + argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); } if (config_.use_mkldnn_) { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index f05b9832da55f..e7ccea6587a25 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -49,7 +49,7 @@ struct AnalysisConfig : public NativeConfig { bool use_feed_fetch_ops{true}; void EnableTensorRtEngine(int workspace_size = 1 << 20, - int max_batch_size = 1); + int max_batch_size = 1, int min_subgraph_size = 3); bool use_tensorrt() const { return use_tensorrt_; } void EnableMKLDNN(); @@ -69,8 +69,19 @@ struct AnalysisConfig : public NativeConfig { bool use_tensorrt_{false}; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; + // For workspace_size, refer it from here: + // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting int tensorrt_workspace_size_; + // While TensorRT allows an engine optimized for a given max batch size + // to run at any smaller size, the performance for those smaller + // sizes may not be as well-optimized. Therefore, Max batch is best + // equivalent to the runtime batch size. int tensorrt_max_batchsize_; + // We transform the Ops that can be converted into TRT layer in the model, + // and aggregate these Ops into subgraphs for TRT execution. + // We set this variable to control the minimum number of nodes in the + // subgraph, 3 as default value. + int tensorrt_min_subgraph_size_{3}; std::unique_ptr pass_builder_; bool model_from_memory_{false}; }; From 2388d0e7d6277bfbb41a6f17324bb3a0e5df1c9c Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 26 Dec 2018 16:45:57 +0800 Subject: [PATCH 060/103] Revert "cherry-pick the #12759" test=develop This reverts commit 7f6d8acecb0c1d61dad645c581cd8cef9d554841. --- paddle/fluid/framework/op_proto_maker.cc | 4 -- paddle/fluid/framework/op_proto_maker.h | 1 - paddle/fluid/framework/operator.cc | 71 +++++-------------- paddle/fluid/pybind/const_value.cc | 3 - python/paddle/fluid/framework.py | 5 -- .../tests/unittests/test_operator_desc.py | 2 +- 6 files changed, 18 insertions(+), 68 deletions(-) diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 2311614c335a5..ca31303f77c4a 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -82,10 +82,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, AddAttr(OpNamescopeAttrName(), "Operator name with namesope.") .SetDefault(""); - AddAttr>(OpCreationCallstackAttrName(), - "Callstack for Op Creatation.") - .SetDefault({}); - Validate(); } diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 0a0f8f4655bc3..4c59c73d8779e 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -47,7 +47,6 @@ class OpProtoAndCheckerMaker { static const char *OpRoleAttrName() { return "op_role"; } static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpNamescopeAttrName() { return "op_namescope"; } - static const char *OpCreationCallstackAttrName() { return "op_callstack"; } void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ac2828136b451..f48e403cef810 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -16,15 +16,10 @@ limitations under the License. */ #include #include -#include -#include -#include -#include "gflags/gflags.h" -#include "glog/logging.h" + #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/transfer_scope_cache.h" @@ -162,59 +157,27 @@ RuntimeContext::RuntimeContext(const VariableNameMap& innames, } void OperatorBase::Run(const Scope& scope, const platform::Place& place) { - try { - if (VLOG_IS_ON(4)) { - VLOG(4) << place << " " << DebugStringEx(&scope); - } - if (platform::is_gpu_place(place)) { + VLOG(4) << place << " " << DebugStringEx(&scope); + if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("Cannot run operator on place %s", place); + PADDLE_THROW("Cannot run operator on place %s", place); #else - auto dev_id = boost::get(place).device; - platform::SetDeviceId(dev_id); + auto dev_id = boost::get(place).device; + platform::SetDeviceId(dev_id); #endif - } - - // The profile has a process-wide mutex, results in serious performance - // issue - // in concurrency scenerio. Here use an `if` to fix this issue. - // Please not remove the `if`, ask @Superjomn if there are any concern. - if (platform::IsProfileEnabled()) { - platform::DeviceContextPool& pool = - platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - RunImpl(scope, place); - } else { - RunImpl(scope, place); - } - - if (VLOG_IS_ON(3)) { - VLOG(3) << place << " " << DebugStringEx(&scope); - } - } catch (platform::EnforceNotMet exception) { - if (Attrs().count("sub_block") != 0) { - throw exception; - } - - auto& callstack = Attr>( - OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + } - if (callstack.empty()) { - throw exception; - } - std::ostringstream sout; - sout << "Invoke operator " << Type() << " error.\n"; - sout << "Python Callstacks: \n"; - for (auto& line : callstack) { - sout << line; - } - sout << "C++ Callstacks: \n"; - sout << exception.err_str_; - exception.err_str_ = sout.str(); - throw exception; - } catch (...) { - std::rethrow_exception(std::current_exception()); + // The profile has a process-wide mutex, results in serious performance issue + // in concurrency scenerio. Here use an `if` to fix this issue. + // Please not remove the `if`, ask @Superjomn if there are any concern. + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + RunImpl(scope, place); + } else { + RunImpl(scope, place); } + VLOG(3) << place << " " << DebugStringEx(&scope); } bool OperatorBase::HasInputs(const std::string& name) const { diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index f8ded9f94ecaf..06d8b65fb1480 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -49,9 +49,6 @@ void BindConstValue(pybind11::module* m) { op_proto_and_checker_maker.def( "kOpNameScopeAttrName", framework::OpProtoAndCheckerMaker::OpNamescopeAttrName); - op_proto_and_checker_maker.def( - "kOpCreationCallstackAttrName", - framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName); } } // namespace pybind diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3427fb0c4ae4d..de30ed2fc5858 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -20,7 +20,6 @@ import re import six import sys -import traceback import numpy as np @@ -605,10 +604,6 @@ def __init__(self, if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0: del op_attrs[role_var_name] - callstack_var_name = op_maker.kOpCreationCallstackAttrName() - op_attrs[callstack_var_name] = list( - reversed(traceback.format_stack()))[1:] - if len(self.desc.type()) != 0: return if type is None: diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index 37b9a9188ab44..4153394c1da77 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -69,7 +69,7 @@ def test_op_desc_creation(self): set(mul_op.attr_names), set([ "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var", - "op_namescope", "op_callstack" + "op_namescope" ])) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) From e49276e731716a1f9f796d102f82ebf58effb22b Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 26 Dec 2018 17:53:08 +0800 Subject: [PATCH 061/103] restore the huber_loss_op test=develop --- paddle/fluid/operators/huber_loss_op.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h index 666500ef26ea2..9efda3dfc9871 100644 --- a/paddle/fluid/operators/huber_loss_op.h +++ b/paddle/fluid/operators/huber_loss_op.h @@ -104,19 +104,15 @@ class HuberLossGradKernel : public framework::OpKernel { if (out0) { out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); - // MSVC not treat it well when partial template arguments were specified x_grad.device(place) = - out_grad * - residual.unaryExpr(HuberLossBackward(delta, static_cast(-1.0))); + out_grad * residual.unaryExpr(HuberLossBackward(delta, -1.0)); } if (out1) { out1->mutable_data(context.GetPlace()); auto y_grad = EigenVector::Flatten(*out1); - // MSVC not treat it well when partial template arguments were specified y_grad.device(place) = - out_grad * - residual.unaryExpr(HuberLossBackward(delta, static_cast(1.0))); + out_grad * residual.unaryExpr(HuberLossBackward(delta, 1.0)); } } }; From 02e17396c24f0deb11826e37a579a69dc41ca382 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Wed, 26 Dec 2018 11:33:35 +0000 Subject: [PATCH 062/103] fix comments test=develop --- paddle/fluid/inference/api/paddle_pass_builder.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index d327f2bcec3ea..1062ac5f58b90 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -118,13 +118,13 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // - "conv_affine_channel_fuse_pass", - "conv_eltwiseadd_affine_channel_fuse_pass", - "conv_bn_fuse_pass", // - "conv_elementwise_add_act_fuse_pass", // - "conv_elementwise_add2_act_fuse_pass", // - "conv_elementwise_add_fuse_pass", // + "infer_clean_graph_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // }); } From 3e917a934af212ab3ff3b2704666fb283cb3ed11 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 26 Dec 2018 08:03:13 +0000 Subject: [PATCH 063/103] add scope_pool add module cleanup test=develop --- paddle/contrib/float16/float16_transpiler.py | 2 +- paddle/fluid/framework/CMakeLists.txt | 1 + paddle/fluid/framework/scope_pool.cc | 54 +++++++++++++++++++ paddle/fluid/framework/scope_pool.h | 46 ++++++++++++++++ paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/pybind.cc | 17 +++++- python/paddle/fluid/__init__.py | 2 +- python/paddle/fluid/executor.py | 2 +- .../fluid/tests/unittests/test_py_func_op.py | 6 +-- .../fluid/transpiler/inference_transpiler.py | 2 +- 10 files changed, 124 insertions(+), 10 deletions(-) create mode 100644 paddle/fluid/framework/scope_pool.cc create mode 100644 paddle/fluid/framework/scope_pool.h diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py index 8d95dc0591e1d..500f64bed9898 100644 --- a/paddle/contrib/float16/float16_transpiler.py +++ b/paddle/contrib/float16/float16_transpiler.py @@ -60,7 +60,7 @@ def transpile(self, program, place, scope=None): raise TypeError("place should be as CPUPlace/CUDAPlace type") if scope is None: scope = global_scope() - if not isinstance(scope, core.Scope): + if not isinstance(scope, core._Scope): raise TypeError("scope should be as Scope type or None") self.scope = scope diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 412bc9cbe88b8..514eeb53476f3 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -84,6 +84,7 @@ cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_library(scope SRCS scope.cc DEPS glog threadpool) +cc_library(scope_pool SRCS scope_pool.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc new file mode 100644 index 0000000000000..5cb241a7a341d --- /dev/null +++ b/paddle/fluid/framework/scope_pool.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/threadpool.h" + +namespace paddle { +namespace framework { + +ScopePool &ScopePool::Instance() { // NOLINT + static ScopePool pool; + return pool; +} + +void ScopePool::DeleteScope(Scope *scope) { delete scope; } + +void ScopePool::Insert(std::unique_ptr &&s) { + std::lock_guard guard(mtx_); + scopes_.insert(s.release()); +} + +void ScopePool::Remove(Scope *s) { + size_t has_scope; + { + std::lock_guard guard(mtx_); + has_scope = scopes_.erase(s); + } + PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope"); + DeleteScope(s); +} + +ScopePool::~ScopePool() { Clear(); } + +void ScopePool::Clear() { + std::lock_guard guard(mtx_); + for (auto *s : scopes_) { + DeleteScope(s); + } + scopes_.clear(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope_pool.h b/paddle/fluid/framework/scope_pool.h new file mode 100644 index 0000000000000..a8b468699abe1 --- /dev/null +++ b/paddle/fluid/framework/scope_pool.h @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +class ScopePool { + public: + static ScopePool &Instance(); // NOLINT + + void Insert(std::unique_ptr &&s); + + void Remove(Scope *s); + + void Clear(); + + ~ScopePool(); + + private: + ScopePool() = default; + + static void DeleteScope(Scope *scope); + + std::unordered_set scopes_; + std::mutex mtx_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index fb8bcb190bda5..72b0f216d3aaf 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,5 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) endif() diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 88a2a5276ab52..81d63aace04a4 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -32,6 +32,7 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope_pool.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/imperative/layer.h" @@ -117,6 +118,9 @@ PYBIND11_MODULE(core, m) { return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); }); + m.add_object("_cleanup", + py::capsule([]() { ScopePool::Instance().Clear(); })); + py::class_(m, "VarBase", R"DOC()DOC") .def(py::init<>()) .def("_run_backward", @@ -454,7 +458,7 @@ All parameter, weight, gradient are variables in Paddle. }, py::return_value_policy::copy); - py::class_(m, "Scope", R"DOC( + py::class_(m, "_Scope", R"DOC( Scope is an association of a name to Variable. All variables belong to Scope. Variables in a parent scope can be retrieved from local scope. @@ -474,17 +478,26 @@ All parameter, weight, gradient are variables in Paddle. param.set(param_array, place) )DOC") + .def("_remove_from_pool", + [](Scope &self) { ScopePool::Instance().Remove(&self); }) .def("var", [](Scope &self, const std::string &name) -> Variable * { return self.Var(name); }, py::return_value_policy::reference) .def("find_var", &Scope::FindVar, py::return_value_policy::reference) - .def(py::init<>()) .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, py::return_value_policy::reference) .def("drop_kids", &Scope::DropKids); + m.def("Scope", + []() -> Scope * { + auto *s = new Scope(); + ScopePool::Instance().Insert(std::unique_ptr(s)); + return s; + }, + py::return_value_policy::reference); + //! @note: Be careful! PyBind will return std::string as an unicode, not //! Python str. If you want a str object, you should cast them in Python. m.def("get_all_op_protos", []() -> std::vector { diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 8f3660ca387ba..e0078e53141ac 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -46,7 +46,7 @@ from . import distribute_lookup_table from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder -from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope +from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig from .lod_tensor import create_lod_tensor, create_random_int_lodtensor diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index f2886090d75f8..5a9e908b61eee 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -191,7 +191,7 @@ def _fetch_var(name, scope=None, return_numpy=True): assert isinstance(name, str) if scope is None: scope = global_scope() - assert isinstance(scope, core.Scope) + assert isinstance(scope, core._Scope) var = scope.find_var(name) assert var is not None, ( diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py index 943ad3ed22480..655378f7f8c18 100644 --- a/python/paddle/fluid/tests/unittests/test_py_func_op.py +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -26,7 +26,7 @@ def dummy_func_with_no_input(): - return float(1.0) + return np.array([0], dtype='float32') def dummy_func_with_no_output(x): @@ -105,7 +105,7 @@ def simple_fc_net(img, label, use_py_func_op): name='test_tmp_var', dtype='float32', shape=[1]) fluid.layers.py_func( func=dummy_func_with_no_input, x=None, out=dummy_var) - + loss += dummy_var fluid.layers.py_func(func=dummy_func_with_no_output, x=loss, out=None) loss = fluid.layers.mean(loss) @@ -174,7 +174,7 @@ def test_loss_diff(self): self.assertAlmostEqual(max_diff, 0, delta=1e-3) -class TestPyFuncOpUseParallelExecutor(unittest.TestCase): +class TestPyFuncOpUseParallelExecutor(TestPyFuncOpUseExecutor): def setUp(self): self.use_parallel_executor = True diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index ccf7af334d091..cc7f5ec90c26c 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -57,7 +57,7 @@ def transpile(self, program, place, scope=None): raise TypeError("place should be as CPUPlace/CUDAPlace type") if scope is None: scope = global_scope() - if not isinstance(scope, core.Scope): + if not isinstance(scope, core._Scope): raise TypeError("scope should be as Scope type or None") use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False)) From 10a6bc9675848c6ab0a30b7dc47f9d5c8788b0d1 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 26 Dec 2018 11:53:29 +0000 Subject: [PATCH 064/103] modify API.spec test=develop --- paddle/fluid/API.spec | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e3b44499258d2..3970d9a731750 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -447,11 +447,7 @@ paddle.fluid.unique_name.switch ArgSpec(args=['new_generator'], varargs=None, ke paddle.fluid.unique_name.guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.recordio_writer.convert_reader_to_recordio_file ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) paddle.fluid.recordio_writer.convert_reader_to_recordio_files ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)) -paddle.fluid.Scope.__init__ __init__(self: paddle.fluid.core.Scope) -> None -paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None -paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable -paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope -paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) From ee83ce75bf46b9c3da8c3f9689d1f3811aafe577 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 26 Dec 2018 07:29:40 +0000 Subject: [PATCH 065/103] try to fix py35 compile error test=develop --- paddle/fluid/framework/ddim.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index 123e227dc0423..f0a42f0f3633b 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -60,7 +60,9 @@ class DDim { DDim() : rank_(1) { dim_[0] = 0; } - DDim(const DDim& ddim) { this->CopyFrom(ddim); } + DDim(const DDim& ddim) : rank_(ddim.rank_) { + dynamic_dim_assign(ddim.Get(), dim_.GetMutable(), rank_); + } DDim(const int* d, int n) : rank_(n) { dynamic_dim_assign(d, dim_.GetMutable(), n); @@ -140,13 +142,10 @@ class DDim { return *reinterpret_cast*>(p); } - inline void CopyFrom(const DDim& ddim) { - rank_ = ddim.rank_; - PADDLE_VISIT_DDIM(rank_, - (void)(UnsafeCast() = ddim.UnsafeCast())); + inline DDim& CopyFrom(const DDim& ddim) { + PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast())); } - friend DDim slice_ddim(const DDim& dim, int begin, int end); friend DDim stride(const DDim& ddim); friend DDim stride_numel(const DDim& ddim); From 05f1b65da34a9daa3b8edc218505fa7b74ca3069 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Wed, 26 Dec 2018 18:53:28 +0800 Subject: [PATCH 066/103] simplify prepere_input in analyzer_test test=develop --- paddle/fluid/inference/api/helper.h | 10 ++++++++ .../tests/api/analyzer_lac_tester.cc | 4 +--- .../tests/api/analyzer_mm_dnn_tester.cc | 12 ++++------ .../tests/api/analyzer_ner_tester.cc | 11 ++++----- .../tests/api/analyzer_seq_conv1_tester.cc | 24 ++++++------------- 5 files changed, 26 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 9a393a61c4b45..7830e85956774 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -113,6 +113,16 @@ static void TensorAssignData(PaddleTensor *tensor, } } +template +static void TensorAssignData(PaddleTensor *tensor, + const std::vector> &data, + const std::vector &lod) { + int size = lod[lod.size() - 1]; + tensor->shape.assign({size, 1}); + tensor->lod.assign({lod}); + TensorAssignData(tensor, data); +} + template static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, const std::vector> &data) { diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 142801382b4fd..2213971c1764b 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -98,10 +98,8 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, auto one_batch = data->NextBatch(); PaddleTensor input_tensor; input_tensor.name = "word"; - input_tensor.shape.assign({static_cast(one_batch.data.size()), 1}); - input_tensor.lod.assign({one_batch.lod}); input_tensor.dtype = PaddleDType::INT64; - TensorAssignData(&input_tensor, {one_batch.data}); + TensorAssignData(&input_tensor, {one_batch.data}, one_batch.lod); PADDLE_ENFORCE_EQ(batch_size, static_cast(one_batch.lod.size() - 1)); input_slots->assign({input_tensor}); } diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc index 8aaab6d6649e1..98335fe4f885c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -80,15 +80,11 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, lod_query_tensor.name = "left"; lod_title_tensor.name = "right"; auto one_batch = data->NextBatch(); - int size1 = one_batch.lod1[one_batch.lod1.size() - 1]; // token batch size - int size2 = one_batch.lod2[one_batch.lod2.size() - 1]; // token batch size - lod_query_tensor.shape.assign({size1, 1}); - lod_query_tensor.lod.assign({one_batch.lod1}); - lod_title_tensor.shape.assign({size2, 1}); - lod_title_tensor.lod.assign({one_batch.lod2}); // assign data - TensorAssignData(&lod_query_tensor, one_batch.query_data_all); - TensorAssignData(&lod_title_tensor, one_batch.title_data_all); + TensorAssignData(&lod_query_tensor, one_batch.query_data_all, + one_batch.lod1); + TensorAssignData(&lod_title_tensor, one_batch.title_data_all, + one_batch.lod2); // Set inputs. input_slots->assign({lod_query_tensor, lod_title_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index f19a2ed59ef2f..54298fdab28d5 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -78,14 +78,11 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, lod_word_tensor.name = "word"; lod_mention_tensor.name = "mention"; auto one_batch = data->NextBatch(); - int size = one_batch.lod[one_batch.lod.size() - 1]; // token batch size - lod_word_tensor.shape.assign({size, 1}); - lod_word_tensor.lod.assign({one_batch.lod}); - lod_mention_tensor.shape.assign({size, 1}); - lod_mention_tensor.lod.assign({one_batch.lod}); // assign data - TensorAssignData(&lod_word_tensor, one_batch.word_data_all); - TensorAssignData(&lod_mention_tensor, one_batch.mention_data_all); + TensorAssignData(&lod_word_tensor, one_batch.word_data_all, + one_batch.lod); + TensorAssignData(&lod_mention_tensor, one_batch.mention_data_all, + one_batch.lod); // Set inputs. input_slots->assign({lod_word_tensor, lod_mention_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index f5082cd60f1ae..49f6059715d7b 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -109,24 +109,14 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, title3_tensor.name = "title3"; l1_tensor.name = "l1"; auto one_batch = data->NextBatch(); - int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1]; - title1_tensor.shape.assign({title1_size, 1}); - title1_tensor.lod.assign({one_batch.title1_lod}); - int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1]; - title2_tensor.shape.assign({title2_size, 1}); - title2_tensor.lod.assign({one_batch.title2_lod}); - int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1]; - title3_tensor.shape.assign({title3_size, 1}); - title3_tensor.lod.assign({one_batch.title3_lod}); - int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1]; - l1_tensor.shape.assign({l1_size, 1}); - l1_tensor.lod.assign({one_batch.l1_lod}); - // assign data - TensorAssignData(&title1_tensor, one_batch.title1); - TensorAssignData(&title2_tensor, one_batch.title2); - TensorAssignData(&title3_tensor, one_batch.title3); - TensorAssignData(&l1_tensor, one_batch.l1); + TensorAssignData(&title1_tensor, one_batch.title1, + one_batch.title1_lod); + TensorAssignData(&title2_tensor, one_batch.title2, + one_batch.title2_lod); + TensorAssignData(&title3_tensor, one_batch.title3, + one_batch.title3_lod); + TensorAssignData(&l1_tensor, one_batch.l1, one_batch.l1_lod); // Set inputs. input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor}); for (auto &tensor : *input_slots) { From ecae157edf352ad73c8e60a90ced540fe0e48ff3 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Wed, 26 Dec 2018 21:31:45 +0800 Subject: [PATCH 067/103] simplify some data record in analyzer_tester test=develop --- .../tests/api/analyzer_mm_dnn_tester.cc | 35 +++------- .../tests/api/analyzer_ner_tester.cc | 33 +++------- .../tests/api/analyzer_seq_conv1_tester.cc | 64 ++++--------------- .../fluid/inference/tests/api/tester_helper.h | 12 ++++ 4 files changed, 45 insertions(+), 99 deletions(-) diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc index 98335fe4f885c..9d3c751943052 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -19,11 +19,9 @@ namespace inference { using contrib::AnalysisConfig; struct DataRecord { - std::vector> query_data_all, title_data_all; + std::vector> query, title; std::vector lod1, lod2; - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,22 +31,9 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= query_data_all.size()) { - data.query_data_all.assign(query_data_all.begin() + batch_iter, - query_data_all.begin() + batch_end); - data.title_data_all.assign(title_data_all.begin() + batch_iter, - title_data_all.begin() + batch_end); - // Prepare LoDs - data.lod1.push_back(0); - data.lod2.push_back(0); - CHECK(!data.query_data_all.empty()); - CHECK(!data.title_data_all.empty()); - CHECK_EQ(data.query_data_all.size(), data.title_data_all.size()); - for (size_t j = 0; j < data.query_data_all.size(); j++) { - // calculate lod - data.lod1.push_back(data.lod1.back() + data.query_data_all[j].size()); - data.lod2.push_back(data.lod2.back() + data.title_data_all[j].size()); - } + if (batch_end <= query.size()) { + GetInputPerBatch(query, &data.query, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title, &data.title, &data.lod2, batch_iter, batch_end); } batch_iter += batch_size; return data; @@ -67,8 +52,8 @@ struct DataRecord { // load title data std::vector title_data; split_to_int64(data[1], ' ', &title_data); - query_data_all.push_back(std::move(query_data)); - title_data_all.push_back(std::move(title_data)); + query.push_back(std::move(query_data)); + title.push_back(std::move(title_data)); } num_samples = num_lines; } @@ -81,10 +66,8 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, lod_title_tensor.name = "right"; auto one_batch = data->NextBatch(); // assign data - TensorAssignData(&lod_query_tensor, one_batch.query_data_all, - one_batch.lod1); - TensorAssignData(&lod_title_tensor, one_batch.title_data_all, - one_batch.lod2); + TensorAssignData(&lod_query_tensor, one_batch.query, one_batch.lod1); + TensorAssignData(&lod_title_tensor, one_batch.title, one_batch.lod2); // Set inputs. input_slots->assign({lod_query_tensor, lod_title_tensor}); for (auto &tensor : *input_slots) { diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 54298fdab28d5..f8635968cebc4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -19,11 +19,9 @@ namespace inference { using contrib::AnalysisConfig; struct DataRecord { - std::vector> word_data_all, mention_data_all; + std::vector> word, mention; std::vector lod; // two inputs have the same lod info. - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,20 +31,10 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= word_data_all.size()) { - data.word_data_all.assign(word_data_all.begin() + batch_iter, - word_data_all.begin() + batch_end); - data.mention_data_all.assign(mention_data_all.begin() + batch_iter, - mention_data_all.begin() + batch_end); - // Prepare LoDs - data.lod.push_back(0); - CHECK(!data.word_data_all.empty()); - CHECK(!data.mention_data_all.empty()); - CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size()); - for (size_t j = 0; j < data.word_data_all.size(); j++) { - // calculate lod - data.lod.push_back(data.lod.back() + data.word_data_all[j].size()); - } + if (batch_end <= word.size()) { + GetInputPerBatch(word, &data.word, &data.lod, batch_iter, batch_end); + GetInputPerBatch(mention, &data.mention, &data.lod, batch_iter, + batch_end); } batch_iter += batch_size; return data; @@ -65,8 +53,8 @@ struct DataRecord { // load mention data std::vector mention_data; split_to_int64(data[3], ' ', &mention_data); - word_data_all.push_back(std::move(word_data)); - mention_data_all.push_back(std::move(mention_data)); + word.push_back(std::move(word_data)); + mention.push_back(std::move(mention_data)); } num_samples = num_lines; } @@ -79,9 +67,8 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, lod_mention_tensor.name = "mention"; auto one_batch = data->NextBatch(); // assign data - TensorAssignData(&lod_word_tensor, one_batch.word_data_all, - one_batch.lod); - TensorAssignData(&lod_mention_tensor, one_batch.mention_data_all, + TensorAssignData(&lod_word_tensor, one_batch.word, one_batch.lod); + TensorAssignData(&lod_mention_tensor, one_batch.mention, one_batch.lod); // Set inputs. input_slots->assign({lod_word_tensor, lod_mention_tensor}); diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index 49f6059715d7b..e6d6cd2960b39 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -18,12 +18,9 @@ namespace paddle { namespace inference { struct DataRecord { - std::vector> title1_all, title2_all, title3_all, l1_all; std::vector> title1, title2, title3, l1; - std::vector title1_lod, title2_lod, title3_lod, l1_lod; - size_t batch_iter{0}; - size_t batch_size{1}; - size_t num_samples; // total number of samples + std::vector lod1, lod2, lod3, l1_lod; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples DataRecord() = default; explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { @@ -33,41 +30,11 @@ struct DataRecord { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= title1_all.size()) { - data.title1_all.assign(title1_all.begin() + batch_iter, - title1_all.begin() + batch_end); - data.title2_all.assign(title2_all.begin() + batch_iter, - title2_all.begin() + batch_end); - data.title3_all.assign(title3_all.begin() + batch_iter, - title3_all.begin() + batch_end); - data.l1_all.assign(l1_all.begin() + batch_iter, - l1_all.begin() + batch_end); - // Prepare LoDs - data.title1_lod.push_back(0); - data.title2_lod.push_back(0); - data.title3_lod.push_back(0); - data.l1_lod.push_back(0); - CHECK(!data.title1_all.empty()); - CHECK(!data.title2_all.empty()); - CHECK(!data.title3_all.empty()); - CHECK(!data.l1_all.empty()); - CHECK_EQ(data.title1_all.size(), data.title2_all.size()); - CHECK_EQ(data.title1_all.size(), data.title3_all.size()); - CHECK_EQ(data.title1_all.size(), data.l1_all.size()); - for (size_t j = 0; j < data.title1_all.size(); j++) { - data.title1.push_back(data.title1_all[j]); - data.title2.push_back(data.title2_all[j]); - data.title3.push_back(data.title3_all[j]); - data.l1.push_back(data.l1_all[j]); - // calculate lod - data.title1_lod.push_back(data.title1_lod.back() + - data.title1_all[j].size()); - data.title2_lod.push_back(data.title2_lod.back() + - data.title2_all[j].size()); - data.title3_lod.push_back(data.title3_lod.back() + - data.title3_all[j].size()); - data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size()); - } + if (batch_end <= title1.size()) { + GetInputPerBatch(title1, &data.title1, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title2, &data.title2, &data.lod2, batch_iter, batch_end); + GetInputPerBatch(title3, &data.title3, &data.lod3, batch_iter, batch_end); + GetInputPerBatch(l1, &data.l1, &data.l1_lod, batch_iter, batch_end); } batch_iter += batch_size; return data; @@ -92,10 +59,10 @@ struct DataRecord { // load l1 data std::vector l1_data; split_to_int64(data[3], ' ', &l1_data); - title1_all.push_back(std::move(title1_data)); - title2_all.push_back(std::move(title2_data)); - title3_all.push_back(std::move(title3_data)); - l1_all.push_back(std::move(l1_data)); + title1.push_back(std::move(title1_data)); + title2.push_back(std::move(title2_data)); + title3.push_back(std::move(title3_data)); + l1.push_back(std::move(l1_data)); } num_samples = num_lines; } @@ -110,12 +77,9 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, l1_tensor.name = "l1"; auto one_batch = data->NextBatch(); // assign data - TensorAssignData(&title1_tensor, one_batch.title1, - one_batch.title1_lod); - TensorAssignData(&title2_tensor, one_batch.title2, - one_batch.title2_lod); - TensorAssignData(&title3_tensor, one_batch.title3, - one_batch.title3_lod); + TensorAssignData(&title1_tensor, one_batch.title1, one_batch.lod1); + TensorAssignData(&title2_tensor, one_batch.title2, one_batch.lod2); + TensorAssignData(&title3_tensor, one_batch.title3, one_batch.lod3); TensorAssignData(&l1_tensor, one_batch.l1, one_batch.l1_lod); // Set inputs. input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor}); diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index b0c8f395ce05f..144027589cc8e 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -169,6 +169,18 @@ void SetFakeImageInput(std::vector> *inputs, (*inputs).emplace_back(input_slots); } +void GetInputPerBatch(const std::vector> &in, + std::vector> *out, + std::vector *lod, size_t batch_iter, + size_t batch_end) { + lod->clear(); + lod->push_back(0); + for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) { + out->push_back(*it); + lod->push_back(lod->back() + (*it).size()); // calculate lod + } +} + void TestOneThreadPrediction( const PaddlePredictor::Config *config, const std::vector> &inputs, From ce7e503cbe10dee0f3cad2145bec4559ab89f00f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Tue, 25 Dec 2018 14:40:55 +0800 Subject: [PATCH 068/103] refactor to avoid scope. test=develop --- paddle/fluid/framework/operator.cc | 60 +++++- paddle/fluid/framework/operator.h | 10 + paddle/fluid/imperative/layer.cc | 188 ++++++++---------- paddle/fluid/imperative/layer.h | 45 +++-- paddle/fluid/imperative/tracer.h | 120 ++++++++--- paddle/fluid/operators/fill_constant_op.cc | 35 ++++ paddle/fluid/pybind/pybind.cc | 12 +- python/paddle/fluid/framework.py | 37 ++-- python/paddle/fluid/imperative/base.py | 3 +- python/paddle/fluid/layer_helper.py | 21 +- python/paddle/fluid/layers/nn.py | 2 + .../fluid/tests/unittests/test_imperative.py | 13 +- 12 files changed, 347 insertions(+), 199 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2e7006ed95389..38675d2cac0bd 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -180,6 +180,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(3) << place << " " << DebugStringEx(&scope); } +void OperatorBase::Run(const RuntimeContext& ctx, + const platform::Place& place) { + RunImpl(ctx, place); +} + bool OperatorBase::HasInputs(const std::string& name) const { return inputs_.find(name) != inputs_.end(); } @@ -954,6 +959,51 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } +void OperatorWithKernel::RunImpl(const RuntimeContext& ctx, + const platform::Place& place) const { + Scope scope; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", type_); + } + + OpKernelMap& kernels = kernels_iter->second; + + auto expected_kernel_key = this->GetExpectedKernelType( + ExecutionContext(*this, scope, *dev_ctx, ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); +#ifdef PADDLE_WITH_MKLDNN + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = LibraryType::kPlain; + expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } +#endif + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } + + if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { + dev_ctx = pool.Get(expected_kernel_key.place_); + } + + RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx); + this->InferShape(&infer_shape_ctx); + kernel_iter->second(ExecutionContext(*this, scope, *dev_ctx, ctx)); +} + void OperatorWithKernel::TransferInplaceVarsBack( const Scope& scope, const std::vector& inplace_vars, const Scope& transfer_scope) const { @@ -1041,12 +1091,9 @@ Scope* OperatorWithKernel::PrepareData( proto::VarType::Type OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { - auto& scope = ctx.scope(); int data_type = -1; - std::string last_input_name; for (auto& input : this->inputs_) { - for (auto& ipt_name : input.second) { - auto* var = scope.FindVar(ipt_name); + for (const Variable* var : ctx.MultiInputVar(input.first)) { if (var != nullptr) { const Tensor* t = nullptr; if (var->IsType()) { @@ -1062,10 +1109,9 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( int tmp = static_cast(t->type()); PADDLE_ENFORCE( tmp == data_type || data_type == -1, - "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)", - Type(), last_input_name, data_type, ipt_name, tmp); + "DataType of Paddle Op %s must be the same. Get (%d) != (%d)", + Type(), data_type, tmp); data_type = tmp; - last_input_name = ipt_name; } } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index bad9716e8b409..446d27efa04e1 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -81,6 +81,10 @@ class RuntimeContext { RuntimeContext(const VariableNameMap& innames, const VariableNameMap& outnames, const Scope& scope); + RuntimeContext(const VariableValueMap& invars, + const VariableValueMap& outvars) + : inputs(invars), outputs(outvars) {} + VariableValueMap inputs; VariableValueMap outputs; }; @@ -101,6 +105,7 @@ class OperatorBase { /// Executor will call this interface function to Run an op. // The implementation should be written at RunImpl void Run(const Scope& scope, const platform::Place& place); + void Run(const RuntimeContext& ctx, const platform::Place& place); // FIXME(typhoonzero): this is only used for recv_op to stop event_loop. virtual void Stop() {} @@ -167,6 +172,9 @@ class OperatorBase { void CheckAllInputOutputSet() const; virtual void RunImpl(const Scope& scope, const platform::Place& place) const = 0; + + virtual void RunImpl(const RuntimeContext& ctx, + const platform::Place& place) const {} }; class ExecutionContext { @@ -458,6 +466,8 @@ class OperatorWithKernel : public OperatorBase { // same. proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; void RunImpl(const Scope& scope, const platform::Place& place) const final; + void RunImpl(const RuntimeContext& ctx, + const platform::Place& place) const final; /** * Transfer data from scope to a transfered scope. If there is no data need to diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 342cb68ab2bf8..239ff029dba8d 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -31,6 +31,11 @@ using framework::Variable; void AddTo(Variable* src, Variable* dst) { framework::LoDTensor* dst_tensor = dst->GetMutable(); framework::LoDTensor* src_tensor = src->GetMutable(); + + VLOG(3) << "apply var grad " << src_tensor->data()[0] << " " + << src_tensor->data()[1] << " " + << src_tensor->data()[2]; + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", dst_tensor->numel(), src_tensor->numel()); float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); @@ -38,16 +43,28 @@ void AddTo(Variable* src, Variable* dst) { for (size_t i = 0; i < src_tensor->numel(); ++i) { dst_data[i] += src_data[i]; } + + VLOG(3) << "apply var dst grad " << dst_tensor->data()[0] << " " + << dst_tensor->data()[1] << " " + << dst_tensor->data()[2]; } class Autograd { public: - explicit Autograd(framework::Scope* scope) : scope_(scope) {} + Autograd() {} void RunBackward(VarBase* var) { PADDLE_ENFORCE(var->pre_op_->op_desc_); // TODO(panyx0718): Only create for vars that "require_grad" - (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; + LOG(ERROR) << reinterpret_cast(var->grads_) << " vs " + << reinterpret_cast( + var->pre_op_ + ->output_vars_[var->pre_op_out_name_] + [var->pre_op_out_idx_] + ->grads_); + var->pre_op_->output_vars_[var->pre_op_out_name_][var->pre_op_out_idx_] + ->grads_->GetMutable() + ->ShareDataWith(var->grads_->Get()); std::deque ready; ready.push_back(var->pre_op_); @@ -57,18 +74,23 @@ class Autograd { while (!ready.empty()) { OpBase* ready_op = ready.front(); ready.pop_front(); - std::vector input_grads = ready_op->ApplyGrad(scope_); - - for (size_t i = 0; i < input_grads.size(); ++i) { - if (!input_grads[i]) continue; - OpBase* pre_op = ready_op->pre_ops_->at(i); - if (!pre_op) continue; - - dep_counts[pre_op] -= 1; - PADDLE_ENFORCE(dep_counts[pre_op] >= 0); - bool pre_op_ready = dep_counts[pre_op] == 0; - if (pre_op_ready) { - ready.push_back(pre_op); + std::map> input_grads = + ready_op->ApplyGrad(); + VLOG(3) << "after apply grad"; + + for (auto it : input_grads) { + const std::vector& ingrads = it.second; + for (size_t i = 0; i < ingrads.size(); ++i) { + if (!ingrads[i]) continue; + OpBase* pre_op = (*ready_op->pre_ops_)[it.first][i]; + if (!pre_op) continue; + + dep_counts[pre_op] -= 1; + PADDLE_ENFORCE(dep_counts[pre_op] >= 0); + bool pre_op_ready = dep_counts[pre_op] == 0; + if (pre_op_ready) { + ready.push_back(pre_op); + } } } } @@ -85,26 +107,25 @@ class Autograd { while (!queue.empty()) { OpBase* candidate = queue.front(); queue.pop_front(); - for (OpBase* pre_op : *(candidate->pre_ops_)) { - if (!pre_op) continue; - if (visited.find(pre_op) == visited.end()) { - visited.insert(pre_op); - queue.push_back(pre_op); + for (auto it : *(candidate->pre_ops_)) { + for (OpBase* pre_op : it.second) { + if (!pre_op) continue; + if (visited.find(pre_op) == visited.end()) { + visited.insert(pre_op); + queue.push_back(pre_op); + } + ret[pre_op] += 1; } - ret[pre_op] += 1; } } - return ret; } - - framework::Scope* scope_; }; -framework::Variable* CreateVariable(const std::string& name, - const framework::DDim& dim, float val, - framework::Scope* scope, - bool random_name = true) { +void CreateVariable(const std::string& name, const framework::DDim& dim, + float val, bool random_name, framework::Variable* var) { + if (var->IsInitialized()) return; + std::string varname = name; if (random_name) { std::mt19937 rng; @@ -116,12 +137,9 @@ framework::Variable* CreateVariable(const std::string& name, } VLOG(3) << "creating var " << varname; - framework::Variable* var = scope->Var(varname); framework::LoDTensor* tensor = var->GetMutable(); - float* data = tensor->mutable_data(dim, platform::CPUPlace()); std::fill(data, data + tensor->numel(), val); - return var; } framework::LoDTensor& VarBase::Grad() { @@ -129,94 +147,56 @@ framework::LoDTensor& VarBase::Grad() { return *grads_->GetMutable(); } -void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { - VLOG(3) << "apply var grad " << var_desc_->Name() << " " - << grad->Get().data()[0]; - if (!grads_) { - grads_ = - CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), - var_->Get().dims(), 0.0, scope); +std::map> OpBase::ApplyGrad() { + if (!grad_op_desc_) { + VLOG(3) << "op with no grad: " << op_desc_->Type(); + return {}; } - AddTo(grad, grads_); - VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " - << grads_->Get().data()[0]; -} - -std::vector OpBase::ApplyGrad(framework::Scope* scope) { VLOG(3) << "op grad " << grad_op_desc_->Type(); - for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { - if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { - // grad op inputs can be forward inputs, so not in grad_to_var. - continue; - } - VLOG(3) << "op grad in var " << grad_invar; - block_->FindRecursiveOrCreateVar(grad_invar); - framework::Variable* var = scope->Var(grad_invar); - const std::string& invar = grad_to_var_->at(grad_invar); - for (VarBase* varbase : *output_vars_) { - // Use the accumulated grads_ by sharing the input with grads_. - if (varbase->var_desc_->Name() == invar) { - var->GetMutable()->ShareDataWith( - varbase->grads_->Get()); - break; - } + std::map> grad_outputs; + for (auto it : grad_output_vars_) { + auto& outputs = grad_outputs[it.first]; + for (size_t i = 0; i < it.second.size(); ++i) { + outputs.push_back(new framework::Variable()); + outputs.back()->GetMutable(); + /* + auto& accum_grad_t = it.second[i]->Get(); + Variable* grad_var = outputs.back(); + float* data = grad_var->GetMutable() + ->mutable_data(accum_grad_t.dims(), platform::CPUPlace()); + std::fill(data, data + accum_grad_t.numel(), 0.0);*/ } } - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - VLOG(3) << "grad outvar " << outvar; - block_->FindRecursiveOrCreateVar(outvar); - framework::Variable* var = scope->Var(outvar); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block_->FindVar(outvar); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else { - LOG(ERROR) << "tracer doesn't support yet"; - } - } - } - grad_op_desc_->InferShape(*block_); + framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); + + // grad_op_desc_->InferShape(*block_); grad_op_desc_->InferVarType(block_); + std::unique_ptr opbase = framework::OpRegistry::CreateOp(*grad_op_desc_); - - opbase->Run(*scope, platform::CPUPlace()); - - // `ret` matches exactly with `input_vars_` of forward op. - std::vector ret; - for (size_t i = 0; i < input_vars_->size(); ++i) { - bool found = false; - VarBase* origin_var = (*input_vars_)[i]; - for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { - Variable* var = scope->FindVar(outvar); - std::string orig_var = grad_to_var_->at(outvar); - if (origin_var->var_desc_->Name() != orig_var) { - continue; - } - VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; - origin_var->ApplyGrad(scope, var); - found = true; - ret.push_back(var); - // TODO(panyx0718): There might be another outvar with the same name. - // In that case, it doesn't matter the first one or the second one is - // used. - break; - } - if (!found) { - ret.push_back(nullptr); + opbase->Run(ctx, platform::CPUPlace()); + + for (auto it : grad_output_vars_) { + auto& outputs = grad_outputs[it.first]; + auto& origin_outputs = it.second; + for (size_t i = 0; i < outputs.size(); ++i) { + framework::Variable* orig_grad = origin_outputs[i]; + AddTo(outputs[i], orig_grad); + VLOG(3) << "done add to " << grad_op_desc_->Outputs().at(it.first)[i]; } } - return ret; + return input_vars_; } -void VarBase::RunBackward(framework::Scope* scope) { - grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), - var_->Get().dims(), 1.0, scope, - false); +void VarBase::RunBackward() { + auto grads_t = grads_->GetMutable(); + float* data = grads_t->mutable_data(platform::CPUPlace()); + std::fill(data, data + grads_t->numel(), 1.0); + if (!pre_op_) return; - Autograd(scope).RunBackward(this); + Autograd().RunBackward(this); } } // namespace imperative diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 85a71ca83d21e..eb5fd553bdce0 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -14,11 +14,11 @@ #pragma once +#include #include #include #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" @@ -33,18 +33,26 @@ class VarBase { : pre_op_(nullptr), pre_op_out_idx_(-1), var_desc_(nullptr), - var_(nullptr), - grads_(nullptr) {} - - virtual ~VarBase() {} - - void ApplyGrad(framework::Scope* scope, framework::Variable* grad); + var_(new framework::Variable()), + grads_(new framework::Variable()) {} + + virtual ~VarBase() { + if (var_) { + delete var_; + var_ = nullptr; + } + if (grads_) { + delete grads_; + grads_ = nullptr; + } + } - void RunBackward(framework::Scope* scope); + void RunBackward(); framework::LoDTensor& Grad(); OpBase* pre_op_; + std::string pre_op_out_name_; int pre_op_out_idx_; framework::VarDesc* var_desc_; @@ -55,17 +63,12 @@ class VarBase { class OpBase { public: OpBase() - : input_vars_(new std::vector()), - output_vars_(new std::vector()), - pre_ops_(new std::vector()), - pre_ops_out_idx_(new std::vector()), + : pre_ops_(new std::map>()), + pre_ops_out_idx_(new std::map>()), op_desc_(nullptr), grad_op_desc_(nullptr) {} virtual ~OpBase() { - delete input_vars_; - delete output_vars_; - delete pre_ops_; delete pre_ops_out_idx_; @@ -73,16 +76,18 @@ class OpBase { if (grad_to_var_) delete grad_to_var_; } - std::vector ApplyGrad(framework::Scope* scope); + std::map> ApplyGrad(); - std::vector* input_vars_; - std::vector* output_vars_; - std::vector* pre_ops_; - std::vector* pre_ops_out_idx_; + std::map> input_vars_; + std::map> output_vars_; + std::map>* pre_ops_; + std::map>* pre_ops_out_idx_; framework::OpDesc* op_desc_; framework::OpDesc* grad_op_desc_; std::unordered_map* grad_to_var_; + std::map> grad_input_vars_; + std::map> grad_output_vars_; framework::BlockDesc* block_; }; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 97772dc110135..e7a60621cd559 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -41,6 +41,14 @@ void CreateGradOp(const framework::OpDesc& op_desc, *grad_op_desc = grad_op_descs[0].release(); } +void InitVar(framework::Variable* var, framework::Variable* grad_var) { + auto& var_t = var->Get(); + float* data = + grad_var->GetMutable()->mutable_data( + var_t.dims(), platform::CPUPlace()); + std::fill(data, data + var_t.numel(), 0.0); +} + class Tracer { public: explicit Tracer(framework::BlockDesc* root_block, @@ -53,10 +61,13 @@ class Tracer { virtual ~Tracer() { delete root_scope_; } - void Trace(OpBase* op, const std::vector& inputs, - const std::vector& outputs, + void Trace(OpBase* op, + const std::map>& inputs, + const std::map>& outputs, framework::BlockDesc* block) { - framework::Scope* scope = GetScope(block); + // framework::Scope* scope = GetScope(block); + std::map vars; + framework::OpDesc* op_desc = op->op_desc_; VLOG(3) << "tracer tracing " << op_desc->Type(); op_desc->InferShape(*block); @@ -64,48 +75,60 @@ class Tracer { std::unique_ptr op_base = framework::OpRegistry::CreateOp(*op_desc); - *op->input_vars_ = inputs; - for (VarBase* input : inputs) { - const std::string vname = input->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - input->var_ = var; - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); - if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); + framework::VariableValueMap invars_map; + framework::VariableValueMap outvars_map; + + op->input_vars_ = inputs; + for (auto it : op->input_vars_) { + auto& invars = invars_map[it.first]; + for (VarBase* inp : it.second) { + PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", + op->op_desc_->Type(), inp->var_desc_->Name()); + + invars.push_back(inp->var_); + vars[inp->var_desc_->Name()] = inp; + if (inp->pre_op_) { + (*op->pre_ops_)[it.first].push_back(inp->pre_op_); + (*op->pre_ops_out_idx_)[it.first].push_back(inp->pre_op_out_idx_); } else { - LOG(ERROR) << "tracer doesn't support yet"; + (*op->pre_ops_)[it.first].push_back(nullptr); } + VLOG(3) << "input vname " << inp->var_desc_->Name() << " " + << inp->var_->Get().dims().size() + << reinterpret_cast(inp->var_); } - if (input->pre_op_) { - op->pre_ops_->push_back(input->pre_op_); - op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); - } else { - op->pre_ops_->push_back(nullptr); - } - VLOG(3) << "input vname " << vname << " " - << var->Get().dims().size(); } - *op->output_vars_ = outputs; - for (size_t i = 0; i < outputs.size(); ++i) { - const std::string vname = outputs[i]->var_desc_->Name(); - framework::Variable* var = scope->Var(vname); - if (!var->IsInitialized()) { - framework::VarDesc* var_desc = block->FindVar(vname); + op->output_vars_ = outputs; + for (auto it : op->output_vars_) { + auto& outvars = outvars_map[it.first]; + const std::vector& outputs = it.second; + for (size_t i = 0; i < outputs.size(); ++i) { + VarBase* out = outputs[i]; + outvars.push_back(out->var_); + vars[out->var_desc_->Name()] = out; + + framework::VarDesc* var_desc = block->FindVar(out->var_desc_->Name()); if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { - var->GetMutable(); + out->var_->GetMutable(); } else { LOG(ERROR) << "tracer doesn't support yet"; } + out->pre_op_ = op; + out->pre_op_out_name_ = it.first; + out->pre_op_out_idx_ = i; + + VLOG(3) << "output vname " << out->var_desc_->Name() << " " + << out->var_->Get().dims().size() << " " + << reinterpret_cast(out->var_) << " " + << out->var_->IsInitialized(); } - outputs[i]->var_ = var; - outputs[i]->pre_op_ = op; - outputs[i]->pre_op_out_idx_ = i; } VLOG(3) << "tracer running " << op_desc->Type(); - op_base->Run(*scope, platform::CPUPlace()); + framework::RuntimeContext ctx(invars_map, outvars_map); + op_base->Run(ctx, platform::CPUPlace()); + if (block == startup_block_) { op->grad_op_desc_ = nullptr; op->grad_to_var_ = nullptr; @@ -115,6 +138,39 @@ class Tracer { CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); op->grad_op_desc_ = grad_op_desc; op->grad_to_var_ = grad_to_var; + + for (auto it : grad_op_desc->Inputs()) { + auto& grad_in_vars = op->grad_input_vars_[it.first]; + for (const std::string& grad_invar : it.second) { + block->FindRecursiveOrCreateVar(grad_invar); + auto var_it = op->grad_to_var_->find(grad_invar); + if (var_it == op->grad_to_var_->end()) { + auto fwd_var_it = vars.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != vars.end()); + grad_in_vars.push_back(fwd_var_it->second->var_); + } else { + VarBase* var = vars[var_it->second]; + if (!var->grads_->IsInitialized()) { + InitVar(var->var_, var->grads_); + } + grad_in_vars.push_back(var->grads_); + } + } + } + for (auto it : grad_op_desc->Outputs()) { + auto& grad_out_vars = op->grad_output_vars_[it.first]; + for (const std::string& grad_outvar : it.second) { + block->FindRecursiveOrCreateVar(grad_outvar); + auto var_it = op->grad_to_var_->find(grad_outvar); + PADDLE_ENFORCE(var_it != op->grad_to_var_->end()); + VarBase* var = vars[var_it->second]; + if (!var->grads_->IsInitialized()) { + InitVar(var->var_, var->grads_); + } + LOG(ERROR) << grad_outvar << " map to " << var->var_desc_->Name(); + grad_out_vars.push_back(var->grads_); + } + } } op->block_ = block; } diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 38cb33e79048a..7b04c5d21f439 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -68,6 +68,41 @@ class FillConstantOp : public framework::OperatorBase { auto &dev_ctx = *pool.Get(dev_place); math::set_constant(dev_ctx, tensor, value); } + + void RunImpl(const framework::RuntimeContext &ctx, + const platform::Place &dev_place) const override { + auto data_type = + static_cast(Attr("dtype")); + auto value = Attr("value"); + auto force_cpu = Attr("force_cpu"); + + framework::Tensor *tensor = nullptr; + + auto &out_var = *ctx.outputs.at("Out")[0]; + + if (out_var.IsType()) { + tensor = out_var.GetMutable(); + tensor->Resize(framework::make_ddim(Attr>("shape"))); + } else if (out_var.IsType()) { + tensor = out_var.GetMutable()->mutable_value(); + tensor->Resize(framework::make_ddim(Attr>("shape"))); + } else { + PADDLE_THROW( + "fill constant op's output only" + "supports SelectedRows and LoDTensor"); + } + + if (force_cpu) { + auto cpu = platform::CPUPlace(); + tensor->mutable_data(cpu, data_type); + } else { + tensor->mutable_data(dev_place, data_type); + } + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + math::set_constant(dev_ctx, tensor, value); + } }; class FillConstantOpVarTypeInference : public framework::VarTypeInference { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 81d63aace04a4..2ffdc90d8477f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -124,9 +124,7 @@ PYBIND11_MODULE(core, m) { py::class_(m, "VarBase", R"DOC()DOC") .def(py::init<>()) .def("_run_backward", - [](imperative::VarBase &self, framework::Scope *scope) { - self.RunBackward(scope); - }) + [](imperative::VarBase &self) { self.RunBackward(); }) .def("_grad", &imperative::VarBase::Grad) .def_property( "desc", @@ -134,7 +132,13 @@ PYBIND11_MODULE(core, m) { [](imperative::VarBase &self, framework::VarDesc *var_desc) { self.var_desc_ = var_desc; }, - py::return_value_policy::reference); + py::return_value_policy::reference) + .def_property("var", + [](const imperative::VarBase &self) { return self.var_; }, + [](imperative::VarBase &self, framework::Variable *var) { + self.var_ = var; + }, + py::return_value_policy::reference); py::class_(m, "OpBase", R"DOC()DOC") .def(py::init<>()) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index de30ed2fc5858..823b6d80be13b 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -15,6 +15,7 @@ from __future__ import print_function import collections +from collections import defaultdict import contextlib import os import re @@ -369,13 +370,11 @@ def __init__(self, self._ivar.desc = self.desc def _numpy(self): - scope = _imperative_tracer().get_scope(self.block.desc) - tensor = core.get_variable_tensor(scope, self.desc.name()) + tensor = self._ivar.var.get_tensor() return np.array(tensor) def _backward(self): - scope = _imperative_tracer().get_scope(self.block.desc) - self._ivar._run_backward(scope) + self._ivar._run_backward() def _gradient(self): return np.array(self._ivar._grad()) @@ -692,20 +691,20 @@ def find_name(var_list, name): if _in_imperative_mode(): self.iop = core.OpBase() self.iop.desc = self.desc - self.inputs = [] + self.inputs = defaultdict(list) if inputs is not None: - for inp in inputs.values(): - if isinstance(inp, Variable): - self.inputs.append(inp) - elif isinstance(inp, list) or isinstance(inp, tuple): - self.inputs.extend(inp[:]) - self.outputs = [] + for k, v in six.iteritems(inputs): + if isinstance(v, Variable): + self.inputs[k].append(v._ivar) + elif isinstance(v, list) or isinstance(v, tuple): + self.inputs[k].extend([var._ivar for var in v]) + self.outputs = defaultdict(list) if outputs is not None: - for out in outputs.values(): - if isinstance(out, Variable): - self.outputs.append(out) - elif isinstance(out, list) or isinstance(out, tuple): - self.outputs.extend(out[:]) + for k, v in six.iteritems(outputs): + if isinstance(v, Variable): + self.outputs[k].append(v._ivar) + elif isinstance(v, list) or isinstance(v, tuple): + self.outputs[k].extend([var._ivar for var in v]) def _has_kernel(self, op_type): return op_type not in self.OP_WITHOUT_KERNEL_SET @@ -1273,8 +1272,7 @@ def append_op(self, *args, **kwargs): op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) if _in_imperative_mode(): - _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], - [v._ivar for v in op.outputs], self.desc) + _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc) self.ops.append(op) return op @@ -1325,8 +1323,7 @@ def _prepend_op(self, *args, **kwargs): op_desc = self.desc._prepend_op() op = Operator(self, op_desc, *args, **kwargs) if _in_imperative_mode(): - _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], - [v._ivar for v in op.outputs], self.desc) + _imperative_tracer().trace(op.iop, op.inputs, op.outputs, self.desc) self.ops.insert(0, op) return op diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py index aa48ef71aa610..61e243e288fad 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/imperative/base.py @@ -46,8 +46,7 @@ def to_variable(value, block=None): name=None, shape=value.shape, dtype=value.dtype) - scope = framework._imperative_tracer().get_scope(block.desc) - var = scope.var(py_var.name) + var = py_var._ivar.var tensor = var.get_tensor() tensor.set(value, core.CPUPlace()) return py_var diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 74b4a977db6b6..0a299bc2fbb3c 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -20,7 +20,7 @@ import sys import numpy as np -from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating +from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode from . import unique_name from paddle.fluid.initializer import Constant, Xavier from paddle.fluid.imperative import base @@ -313,11 +313,20 @@ def create_parameter(self, param = self._create_weight_normalize(attr, shape, dtype) WeightNormParamAttr.params_with_weight_norm.append(param) return param - - self.startup_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr._to_kwargs(with_initializer=True)) - return self.main_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr._to_kwargs()) + if _in_imperative_mode(): + self.main_program.global_block().create_parameter( + dtype=dtype, shape=shape, **attr._to_kwargs()) + return self.startup_program.global_block().create_parameter( + dtype=dtype, + shape=shape, + **attr._to_kwargs(with_initializer=True)) + else: + self.startup_program.global_block().create_parameter( + dtype=dtype, + shape=shape, + **attr._to_kwargs(with_initializer=True)) + return self.main_program.global_block().create_parameter( + dtype=dtype, shape=shape, **attr._to_kwargs()) def get_parameter(self, name): param = self.main_program.global_block().var(name) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index cc1fdbd285611..d83e2735ffe52 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -20,6 +20,7 @@ import numpy as np import six import os +import sys import inspect from ..layer_helper import LayerHelper from ..initializer import Normal, Constant @@ -9682,6 +9683,7 @@ def _build_once(self, inputs): shape=param_shape, dtype=self._dtype, is_bias=False) + sys.stderr.write('created w: %s\n' % self._w.name) def forward(self, inputs): tmp = self._helper.create_variable_for_type_inference(self._dtype) diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 0fe69d1bd4b1b..6368f9b44a6e2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import contextlib import unittest import numpy as np @@ -38,7 +39,9 @@ def __init__(self): def forward(self, inputs): x = fluid.layers.relu(inputs[0]) self._x_for_debug = x - return [fluid.layers.elementwise_mul(x, x)] + x = fluid.layers.elementwise_mul(x, x) + x = fluid.layers.reduce_sum(x) + return [x] class MLP(fluid.imperative.PyLayer): @@ -79,10 +82,12 @@ def test_layer_in_out(self): with new_program_scope(): inp = fluid.layers.data( name="inp", shape=[3], append_batch_size=False) - l = MyLayer() - x = l(inp)[0] + x = fluid.layers.relu(inp) + x_for_debug = x + x = fluid.layers.elementwise_mul(x, x) + x = fluid.layers.reduce_sum(x) param_grads = fluid.backward.append_backward( - x, parameter_list=[l._x_for_debug.name])[0] + x, parameter_list=[x_for_debug.name])[0] exe = fluid.Executor(fluid.CPUPlace()) static_out, static_grad = exe.run( From 61491ce250548122ec3abf3df0928c819906e091 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 26 Dec 2018 16:29:55 +0800 Subject: [PATCH 069/103] clean test=develop --- paddle/fluid/framework/operator.cc | 14 +++++----- paddle/fluid/framework/operator.h | 10 ++++--- paddle/fluid/imperative/layer.cc | 32 ++++------------------ paddle/fluid/imperative/tracer.h | 29 ++------------------ paddle/fluid/operators/fill_constant_op.cc | 4 +-- paddle/fluid/pybind/imperative.cc | 4 +-- python/paddle/fluid/layers/nn.py | 2 -- 7 files changed, 24 insertions(+), 71 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 38675d2cac0bd..51b7f572c97e5 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -182,7 +182,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { void OperatorBase::Run(const RuntimeContext& ctx, const platform::Place& place) { - RunImpl(ctx, place); + RunImplPrepared(ctx, place); } bool OperatorBase::HasInputs(const std::string& name) const { @@ -959,9 +959,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } -void OperatorWithKernel::RunImpl(const RuntimeContext& ctx, - const platform::Place& place) const { - Scope scope; +void OperatorWithKernel::RunImplPrepared(const RuntimeContext& ctx, + const platform::Place& place) const { + Scope dummy_scope; platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -976,7 +976,7 @@ void OperatorWithKernel::RunImpl(const RuntimeContext& ctx, OpKernelMap& kernels = kernels_iter->second; auto expected_kernel_key = this->GetExpectedKernelType( - ExecutionContext(*this, scope, *dev_ctx, ctx)); + ExecutionContext(*this, dummy_scope, *dev_ctx, ctx)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -999,9 +999,9 @@ void OperatorWithKernel::RunImpl(const RuntimeContext& ctx, dev_ctx = pool.Get(expected_kernel_key.place_); } - RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx); + RuntimeInferShapeContext infer_shape_ctx(*this, dummy_scope, ctx); this->InferShape(&infer_shape_ctx); - kernel_iter->second(ExecutionContext(*this, scope, *dev_ctx, ctx)); + kernel_iter->second(ExecutionContext(*this, dummy_scope, *dev_ctx, ctx)); } void OperatorWithKernel::TransferInplaceVarsBack( diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 446d27efa04e1..3605bf22fc779 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -173,8 +173,10 @@ class OperatorBase { virtual void RunImpl(const Scope& scope, const platform::Place& place) const = 0; - virtual void RunImpl(const RuntimeContext& ctx, - const platform::Place& place) const {} + virtual void RunImplPrepared(const RuntimeContext& ctx, + const platform::Place& place) const { + PADDLE_THROW("%s doesn't support RunPreparedImpl", Type()); + } }; class ExecutionContext { @@ -466,8 +468,8 @@ class OperatorWithKernel : public OperatorBase { // same. proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; void RunImpl(const Scope& scope, const platform::Place& place) const final; - void RunImpl(const RuntimeContext& ctx, - const platform::Place& place) const final; + void RunImplPrepared(const RuntimeContext& ctx, + const platform::Place& place) const final; /** * Transfer data from scope to a transfered scope. If there is no data need to diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 239ff029dba8d..7741865f9f699 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -31,11 +31,6 @@ using framework::Variable; void AddTo(Variable* src, Variable* dst) { framework::LoDTensor* dst_tensor = dst->GetMutable(); framework::LoDTensor* src_tensor = src->GetMutable(); - - VLOG(3) << "apply var grad " << src_tensor->data()[0] << " " - << src_tensor->data()[1] << " " - << src_tensor->data()[2]; - PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", dst_tensor->numel(), src_tensor->numel()); float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); @@ -43,10 +38,6 @@ void AddTo(Variable* src, Variable* dst) { for (size_t i = 0; i < src_tensor->numel(); ++i) { dst_data[i] += src_data[i]; } - - VLOG(3) << "apply var dst grad " << dst_tensor->data()[0] << " " - << dst_tensor->data()[1] << " " - << dst_tensor->data()[2]; } class Autograd { @@ -55,16 +46,10 @@ class Autograd { void RunBackward(VarBase* var) { PADDLE_ENFORCE(var->pre_op_->op_desc_); - // TODO(panyx0718): Only create for vars that "require_grad" - LOG(ERROR) << reinterpret_cast(var->grads_) << " vs " - << reinterpret_cast( - var->pre_op_ - ->output_vars_[var->pre_op_out_name_] - [var->pre_op_out_idx_] - ->grads_); - var->pre_op_->output_vars_[var->pre_op_out_name_][var->pre_op_out_idx_] - ->grads_->GetMutable() - ->ShareDataWith(var->grads_->Get()); + PADDLE_ENFORCE( + var->grads_ == + var->pre_op_->output_vars_[var->pre_op_out_name_][var->pre_op_out_idx_] + ->grads_); std::deque ready; ready.push_back(var->pre_op_); @@ -76,7 +61,6 @@ class Autograd { ready.pop_front(); std::map> input_grads = ready_op->ApplyGrad(); - VLOG(3) << "after apply grad"; for (auto it : input_grads) { const std::vector& ingrads = it.second; @@ -160,17 +144,12 @@ std::map> OpBase::ApplyGrad() { for (size_t i = 0; i < it.second.size(); ++i) { outputs.push_back(new framework::Variable()); outputs.back()->GetMutable(); - /* - auto& accum_grad_t = it.second[i]->Get(); - Variable* grad_var = outputs.back(); - float* data = grad_var->GetMutable() - ->mutable_data(accum_grad_t.dims(), platform::CPUPlace()); - std::fill(data, data + accum_grad_t.numel(), 0.0);*/ } } framework::RuntimeContext ctx(grad_input_vars_, grad_outputs); + // No need to do static infer shape here. // grad_op_desc_->InferShape(*block_); grad_op_desc_->InferVarType(block_); @@ -184,7 +163,6 @@ std::map> OpBase::ApplyGrad() { for (size_t i = 0; i < outputs.size(); ++i) { framework::Variable* orig_grad = origin_outputs[i]; AddTo(outputs[i], orig_grad); - VLOG(3) << "done add to " << grad_op_desc_->Outputs().at(it.first)[i]; } } return input_vars_; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index e7a60621cd559..6b2e97873759d 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -20,7 +20,6 @@ #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/layer.h" @@ -53,19 +52,14 @@ class Tracer { public: explicit Tracer(framework::BlockDesc* root_block, framework::BlockDesc* startup_block) - : root_block_(root_block), startup_block_(startup_block) { - root_scope_ = new framework::Scope(); - scopes_[root_block_] = root_scope_; - scopes_[startup_block_] = root_scope_; - } + : root_block_(root_block), startup_block_(startup_block) {} - virtual ~Tracer() { delete root_scope_; } + virtual ~Tracer() {} void Trace(OpBase* op, const std::map>& inputs, const std::map>& outputs, framework::BlockDesc* block) { - // framework::Scope* scope = GetScope(block); std::map vars; framework::OpDesc* op_desc = op->op_desc_; @@ -94,8 +88,7 @@ class Tracer { (*op->pre_ops_)[it.first].push_back(nullptr); } VLOG(3) << "input vname " << inp->var_desc_->Name() << " " - << inp->var_->Get().dims().size() - << reinterpret_cast(inp->var_); + << inp->var_->IsInitialized(); } } @@ -119,8 +112,6 @@ class Tracer { out->pre_op_out_idx_ = i; VLOG(3) << "output vname " << out->var_desc_->Name() << " " - << out->var_->Get().dims().size() << " " - << reinterpret_cast(out->var_) << " " << out->var_->IsInitialized(); } } @@ -167,7 +158,6 @@ class Tracer { if (!var->grads_->IsInitialized()) { InitVar(var->var_, var->grads_); } - LOG(ERROR) << grad_outvar << " map to " << var->var_desc_->Name(); grad_out_vars.push_back(var->grads_); } } @@ -175,22 +165,9 @@ class Tracer { op->block_ = block; } - framework::Scope* GetScope(framework::BlockDesc* block) { - if (scopes_.find(block) != scopes_.end()) { - return scopes_.at(block); - } - framework::BlockDesc* parent_block = block->ParentBlock(); - PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); - framework::Scope* scope = &scopes_[parent_block]->NewScope(); - scopes_[block] = scope; - return scope; - } - private: - std::map scopes_; framework::BlockDesc* root_block_; framework::BlockDesc* startup_block_; - framework::Scope* root_scope_; }; } // namespace imperative diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 7b04c5d21f439..d10fb1214c7f2 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -69,8 +69,8 @@ class FillConstantOp : public framework::OperatorBase { math::set_constant(dev_ctx, tensor, value); } - void RunImpl(const framework::RuntimeContext &ctx, - const platform::Place &dev_place) const override { + void RunImplPrepared(const framework::RuntimeContext &ctx, + const platform::Place &dev_place) const override { auto data_type = static_cast(Attr("dtype")); auto value = Attr("value"); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index be63fb877869b..7f9d937981ae8 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -28,9 +28,7 @@ void BindTracer(pybind11::module *m) { framework::BlockDesc *startup_block) { new (&self) imperative::Tracer(root_block, startup_block); }) - .def("trace", &imperative::Tracer::Trace) - .def("get_scope", &imperative::Tracer::GetScope, - pybind11::return_value_policy::reference); + .def("trace", &imperative::Tracer::Trace); } } // namespace pybind diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d83e2735ffe52..cc1fdbd285611 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -20,7 +20,6 @@ import numpy as np import six import os -import sys import inspect from ..layer_helper import LayerHelper from ..initializer import Normal, Constant @@ -9683,7 +9682,6 @@ def _build_once(self, inputs): shape=param_shape, dtype=self._dtype, is_bias=False) - sys.stderr.write('created w: %s\n' % self._w.name) def forward(self, inputs): tmp = self._helper.create_variable_for_type_inference(self._dtype) From 7b6bf9ddf23a70a0f67dcf412034d9cf8a02e5ef Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 26 Dec 2018 19:17:37 +0800 Subject: [PATCH 070/103] make fill_constant kernel-based test=develop --- paddle/fluid/operators/fill_constant_op.cc | 113 +++++------------- paddle/fluid/operators/fill_constant_op.cu.cc | 20 ++++ paddle/fluid/operators/fill_constant_op.h | 64 ++++++++++ paddle/fluid/pybind/imperative.cc | 1 - 4 files changed, 111 insertions(+), 87 deletions(-) create mode 100644 paddle/fluid/operators/fill_constant_op.cu.cc create mode 100644 paddle/fluid/operators/fill_constant_op.h diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index d10fb1214c7f2..6c7b9fa115500 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -12,103 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/fill_constant_op.h" namespace paddle { namespace operators { -class FillConstantInferShape : public framework::InferShapeBase { +class FillConstantOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *ctx) const override { + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of FillConstantOp should not be null."); - auto &shape = ctx->Attrs().Get>("shape"); + auto& shape = ctx->Attrs().Get>("shape"); ctx->SetOutputDim("Out", framework::make_ddim(shape)); } -}; - -class FillConstantOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto data_type = - static_cast(Attr("dtype")); - auto value = Attr("value"); - auto force_cpu = Attr("force_cpu"); - framework::Tensor *tensor = nullptr; - - auto &out_var = *scope.FindVar(Output("Out")); - - if (out_var.IsType()) { - tensor = out_var.GetMutable(); - tensor->Resize(framework::make_ddim(Attr>("shape"))); - } else if (out_var.IsType()) { - tensor = out_var.GetMutable()->mutable_value(); - tensor->Resize(framework::make_ddim(Attr>("shape"))); - } else { - PADDLE_THROW( - "fill constant op's output only" - "supports SelectedRows and LoDTensor"); - } - - if (force_cpu) { - auto cpu = platform::CPUPlace(); - tensor->mutable_data(cpu, data_type); - } else { - tensor->mutable_data(dev_place, data_type); - } - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - math::set_constant(dev_ctx, tensor, value); - } - - void RunImplPrepared(const framework::RuntimeContext &ctx, - const platform::Place &dev_place) const override { - auto data_type = - static_cast(Attr("dtype")); - auto value = Attr("value"); - auto force_cpu = Attr("force_cpu"); - - framework::Tensor *tensor = nullptr; - - auto &out_var = *ctx.outputs.at("Out")[0]; - - if (out_var.IsType()) { - tensor = out_var.GetMutable(); - tensor->Resize(framework::make_ddim(Attr>("shape"))); - } else if (out_var.IsType()) { - tensor = out_var.GetMutable()->mutable_value(); - tensor->Resize(framework::make_ddim(Attr>("shape"))); - } else { - PADDLE_THROW( - "fill constant op's output only" - "supports SelectedRows and LoDTensor"); - } - - if (force_cpu) { - auto cpu = platform::CPUPlace(); - tensor->mutable_data(cpu, data_type); - } else { - tensor->mutable_data(dev_place, data_type); - } - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - math::set_constant(dev_ctx, tensor, value); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::Type(ctx.Attr("dtype")), + ctx.GetPlace()); } }; class FillConstantOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override {} + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto data_type = static_cast( + boost::get(op_desc.GetAttr("dtype"))); + auto& out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetDataType(data_type); + } }; class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { @@ -142,7 +79,11 @@ Fill up a variable with specified constant value. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, - ops::FillConstantInferShape, ops::FillConstantOpMaker, - paddle::framework::EmptyGradOpMaker, - ops::FillConstantOpVarTypeInference); + +REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker, + ops::FillConstantOpVarTypeInference, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel); diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc new file mode 100644 index 0000000000000..fba5583505afe --- /dev/null +++ b/paddle/fluid/operators/fill_constant_op.cu.cc @@ -0,0 +1,20 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_constant_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel); diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h new file mode 100644 index 0000000000000..417c5b4da611c --- /dev/null +++ b/paddle/fluid/operators/fill_constant_op.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { +template +class FillConstantKernel : public framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext &ctx) const override { + auto data_type = + static_cast(ctx.Attr("dtype")); + auto value = ctx.Attr("value"); + auto force_cpu = ctx.Attr("force_cpu"); + + framework::Tensor *tensor = nullptr; + + framework::Variable *out_var = ctx.OutputVar("Out"); + + if (out_var->IsType()) { + tensor = out_var->GetMutable(); + tensor->Resize( + framework::make_ddim(ctx.Attr>("shape"))); + } else if (out_var->IsType()) { + tensor = out_var->GetMutable()->mutable_value(); + tensor->Resize( + framework::make_ddim(ctx.Attr>("shape"))); + } else { + PADDLE_THROW( + "fill constant op's output only" + "supports SelectedRows and LoDTensor"); + } + + if (force_cpu) { + tensor->mutable_data(platform::CPUPlace(), data_type); + } else { + tensor->mutable_data(ctx.GetPlace(), data_type); + } + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(ctx.GetPlace()); + math::set_constant(dev_ctx, tensor, value); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 7f9d937981ae8..819943508b04a 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/imperative/tracer.h" namespace paddle { From 4e80e04f230cdd1c8e14eabfd204329b33867f8c Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Wed, 26 Dec 2018 19:22:32 +0800 Subject: [PATCH 071/103] fix test=develop --- paddle/fluid/framework/operator.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 51b7f572c97e5..ea3f4b7715bde 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include - #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -1104,8 +1103,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { - PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized", - ipt_name); + PADDLE_ENFORCE(t->IsInitialized(), "Input is not initialized"); int tmp = static_cast(t->type()); PADDLE_ENFORCE( tmp == data_type || data_type == -1, From f52b514dcd2db6dcec5c817ac516baf5af4273eb Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 27 Dec 2018 09:38:44 +0800 Subject: [PATCH 072/103] call kernel --- paddle/fluid/framework/operator.cc | 11 ++- paddle/fluid/framework/operator.h | 5 +- paddle/fluid/imperative/layer.cc | 30 +++++--- paddle/fluid/imperative/layer.h | 73 +++++++++++++++---- paddle/fluid/imperative/tracer.h | 29 +++++--- paddle/fluid/operators/fill_constant_op.cc | 3 +- python/paddle/fluid/layer_helper.py | 2 + .../fluid/tests/unittests/test_imperative.py | 9 +-- 8 files changed, 114 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ea3f4b7715bde..dc365a954d1c8 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -179,8 +179,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(3) << place << " " << DebugStringEx(&scope); } -void OperatorBase::Run(const RuntimeContext& ctx, - const platform::Place& place) { +void OperatorBase::RunPrepared(const RuntimeContext& ctx, + const platform::Place& place) { RunImplPrepared(ctx, place); } @@ -1092,7 +1092,9 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { int data_type = -1; for (auto& input : this->inputs_) { - for (const Variable* var : ctx.MultiInputVar(input.first)) { + const std::vector vars = ctx.MultiInputVar(input.first); + for (size_t i = 0; i < vars.size(); ++i) { + const Variable* var = vars[i]; if (var != nullptr) { const Tensor* t = nullptr; if (var->IsType()) { @@ -1103,7 +1105,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { - PADDLE_ENFORCE(t->IsInitialized(), "Input is not initialized"); + PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized", + input.first, i); int tmp = static_cast(t->type()); PADDLE_ENFORCE( tmp == data_type || data_type == -1, diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 3605bf22fc779..a6bdc0bfa7337 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -105,7 +105,7 @@ class OperatorBase { /// Executor will call this interface function to Run an op. // The implementation should be written at RunImpl void Run(const Scope& scope, const platform::Place& place); - void Run(const RuntimeContext& ctx, const platform::Place& place); + void RunPrepared(const RuntimeContext& ctx, const platform::Place& place); // FIXME(typhoonzero): this is only used for recv_op to stop event_loop. virtual void Stop() {} @@ -457,8 +457,9 @@ class OperatorWithKernel : public OperatorBase { void RuntimeInferShape(const Scope& scope, const platform::Place& place, const RuntimeContext& ctx) const override; - protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; + + protected: virtual OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const OpKernelType& expected_kernel_type) const; diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 7741865f9f699..0d850ee162bd8 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -45,12 +45,6 @@ class Autograd { Autograd() {} void RunBackward(VarBase* var) { - PADDLE_ENFORCE(var->pre_op_->op_desc_); - PADDLE_ENFORCE( - var->grads_ == - var->pre_op_->output_vars_[var->pre_op_out_name_][var->pre_op_out_idx_] - ->grads_); - std::deque ready; ready.push_back(var->pre_op_); @@ -66,7 +60,7 @@ class Autograd { const std::vector& ingrads = it.second; for (size_t i = 0; i < ingrads.size(); ++i) { if (!ingrads[i]) continue; - OpBase* pre_op = (*ready_op->pre_ops_)[it.first][i]; + OpBase* pre_op = ready_op->pre_ops_[it.first][i]; if (!pre_op) continue; dep_counts[pre_op] -= 1; @@ -91,7 +85,7 @@ class Autograd { while (!queue.empty()) { OpBase* candidate = queue.front(); queue.pop_front(); - for (auto it : *(candidate->pre_ops_)) { + for (auto it : candidate->pre_ops_) { for (OpBase* pre_op : it.second) { if (!pre_op) continue; if (visited.find(pre_op) == visited.end()) { @@ -138,11 +132,13 @@ std::map> OpBase::ApplyGrad() { } VLOG(3) << "op grad " << grad_op_desc_->Type(); + std::vector> tmp_vars; std::map> grad_outputs; for (auto it : grad_output_vars_) { auto& outputs = grad_outputs[it.first]; for (size_t i = 0; i < it.second.size(); ++i) { - outputs.push_back(new framework::Variable()); + tmp_vars.emplace_back(new framework::Variable()); + outputs.push_back(tmp_vars.back().get()); outputs.back()->GetMutable(); } } @@ -155,7 +151,15 @@ std::map> OpBase::ApplyGrad() { std::unique_ptr opbase = framework::OpRegistry::CreateOp(*grad_op_desc_); - opbase->Run(ctx, platform::CPUPlace()); + framework::OperatorWithKernel* op_kernel = + dynamic_cast(opbase.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + + framework::Scope scope; + platform::CPUPlace place; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); + p.op.RuntimeInferShape(scope, place, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); for (auto it : grad_output_vars_) { auto& outputs = grad_outputs[it.first]; @@ -169,11 +173,15 @@ std::map> OpBase::ApplyGrad() { } void VarBase::RunBackward() { + if (!pre_op_) return; + auto grads_t = grads_->GetMutable(); float* data = grads_t->mutable_data(platform::CPUPlace()); std::fill(data, data + grads_t->numel(), 1.0); - if (!pre_op_) return; + PADDLE_ENFORCE( + grads_ == + pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_); Autograd().RunBackward(this); } diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index eb5fd553bdce0..6225edea77585 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -25,6 +25,59 @@ namespace paddle { namespace imperative { +class PreparedOp { + public: + PreparedOp(const framework::OperatorBase& op, + const framework::RuntimeContext& ctx, + framework::OperatorWithKernel::OpKernelFunc func, + platform::DeviceContext* dev_ctx) + : op(op), ctx(ctx), func(func), dev_ctx(dev_ctx) {} + + static PreparedOp Prepare(const framework::RuntimeContext& ctx, + const framework::OperatorWithKernel& op, + const platform::Place& place) { + framework::Scope dummy_scope; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + + // check if op[type] has kernel registered. + auto& all_op_kernels = op.AllOpKernels(); + auto kernels_iter = all_op_kernels.find(op.Type()); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", + op.Type()); + } + + framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second; + + auto expected_kernel_key = op.GetExpectedKernelType( + framework::ExecutionContext(op, dummy_scope, *dev_ctx, ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); +#ifdef PADDLE_WITH_MKLDNN + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = framework::LibraryType::kPlain; + expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } +#endif + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", op.Type(), + KernelTypeToString(expected_kernel_key)); + } + return PreparedOp(op, ctx, kernel_iter->second, dev_ctx); + } + + const framework::OperatorBase& op; + const framework::RuntimeContext& ctx; + framework::OperatorWithKernel::OpKernelFunc func; + platform::DeviceContext* dev_ctx; +}; class OpBase; class VarBase { @@ -62,30 +115,22 @@ class VarBase { class OpBase { public: - OpBase() - : pre_ops_(new std::map>()), - pre_ops_out_idx_(new std::map>()), - op_desc_(nullptr), - grad_op_desc_(nullptr) {} + OpBase() : op_desc_(nullptr), grad_op_desc_(nullptr) {} virtual ~OpBase() { - delete pre_ops_; - delete pre_ops_out_idx_; - if (grad_op_desc_) delete grad_op_desc_; - if (grad_to_var_) delete grad_to_var_; } std::map> ApplyGrad(); + framework::OpDesc* op_desc_; + framework::OpDesc* grad_op_desc_; + std::map> input_vars_; std::map> output_vars_; - std::map>* pre_ops_; - std::map>* pre_ops_out_idx_; - framework::OpDesc* op_desc_; + std::map> pre_ops_; + std::map> pre_ops_out_idx_; - framework::OpDesc* grad_op_desc_; - std::unordered_map* grad_to_var_; std::map> grad_input_vars_; std::map> grad_output_vars_; framework::BlockDesc* block_; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 6b2e97873759d..1f0c7b30b44dd 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -82,10 +82,10 @@ class Tracer { invars.push_back(inp->var_); vars[inp->var_desc_->Name()] = inp; if (inp->pre_op_) { - (*op->pre_ops_)[it.first].push_back(inp->pre_op_); - (*op->pre_ops_out_idx_)[it.first].push_back(inp->pre_op_out_idx_); + op->pre_ops_[it.first].push_back(inp->pre_op_); + op->pre_ops_out_idx_[it.first].push_back(inp->pre_op_out_idx_); } else { - (*op->pre_ops_)[it.first].push_back(nullptr); + op->pre_ops_[it.first].push_back(nullptr); } VLOG(3) << "input vname " << inp->var_desc_->Name() << " " << inp->var_->IsInitialized(); @@ -118,24 +118,33 @@ class Tracer { VLOG(3) << "tracer running " << op_desc->Type(); framework::RuntimeContext ctx(invars_map, outvars_map); - op_base->Run(ctx, platform::CPUPlace()); + // op_base->RunPrepared(ctx, platform::CPUPlace()); + + // TODO(panyx0718): Cache p. + framework::OperatorWithKernel* op_kernel = + dynamic_cast(op_base.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + + framework::Scope scope; + platform::CPUPlace place; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place); + p.op.RuntimeInferShape(scope, place, ctx); + p.func(framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx)); if (block == startup_block_) { op->grad_op_desc_ = nullptr; - op->grad_to_var_ = nullptr; } else { framework::OpDesc* grad_op_desc; auto grad_to_var = new std::unordered_map(); CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; for (auto it : grad_op_desc->Inputs()) { auto& grad_in_vars = op->grad_input_vars_[it.first]; for (const std::string& grad_invar : it.second) { block->FindRecursiveOrCreateVar(grad_invar); - auto var_it = op->grad_to_var_->find(grad_invar); - if (var_it == op->grad_to_var_->end()) { + auto var_it = grad_to_var->find(grad_invar); + if (var_it == grad_to_var->end()) { auto fwd_var_it = vars.find(grad_invar); PADDLE_ENFORCE(fwd_var_it != vars.end()); grad_in_vars.push_back(fwd_var_it->second->var_); @@ -152,8 +161,8 @@ class Tracer { auto& grad_out_vars = op->grad_output_vars_[it.first]; for (const std::string& grad_outvar : it.second) { block->FindRecursiveOrCreateVar(grad_outvar); - auto var_it = op->grad_to_var_->find(grad_outvar); - PADDLE_ENFORCE(var_it != op->grad_to_var_->end()); + auto var_it = grad_to_var->find(grad_outvar); + PADDLE_ENFORCE(var_it != grad_to_var->end()); VarBase* var = vars[var_it->second]; if (!var->grads_->IsInitialized()) { InitVar(var->var_, var->grads_); diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 6c7b9fa115500..73f38de08e3fc 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -86,4 +86,5 @@ REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker, REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel, ops::FillConstantKernel, - ops::FillConstantKernel); + ops::FillConstantKernel, + ops::FillConstantKernel); diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 0a299bc2fbb3c..8543cb847d3dc 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -316,6 +316,8 @@ def create_parameter(self, if _in_imperative_mode(): self.main_program.global_block().create_parameter( dtype=dtype, shape=shape, **attr._to_kwargs()) + # In imperative mode, we want the returned parameter to be + # initialized so that it can be used imperatively. return self.startup_program.global_block().create_parameter( dtype=dtype, shape=shape, diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py index 6368f9b44a6e2..6b6ab227deab0 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative.py +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys import contextlib import unittest import numpy as np @@ -82,12 +81,10 @@ def test_layer_in_out(self): with new_program_scope(): inp = fluid.layers.data( name="inp", shape=[3], append_batch_size=False) - x = fluid.layers.relu(inp) - x_for_debug = x - x = fluid.layers.elementwise_mul(x, x) - x = fluid.layers.reduce_sum(x) + l = MyLayer() + x = l(inp)[0] param_grads = fluid.backward.append_backward( - x, parameter_list=[x_for_debug.name])[0] + x, parameter_list=[l._x_for_debug.name])[0] exe = fluid.Executor(fluid.CPUPlace()) static_out, static_grad = exe.run( From b91a7a9d3073e4e38f659f4353dbf4eb0215d816 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 27 Dec 2018 09:41:18 +0800 Subject: [PATCH 073/103] clear operator changes test=develop --- paddle/fluid/framework/operator.cc | 50 ------------------------------ paddle/fluid/framework/operator.h | 8 ----- paddle/fluid/imperative/tracer.h | 1 - 3 files changed, 59 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index dc365a954d1c8..d67782319d7a0 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -179,11 +179,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(3) << place << " " << DebugStringEx(&scope); } -void OperatorBase::RunPrepared(const RuntimeContext& ctx, - const platform::Place& place) { - RunImplPrepared(ctx, place); -} - bool OperatorBase::HasInputs(const std::string& name) const { return inputs_.find(name) != inputs_.end(); } @@ -958,51 +953,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } -void OperatorWithKernel::RunImplPrepared(const RuntimeContext& ctx, - const platform::Place& place) const { - Scope dummy_scope; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - - // check if op[type] has kernel registered. - auto& all_op_kernels = AllOpKernels(); - auto kernels_iter = all_op_kernels.find(type_); - if (kernels_iter == all_op_kernels.end()) { - PADDLE_THROW( - "There are no kernels which are registered in the %s operator.", type_); - } - - OpKernelMap& kernels = kernels_iter->second; - - auto expected_kernel_key = this->GetExpectedKernelType( - ExecutionContext(*this, dummy_scope, *dev_ctx, ctx)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; - - auto kernel_iter = kernels.find(expected_kernel_key); -#ifdef PADDLE_WITH_MKLDNN - // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set - if (kernel_iter == kernels.end() && - expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; - expected_kernel_key.library_type_ = LibraryType::kPlain; - expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; - kernel_iter = kernels.find(expected_kernel_key); - } -#endif - if (kernel_iter == kernels.end()) { - PADDLE_THROW("op %s does not have kernel for %s", type_, - KernelTypeToString(expected_kernel_key)); - } - - if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { - dev_ctx = pool.Get(expected_kernel_key.place_); - } - - RuntimeInferShapeContext infer_shape_ctx(*this, dummy_scope, ctx); - this->InferShape(&infer_shape_ctx); - kernel_iter->second(ExecutionContext(*this, dummy_scope, *dev_ctx, ctx)); -} - void OperatorWithKernel::TransferInplaceVarsBack( const Scope& scope, const std::vector& inplace_vars, const Scope& transfer_scope) const { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index a6bdc0bfa7337..e2bedc60d273b 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -105,7 +105,6 @@ class OperatorBase { /// Executor will call this interface function to Run an op. // The implementation should be written at RunImpl void Run(const Scope& scope, const platform::Place& place); - void RunPrepared(const RuntimeContext& ctx, const platform::Place& place); // FIXME(typhoonzero): this is only used for recv_op to stop event_loop. virtual void Stop() {} @@ -172,11 +171,6 @@ class OperatorBase { void CheckAllInputOutputSet() const; virtual void RunImpl(const Scope& scope, const platform::Place& place) const = 0; - - virtual void RunImplPrepared(const RuntimeContext& ctx, - const platform::Place& place) const { - PADDLE_THROW("%s doesn't support RunPreparedImpl", Type()); - } }; class ExecutionContext { @@ -469,8 +463,6 @@ class OperatorWithKernel : public OperatorBase { // same. proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; void RunImpl(const Scope& scope, const platform::Place& place) const final; - void RunImplPrepared(const RuntimeContext& ctx, - const platform::Place& place) const final; /** * Transfer data from scope to a transfered scope. If there is no data need to diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 1f0c7b30b44dd..c814da9853cc1 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -118,7 +118,6 @@ class Tracer { VLOG(3) << "tracer running " << op_desc->Type(); framework::RuntimeContext ctx(invars_map, outvars_map); - // op_base->RunPrepared(ctx, platform::CPUPlace()); // TODO(panyx0718): Cache p. framework::OperatorWithKernel* op_kernel = From c132c790111d7fadf212a72ec2cd35e03aed364f Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 27 Dec 2018 11:23:24 +0800 Subject: [PATCH 074/103] address comments and resolve conflicts. test=develop --- paddle/fluid/imperative/layer.cc | 20 -------------------- paddle/fluid/imperative/layer.h | 3 +-- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 0d850ee162bd8..26e7830265ea6 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -100,26 +100,6 @@ class Autograd { } }; -void CreateVariable(const std::string& name, const framework::DDim& dim, - float val, bool random_name, framework::Variable* var) { - if (var->IsInitialized()) return; - - std::string varname = name; - if (random_name) { - std::mt19937 rng; - rng.seed(std::random_device()()); - std::uniform_int_distribution dist6( - 1, std::numeric_limits::max()); - int id = dist6(rng); - varname = string::Sprintf("%s@%d", varname, id); - } - - VLOG(3) << "creating var " << varname; - framework::LoDTensor* tensor = var->GetMutable(); - float* data = tensor->mutable_data(dim, platform::CPUPlace()); - std::fill(data, data + tensor->numel(), val); -} - framework::LoDTensor& VarBase::Grad() { VLOG(3) << "get var grad " << var_desc_->Name(); return *grads_->GetMutable(); diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 6225edea77585..ae4e8e0f8a39d 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -36,7 +36,6 @@ class PreparedOp { static PreparedOp Prepare(const framework::RuntimeContext& ctx, const framework::OperatorWithKernel& op, const platform::Place& place) { - framework::Scope dummy_scope; platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -52,7 +51,7 @@ class PreparedOp { framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second; auto expected_kernel_key = op.GetExpectedKernelType( - framework::ExecutionContext(op, dummy_scope, *dev_ctx, ctx)); + framework::ExecutionContext(op, framework::Scope(), *dev_ctx, ctx)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); From 66ea718452584d8114e5adcebc15e48781ad93bf Mon Sep 17 00:00:00 2001 From: haowang101779990 <101779990@student.swin.edu.au> Date: Wed, 26 Dec 2018 21:29:01 -0800 Subject: [PATCH 075/103] en api improve format Dec 27 test=develop --- python/paddle/fluid/data_feeder.py | 3 +- python/paddle/fluid/framework.py | 4 +- python/paddle/fluid/layers/control_flow.py | 9 +- python/paddle/fluid/layers/detection.py | 120 ++--- python/paddle/fluid/layers/io.py | 11 +- python/paddle/fluid/layers/nn.py | 467 ++++++++++-------- python/paddle/fluid/layers/tensor.py | 11 +- python/paddle/fluid/metrics.py | 22 +- .../fluid/transpiler/distribute_transpiler.py | 23 +- 9 files changed, 379 insertions(+), 291 deletions(-) diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index af02721eb72c1..c280ff21eec8d 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -272,8 +272,7 @@ def decorate_reader(self, dict: the result of conversion. Raises: - ValueError: If drop_last is False and the data batch which cannot - fit for devices. + ValueError: If drop_last is False and the data batch which cannot fit for devices. """ def __reader_creator__(): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3427fb0c4ae4d..2a31379d8bdf3 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1646,8 +1646,8 @@ def to_string(self, throw_on_error, with_details=False): parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need to print. - Returns - (str): The debug string. + Returns: + str : The debug string. Raises: ValueError: If any of required fields is not set and throw_on_error is diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 9d98e8333ba07..a7494aaceab42 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -1452,6 +1452,7 @@ def __init__(self, name=None): def step_input(self, x): """ Mark a sequence as a dynamic RNN input. + Args: x(Variable): The input sequence. @@ -1505,6 +1506,7 @@ def static_input(self, x): """ Mark a variable as a RNN input. The input will not be scattered into time steps. + Args: x(Variable): The input variable. @@ -1629,13 +1631,11 @@ def memory(self, Args: init(Variable|None): The initialized variable. - shape(list|tuple): The memory shape. NOTE the shape does not contain - batch_size. + shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size. value(float): the initalized value. - need_reorder(bool): True if the initialized memory depends on the - input sample. + need_reorder(bool): True if the initialized memory depends on the input sample. dtype(str|numpy.dtype): The data type of the initialized memory. @@ -1714,6 +1714,7 @@ def update_memory(self, ex_mem, new_mem): """ Update the memory from ex_mem to new_mem. NOTE that the shape and data type of :code:`ex_mem` and :code:`new_mem` must be same. + Args: ex_mem(Variable): the memory variable. new_mem(Variable): the plain variable generated in RNN block. diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index ce731f39ea099..8aed97dc59b10 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -65,7 +65,7 @@ def rpn_target_assign(bbox_pred, rpn_negative_overlap=0.3, use_random=True): """ - ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. ** + **Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection.** This layer can be, for given the Intersection-over-Union (IoU) overlap between anchors and ground truth boxes, to assign classification and @@ -135,19 +135,20 @@ def rpn_target_assign(bbox_pred, Examples: .. code-block:: python - bbox_pred = layers.data(name='bbox_pred', shape=[100, 4], - append_batch_size=False, dtype='float32') - cls_logits = layers.data(name='cls_logits', shape=[100, 1], - append_batch_size=False, dtype='float32') - anchor_box = layers.data(name='anchor_box', shape=[20, 4], - append_batch_size=False, dtype='float32') - gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], - append_batch_size=False, dtype='float32') - loc_pred, score_pred, loc_target, score_target, bbox_inside_weight = - fluid.layers.rpn_target_assign(bbox_pred=bbox_pred, - cls_logits=cls_logits, - anchor_box=anchor_box, - gt_boxes=gt_boxes) + bbox_pred = layers.data(name='bbox_pred', shape=[100, 4], + append_batch_size=False, dtype='float32') + cls_logits = layers.data(name='cls_logits', shape=[100, 1], + append_batch_size=False, dtype='float32') + anchor_box = layers.data(name='anchor_box', shape=[20, 4], + append_batch_size=False, dtype='float32') + gt_boxes = layers.data(name='gt_boxes', shape=[10, 4], + append_batch_size=False, dtype='float32') + loc_pred, score_pred, loc_target, score_target, bbox_inside_weight = + fluid.layers.rpn_target_assign(bbox_pred=bbox_pred, + cls_logits=cls_logits, + anchor_box=anchor_box, + gt_boxes=gt_boxes) + """ helper = LayerHelper('rpn_target_assign', **locals()) @@ -1519,27 +1520,30 @@ def anchor_generator(input, Args: input(Variable): The input feature map, the format is NCHW. anchor_sizes(list|tuple|float): The anchor sizes of generated anchors, - given in absolute pixels e.g. [64., 128., 256., 512.]. - For instance, the anchor size of 64 means the area of this anchor equals to 64**2. + given in absolute pixels e.g. [64., 128., 256., 512.]. + For instance, the anchor size of 64 means the area of this anchor equals to 64**2. aspect_ratios(list|tuple|float): The height / width ratios of generated - anchors, e.g. [0.5, 1.0, 2.0]. + anchors, e.g. [0.5, 1.0, 2.0]. variance(list|tuple): The variances to be used in box regression deltas. - Default:[0.1, 0.1, 0.2, 0.2]. - stride(list|turple): The anchors stride across width and height, - e.g. [16.0, 16.0] + Default:[0.1, 0.1, 0.2, 0.2]. + stride(list|turple): The anchors stride across width and height,e.g. [16.0, 16.0] offset(float): Prior boxes center offset. Default: 0.5 name(str): Name of the prior box op. Default: None. Returns: - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. - H is the height of input, W is the width of input, - num_anchors is the box count of each position. - Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. - Variances(Variable): The expanded variances of anchors - with a layout of [H, W, num_priors, 4]. - H is the height of input, W is the width of input - num_anchors is the box count of each position. - Each variance is in (xcenter, ycenter, w, h) format. + Anchors(Variable),Variances(Variable): + + two variables: + + - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. \ + H is the height of input, W is the width of input, \ + num_anchors is the box count of each position. \ + Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. + - Variances(Variable): The expanded variances of anchors \ + with a layout of [H, W, num_priors, 4]. \ + H is the height of input, W is the width of input \ + num_anchors is the box count of each position. \ + Each variance is in (xcenter, ycenter, w, h) format. Examples: @@ -1748,35 +1752,35 @@ def generate_proposals(scores, eta=1.0, name=None): """ - ** Generate proposal Faster-RCNN ** - - This operation proposes RoIs according to each box with their probability to be a foreground object and - the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals - could be used to train detection net. - - For generating proposals, this operation performs following steps: - - 1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) - 2. Calculate box locations as proposals candidates. - 3. Clip boxes to image - 4. Remove predicted boxes with small area. - 5. Apply NMS to get final proposals as output. - - - Args: - scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. - N is batch size, A is number of anchors, H and W are height and width of the feature map. - bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. - im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale - between origin image size and the size of feature map. - anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, - num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. - variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format. - pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default. - post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default. - nms_thresh(float): Threshold in NMS, 0.5 by default. - min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default. - eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration. + **Generate proposal Faster-RCNN** + + This operation proposes RoIs according to each box with their probability to be a foreground object and + the box can be calculated by anchors. Bbox_deltais and scores to be an object are the output of RPN. Final proposals + could be used to train detection net. + + For generating proposals, this operation performs following steps: + + 1. Transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) + 2. Calculate box locations as proposals candidates. + 3. Clip boxes to image + 4. Remove predicted boxes with small area. + 5. Apply NMS to get final proposals as output. + + Args: + scores(Variable): A 4-D Tensor with shape [N, A, H, W] represents the probability for each box to be an object. + N is batch size, A is number of anchors, H and W are height and width of the feature map. + bbox_deltas(Variable): A 4-D Tensor with shape [N, 4*A, H, W] represents the differece between predicted box locatoin and anchor location. + im_info(Variable): A 2-D Tensor with shape [N, 3] represents origin image information for N batch. Info contains height, width and scale + between origin image size and the size of feature map. + anchors(Variable): A 4-D Tensor represents the anchors with a layout of [H, W, A, 4]. H and W are height and width of the feature map, + num_anchors is the box count of each position. Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. + variances(Variable): The expanded variances of anchors with a layout of [H, W, num_priors, 4]. Each variance is in (xcenter, ycenter, w, h) format. + pre_nms_top_n(float): Number of total bboxes to be kept per image before NMS. 6000 by default. + post_nms_top_n(float): Number of total bboxes to be kept per image after NMS. 1000 by default. + nms_thresh(float): Threshold in NMS, 0.5 by default. + min_size(float): Remove predicted boxes with either height or width < min_size. 0.1 by default. + eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5, adaptive_threshold = adaptive_threshold * eta in each iteration. + """ helper = LayerHelper('generate_proposals', **locals()) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 42f4959a83fe1..9a29b2509357c 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -949,12 +949,11 @@ def shuffle(reader, buffer_size): is determined by argument buf_size. Args: - param reader: the original reader whose output will be shuffled. - type reader: callable - param buf_size: shuffle buffer size. - type buf_size: int - return: the new reader whose output is shuffled. - rtype: callable + reader(callable): the original reader whose output will be shuffled. + buf_size(int): shuffle buffer size. + + Returns: + callable: the new reader whose output is shuffled. """ return __create_unshared_decorated_reader__( 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index cc1fdbd285611..8f43c6f226bf5 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -233,7 +233,7 @@ def fc(input, dimensions will be flatten to form the first dimension of the final matrix (height of the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to form the second dimension of the final matrix (width of the matrix). For example, suppose - `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. + `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable parameters/weights of this layer. @@ -502,46 +502,48 @@ def lstm(input, If Device is GPU, This op will use cudnn LSTM implementation A four-gate Long Short-Term Memory network with no peephole connections. - In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, + In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: - $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ - - $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ - - $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ - - $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ - - $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ - - $$ h_t = o_t \\odot tanh(c_t) $$ - - - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + .. math:: + + i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) + + f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) + + o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) + + \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) + + c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} + + h_t &= o_t \odot tanh(c_t) + + - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix of weights from the input gate to the input) - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). - sigmoid is the logistic sigmoid function. - $i, f, o$ and $c$ are the input gate, forget gate, output gate, and cell activation vectors, respectively, all of which have the same size as the cell output activation vector $h$. - - The $\odot$ is the element-wise product of the vectors. - - `tanh` is the activation functions. - - $\tilde{c_t}$ is also called candidate hidden state, + - The :math:`\odot` is the element-wise product of the vectors. + - :math:`tanh` is the activation functions. + - :math:`\\tilde{c_t}` is also called candidate hidden state, which is computed based on the current input and the previous hidden state. - Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, + Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, X represensts a matrix multiplication Args: input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size ) - init_h(Variable): The initial hidden state of the LSTM + init_h(Variable): The initial hidden state of the LSTM This is a tensor with shape ( num_layers x batch_size x hidden_size) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) init_c(Variable): The initial cell state of the LSTM. This is a tensor with shape ( num_layers x batch_size x hidden_size ) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) - max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len + max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len hidden_size (int): hidden size of the LSTM num_layers (int): total layers number of the LSTM dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps @@ -556,14 +558,18 @@ def lstm(input, Returns: - rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) - if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) - last_h(Tensor): the hidden state of the last step of LSTM - shape is ( num_layers x batch_size x hidden_size ) - if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) - last_c(Tensor): the cell state of the last step of LSTM - shape is ( num_layers x batch_size x hidden_size ) - if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + rnn_out(Tensor),last_h(Tensor),last_c(Tensor): + + Three tensors, rnn_out, last_h, last_c: + + - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \ + if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) + - last_h is the hidden state of the last step of LSTM \ + shape is ( num_layers x batch_size x hidden_size ) \ + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + - last_c(Tensor): the cell state of the last step of LSTM \ + shape is ( num_layers x batch_size x hidden_size ) \ + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) Examples: @@ -1220,6 +1226,8 @@ def dropout(x, probability) the outputs of some units to zero, while others are remain unchanged. + dropout op can be removed from the program to make the program more efficient. + Args: x (Variable): The input tensor variable. dropout_prob (float): Probability of setting units to zero. @@ -1230,22 +1238,24 @@ def dropout(x, units will be dropped. DO NOT use a fixed seed in training. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. - dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train'] + dropout_implementation(string): ['downgrade_in_infer'(default)|'upscale_in_train'] + 1. downgrade_in_infer(default), downgrade the outcome at inference - train: out = input * mask - inference: out = input * dropout_prob - (make is a tensor same shape with input, value is 0 or 1 - ratio of 0 is dropout_prob) + + - train: out = input * mask + - inference: out = input * dropout_prob + + (mask is a tensor same shape with input, value is 0 or 1 + ratio of 0 is dropout_prob) 2. upscale_in_train, upscale the outcome at training time - train: out = input * mask / ( 1.0 - dropout_prob ) - inference: out = input - (make is a tensor same shape with input, value is 0 or 1 - ratio of 0 is dropout_prob) - dropout op can be removed from the program. - the program will be efficient + - train: out = input * mask / ( 1.0 - dropout_prob ) + - inference: out = input + (mask is a tensor same shape with input, value is 0 or 1 + ratio of 0 is dropout_prob) + Returns: Variable: A tensor variable is the shape with `x`. @@ -1333,11 +1343,15 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): A 2-D tensor with shape [N x 1], the cross entropy loss. Raises: - `ValueError`: 1) the 1st dimension of `input` and `label` are not equal. - 2) when `soft_label == True`, and the 2nd dimension of - `input` and `label` are not equal. - 3) when `soft_label == False`, and the 2nd dimension of - `label` is not 1. + ValueError: + + 1. the 1st dimension of ``input`` and ``label`` are not equal. + + 2. when ``soft_label == True``, and the 2nd dimension of + ``input`` and ``label`` are not equal. + + 3. when ``soft_label == False``, and the 2nd dimension of + ``label`` is not 1. Examples: .. code-block:: python @@ -1457,8 +1471,8 @@ def chunk_eval(input, This function computes and outputs the precision, recall and F1-score of chunk detection. - For some basics of chunking, please refer to - 'Chunking with Support Vector Machines '. + For some basics of chunking, please refer to + `Chunking with Support Vector Machines `_ . ChunkEvalOp computes the precision, recall, and F1-score of chunk detection, and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. @@ -1823,7 +1837,7 @@ def conv2d(input, of conv2d. If it is set to None or one attribute of ParamAttr, conv2d will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with :math:`Normal(0.0, std)`, - and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. + and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d. If it is set to False, no bias will be added to the output units. If it is set to None or one attribute of ParamAttr, conv2d @@ -2276,7 +2290,7 @@ def sequence_slice(input, offset, length, name=None): .. code-block:: text - - Case: + - Case: Given the input Variable **input**: @@ -2292,7 +2306,8 @@ def sequence_slice(input, offset, length, name=None): out.lod = [[2, 1]], out.dims = (3, 2). - NOTE: The first dimension size of **input**, **offset** and **length** + Note: + The first dimension size of **input**, **offset** and **length** should be equal. The **offset** should start from 0. Args: @@ -3013,7 +3028,7 @@ def group_norm(input, """ **Group Normalization Layer** - Refer to `Group Normalization ` + Refer to `Group Normalization `_ . Args: input(Variable): The input tensor variable. @@ -3140,8 +3155,8 @@ def conv2d_transpose(input, H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\ W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\ - H_{out} \in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\ - W_{out} \in [ W^\prime_{out}, W^\prime_{out} + strides[1] ) + H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\ + W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] ) Args: input(Variable): The input image with [N, C, H, W] format. @@ -4673,7 +4688,7 @@ def ctc_greedy_decoder(input, blank, name=None): [0.5, 0.1, 0.3, 0.1]] input.lod = [[4, 4]] - + Computation: step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get: @@ -4704,10 +4719,10 @@ def ctc_greedy_decoder(input, blank, name=None): name (str): The name of this layer. It is optional. Returns: - Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. - 'Lp' is the sum if all output sequences' length. If all the sequences - in result were empty, the result LoDTensor will be [-1] with - LoD [[]] and dims [1, 1]. + Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \ + 'Lp' is the sum if all output sequences' length. If all the sequences \ + in result were empty, the result LoDTensor will be [-1] with \ + LoD [[]] and dims [1, 1]. Examples: .. code-block:: python @@ -5060,7 +5075,7 @@ def hsigmoid(input, """ The hierarchical sigmoid operator is used to accelerate the training process of language model. This operator organizes the classes into a - complete binary tree, or you can use is_custom to pass your own tree to + complete binary tree, or you can use is_custom to pass your own tree to implement hierarchical. Each leaf node represents a class(a word) and each internal node acts as a binary classifier. For each word there's a unique path from root to it's leaf node, hsigmoid calculate the cost for each @@ -5072,13 +5087,13 @@ def hsigmoid(input, `_ And if you want to use the costumed tree by set 'is_custom' as true you may need to do following things first: - 1. using your word dict to build a binary tree, each leaf node should be an word of your word dict - 2. build a dict to store word_id -> word's leaf to root path, we call it path_table. - 3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code - means label of each binary classification, using 1 indicate true, 0 indicate false. - 4. now, each word should has its path and code along the path, you can pass a batch of path and code - related to the same batch of inputs. + 1. using your word dict to build a binary tree, each leaf node should be an word of your word dict + 2. build a dict to store word_id -> word's leaf to root path, we call it path_table. + 3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code + means label of each binary classification, using 1 indicate true, 0 indicate false. + 4. now, each word should has its path and code along the path, you can pass a batch of path and code + related to the same batch of inputs. Args: input (Variable): The input tensor variable with shape @@ -5086,8 +5101,8 @@ def hsigmoid(input, and :math:`D` is the feature size. label (Variable): The tensor variable contains labels of training data. It's a tensor with shape is :math:`[N \\times 1]`. - num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, - it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num + num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, + it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num which indicates the num of classes using by binary classify. param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid @@ -5100,15 +5115,15 @@ def hsigmoid(input, is not set, the bias is initialized zero. Default: None. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None. - path_table: (Variable|None) this variable can store each batch of samples' path to root, + path_table: (Variable|None) this variable can store each batch of samples' path to root, it should be in leaf -> root order - path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like - structure and each element in this array is indexes in parent nodes' Weight Matrix. - path_code: (Variable|None) this variable can store each batch of samples' code, + path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like + structure and each element in this array is indexes in parent nodes' Weight Matrix. + path_code: (Variable|None) this variable can store each batch of samples' code, each code consist with every code of parent nodes. it should be in leaf -> root order - is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is + is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is set you need to set path_table/path_code/num_classes, otherwise num_classes should be set - is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient + is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient of W and input will be sparse. Returns: @@ -5485,11 +5500,11 @@ def softmax_with_cross_entropy(logits, .. math:: - max_j = \\max_{i=0}^{K}{\\text{logit}_i} + max_j &= \\max_{i=0}^{K}{\\text{logit}_i} - log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j) + log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j) - softmax_j = \\exp(logit_j - max_j - {log\\_max\\_sum}_j) + softmax_j &= \\exp(logit_j - max_j - {log\\_max\\_sum}_j) and then cross entropy loss is calculated by softmax and label. @@ -5515,11 +5530,11 @@ def softmax_with_cross_entropy(logits, along with the cross entropy loss. Default: False Returns: - Variable or Tuple of two Variables: Return the cross entropy loss if - `return_softmax` is False, otherwise the tuple - (loss, softmax), where the cross entropy loss is - a 2-D tensor with shape [N x 1], and softmax is a - 2-D tensor with shape [N x K]. + Variable or Tuple of two Variables: Return the cross entropy loss if \ + `return_softmax` is False, otherwise the tuple \ + (loss, softmax), where the cross entropy loss is \ + a 2-D tensor with shape [N x 1], and softmax is a \ + 2-D tensor with shape [N x K]. Examples: .. code-block:: python @@ -5792,21 +5807,27 @@ def squeeze(input, axes, name=None): the single dimensions will be removed from the shape. If an axis is selected with shape entry not equal to one, an error is raised. - Examples: - Case 1: - Given - X.shape = (1, 3, 1, 5) - and - axes = [0] - we get: - Out.shape = (3, 1, 5) - Case 2: - Given - X.shape = (1, 3, 1, 5) - and - axes = [] - we get: - Out.shape = (3, 5) + For example: + + .. code-block:: text + + Case 1: + + Given + X.shape = (1, 3, 1, 5) + and + axes = [0] + we get: + Out.shape = (3, 1, 5) + + Case 2: + + Given + X.shape = (1, 3, 1, 5) + and + axes = [] + we get: + Out.shape = (3, 5) Args: input (Variable): The input variable to be squeezed. @@ -5842,6 +5863,9 @@ def unsqueeze(input, axes, name=None): Dimension indices in axes are as seen in the output tensor. For example: + + .. code-block:: text + Given a tensor such that tensor with shape [3, 4, 5], then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1]. @@ -6729,8 +6753,11 @@ def sequence_scatter(input, index, updates, name=None): the columns to update in each row of X. Here is an example: + Given the following input: + .. code-block:: text + input.data = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]] @@ -6743,7 +6770,9 @@ def sequence_scatter(input, index, updates, name=None): updates.lod = [[ 0, 3, 8, 12]] Then we have the output: + .. code-block:: text + out.data = [[1.3, 1.3, 1.4, 1.0, 1.0, 1.0], [1.0, 1.0, 1.4, 1.3, 1.2, 1.1], [1.0, 1.0, 1.3, 1.2, 1.4, 1.1]] @@ -6759,7 +6788,7 @@ def sequence_scatter(input, index, updates, name=None): name (str|None): The output variable name. Default None. Returns: - output (Variable): The output is a tensor with the same shape as input. + Variable: The output is a tensor with the same shape as input. Examples: @@ -6933,7 +6962,7 @@ def mean_iou(input, label, num_classes): .. math:: - IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}. + IOU = \\frac{true\_positive}{(true\_positive + false\_positive + false\_negative)}. The predictions are accumulated in a confusion matrix and mean-IOU is then calculated from it. @@ -6946,9 +6975,13 @@ def mean_iou(input, label, num_classes): num_classes (int): The possible number of labels. Returns: - mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1]. - out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class. - out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class. + mean_iou (Variable),out_wrong(Variable),out_correct(Variable): + + Three variables: + + - mean_iou : A Tensor representing the mean intersection-over-union with shape [1]. + - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class. + - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class. Examples: @@ -7143,8 +7176,8 @@ def affine_grid(theta, out_shape, name=None): Args: theta (Variable): A batch of affine transform parameters with shape [N, 2, 3]. - out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. - out_shape can be a Variable or a list or tuple. + out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. + ``out_shape`` can be a Variable or a list or tuple. name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. @@ -7157,6 +7190,7 @@ def affine_grid(theta, out_shape, name=None): Examples: .. code-block:: python + theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32") out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32") data = fluid.layers.affine_grid(theta, out_shape) @@ -7192,9 +7226,10 @@ def affine_grid(theta, out_shape, name=None): def rank_loss(label, left, right, name=None): """ + **Rank loss layer for RankNet** - RankNet(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf) + `RankNet `_ is a pairwise ranking model with a training sample consisting of a pair of documents, A and B. Label P indicates whether A is ranked higher than B or not: @@ -7202,16 +7237,19 @@ def rank_loss(label, left, right, name=None): P = {0, 1} or {0, 0.5, 1}, where 0.5 means that there is no information about the rank of the input pair. - Rank loss layer takes three inputs: left (o_i), right (o_j) and - label (P_{i,j}). The inputs respectively represent RankNet's output scores + Rank loss layer takes three inputs: left ( :math:`o_i` ), right ( :math:`o_j` ) and + label ( :math:`P_{i,j}` ). The inputs respectively represent RankNet's output scores for documents A and B and the value of label P. The following equation computes rank loss C_{i,j} from the inputs: - $$ - C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\ - o_{i,j} = o_i - o_j \\ - \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \} - $$ + .. math:: + + C_{i,j} &= -\\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\\\ + + o_{i,j} &= o_i - o_j \\\\ + + \\tilde{P_{i,j}} &= \\left \{0, 0.5, 1 \\right \} \ or \ \\left \{0, 1 \\right \} + Rank loss layer takes batch inputs with size batch_size (batch_size >= 1). @@ -7237,7 +7275,6 @@ def rank_loss(label, left, right, name=None): right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32") out = fluid.layers.rank_loss(label, left, right) - """ helper = LayerHelper('rank_loss', **locals()) @@ -7269,7 +7306,7 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None): .. math:: - rank\_loss &= max(0, -label * (left - right) + margin) + rank\_loss = max(0, -label * (left - right) + margin) Args: label (Variable): Indicates whether the left is ranked higher than the right or not. @@ -7278,12 +7315,17 @@ def margin_rank_loss(label, left, right, margin=0.1, name=None): margin (float): Indicates the given margin. name (str|None): A name for this layer (optional). If set None, the layer will be named automatically. + Returns: Variable: The ranking loss. + Raises: ValueError: Any of label, left, and right is not a Variable. + Examples: + .. code-block:: python + label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32") left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32") right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32") @@ -7587,7 +7629,8 @@ def prelu(x, mode, param_attr=None, name=None): """ Equation: - y = \max(0, x) + alpha * \min(0, x) + .. math:: + y = \max(0, x) + \\alpha * \min(0, x) Args: x (Variable): The input tensor. @@ -7653,8 +7696,8 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): .. code-block:: python - x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") - y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0) + x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") + y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0) """ helper = LayerHelper('brelu', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -7683,8 +7726,8 @@ def leaky_relu(x, alpha=0.02, name=None): .. code-block:: python - x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") - y = fluid.layers.leaky_relu(x, alpha=0.01) + x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") + y = fluid.layers.leaky_relu(x, alpha=0.01) """ helper = LayerHelper('leaky_relu', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -7712,8 +7755,8 @@ def soft_relu(x, threshold=40.0, name=None): .. code-block:: python - x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") - y = fluid.layers.soft_relu(x, threshold=20.0) + x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") + y = fluid.layers.soft_relu(x, threshold=20.0) """ helper = LayerHelper('soft_relu', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -7729,23 +7772,32 @@ def flatten(x, axis=1, name=None): """ **Flatten layer** Flattens the input tensor into a 2D matrix. + + For Example: + + .. code-block:: text - Examples: - Case 1: - Given - X.shape = (3, 100, 100, 4) - and - axis = 2 - We get: - Out.shape = (3 * 100, 4 * 100) - - Case 2: - Given - X.shape = (3, 100, 100, 4) - and - axis = 0 - We get: - Out.shape = (1, 3 * 100 * 100 * 4) + Case 1: + + Given + X.shape = (3, 100, 100, 4) + + and + axis = 2 + + We get: + Out.shape = (3 * 100, 4 * 100) + + Case 2: + + Given + X.shape = (3, 100, 100, 4) + + and + axis = 0 + + We get: + Out.shape = (1, 3 * 100 * 100 * 4) Args: x (Variable): A tensor of rank >= axis. @@ -7759,9 +7811,9 @@ def flatten(x, axis=1, name=None): will be named automatically. Returns: - Variable: A 2D tensor with the contents of the input tensor, with input - dimensions up to axis flattened to the outer dimension of - the output and remaining input dimensions flattened into the + Variable: A 2D tensor with the contents of the input tensor, with input \ + dimensions up to axis flattened to the outer dimension of \ + the output and remaining input dimensions flattened into the \ inner dimension of the output. Raises: @@ -7801,19 +7853,23 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None): The enumerated sequence has the same 1st dimension with variable `input`, and the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation. - Examples: - Case 1: - Input: - X.lod = [[0, 3, 5]] - X.data = [[1], [2], [3], [4], [5]] - X.dims = [5, 1] - Attrs: - win_size = 2 - pad_value = 0 - Output: - Out.lod = [[0, 3, 5]] - Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]] - Out.dims = [5, 2] + .. code-block:: text + + Case 1: + + Input: + X.lod = [[0, 3, 5]] + X.data = [[1], [2], [3], [4], [5]] + X.dims = [5, 1] + + Attrs: + win_size = 2 + pad_value = 0 + + Output: + Out.lod = [[0, 3, 5]] + Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]] + Out.dims = [5, 2] Args: input (Variable): The input variable which is a index sequence. @@ -8896,6 +8952,7 @@ def similarity_focus(input, axis, indexes, name=None): SimilarityFocus Operator Generate a similarity focus mask with the same shape of input using the following method: + 1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding to the axis according to the indexes. For example, if axis=1 and indexes=[a], it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X @@ -8969,14 +9026,16 @@ def similarity_focus(input, axis, indexes, name=None): indexes(list): Indicating the indexes of the selected dimension. Returns: - Variable: A tensor variable with the same shape and same type - as the input. + Variable: A tensor variable with the same shape and same type \ + as the input. Examples: .. code-block:: python + data = fluid.layers.data( name='data', shape=[2, 3, 2, 2], dtype='float32') x = fluid.layers.layer_norm(input=data, axis=1, indexes=[0]) + """ helper = LayerHelper('similarity_focus', **locals()) # check attrs @@ -9055,6 +9114,7 @@ def hash(input, hash_size, num_hash=1, name=None): Examples: .. code-block:: python + word_dict = paddle.dataset.imdb.word_dict() x = fluid.layers.data(shape[1], dtype='int32', lod_level=1) out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000) @@ -9075,50 +9135,52 @@ def hash(input, hash_size, num_hash=1, name=None): def grid_sampler(x, grid, name=None): """ This operation samples input X by using bilinear interpolation based on - flow field grid, which is usually gennerated by affine_grid. The grid of + flow field grid, which is usually gennerated by :code:`affine_grid` . The grid of shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates with shape [N, H, W] each, where grid_x is indexing the 4th dimension (in width dimension) of input data x and grid_y is indexng the 3rd dimention (in height dimension), finally results is the bilinear interpolation value of 4 nearest corner points. - Step 1: - Get (x, y) grid coordinates and scale to [0, H-1/W-1]. + .. code-block:: text + + Step 1: + Get (x, y) grid coordinates and scale to [0, H-1/W-1]. - grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1) - grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1) + grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1) + grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1) - Step 2: - Indices input data X with grid (x, y) in each [H, W] area, and bilinear - interpolate point value by 4 nearest points. + Step 2: + Indices input data X with grid (x, y) in each [H, W] area, and bilinear + interpolate point value by 4 nearest points. - wn ------- y_n ------- en - | | | - | d_n | - | | | - x_w --d_w-- grid--d_e-- x_e - | | | - | d_s | - | | | - ws ------- y_s ------- wn + wn ------- y_n ------- en + | | | + | d_n | + | | | + x_w --d_w-- grid--d_e-- x_e + | | | + | d_s | + | | | + ws ------- y_s ------- wn - x_w = floor(x) // west side x coord - x_e = x_w + 1 // east side x coord - y_n = floor(y) // north side y coord - y_s = y_s + 1 // south side y coord + x_w = floor(x) // west side x coord + x_e = x_w + 1 // east side x coord + y_n = floor(y) // north side y coord + y_s = y_s + 1 // south side y coord - d_w = grid_x - x_w // distance to west side - d_e = x_e - grid_x // distance to east side - d_n = grid_y - y_n // distance to north side - d_s = y_s - grid_y // distance to south side + d_w = grid_x - x_w // distance to west side + d_e = x_e - grid_x // distance to east side + d_n = grid_y - y_n // distance to north side + d_s = y_s - grid_y // distance to south side - wn = X[:, :, y_n, x_w] // north-west point value - en = X[:, :, y_n, x_e] // north-east point value - ws = X[:, :, y_s, x_w] // south-east point value - es = X[:, :, y_s, x_w] // north-east point value + wn = X[:, :, y_n, x_w] // north-west point value + en = X[:, :, y_n, x_e] // north-east point value + ws = X[:, :, y_s, x_w] // south-east point value + es = X[:, :, y_s, x_w] // north-east point value - output = wn * d_e * d_s + en * d_w * d_s - + ws * d_e * d_n + es * d_w * d_n + output = wn * d_e * d_s + en * d_w * d_s + + ws * d_e * d_n + es * d_w * d_n Args: x(Variable): Input data of shape [N, C, H, W]. @@ -9126,16 +9188,18 @@ def grid_sampler(x, grid, name=None): name (str, default None): The name of this layer. Returns: - out(Variable): Output of shape [N, C, H, W] data samples input X + Variable: Output of shape [N, C, H, W] data samples input X using bilnear interpolation based on input grid. - Exmples: - .. code-block:: python + Examples: + + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32') + theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32') + grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]}) + out = fluid.layers.grid_sampler(x=x, grid=grid) - x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32') - theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32') - grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]}) - out = fluid.layers.grid_sampler(x=x, grid=grid) """ helper = LayerHelper("grid_sampler", **locals()) @@ -9203,19 +9267,19 @@ def add_position_encoding(input, alpha, beta, name=None): """ **Add Position Encoding Layer** - This layer accepts an input 3D-Tensor of shape [N x M x P], and return an + This layer accepts an input 3D-Tensor of shape [N x M x P], and returns an output Tensor of shape [N x M x P] with positional encoding value. - Refer to `Attention Is All You Need`_ . + Refer to `Attention Is All You Need `_ . .. math:: - PE(pos, 2i) = \\sin{(pos / 10000^{2i / P})} \\\\ - PE(pos, 2i + 1) = \\cos{(pos / 10000^{2i / P})} \\\\ - Out(:, pos, i) = \\alpha * input(:, pos, i) + \\beta * PE(pos, i) + PE(pos, 2i) &= \\sin{(pos / 10000^{2i / P})} \\\\ + PE(pos, 2i + 1) &= \\cos{(pos / 10000^{2i / P})} \\\\ + Out(:, pos, i) &= \\alpha * input(:, pos, i) + \\beta * PE(pos, i) Where: - * PE(pos, 2i): the increment for the number at even position - * PE(pos, 2i + 1): the increment for the number at odd position + - :math:`PE(pos, 2i)` : the increment for the number at even position + - :math:`PE(pos, 2i + 1)` : the increment for the number at odd position Args: input (Variable): 3-D input tensor with shape [N x M x P] @@ -9230,6 +9294,7 @@ def add_position_encoding(input, alpha, beta, name=None): .. code-block:: python position_tensor = fluid.layers.add_position_encoding(input=tensor) + """ helper = LayerHelper('add_position_encoding', **locals()) dtype = helper.input_dtype() @@ -9262,13 +9327,13 @@ def bilinear_tensor_product(x, For example: .. math:: - out{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1 + out_{i} = x * W_{i} * {y^\mathrm{T}}, i=0,1,...,size-1 In this formula: - :math:`x`: the first input contains M elements, shape is [batch_size, M]. - :math:`y`: the second input contains N elements, shape is [batch_size, N]. - :math:`W_{i}`: the i-th learned weight, shape is [M, N] - - :math:`out{i}`: the i-th element of out, shape is [batch_size, size]. + - :math:`out_{i}`: the i-th element of out, shape is [batch_size, size]. - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`. Args: diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 49a486cf0c3d1..4399d96626b85 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -393,9 +393,6 @@ def fill_constant_batch_size_like(input, It also sets *stop_gradient* to True. - >>> data = fluid.layers.fill_constant_batch_size_like( - >>> input=like, shape=[1], value=0, dtype='int64') - Args: input(${input_type}): ${input_comment}. @@ -411,6 +408,14 @@ def fill_constant_batch_size_like(input, Returns: ${out_comment}. + + Examples: + + .. code-block:: python + + data = fluid.layers.fill_constant_batch_size_like( + input=like, shape=[1], value=0, dtype='int64') + """ helper = LayerHelper("fill_constant_batch_size_like", **locals()) out = helper.create_variable_for_type_inference(dtype=dtype) diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 85af8fea13d5b..fd07ff0ba3d21 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -361,8 +361,8 @@ class ChunkEvaluator(MetricBase): Accumulate counter numbers output by chunk_eval from mini-batches and compute the precision recall and F1-score using the accumulated counter numbers. - For some basics of chunking, please refer to - 'Chunking with Support Vector Machines '. + For some basics of chunking, please refer to + `Chunking with Support Vector Machines `_ . ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection, and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. @@ -391,6 +391,7 @@ def __init__(self, name=None): def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks): """ Update the states based on the layers.chunk_eval() ouputs. + Args: num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch. num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch. @@ -450,9 +451,9 @@ class EditDistance(MetricBase): distance, instance_error = distance_evaluator.eval() In the above example: - 'distance' is the average of the edit distance in a pass. - 'instance_error' is the instance error rate in a pass. + - 'distance' is the average of the edit distance in a pass. + - 'instance_error' is the instance error rate in a pass. """ @@ -567,12 +568,15 @@ class DetectionMAP(object): Calculate the detection mean average precision (mAP). The general steps are as follows: + 1. calculate the true positive and false positive according to the input - of detection and labels. + of detection and labels. 2. calculate mAP value, support two versions: '11 point' and 'integral'. Please get more information from the following articles: + https://sanchom.wordpress.com/tag/average-precision/ + https://arxiv.org/abs/1512.02325 Args: @@ -613,10 +617,12 @@ class DetectionMAP(object): for data in batches: loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch) - In the above example: + In the above example: + + - 'cur_map_v' is the mAP of current mini-batch. + - 'accum_map_v' is the accumulative mAP of one pass. - 'cur_map_v' is the mAP of current mini-batch. - 'accum_map_v' is the accumulative mAP of one pass. + """ def __init__(self, diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index d21ec42dccde8..c128843885fbc 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -125,14 +125,23 @@ def slice_variable(var_list, slice_count, min_block_size): class DistributeTranspilerConfig(object): """ - Args: - slice_var_up (bool): Do Tensor slice for pservers, default is True. - split_method (PSDispatcher): RoundRobin or HashName can be used - try to choose the best method to balance loads for pservers. - min_block_size (int): Minimum splitted element number in block. - According:/~https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 + .. py:attribute:: slice_var_up (bool) + + Do Tensor slice for pservers, default is True. + + .. py:attribute:: split_method (PSDispatcher) + + RoundRobin or HashName can be used. + Try to choose the best method to balance loads for pservers. + + .. py:attribute:: min_block_size (int) + + Minimum number of splitted elements in block. + + According to : /~https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 We can use bandwidth effiently when data size is larger than 2MB.If you - want to change it, please be sure you see the slice_variable function. + want to change it, please be sure you have read the slice_variable function. + """ slice_var_up = True From 6dd623b1e3e2211cb2b06fd864df4ba38a39304d Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 27 Dec 2018 12:53:48 +0800 Subject: [PATCH 076/103] test=develop --- .../fluid/tests/unittests/test_py_reader_using_executor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py index d94494e219c5f..abc30874f6174 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -209,6 +209,7 @@ def main(self, else: thread = threading.Thread( target=feed_data, args=(feed_queue, reader)) + thread.daemon = True thread.start() self.outputs = [] From fe8495a7583b503a094168aae38a22843c96a72d Mon Sep 17 00:00:00 2001 From: chengduo Date: Wed, 26 Dec 2018 23:42:35 -0600 Subject: [PATCH 077/103] [WIP] Refine MultiDevSSAGraph (#15040) * refine parallel_exe test=develop * rename shared_var_device * code refine * add test_weight_decay * remove Sort test=develop * Add SortForReduce test=develop * code refine test=develop * follow comment test=develop --- .../details/multi_devices_graph_pass.cc | 405 +++++++++--------- .../details/multi_devices_graph_pass.h | 19 +- paddle/fluid/framework/ir/graph.cc | 58 --- paddle/fluid/framework/parallel_executor.cc | 5 +- python/paddle/fluid/parallel_executor.py | 4 +- .../tests/unittests/test_weight_decay.py | 188 ++++++++ 6 files changed, 401 insertions(+), 278 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_weight_decay.py diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 7e320a08942e4..5b9a81811728b 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -42,6 +42,12 @@ namespace { typedef std::vector GraphOps; const char kGraphOps[] = "ops"; +bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) { + return boost::get( + node.Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(role); +} + void PolishGraphToSupportDataHazards(ir::Graph *graph) { for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { @@ -147,6 +153,7 @@ void MultiDevSSAGraphBuilder::Init() const { #endif balance_vars_.resize(places_.size(), 0); + if (strategy_.enable_data_balance_ && places_.size() == 1) { LOG(WARNING) << "It is no need to enable data balance when there is only " "one place. enable_data_balance is set to False."; @@ -154,145 +161,16 @@ void MultiDevSSAGraphBuilder::Init() const { } } -void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, - ir::Node *node, - size_t place_id) const { - auto p = places_[place_id]; - auto *op_handle = result->Get(kGraphOps).back(); - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); - - for (ir::Node *input : node->inputs) { - VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); - op_handle->AddInput(var); - } - - for (ir::Node *output : node->outputs) { - ir::Node *new_node = nullptr; - if (output->Var()) { - new_node = result->CreateVarNode(output->Var()); - } else { - new_node = - result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); - } - CreateOpOutput(result, op_handle, new_node, p, place_id); - } -} - -std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( - const std::vector &nodes) const { - std::vector send_vars; - // since parameters are all in block 0, - // it's enough to only scan send ops in block 0 - for (auto &node : nodes) { - OpDesc *op = node->Op(); - // TODO(Yancey1989): use a graceful method to find send op, - // instead of the the hard code string - if (op->Type() == "send") { - auto op_vars = op->InputArgumentNames(); - send_vars.reserve(send_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end()); - } - } - return send_vars; -} - -std::vector MultiDevSSAGraphBuilder::FindDistTrainRecvVars( - const std::vector &nodes) const { - std::vector recv_vars; - for (auto &node : nodes) { - OpDesc *op = node->Op(); - // TODO(Yancey1989): use a graceful method to find recv op, - // instead of the hard code string - if (op->Type() == "recv") { - auto op_vars = op->OutputArgumentNames(); - recv_vars.reserve(recv_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end()); - } - } - return recv_vars; -} - -size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( - const std::vector &var_names) const { - int64_t numel_sum = 0; - for (auto var_name : var_names) { - if (all_vars_.find(var_name) == all_vars_.end()) continue; - auto var_desc = all_vars_.at(var_name); - PADDLE_ENFORCE_NOT_NULL(var_desc); - auto dim = framework::make_ddim(var_desc->GetShape()); - int64_t numel = framework::product(dim); - PADDLE_ENFORCE_GT(numel, 0); - numel_sum += numel; - } - - auto smallest = - std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); - size_t dev_id = - static_cast(std::distance(std::begin(balance_vars_), smallest)); - balance_vars_[dev_id] += numel_sum; - return dev_id; -} - -// Topology sort the graph nodes from inputs to outputs. -// Since SSAGraphBuilder depends on forward/backward nodes to assign devices -// to parameter/gradients before optimizer ops, topo sort is insufficient. ( -// some optimizer ops might not depend on any nodes), we manually move all -// optimizer nodes after last backward nodes. -// However, the assumption by SSAGraphBuilder should be relaxed in the future. -std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { - std::vector ret = ir::TopologySortOperations(graph); - size_t last_backward = 0; - for (size_t i = 0; i < ret.size(); ++i) { - if (boost::get( - ret[i]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kBackward)) { - last_backward = i; - } - } - - std::vector optimize_ops; - std::vector sorted_ret; - for (size_t i = 0; i < ret.size(); ++i) { - if (i < last_backward) { - if (static_cast(boost::get(ret[i]->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kOptimize))) { - optimize_ops.push_back(ret[i]); - } else { - sorted_ret.push_back(ret[i]); - } - } else if (i == last_backward) { - sorted_ret.push_back(ret[i]); - // Verify that no operations before optimize ops depends on optimize ops. - std::unordered_set optimize_set(optimize_ops.begin(), - optimize_ops.end()); - for (ir::Node *n : sorted_ret) { - for (ir::Node *in : n->inputs) { - for (ir::Node *pre_n : in->inputs) { - PADDLE_ENFORCE(optimize_set.find(pre_n) == optimize_set.end(), - "optimize operations cannot be depended by forward " - "or backward node %s -> %s", - pre_n->Name(), n->Name()); - } - } - } - sorted_ret.insert(sorted_ret.end(), optimize_ops.begin(), - optimize_ops.end()); - } else { - sorted_ret.push_back(ret[i]); - } - } - return sorted_ret; -} - std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unique_ptr graph) const { Init(); // Give the topology sort order and rebuild the graph structure. - std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); + std::vector sorted_ops = ir::TopologySortOperations(*graph); + + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + sorted_ops = SortForReduceMode(sorted_ops); + } + auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; @@ -303,31 +181,22 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( all_vars_.emplace(node->Name(), node->Var()); } } - std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 result.Set(kGraphVars, new GraphVars(places_.size())); result.Set(kGraphDepVars, new GraphDepVars); result.Set(kGraphOps, new GraphOps); - // find send/recv vars so that we can place the distributed training - // related op in the place 0 - auto send_vars = FindDistTrainSendVars(sorted_ops); - auto recv_vars = FindDistTrainRecvVars(sorted_ops); - std::vector> bcast_var_name_set; bcast_var_name_set.resize(places_.size()); - size_t cur_device_id = 0; bool is_forwarding = true; bool is_dist_train = false; std::unordered_map sharded_var_device; for (ir::Node *node : sorted_ops) { - if (boost::get( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kRPC)) { + if (OpHaveRole(*node, OpRole::kRPC)) { int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device); PADDLE_ENFORCE(op_dev_id != -1, "Can not schedule the RPC operator to the right place."); @@ -341,9 +210,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } is_dist_train = true; - } else if (boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kDist)) { + } else if (OpHaveRole(*node, OpRole::kDist)) { int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device); if (node->Op()->Type() == "concat") { auto origin_param_name = node->Op()->OutputArgumentNames()[0]; @@ -365,7 +232,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); + int op_dev_id = GetOpDeviceID(node, sharded_var_device); if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { @@ -385,47 +252,48 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. - if (static_cast(boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward))) { - try { - auto backward_vars = boost::get>( - node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - - for (size_t i = 0; i < backward_vars.size(); i += 2) { - auto &p_name = backward_vars[i]; - auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; - - switch (strategy_.reduce_) { - case BuildStrategy::ReduceStrategy::kReduce: - cur_device_id = GetAppropriateDeviceID({g_name}); - CreateReduceOp(&result, g_name, cur_device_id); - sharded_var_device.emplace(g_name, cur_device_id); - if (!is_dist_train) { - bcast_var_name_set[cur_device_id].emplace(p_name); - } - break; - case BuildStrategy::ReduceStrategy::kAllReduce: - if (IsSparseGradient(g_name)) { - CreateReduceOp(&result, g_name, 0); - CreateBroadcastOp(&result, g_name, 0); - } else { - InsertAllReduceOp(&result, g_name); - } - break; - default: - LOG(FATAL) << "Unknown reduce strategy "; - break; - } + try { + auto backward_vars = boost::get>( + node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &p_name = backward_vars[i]; + auto &g_name = backward_vars[i + 1]; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + size_t cur_device_id = -1; + switch (strategy_.reduce_) { + case BuildStrategy::ReduceStrategy::kReduce: + cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(&result, g_name, cur_device_id); + sharded_var_device.emplace(g_name, cur_device_id); + if (!is_dist_train) { + bcast_var_name_set[cur_device_id].emplace(p_name); + } + break; + case BuildStrategy::ReduceStrategy::kAllReduce: + if (IsSparseGradient(g_name)) { + CreateReduceOp(&result, g_name, 0); + CreateBroadcastOp(&result, g_name, 0); + } else { + InsertAllReduceOp(&result, g_name); + } + break; + default: + LOG(FATAL) << "Unknown reduce strategy "; + break; } - } catch (boost::bad_get e) { } + } catch (boost::bad_get e) { } } } @@ -469,12 +337,108 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( return graph; } -bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { - PADDLE_ENFORCE(all_vars_.count(og) != 0); - if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { - return true; +std::vector MultiDevSSAGraphBuilder::SortForReduceMode( + const std::vector &topo_ops) const { + std::unordered_map sharded_var_device; + std::vector sorted_ops; + std::unordered_map> delayed_op; + sorted_ops.reserve(topo_ops.size()); + + auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { + sharded_var_device.emplace(var_name, dev_id); + if (delayed_op.count(var_name)) { + auto &ops = delayed_op.at(var_name); + sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); + delayed_op.at(var_name).clear(); + } + }; + + for (ir::Node *node : topo_ops) { + int op_dev_id = GetOpDeviceID(node, sharded_var_device, &delayed_op); + if (op_dev_id > -1) { + // This op only runs on one specific device. + sorted_ops.emplace_back(node); + for (ir::Node *n : node->outputs) { + insert_delayed_op(n->Name(), op_dev_id); + } + } else if (op_dev_id == -1) { + // This op runs on all devices, and its output may have parameter's + // gradients. + sorted_ops.emplace_back(node); + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + std::vector backward_vars; + try { + backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + } catch (boost::bad_get e) { + } + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &g_name = backward_vars[i + 1]; + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + insert_delayed_op(g_name, static_cast(cur_device_id)); + } + } else if (op_dev_id == -2) { + // The Op on which the Op depends has not yet been generated. + } } - return false; + + PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); + return sorted_ops; +} + +void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, + ir::Node *node, + size_t place_id) const { + auto p = places_[place_id]; + auto *op_handle = result->Get(kGraphOps).back(); + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + + for (ir::Node *input : node->inputs) { + VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); + op_handle->AddInput(var); + } + + for (ir::Node *output : node->outputs) { + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, place_id); + } +} + +size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( + const std::vector &var_names) const { + int64_t numel_sum = 0; + for (auto var_name : var_names) { + if (all_vars_.find(var_name) == all_vars_.end()) continue; + auto var_desc = all_vars_.at(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc); + auto dim = framework::make_ddim(var_desc->GetShape()); + int64_t numel = framework::product(dim); + PADDLE_ENFORCE_GT(numel, 0); + numel_sum += numel; + } + + auto smallest = + std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); + size_t dev_id = + static_cast(std::distance(std::begin(balance_vars_), smallest)); + balance_vars_[dev_id] += numel_sum; + return dev_id; } void MultiDevSSAGraphBuilder::SetCommunicationContext( @@ -625,28 +589,52 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp( } int MultiDevSSAGraphBuilder::GetOpDeviceID( - const ir::Graph &graph, ir::Node *node, + ir::Node *node, + const std::unordered_map &sharded_var_device, + std::unordered_map> *delay_ops) const { + if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { + return -1; + } + + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); + + if (dev_id == -1) { + (*delay_ops)[param_grad[1]].push_back(node); + return -2; + } + return dev_id; +} + +int MultiDevSSAGraphBuilder::GetOpDeviceID( + ir::Node *node, const std::unordered_map &sharded_var_device) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } - int op_role = boost::get( - node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())); - if (op_role != static_cast(framework::OpRole::kOptimize)) { + + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { return -1; } auto param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(graph, param_grad[1], sharded_var_device); + int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", node->Op()->Type(), param_grad[0], param_grad[1]); return dev_id; } int MultiDevSSAGraphBuilder::GetVarDeviceID( - const ir::Graph &graph, const std::string &varname, + const std::string &varname, const std::unordered_map &sharded_var_device) const { auto got = sharded_var_device.find(varname); if (got == sharded_var_device.end()) { @@ -740,8 +728,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp( node->Op()->Type() == "split_selected_rows" || node->Op()->Type() == "split_ids") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = - GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); + op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { @@ -752,8 +739,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp( sharded_var_device->emplace(varname, op_dev_id); } } else if (node->Op()->Type() == "concat") { - op_dev_id = - GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); + op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); for (auto &varname : output_var_names) { sharded_var_device->emplace(varname, op_dev_id); } @@ -794,8 +780,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = - GetVarDeviceID(*result, node->inputs[0]->Name(), *sharded_var_device); + op_dev_id = GetVarDeviceID(node->inputs[0]->Name(), *sharded_var_device); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -825,8 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); if (recv_param_grad.size() == 2U) { - op_dev_id = - GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device); + op_dev_id = GetVarDeviceID(recv_param_grad[1], *sharded_var_device); VLOG(10) << "recv param " << recv_param_grad[0] << " get grad place: " << recv_param_grad[1] << " place: " << op_dev_id; @@ -861,8 +845,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( for (ir::Node *output : node->outputs) { int outvar_dev_id = op_dev_id; if (node->Op()->Type() == "fetch_barrier") { - outvar_dev_id = - GetVarDeviceID(*result, output->Name(), *sharded_var_device); + outvar_dev_id = GetVarDeviceID(output->Name(), *sharded_var_device); PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name()); } p = places_[outvar_dev_id]; @@ -879,6 +862,14 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( return op_dev_id; } +bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { + PADDLE_ENFORCE(all_vars_.count(og) != 0); + if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { + return true; + } + return false; +} + bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { return boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 5736102ddc134..7029e9dc18cba 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -45,7 +45,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { #endif int GetVarDeviceID( - const ir::Graph &graph, const std::string &varname, + const std::string &varname, const std::unordered_map &sharded_var_device) const; bool IsScaleLossOp(ir::Node *node) const; @@ -57,12 +57,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ir::Graph *result, ir::Node *node, std::unordered_map *sharded_var_device) const; - std::vector FindDistTrainSendVars( - const std::vector &nodes) const; - - std::vector FindDistTrainRecvVars( - const std::vector &nodes) const; - void CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const; @@ -77,7 +71,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { int dev_id) const; int GetOpDeviceID( - const ir::Graph &graph, ir::Node *node, + ir::Node *node, const std::unordered_map &sharded_var_device) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; @@ -100,6 +94,15 @@ class MultiDevSSAGraphBuilder : public ir::Pass { void SetCommunicationContext(OpHandleBase *op_handle, const platform::Place &p) const; + std::vector SortForReduceMode( + const std::vector &) const; + + int GetOpDeviceID( + ir::Node *node, + const std::unordered_map &shared_var_device, + std::unordered_map> *delay_ops) + const; + mutable std::string loss_var_name_; mutable std::vector places_; mutable std::vector local_scopes_; diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8670dcfed7e40..3eb5bdba3b727 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -23,66 +23,8 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { -namespace { - -void CheckProgram(const ProgramDesc &program) { -#define _INT(role) static_cast(role) - - std::map visit; - for (OpDesc *op : program.Block(0).AllOps()) { - // For backward compatibility, some program doesn't have role added. - if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; - int role_id = - boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - visit[role_id] = true; - switch (role_id) { - case _INT(OpRole::kForward): - if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) << "Cannot add backward operator before forward operator " - << op->Type(); - } - break; - case _INT(OpRole::kBackward): - case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add backward operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kForward) | _INT(OpRole::kLoss): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | - _INT(OpRole::kLoss)) == visit.end(), - "Cannot add backward|loss operator before " - "forward|loss operator %s.", - op->Type()); - PADDLE_ENFORCE( - visit.find(_INT(OpRole::kOptimize)) == visit.end(), - "Cannot add forward|loss operator %s after optimize operator.", - op->Type()); - break; - case _INT(OpRole::kOptimize): - case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): - PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), - "Optimize operators %s must follow backward operator.", - op->Type()); - break; - case _INT(OpRole::kLRSched): - case _INT(OpRole::kDist): - case _INT(OpRole::kRPC): - case _INT(OpRole::kNotSpecified): - break; - default: - LOG(FATAL) << "Unknown operator role. Don't add new role because " - "you don't know what you are doing."; - } - } - -#undef _INT -} -} // namespace Graph::Graph(const ProgramDesc &program) : program_(program) { - CheckProgram(program_); auto var_nodes = InitFromProgram(program_); ResolveHazard(var_nodes); } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index a921f469f5e02..e14b74a87302a 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -320,6 +320,7 @@ void ParallelExecutor::BCastParamsToDevices( if (paddle::platform::is_gpu_place(main_tensor.place())) { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::vector buffers; + buffers.reserve(member_->places_.size()); size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); for (size_t i = 0; i < member_->places_.size(); ++i) { @@ -353,9 +354,7 @@ void ParallelExecutor::BCastParamsToDevices( #endif } else { platform::CPUPlace cpu; - for (size_t i = 0; i < member_->places_.size(); ++i) { - if (i == 0) continue; - + for (size_t i = 1; i < member_->places_.size(); ++i) { auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 74cf76da951a4..c97a93ec36d4f 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -148,7 +148,7 @@ def __init__(self, trainers_endpoints), "num_trainers == len(end_points)" build_strategy.trainers_endpoints = trainers_endpoints - # step5: get persistable_vars, parameter_vars, places. persistable_vars + # step6: get persistable_vars, places. persistable_vars # need be broadcast to other local_scope. persistable_vars = set([ cpt.to_text(v.name) for v in [ @@ -164,7 +164,7 @@ def place_obj(place): places = list(map(place_obj, self._places)) - # step6: init ParallelExecutor + # step7: init ParallelExecutor self.executor = core.ParallelExecutor( places, persistable_vars, main.desc, cpt.to_text(loss_name) diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py new file mode 100644 index 0000000000000..f37d2bfb2e86b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py @@ -0,0 +1,188 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import contextlib + +import unittest +from functools import partial +import numpy as np +import paddle +import paddle.fluid.core as core + +import paddle.fluid as fluid + + +def get_places(): + places = [] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + +@contextlib.contextmanager +def prog_scope_guard(main_prog, startup_prog): + scope = fluid.core.Scope() + with fluid.unique_name.guard(): + with fluid.scope_guard(scope): + with fluid.program_guard(main_prog, startup_prog): + yield + + +def bow_net(data, + label, + dict_dim, + is_sparse=False, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from /~https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestWeightDecay(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + reader = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), batch_size=4)() + self.train_data = [next(reader) for _ in range(5)] + self.learning_rate = .5 + + def run_executor(self, place, feed_list, loss): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + main_prog = fluid.default_main_program() + loss_set = [] + for data in self.train_data: + out = exe.run(main_prog, + feed=feeder.feed(data), + fetch_list=[loss.name]) + + print("loss %s" % (np.average(out))) + loss_set.append(np.average(out)) + + return loss_set + + def run_parallel_exe(self, + place, + feed_list, + loss, + use_cuda=True, + use_reduce=False, + use_fast_executor=False, + use_ir_memory_optimize=False): + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=feed_list, place=place) + exe.run(fluid.default_startup_program()) + + exec_strategy = fluid.ExecutionStrategy() + if use_fast_executor: + exec_strategy.use_experimental_executor = True + + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.memory_optimize = use_ir_memory_optimize + + parallel_exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss.name, + exec_strategy=exec_strategy, + build_strategy=build_strategy) + + loss_set = [] + for data in self.train_data: + out = parallel_exe.run(feed=feeder.feed(data), + fetch_list=[loss.name]) + print("loss %s" % (np.average(out))) + loss_set.append(np.average(out)) + + return loss_set + + def check_weight_decay(self, + place, + model, + use_parallel_exe=False, + use_reduce=False): + main_prog = fluid.framework.Program() + startup_prog = fluid.framework.Program() + startup_prog.random_seed = 1 + with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog): + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost = model(data, label, len(self.word_dict)) + + param_list = [(var, var * self.learning_rate) + for var in main_prog.block(0).all_parameters()] + + optimizer = fluid.optimizer.Adagrad( + learning_rate=self.learning_rate) + + optimizer.minimize(avg_cost) + + for params in param_list: + updated_p = fluid.layers.elementwise_sub( + x=params[0], y=params[1]) + fluid.layers.assign(input=updated_p, output=params[0]) + + if use_parallel_exe: + loss = self.run_parallel_exe( + place, [data, label], + loss=avg_cost, + use_cuda=True, + use_reduce=use_reduce) + else: + loss = self.run_executor(place, [data, label], loss=avg_cost) + + return loss + + def test_weight_decay(self): + model = partial(bow_net, is_sparse=False) + for place in get_places(): + loss = self.check_weight_decay(place, model, use_parallel_exe=False) + + loss2 = self.check_weight_decay( + place, model, use_parallel_exe=True, use_reduce=False) + + for i in range(len(loss)): + assert np.isclose(a=loss[i], b=loss2[i], rtol=5e-5) + + loss3 = self.check_weight_decay( + place, model, use_parallel_exe=True, use_reduce=True) + + for i in range(len(loss)): + assert np.isclose(a=loss[i], b=loss3[i], rtol=5e-5) + + +if __name__ == '__main__': + unittest.main() From efa630eadbfd60270ccd8dbe2f9951ef34541cde Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 27 Dec 2018 14:39:41 +0800 Subject: [PATCH 078/103] Refine Dockerfile (#14908) * Refine Dockerfile * Add tasks, cmake gen * Fix code error * Disable compile after paddle_build.sh * Refine * Skip on PY35 CI * Change env * Refine paddle_build.sh * Expose gen_fluid_lib * Refine mkldnn.cmake * Refine mkldnn.cmake * Refine mkldnnlib * Skip unstable tests --- Dockerfile | 76 +++++++++---------- cmake/external/mkldnn.cmake | 4 +- cmake/inference_lib.cmake | 2 +- paddle/scripts/paddle_build.sh | 18 +++-- .../test_image_classification_resnet.py | 12 +-- .../tests/unittests/test_dist_se_resnext.py | 15 ++++ 6 files changed, 73 insertions(+), 54 deletions(-) diff --git a/Dockerfile b/Dockerfile index 84e1edbee91b0..716b164ab84c1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -94,52 +94,52 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3 install -U wheel && \ - pip3 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ - pip3.6 install -U wheel && \ - pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ - pip3.7 install -U wheel && \ - pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ +RUN pip3 --no-cache-dir install -U wheel && \ + pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.6 --no-cache-dir install -U wheel && \ + pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.7 --no-cache-dir install -U wheel && \ + pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ easy_install -U pip && \ - pip install -U pip setuptools wheel && \ - pip install -U docopt PyYAML sphinx==1.5.6 && \ - pip install sphinx-rtd-theme==0.1.9 recommonmark - -RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3 install opencv-python && \ - pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.6 install opencv-python && \ - pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.7 install opencv-python && \ - pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip install opencv-python + pip --no-cache-dir install -U pip setuptools wheel && \ + pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 --no-cache-dir install opencv-python && \ + pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.6 --no-cache-dir install opencv-python && \ + pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.7 --no-cache-dir install opencv-python && \ + pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip --no-cache-dir install opencv-python #For docstring checker -RUN pip3 install pylint pytest astroid isort -RUN pip3.6 install pylint pytest astroid isort -RUN pip3.7 install pylint pytest astroid isort -RUN pip install pylint pytest astroid isort LinkChecker +RUN pip3 --no-cache-dir install pylint pytest astroid isort +RUN pip3.6 --no-cache-dir install pylint pytest astroid isort +RUN pip3.7 --no-cache-dir install pylint pytest astroid isort +RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ -RUN pip3 install -r /root/requirements.txt -RUN pip3.6 install -r /root/requirements.txt -RUN pip3.7 install -r /root/requirements.txt -RUN pip install -r /root/requirements.txt +RUN pip3 --no-cache-dir install -r /root/requirements.txt +RUN pip3.6 --no-cache-dir install -r /root/requirements.txt +RUN pip3.7 --no-cache-dir install -r /root/requirements.txt +RUN pip --no-cache-dir install -r /root/requirements.txt # To fix /~https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 -RUN apt-get install -y libssl-dev libffi-dev -RUN pip3 install certifi urllib3[secure] -RUN pip3.6 install certifi urllib3[secure] -RUN pip3.7 install certifi urllib3[secure] -RUN pip install certifi urllib3[secure] +RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y +RUN pip3 --no-cache-dir install certifi urllib3[secure] +RUN pip3.6 --no-cache-dir install certifi urllib3[secure] +RUN pip3.7 --no-cache-dir install certifi urllib3[secure] +RUN pip --no-cache-dir install certifi urllib3[secure] # Install woboq_codebrowser to /woboq diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index c29375cd05897..a9b99e9ab87c7 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -106,10 +106,10 @@ else(WIN32) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} - DEPENDS mkldnn) + DEPENDS mkldnn shared_mkldnn) endif(WIN32) ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) - +ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn) IF(WITH_C_API) INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib) ENDIF() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 48279bc809dde..3e11d332ff710 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -136,7 +136,7 @@ if (WITH_MKLDNN) copy(mkldnn_lib SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} DSTS ${dst_dir} ${dst_dir}/lib - DEPS mkldnn + DEPS mkldnn_shared_lib ) endif () diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 418dc1346840f..1220f80100785 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - #================================================= # Utils #================================================= @@ -418,13 +417,6 @@ EOF else ctest --output-on-failure fi - - # make install should also be test when unittest - make install -j `nproc` - pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl - if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then - paddle version - fi fi } @@ -922,6 +914,7 @@ function main() { ;; assert_api) assert_api_not_changed ${PYTHON_ABI:-""} + assert_api_spec_approvals ;; test_inference) gen_capi_package @@ -946,6 +939,15 @@ function main() { run_test assert_api_not_changed ${PYTHON_ABI:-""} ;; + cmake_gen) + cmake_gen ${PYTHON_ABI:-""} + ;; + gen_fluid_lib) + gen_fluid_lib + ;; + test_fluid_lib) + test_fluid_lib + ;; *) print_usage exit 0 diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index d744a00242422..e87c1d58c812f 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -185,8 +185,10 @@ def main(use_cuda, parallel): if __name__ == '__main__': - for use_cuda in (False, True): - for parallel in (False, True): - if use_cuda and not core.is_compiled_with_cuda(): - continue - main(use_cuda=use_cuda, parallel=parallel) + on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) + if not on_ci: + for use_cuda in (False, True): + for parallel in (False, True): + if use_cuda and not core.is_compiled_with_cuda(): + continue + main(use_cuda=use_cuda, parallel=parallel) diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index c2a4e5ca0c050..28602d3251a36 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -15,6 +15,18 @@ from __future__ import print_function import unittest from test_dist_base import TestDistBase +import os + + +def skip_ci(func): + on_ci = bool(int(os.environ.get("SKIP_UNSTABLE_CI", '0'))) + + def __func__(*args, **kwargs): + if on_ci: + return + return func(*args, **kwargs) + + return __func__ class TestDistSeResneXt2x2(TestDistBase): @@ -22,6 +34,7 @@ def _setup_config(self): self._sync_mode = True self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=1e-7) @@ -32,6 +45,7 @@ def _setup_config(self): self._mem_opt = True self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=1e-7) @@ -41,6 +55,7 @@ def _setup_config(self): self._sync_mode = False self._use_reader_alloc = False + @skip_ci def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=100) From ef7d563db9b0b058bb4ee12beb3cd94f3f1a61ce Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 27 Dec 2018 14:58:32 +0800 Subject: [PATCH 079/103] Add changes back test=develop --- .../framework/details/execution_strategy.h | 2 +- paddle/fluid/framework/scope.cc | 51 +++++++++++-------- paddle/fluid/framework/scope.h | 11 +++- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 15c496130c2b6..37b07e5736312 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -25,7 +25,7 @@ struct ExecutionStrategy { size_t num_threads_{0}; bool use_cuda_{true}; bool allow_op_delay_{false}; - size_t num_iteration_per_drop_scope_{100}; + size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; }; diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 750b626603178..a5742dbd3d66a 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -47,9 +47,15 @@ DEFINE_bool(fast_eager_deletion_mode, false, // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_LOCK_GUARD +#define SCOPE_KIDS_READER_LOCK +#define SCOPE_KIDS_WRITER_LOCK +#define SCOPE_VARS_READER_LOCK +#define SCOPE_VARS_WRITER_LOCK #else -#define SCOPE_LOCK_GUARD std::lock_guard lock(mutex_); +#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); #endif namespace paddle { @@ -67,64 +73,69 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - SCOPE_LOCK_GUARD - kids_.push_back(new Scope(this)); - return *kids_.back(); + Scope* child = new Scope(this); + { + SCOPE_KIDS_WRITER_LOCK + kids_.push_back(child); + } + return *child; } Variable* Scope::Var(const std::string& name) { - SCOPE_LOCK_GUARD + SCOPE_VARS_WRITER_LOCK return VarInternal(name); } Variable* Scope::Var(std::string* name) { - SCOPE_LOCK_GUARD auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; } + SCOPE_VARS_WRITER_LOCK return VarInternal(new_name); } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_VARS_READER_LOCK return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_VARS_READER_LOCK return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_LOCK_GUARD + SCOPE_VARS_READER_LOCK return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_LOCK_GUARD + SCOPE_KIDS_WRITER_LOCK for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_KIDS_READER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - SCOPE_LOCK_GUARD std::vector known_vars; - known_vars.reserve(this->vars_.size()); - for (auto& p : vars_) { - known_vars.emplace_back(p.first); + { + SCOPE_VARS_READER_LOCK + known_vars.reserve(this->vars_.size()); + for (auto& p : vars_) { + known_vars.emplace_back(p.first); + } } return known_vars; } void Scope::DeleteScope(Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_KIDS_WRITER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -138,8 +149,8 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - SCOPE_LOCK_GUARD std::set var_set(var_names.begin(), var_names.end()); + SCOPE_VARS_WRITER_LOCK for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { it = vars_.erase(it); @@ -151,12 +162,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_LOCK_GUARD + SCOPE_VARS_WRITER_LOCK RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_LOCK_GUARD + SCOPE_VARS_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 794b8e4c9417f..f0915d2eee072 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -14,12 +14,18 @@ limitations under the License. */ #pragma once +extern "C" { +#include +} + #include -#include // NOLINT +#include #include #include +#include #include +#include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/macros.h" @@ -131,7 +137,8 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); private: - mutable std::mutex mutex_; + mutable RWLock kids_lock_; + mutable RWLock vars_lock_; }; // Generate some debug string about the inherience structure of scope, quite From f7294f8b251a3907a872c9b7a5b3d02ecdfdbe76 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Thu, 27 Dec 2018 13:41:23 +0800 Subject: [PATCH 080/103] register float16 test=develop --- paddle/fluid/operators/fill_constant_op.cc | 3 ++- paddle/fluid/operators/fill_constant_op.cu.cc | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 73f38de08e3fc..c86430524e182 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -87,4 +87,5 @@ REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker, REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel, ops::FillConstantKernel, ops::FillConstantKernel, - ops::FillConstantKernel); + ops::FillConstantKernel, + ops::FillConstantKernel); diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc index fba5583505afe..77027b5a87d4a 100644 --- a/paddle/fluid/operators/fill_constant_op.cu.cc +++ b/paddle/fluid/operators/fill_constant_op.cu.cc @@ -17,4 +17,6 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel, ops::FillConstantKernel, - ops::FillConstantKernel); + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel); From 26695e0bb2e57012ccd7f222658474d2435aa1ec Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 27 Dec 2018 13:36:18 +0800 Subject: [PATCH 081/103] add thread join test=develop --- .../fluid/tests/unittests/test_py_reader_using_executor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py index abc30874f6174..559386545e730 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -220,6 +220,8 @@ def main(self, feed_queue.close() self.validate() + if not use_decorate_paddle_reader: + thread.join() def validate(self): self.assertEqual(len(self.inputs), len(self.outputs)) From 9a3a246cb5efa4693b31b44546451c7061fbf2c8 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 27 Dec 2018 09:23:22 +0000 Subject: [PATCH 082/103] fix py35 compile error test=develop --- paddle/fluid/framework/CMakeLists.txt | 1 + paddle/fluid/framework/array.h | 55 +++++++-- paddle/fluid/framework/ddim.h | 10 +- paddle/fluid/framework/unroll_array_ops.h | 22 +++- .../fluid/framework/unroll_array_ops_test.cc | 108 ++++++++++++++++++ 5 files changed, 175 insertions(+), 21 deletions(-) create mode 100644 paddle/fluid/framework/unroll_array_ops_test.cc diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c9ba478a099f4..79c00fd039b34 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -40,6 +40,7 @@ proto_library(async_executor_proto SRCS data_feed.proto) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) +cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) if(WITH_GPU) diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h index aa0abc22a6bc9..b53082986882c 100644 --- a/paddle/fluid/framework/array.h +++ b/paddle/fluid/framework/array.h @@ -26,11 +26,12 @@ class Array { public: static constexpr size_t kSize = N; - HOSTDEVICE inline Array() = default; + HOSTDEVICE inline Array() {} template HOSTDEVICE inline explicit Array(const T &val, Args... args) { - UnrollVarArgsAssign::Run(data_, val, args...); + static_assert(N == sizeof...(Args) + 1, "Invalid argument"); + UnrollVarArgsAssign::Run(data_, val, args...); } HOSTDEVICE inline void Fill(const T &val) { @@ -41,10 +42,29 @@ class Array { HOSTDEVICE inline T *GetMutable() { return data_; } - HOSTDEVICE inline T &operator[](size_t index) { return data_[index]; } + HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); } - HOSTDEVICE inline const T &operator[](size_t index) const { - return data_[index]; + // Writing "return data_[i]" would cause compilation warning/error: + // "array subscript is above array bound" in Python 35 CI. + // It seems that it is a false warning of GCC if we do not check the bounds + // of array index. But for better performance, we do not check in operator[] + // like what is in STL. If users want to check the bounds, use at() instead + HOSTDEVICE inline const T &operator[](size_t i) const { + return *advance(data_, i); + } + + HOSTDEVICE inline T &at(size_t i) { +#ifndef __CUDA_ARCH__ + PADDLE_ENFORCE_LT(i, N, "Array index out of bounds"); +#endif + return (*this)[i]; + } + + HOSTDEVICE inline const T &at(size_t i) const { +#ifndef __CUDA_ARCH__ + PADDLE_ENFORCE_LT(i, N, "Array index out of bounds"); +#endif + return (*this)[i]; } HOSTDEVICE constexpr size_t size() const { return N; } @@ -58,6 +78,11 @@ class Array { } private: + template + HOSTDEVICE static inline U *advance(U *ptr, size_t i) { + return ptr + i; + } + T data_[N]; }; @@ -66,7 +91,7 @@ class Array { public: static constexpr size_t kSize = 0; - HOSTDEVICE inline Array() = default; + HOSTDEVICE inline Array() {} HOSTDEVICE inline void Fill(const T &val) {} @@ -75,18 +100,28 @@ class Array { // Add constexpr to GetMutable() cause warning in MAC HOSTDEVICE inline T *GetMutable() { return nullptr; } - HOSTDEVICE inline T &operator[](size_t index) { -#ifndef __CUDA_ARCH__ + HOSTDEVICE inline T &operator[](size_t) { +#ifdef __CUDA_ARCH__ + static T obj(); + return obj; +#else PADDLE_THROW("Array has no element"); #endif } - HOSTDEVICE inline const T &operator[](size_t index) const { -#ifndef __CUDA_ARCH__ + HOSTDEVICE inline const T &operator[](size_t) const { +#ifdef __CUDA_ARCH__ + static const T obj(); + return obj; +#else PADDLE_THROW("Array has no element"); #endif } + HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; } + + HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; } + HOSTDEVICE constexpr size_t size() const { return 0; } HOSTDEVICE constexpr bool operator==(const Array &other) const { diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index 1fd3badbb27ba..28cb8171f623f 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -60,9 +60,7 @@ class DDim { DDim() : rank_(1) { dim_[0] = 0; } - DDim(const DDim& ddim) : dim_(), rank_(ddim.rank_) { - dynamic_dim_assign(ddim.dim_.Get(), dim_.GetMutable(), rank_); - } + DDim(const DDim& ddim) { CopyFrom(ddim); } DDim(const int* d, int n) : rank_(n) { dynamic_dim_assign(d, dim_.GetMutable(), n); @@ -80,10 +78,12 @@ class DDim { /*implicit*/ DDim(std::initializer_list init_list) : DDim(init_list.begin(), init_list.size()) {} + inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); } + template - inline DDim& operator=(const Dim& in) { + inline DDim& operator=(const Dim& dim) { rank_ = D; - UnsafeCast() = in; + UnsafeCast() = dim; return *this; } diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h index fb0a89530f61a..731da74eff4d2 100644 --- a/paddle/fluid/framework/unroll_array_ops.h +++ b/paddle/fluid/framework/unroll_array_ops.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include "paddle/fluid/platform/hostdevice.h" @@ -52,21 +53,30 @@ struct UnrollAssign { }; template -struct UnrollVarArgsAssign { +struct UnrollVarArgsAssignImpl { template HOSTDEVICE inline static void Run(T *d, T val, Args... args) { static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument"); d[kStart] = val; - UnrollVarArgsAssign::Run(d, - args...); + UnrollVarArgsAssignImpl::Run( + d, args...); } }; template -struct UnrollVarArgsAssign { +struct UnrollVarArgsAssignImpl { HOSTDEVICE inline static void Run(T *d) {} }; +template +struct UnrollVarArgsAssign { + template + HOSTDEVICE inline static void Run(T *d, Args... args) { + UnrollVarArgsAssignImpl::Run( + d, args...); + } +}; + template struct UnrollCompare { template @@ -150,8 +160,8 @@ using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>; template using UnrollAssign = detail::UnrollAssign<0, N, N == 0>; -template -using UnrollVarArgsAssign = detail::UnrollVarArgsAssign; +template +using UnrollVarArgsAssign = detail::UnrollVarArgsAssign; template using UnrollCompare = detail::UnrollCompare<0, N, N == 0>; diff --git a/paddle/fluid/framework/unroll_array_ops_test.cc b/paddle/fluid/framework/unroll_array_ops_test.cc new file mode 100644 index 0000000000000..51433c83c8017 --- /dev/null +++ b/paddle/fluid/framework/unroll_array_ops_test.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/unroll_array_ops.h" +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +template +bool CheckEquality(const T* p, size_t n, T val) { + return std::all_of(p, p + n, [val](const T& v) { return v == val; }); +} + +template +bool FillConstantTestMain() { + static_assert(D1 >= D2, ""); + std::array arr; + arr.fill(0); + + UnrollFillConstant::Run(arr.data(), 1); + return CheckEquality(arr.data(), D2, 1) && + CheckEquality(arr.data() + D2, arr.size() - D2, 0); +} + +TEST(unroll_ops, fill_constant) { + EXPECT_TRUE((FillConstantTestMain<9, 0>())); + EXPECT_TRUE((FillConstantTestMain<9, 1>())); + EXPECT_TRUE((FillConstantTestMain<9, 4>())); + EXPECT_TRUE((FillConstantTestMain<9, 9>())); +} + +TEST(unroll_ops, assign) { + const int a[] = {1, 2, 3, 4, 5}; + int b[] = {0, 0, 0, 0, 0}; + UnrollAssign<3>::Run(a, b); + EXPECT_EQ(b[0], 1); + EXPECT_EQ(b[1], 2); + EXPECT_EQ(b[2], 3); + EXPECT_EQ(b[3], 0); + EXPECT_EQ(b[4], 0); +} + +TEST(unroll_ops, var_args_assign) { + int a[] = {0, 0, 0}; + UnrollVarArgsAssign::Run(a, 1, 2); + EXPECT_EQ(a[0], 1); + EXPECT_EQ(a[1], 2); + EXPECT_EQ(a[2], 0); +} + +TEST(unroll_ops, compare) { + int a[] = {1, 2, 3}; + int b[] = {1, 2, 4}; + EXPECT_TRUE(UnrollCompare<2>::Run(a, b)); + EXPECT_FALSE(UnrollCompare<3>::Run(a, b)); + + b[0] = -1; + EXPECT_TRUE(UnrollCompare<0>::Run(a, b)); + EXPECT_FALSE(UnrollCompare<1>::Run(a, b)); +} + +TEST(unroll_ops, add) { + int a[] = {2, 3, 4}; + int b[] = {5, 10, 102}; + int c[] = {0, 0, 0}; + UnrollAdd<2>::Run(a, b, c); + EXPECT_EQ(a[0] + b[0], c[0]); + EXPECT_EQ(a[1] + b[1], c[1]); + EXPECT_EQ(c[2], 0); +} + +TEST(unroll_ops, mul) { + int a[] = {2, 3, 4}; + int b[] = {5, 10, 102}; + int c[] = {0, 0, 0}; + UnrollMul<2>::Run(a, b, c); + EXPECT_EQ(a[0] * b[0], c[0]); + EXPECT_EQ(a[1] * b[1], c[1]); + EXPECT_EQ(c[2], 0); +} + +TEST(unroll_ops, product) { + int a[] = {2, 3, 4}; + int b[] = {5, 10, 102}; + + EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]); + + EXPECT_EQ(UnrollProduct<3>::Run(a, b), + a[0] * b[0] + a[1] * b[1] + a[2] * b[2]); +} + +} // namespace framework +} // namespace paddle From e26cced7ccad46c3165b9c8dc2ee8831c0f5aa8d Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Thu, 27 Dec 2018 18:51:01 +0800 Subject: [PATCH 083/103] refine batch merge pass (#14777) * refine batch merge pass * refine batch merge pass test=develop --- .../framework/ir/multi_batch_merge_pass.cc | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc index bd5b76426eb55..9e77f98e9efb2 100644 --- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc @@ -75,6 +75,7 @@ std::unique_ptr BatchMergePass::ApplyImpl( std::vector optimize_ops; std::vector lr_ops; // ops other than forward/backward/optimize std::unordered_set grad_names; + std::unordered_map gradname2paramname; std::vector nodes = TopologySortOperations(*graph); auto origin_nodes = graph->ReleaseNodes(); @@ -99,6 +100,7 @@ std::unique_ptr BatchMergePass::ApplyImpl( auto op_role_vars = boost::get>(op_role_var); for (size_t i = 0; i < op_role_vars.size(); i += 2) { grad_names.insert(op_role_vars[i + 1]); + gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i]; } } else if (op_role & static_cast(framework::OpRole::kLRSched)) { lr_ops.push_back(node); @@ -109,7 +111,7 @@ std::unique_ptr BatchMergePass::ApplyImpl( // 2. copy forward backward ir::Node* prev_repeat_last_op_node = nullptr; - // record origin_grad -> repeated grad list map. + // record origin_grad -> repeated_grad_list map. std::map> grad_repeated_map; std::map> created; std::unordered_set bn_vars_need_rename; @@ -124,10 +126,16 @@ std::unique_ptr BatchMergePass::ApplyImpl( if (grad_names.find(outname) != grad_names.end()) { std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i); repeated_op.RenameOutput(outname, new_gname); + // remove op_role_var for backward ops that outputs grad for a + // parameter. + repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(), + std::vector()); } } // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do - // not need this update + // not need this update, because only moving mean and variance should be + // differ, trainable parameter scale and bias is the same as other + // parameters. if (node->Name() == "batch_norm") { // NOTE: assume bn op created by layers use save var as output mean and // variance @@ -224,16 +232,25 @@ std::unique_ptr BatchMergePass::ApplyImpl( var->inputs.push_back(repeated_node); } } - } + } // end copy forward backward - // 5. create GRAD merge op node + // 5. create GRAD merge op node: sum(repeat.0...repeat.n) -> + // scale(1/num_repeats) for (auto kv : grad_repeated_map) { OpDesc sum_op; sum_op.SetType("sum"); std::vector repeated_grad_names; + std::vector param_grad_op_role_var; for (auto r : kv.second) { repeated_grad_names.push_back(r->Var()->Name()); } + // NOTE: use op_role_var to control allreduce op appending in + // multi_devices_graph_pass, we want to append op_role_var + // only once for the merged gradient, so break after first call. + param_grad_op_role_var.push_back( + gradname2paramname.at(kv.first->Var()->Name())); // param + param_grad_op_role_var.push_back(kv.first->Var()->Name()); // grad + sum_op.SetInput("X", repeated_grad_names); sum_op.SetOutput("Out", {kv.first->Var()->Name()}); sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), @@ -256,6 +273,10 @@ std::unique_ptr BatchMergePass::ApplyImpl( scale_op.SetAttr("scale", static_cast(1.0f / num_repeats)); scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), static_cast(OpRole::kBackward)); + + scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(), + param_grad_op_role_var); + auto scale_op_node = result.CreateOpNode(&scale_op); scale_op_node->inputs.push_back(sum_out_var_node); sum_out_var_node->outputs.push_back(scale_op_node); From c714c36482eda6d5eb1e0857a16146e04ae117d5 Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Tue, 25 Dec 2018 20:53:55 -0800 Subject: [PATCH 084/103] simplify logic test=develop --- paddle/fluid/framework/CMakeLists.txt | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 21e640cdf2e4f..e7e06b179595a 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -129,11 +129,9 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) if(WITH_NGRAPH) - if(NOT WIN32) - cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) - cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler ngraph) - endif(NOT WIN32) + cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) + cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog + shape_inference data_transform lod_tensor profiler) endif(WITH_NGRAPH) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) @@ -175,11 +173,7 @@ if(WITH_DISTRIBUTE) else() if(WITH_NGRAPH) - if(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper) - else(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) - endif(NOT WIN32) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) else(WITH_NGRAPH) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) endif(WITH_NGRAPH) From 555fbc10d82f0e81810136ed8fcdb514b42dcfc2 Mon Sep 17 00:00:00 2001 From: baojun-nervana Date: Tue, 25 Dec 2018 20:55:40 -0800 Subject: [PATCH 085/103] upgrade ngraph to v0.10.1 test=develop --- cmake/external/ngraph.cmake | 5 ++--- paddle/fluid/framework/ngraph_operator.cc | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index e66459fa3a150..9da657b7d78f2 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -37,13 +37,12 @@ INCLUDE(GNUInstallDirs) INCLUDE(ExternalProject) SET(NGRAPH_PROJECT "extern_ngraph") -SET(NGRAPH_VERSION "0.9") -SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0") +SET(NGRAPH_GIT_TAG "v0.10.1") SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) -SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) +SET(NGRAPH_SHARED_LIB_NAME libngraph.so) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) SET(NGRAPH_GIT_REPO "/~https://github.com/NervanaSystems/ngraph.git") diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 23f681ce886fd..57345f12ccc5d 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -399,7 +399,7 @@ void NgraphEngine::BuildNgFunction() { BuildNgNodes(); ngraph_function_ = nullptr; ngraph::NodeVector func_outputs; - ngraph::op::ParameterVector func_inputs; + ngraph::ParameterVector func_inputs; for (auto& vo : var_out_) { func_outputs.push_back(var_node_map_->at(vo)); From a8bc05b5fff54a6083e6ee4aec08dbc1c36dbb5e Mon Sep 17 00:00:00 2001 From: Wu Yi Date: Fri, 28 Dec 2018 10:03:22 +0800 Subject: [PATCH 086/103] Refactor distributed RPC (#15075) * wip * wip * refactor no.1 dir structure test=develop * fix linking test=develop * fix includes test=develop * fix build test=develop * fix build test=develop --- paddle/fluid/framework/executor.cc | 2 +- .../operators/distributed/CMakeLists.txt | 66 +++++++++---------- .../distributed/{ => brpc}/brpc_client.cc | 4 +- .../distributed/{ => brpc}/brpc_client.h | 4 +- .../distributed/{ => brpc}/brpc_rdma_pool.cc | 2 +- .../distributed/{ => brpc}/brpc_rdma_pool.h | 0 .../{ => brpc}/brpc_sendrecvop_utils.cc | 8 +-- .../{ => brpc}/brpc_sendrecvop_utils.h | 2 +- .../distributed/{ => brpc}/brpc_serde_test.cc | 4 +- .../distributed/{ => brpc}/brpc_server.cc | 6 +- .../distributed/{ => brpc}/brpc_server.h | 2 +- .../{ => brpc}/brpc_variable_response.cc | 2 +- .../{ => brpc}/brpc_variable_response.h | 2 +- .../operators/distributed/collective_client.h | 2 +- .../operators/distributed/collective_server.h | 2 +- .../distributed/collective_server_test.cc | 2 +- .../macros.h => distributed/distributed.h} | 8 +-- .../operators/distributed/distributed_pb.h | 30 +++++++++ .../{ => grpc}/grpc_bytebuffer_stream.cc | 2 +- .../{ => grpc}/grpc_bytebuffer_stream.h | 0 .../distributed/{ => grpc}/grpc_client.cc | 4 +- .../distributed/{ => grpc}/grpc_client.h | 3 +- .../distributed/{ => grpc}/grpc_serde.cc | 6 +- .../distributed/{ => grpc}/grpc_serde.h | 3 +- .../distributed/{ => grpc}/grpc_serde_test.cc | 6 +- .../distributed/{ => grpc}/grpc_server.cc | 4 +- .../distributed/{ => grpc}/grpc_server.h | 5 +- .../distributed/{ => grpc}/grpc_service.h | 2 +- .../{ => grpc}/grpc_variable_response.cc | 2 +- .../{ => grpc}/grpc_variable_response.h | 6 +- .../distributed/parameter_prefetch.cc | 2 +- .../fluid/operators/distributed/rpc_server.cc | 4 +- .../operators/distributed/rpc_server_test.cc | 2 +- .../operators/distributed/send_recv.proto.in | 20 ------ .../operators/distributed/sendrecvop_utils.cc | 1 - .../operators/distributed/sendrecvop_utils.h | 2 +- .../operators/distributed/variable_response.h | 2 +- .../distributed_ops/checkpoint_notify_op.cc | 2 +- .../distributed_ops/fetch_barrier_op.cc | 2 +- .../distributed_ops/gen_nccl_id_op.cc | 2 +- .../distributed_ops/listen_and_serv_op.cc | 2 +- .../operators/distributed_ops/prefetch_op.cc | 2 +- .../operators/distributed_ops/recv_op.cc | 2 +- .../distributed_ops/send_barrier_op.cc | 2 +- .../operators/distributed_ops/send_op.cc | 2 +- .../distributed_ops/test_send_nccl_id.cc | 2 +- 46 files changed, 121 insertions(+), 121 deletions(-) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_client.cc (99%) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_client.h (97%) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_rdma_pool.cc (97%) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_rdma_pool.h (100%) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_sendrecvop_utils.cc (96%) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_sendrecvop_utils.h (96%) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_serde_test.cc (97%) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_server.cc (98%) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_server.h (95%) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_variable_response.cc (96%) rename paddle/fluid/operators/distributed/{ => brpc}/brpc_variable_response.h (97%) rename paddle/fluid/operators/{detail/macros.h => distributed/distributed.h} (80%) create mode 100644 paddle/fluid/operators/distributed/distributed_pb.h rename paddle/fluid/operators/distributed/{ => grpc}/grpc_bytebuffer_stream.cc (96%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_bytebuffer_stream.h (100%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_client.cc (99%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_client.h (98%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_serde.cc (96%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_serde.h (93%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_serde_test.cc (97%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_server.cc (99%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_server.h (93%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_service.h (98%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_variable_response.cc (99%) rename paddle/fluid/operators/distributed/{ => grpc}/grpc_variable_response.h (89%) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 594fbb48a6d12..c93bbe7ceecce 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index eab4297c737bb..8a25d57e613ee 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -7,56 +7,52 @@ if(WITH_GRPC) else() set(cc_generic_services "true") endif() -configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) +configure_file(send_recv.proto.in ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto @ONLY) +# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - if(WITH_GRPC) - grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc - PROTO send_recv.proto + set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc) + grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc + variable_response.cc + collective_client.cc collective_server.cc + ${GRPC_SRCS} + PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto DEPS lod_tensor selected_rows_functor memory) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set(RPC_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) - cc_test(grpc_serde_test SRCS grpc_serde_test.cc - DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL) - - cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) - - cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) - - if(WITH_GPU) - cc_test(collective_server_test SRCS collective_server_test.cc - DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor - selected_rows_functor scope math_function SERIAL) - endif() + cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc + DEPS ${RPC_DEPS} scope profiler math_function SERIAL) - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) else() set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc - brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc - PROTO send_recv.proto + set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc/server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) + brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc + variable_response.cc + collective_client.cc collective_server.cc + ${BRPC_SRCS} + PROTO ${CMAKE_CURRENT_BINARY_DIR}/send_recv.proto DEPS lod_tensor selected_rows memory) - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) - - set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor - proto_desc lookup_sparse_table_op snappystream snappy zlib) - - cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS ${brpc_test_depends} SERIAL) + set(RPC_DEPS sendrecvop_rpc brpc ssl crypto protobuf leveldb snappystream snappy zlib) + cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc + DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL) +endif() - cc_test(brpc_serde_test SRCS brpc_serde_test.cc - DEPS ${brpc_test_depends} SERIAL) - if(WITH_GPU) - cc_test(collective_server_test SRCS collective_server_test.cc - DEPS ${brpc_test_depends} selected_rows_functor scope math_function SERIAL) - endif() +cc_test(rpc_server_test SRCS rpc_server_test.cc + DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL) +cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) +cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) +if(WITH_GPU) + cc_test(collective_server_test SRCS collective_server_test.cc + DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + selected_rows_functor scope math_function SERIAL) endif() diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc similarity index 99% rename from paddle/fluid/operators/distributed/brpc_client.cc rename to paddle/fluid/operators/distributed/brpc/brpc_client.cc index 62e32977b8cd7..87bdb83503783 100644 --- a/paddle/fluid/operators/distributed/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/distributed/brpc_client.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h similarity index 97% rename from paddle/fluid/operators/distributed/brpc_client.h rename to paddle/fluid/operators/distributed/brpc/brpc_client.h index 80cc81bff3791..2066ade8a5621 100644 --- a/paddle/fluid/operators/distributed/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h @@ -31,10 +31,10 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN namespace paddle { diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc similarity index 97% rename from paddle/fluid/operators/distributed/brpc_rdma_pool.cc rename to paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc index e1be5673dfbc5..d5c614001e0b2 100644 --- a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc @@ -14,7 +14,7 @@ #ifdef PADDLE_WITH_BRPC_RDMA -#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" #include "brpc/channel.h" #include "brpc/rdma/rdma_helper.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h similarity index 100% rename from paddle/fluid/operators/distributed/brpc_rdma_pool.h rename to paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc similarity index 96% rename from paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc rename to paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc index e4604db3a3816..49e048f07a239 100644 --- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc @@ -20,10 +20,10 @@ limitations under the License. */ #include // NOLINT #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" -#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h similarity index 96% rename from paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h rename to paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h index ffaf442224228..a5bdc331eb29c 100644 --- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc similarity index 97% rename from paddle/fluid/operators/distributed/brpc_serde_test.cc rename to paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc index 2a2dc72150a32..b902d3db48778 100644 --- a/paddle/fluid/operators/distributed/brpc_serde_test.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc @@ -22,8 +22,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc similarity index 98% rename from paddle/fluid/operators/distributed/brpc_server.cc rename to paddle/fluid/operators/distributed/brpc/brpc_server.cc index 78d41aeac50a3..cbe0bd09c7b27 100644 --- a/paddle/fluid/operators/distributed/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/distributed/brpc_server.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/request_handler.h" namespace sendrecv { diff --git a/paddle/fluid/operators/distributed/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h similarity index 95% rename from paddle/fluid/operators/distributed/brpc_server.h rename to paddle/fluid/operators/distributed/brpc/brpc_server.h index 85a7ad0dfe843..78bbe5adc0813 100644 --- a/paddle/fluid/operators/distributed/brpc_server.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_server.h @@ -19,8 +19,8 @@ limitations under the License. */ #include #include "brpc/server.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc similarity index 96% rename from paddle/fluid/operators/distributed/brpc_variable_response.cc rename to paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc index 75306d72334ab..eb78917ad2d8b 100644 --- a/paddle/fluid/operators/distributed/brpc_variable_response.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc @@ -13,7 +13,7 @@ // limitations under the License. // -#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h similarity index 97% rename from paddle/fluid/operators/distributed/brpc_variable_response.h rename to paddle/fluid/operators/distributed/brpc/brpc_variable_response.h index b0b91a42a01c7..6282f08a72536 100644 --- a/paddle/fluid/operators/distributed/brpc_variable_response.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h index 53b03c531a2b8..6a3a450a1fd2e 100644 --- a/paddle/fluid/operators/distributed/collective_client.h +++ b/paddle/fluid/operators/distributed/collective_client.h @@ -22,7 +22,7 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler.h" DECLARE_int32(rpc_deadline); diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h index a23dc18b4de86..03c688a78e1cb 100644 --- a/paddle/fluid/operators/distributed/collective_server.h +++ b/paddle/fluid/operators/distributed/collective_server.h @@ -23,7 +23,7 @@ limitations under the License. */ #include "gflags/gflags.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/rpc_server.h" diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc index 0a9c69e393257..c5d18f7c60e4a 100644 --- a/paddle/fluid/operators/distributed/collective_server_test.cc +++ b/paddle/fluid/operators/distributed/collective_server_test.cc @@ -21,9 +21,9 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/collective_client.h" #include "paddle/fluid/operators/distributed/collective_server.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/distributed/distributed.h similarity index 80% rename from paddle/fluid/operators/detail/macros.h rename to paddle/fluid/operators/distributed/distributed.h index 6f4a15caa5542..3a9f922598757 100644 --- a/paddle/fluid/operators/detail/macros.h +++ b/paddle/fluid/operators/distributed/distributed.h @@ -18,15 +18,15 @@ #ifdef PADDLE_WITH_GRPC -#include "paddle/fluid/operators/distributed/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc_server.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" #define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer #define RPCCLIENT_T paddle::operators::distributed::GRPCClient #else // PADDLE_WITH_GRPC -#include "paddle/fluid/operators/distributed/brpc_client.h" -#include "paddle/fluid/operators/distributed/brpc_server.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" #define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer #define RPCCLIENT_T paddle::operators::distributed::BRPCClient diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/operators/distributed/distributed_pb.h new file mode 100644 index 0000000000000..f1c662be9af67 --- /dev/null +++ b/paddle/fluid/operators/distributed/distributed_pb.h @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_DISTRIBUTE + +#ifdef PADDLE_WITH_GRPC + +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#else // PADDLE_WITH_GRPC + +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#endif // PADDLE_WITH_GRPC + +#endif // PADDLE_WITH_DISTRIBUTE diff --git a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc similarity index 96% rename from paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc rename to paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc index d192f54ee0c92..c2cb0d7f04eb6 100644 --- a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc @@ -17,7 +17,7 @@ limitations under the License. */ // file and did some modifications so that we can send gRPC // requests without too much copying of the tensor data. -#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h similarity index 100% rename from paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h rename to paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc similarity index 99% rename from paddle/fluid/operators/distributed/grpc_client.cc rename to paddle/fluid/operators/distributed/grpc/grpc_client.cc index 8c54159a41e33..7875c16c3cf41 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc @@ -17,8 +17,8 @@ limitations under the License. */ #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h similarity index 98% rename from paddle/fluid/operators/distributed/grpc_client.h rename to paddle/fluid/operators/distributed/grpc/grpc_client.h index 01bf46cc313b4..fa77d21257647 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h @@ -39,10 +39,9 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc similarity index 96% rename from paddle/fluid/operators/distributed/grpc_serde.cc rename to paddle/fluid/operators/distributed/grpc/grpc_serde.cc index a9dea9cfd2eea..6df4fd36f95b1 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc @@ -21,9 +21,9 @@ limitations under the License. */ #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/proto_encoder_helper.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/port.h" diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h similarity index 93% rename from paddle/fluid/operators/distributed/grpc_serde.h rename to paddle/fluid/operators/distributed/grpc/grpc_serde.h index 16f5293b0eb41..c9a57beb3a6a7 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h @@ -27,8 +27,7 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/port.h" -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc similarity index 97% rename from paddle/fluid/operators/distributed/grpc_serde_test.cc rename to paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc index 1936c2c623a77..749c1bf39a486 100644 --- a/paddle/fluid/operators/distributed/grpc_serde_test.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc @@ -21,9 +21,9 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/detail/macros.h" -#include "paddle/fluid/operators/distributed/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc similarity index 99% rename from paddle/fluid/operators/distributed/grpc_server.cc rename to paddle/fluid/operators/distributed/grpc/grpc_server.cc index cda102e78d2de..08f777e279e34 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/distributed/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc_server.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" using ::grpc::ServerAsyncResponseWriter; diff --git a/paddle/fluid/operators/distributed/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h similarity index 93% rename from paddle/fluid/operators/distributed/grpc_server.h rename to paddle/fluid/operators/distributed/grpc/grpc_server.h index d2524f5e65db6..2fd3a7a74073b 100644 --- a/paddle/fluid/operators/distributed/grpc_server.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.h @@ -29,11 +29,10 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/grpc_service.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_service.h" #include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h similarity index 98% rename from paddle/fluid/operators/distributed/grpc_service.h rename to paddle/fluid/operators/distributed/grpc/grpc_service.h index 537429b5fe989..0b5c5151e637f 100644 --- a/paddle/fluid/operators/distributed/grpc_service.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h @@ -23,7 +23,7 @@ #include #include #include -#include "paddle/fluid/operators/distributed/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" #include "paddle/fluid/platform/profiler.h" // NOTE: This method was originally created by tensorflow diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc similarity index 99% rename from paddle/fluid/operators/distributed/grpc_variable_response.cc rename to paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc index 76ad02b0300a5..87e83ca53bf13 100644 --- a/paddle/fluid/operators/distributed/grpc_variable_response.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc @@ -19,7 +19,7 @@ #include #endif -#include "paddle/fluid/operators/distributed/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h similarity index 89% rename from paddle/fluid/operators/distributed/grpc_variable_response.h rename to paddle/fluid/operators/distributed/grpc/grpc_variable_response.h index 89df07c92cd33..3ca1d89f75031 100644 --- a/paddle/fluid/operators/distributed/grpc_variable_response.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h @@ -22,13 +22,11 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" #include "paddle/fluid/operators/distributed/variable_response.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index cf14538b1c284..a96dec10866c0 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 122619d41b25d..cc5b9c29a12ec 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/distributed/rpc_server.h" + #include #include #include #include - -#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc index c3dd459fc4e8c..089ea623f18a2 100644 --- a/paddle/fluid/operators/distributed/rpc_server_test.cc +++ b/paddle/fluid/operators/distributed/rpc_server_test.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/rpc_server.h" diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in index 2637619f304d2..b39eef04d8d1d 100644 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -1,4 +1,3 @@ - /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,13 +17,8 @@ package sendrecv; option cc_generic_services = @cc_generic_services@; service SendRecvService { - // For parameter server round-robin like hashing, do not split tensors. - // Send and recv only one tensor - // TODO(typhoonzero): add streaming API rpc SendVariable(VariableMessage) returns (VoidMessage) {} - // Argument VariableMessage for GetVariable should only contain varname. rpc GetVariable(VariableMessage) returns (VariableMessage) {} - // pre-fetch variable by given variable name and Ids rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} @@ -33,19 +27,12 @@ service SendRecvService { rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} } -// VariableMessage is serialized paddle variable message. -// It can be: -// LoDTensor -// SelectedRows enum VarType { LOD_TENSOR = 0; SELECTED_ROWS = 1; NCCL_ID = 2; } -// NOTICE(gongwb):don't modify this proto if you are not -// not familar with how we serialize in sendrecvop_utils.h -// and deserilize it in variable_response.h. message VariableMessage { enum Type { // Pod Types @@ -62,21 +49,14 @@ message VariableMessage { string varname = 1; // TODO(Yancey1989): reference framework::proto::VarDesc::VarType VarType type = 2; - // bool persistable is not needed for sending. - // tensor info: Type data_type = 3; repeated int64 dims = 4; - // lod details: int64 lod_level = 5; repeated LodData lod = 6; - // selected_rows height, aka. original dim0 int64 slr_height = 7; - // tensor data bytes serialized = 8; - // selected_rows data bytes rows = 9; - // Look up table block execution output variable name. string out_varname = 10; // If 1, the ps server will start profiling, the ps // server stops profiling and generates a profile to /tmp/profile_ps_* diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 25e2f77fb74f2..e5c96507e9726 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -18,7 +18,6 @@ limitations under the License. */ #include // NOLINT #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/platform/port.h" diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 6a87178be5daa..5457101a5c9f3 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" #include "paddle/fluid/platform/port.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h index a4324f67bb99b..294cae5f44a47 100644 --- a/paddle/fluid/operators/distributed/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -25,7 +25,7 @@ #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" DECLARE_string(rpc_server_profile_path); diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc index a3b5ff8d17602..a09bff351fc0c 100644 --- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc +++ b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/string/printf.h" diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc index 8754856e140ed..7275ab201f471 100644 --- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc +++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc index ef574ccdf48dc..80d712a0e0275 100644 --- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/platform/nccl_helper.h" diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index 20870ea07ebf6..629f364d71269 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "gflags/gflags.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc index 86425aba8c4a0..52b96d5f8ef78 100644 --- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc +++ b/paddle/fluid/operators/distributed_ops/prefetch_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 0399ff41007fb..48065437e38b2 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc index 8ca2877d8adad..ae1b10c3b6c7b 100644 --- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 0bf4bebbc9002..e2c2147ab5e9a 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc index a73cb08eca272..1598e1d0a47ef 100644 --- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc +++ b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h" #include "paddle/fluid/operators/math/math_function.h" From ccc83bb4e5f2051ff03322a70590848e6a7594b2 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 28 Dec 2018 11:31:21 +0800 Subject: [PATCH 087/103] adaptive_pool support pool_size as int. test=develop --- python/paddle/fluid/layers/nn.py | 14 ++------------ python/paddle/fluid/tests/unittests/test_layers.py | 8 ++++++++ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index cc1fdbd285611..236f1643ea572 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2570,12 +2570,7 @@ def adaptive_pool2d(input, raise ValueError( "invalid setting 'require_index' true when 'pool_type' is 'avg'.") - def _is_list_or_tuple_(data): - return (isinstance(data, list) or isinstance(data, tuple)) - - if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2: - raise ValueError( - "'pool_size' should be a list or tuple with length as 2.") + pool_size = utils.convert_to_list(pool_size, 2, 'pool_size') if pool_type == "max": l_type = 'max_pool2d_with_index' @@ -2671,12 +2666,7 @@ def adaptive_pool3d(input, raise ValueError( "invalid setting 'require_index' true when 'pool_type' is 'avg'.") - def _is_list_or_tuple_(data): - return (isinstance(data, list) or isinstance(data, tuple)) - - if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3: - raise ValueError( - "'pool_size' should be a list or tuple with length as 3.") + pool_size = utils.convert_to_list(pool_size, 3, 'pool_size') if pool_type == "max": l_type = 'max_pool3d_with_index' diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index e180822c2b4b7..90f5d797a67d9 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -243,6 +243,10 @@ def test_adaptive_pool2d(self): pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True) self.assertIsNotNone(pool) self.assertIsNotNone(mask) + self.assertIsNotNone(layers.adaptive_pool2d(x, 3, pool_type='avg')) + pool, mask = layers.adaptive_pool2d(x, 3, require_index=True) + self.assertIsNotNone(pool) + self.assertIsNotNone(mask) def test_adaptive_pool3d(self): program = Program() @@ -255,6 +259,10 @@ def test_adaptive_pool3d(self): x, [3, 3, 3], require_index=True) self.assertIsNotNone(pool) self.assertIsNotNone(mask) + self.assertIsNotNone(layers.adaptive_pool3d(x, 3, pool_type='avg')) + pool, mask = layers.adaptive_pool3d(x, 3, require_index=True) + self.assertIsNotNone(pool) + self.assertIsNotNone(mask) def test_lstm_unit(self): program = Program() From 8bd0b028e23d094636b2a7d96e4da609fb6a0d38 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 28 Dec 2018 04:17:01 +0000 Subject: [PATCH 088/103] disable data balance unittest test=develop --- python/paddle/fluid/tests/unittests/test_data_balance.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py index aa19a5edc7814..9a6b7cf4765eb 100644 --- a/python/paddle/fluid/tests/unittests/test_data_balance.py +++ b/python/paddle/fluid/tests/unittests/test_data_balance.py @@ -194,4 +194,6 @@ def test_all(self): if __name__ == '__main__': - unittest.main() + # Disable data balance unittest, because data balance would be removed + # unittest.main() + pass From 813c2ce539dcd1f69d81a42d711a0d46e1faaf40 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 28 Dec 2018 14:42:04 +0800 Subject: [PATCH 089/103] fix timer test=develop --- paddle/fluid/platform/timer.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h index 9bb66eb97ad92..56019ae7cf21c 100644 --- a/paddle/fluid/platform/timer.h +++ b/paddle/fluid/platform/timer.h @@ -16,6 +16,13 @@ limitations under the License. */ #include #include "paddle/fluid/platform/port.h" +#ifdef _WIN32 +static unsigned sleep(unsigned seconds) { + Sleep(seconds * 1000); + return 0; +} +#endif + namespace paddle { namespace platform { From 6f0a1d7b47854e3a640a92f842c6262a38f34636 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Fri, 28 Dec 2018 15:14:25 +0800 Subject: [PATCH 090/103] Inception fusion operator. (#14968) * Inception fusion operator. * Support horizontal layer fusion in conv_fusion_op. * Search conv algo strategy for variable-length input. search N times and cache the searched algos. For other input, choose the algo of input whose area is closest to this input. --- cmake/operators.cmake | 2 +- paddle/fluid/operators/conv_cudnn_op_cache.h | 34 +++ paddle/fluid/operators/conv_fusion_op.cc | 62 +++- paddle/fluid/operators/conv_fusion_op.cu.cc | 103 +++++-- paddle/fluid/operators/fused/CMakeLists.txt | 4 +- .../fused/fusion_conv_inception_op.cc | 110 +++++++ .../fused/fusion_conv_inception_op.cu | 272 ++++++++++++++++++ python/paddle/fluid/__init__.py | 11 +- python/paddle/fluid/framework.py | 20 +- .../tests/unittests/test_conv2d_fusion_op.py | 41 ++- .../paddle/fluid/tests/unittests/testsuite.py | 4 +- 11 files changed, 604 insertions(+), 59 deletions(-) create mode 100644 paddle/fluid/operators/fused/fusion_conv_inception_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_conv_inception_op.cu diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 70d159b4f3549..59c40a0e5d18b 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -110,7 +110,7 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" -"fusion_transpose_flatten_concat_op") +"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h index 92d394eb3c5ae..f172431e483f3 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -19,6 +19,10 @@ limitations under the License. */ #include #include "paddle/fluid/platform/cudnn_helper.h" +DECLARE_uint64(conv_workspace_size_limit); +DECLARE_bool(cudnn_exhaustive_search); +DECLARE_int64(cudnn_exhaustive_search_times); + namespace paddle { namespace operators { @@ -45,6 +49,7 @@ static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; template class AlgorithmsCache { public: + AlgorithmsCache() : search_times_(0) { hash_.clear(); } // Caches the best algorithm for a given // combination of tensor dimensions & compute data type. TAlgorithm GetAlgorithm( @@ -54,9 +59,14 @@ class AlgorithmsCache { int algorithmFlags, // can set for different data type std::function gen_func); + TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags, + std::function gen_func); + private: std::unordered_map hash_; std::mutex mutex_; + + int search_times_; }; template @@ -107,5 +117,29 @@ TAlgorithm AlgorithmsCache::GetAlgorithm( return hash_[seed]; } +template +TAlgorithm AlgorithmsCache::GetAlgorithm( + int64_t area, int search_times, int algorithmFlags, + std::function gen_func) { + if (hash_.find(area) != hash_.end()) { + return hash_[area]; + } + if (search_times_ < search_times) { + auto algo = gen_func(); + hash_[area] = algo; + ++search_times_; + return algo; + } + TAlgorithm algo; + int64_t min = static_cast(INT_MAX); + for (const auto& m : hash_) { + if (m.first < min) { + min = m.first; + algo = m.second; + } + } + return algo; +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_fusion_op.cc b/paddle/fluid/operators/conv_fusion_op.cc index 9bdedb10e0b1b..23b8087e781da 100644 --- a/paddle/fluid/operators/conv_fusion_op.cc +++ b/paddle/fluid/operators/conv_fusion_op.cc @@ -28,6 +28,8 @@ namespace operators { // x is Input, // z is ResidualData, // bias is Bias +// When `split_channels` is set, y will be splitted into multiple outputs, +// each output has split_channels[i] number of channels. class Conv2DFusionOpMaker : public Conv2DOpMaker { protected: void Apply() override { @@ -36,8 +38,65 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker { "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' " "'relux' , 'tanh', 'band_pass'") .SetDefault("relu"); + AddAttr>( + "split_channels", + "When `split_channels` are set, there will be multiple outputs, the " + "output size is equal to the number of `split_channels`.") + .SetDefault({}); + AddOutput("Outputs", + "This Outputs is used when setting `split_channels`." + "Usually used to fuse conv with same input and same filter size, " + "padding, stride, dilation size.") + .AsDuplicable() + .AsDispensable(); + AddInput("AlgoCache", + "The cache of convolution algorithm, a RAW type variable.") + .AsDispensable(); + AddAttr( + "search_times", + "The number of exhaustive search times for convolution algorithm.") + .SetDefault(-1); } }; + +class Conv2DFusionOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of ConvOp should not be null."); + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::vector dilations = + ctx->Attrs().Get>("dilations"); + + std::vector oshape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + oshape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], strides[i])); + } + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of ConvOp should not be null."); + ctx->SetOutputDim("Output", framework::make_ddim(oshape)); + std::vector channels = + ctx->Attrs().Get>("split_channels"); + if (channels.size()) { + PADDLE_ENFORCE(ctx->HasOutputs("Outputs"), + "Output(Outputs) of ConvOp should not be null."); + std::vector oshapes; + oshapes.reserve(channels.size()); + for (size_t i = 0; i < channels.size(); ++i) { + oshapes.push_back({oshape[0], channels[i], oshape[2], oshape[3]}); + } + ctx->SetOutputsDim("Outputs", oshapes); + } + } +}; + // TODO(qingqing): add gradient operator for conv2d_fusion } // namespace operators @@ -45,4 +104,5 @@ class Conv2DFusionOpMaker : public Conv2DOpMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker, - ops::ConvOpInferVarType, paddle::framework::EmptyGradOpMaker); + ops::Conv2DFusionOpInferShape, ops::ConvOpInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index e73762f5fb238..d8b997cca613f 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -16,8 +16,9 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/cudnn_helper.h" -DECLARE_uint64(conv_workspace_size_limit); -DECLARE_bool(cudnn_exhaustive_search); +DEFINE_int64(cudnn_exhaustive_search_times, -1, + "Exhaustive search times for cuDNN convolution, " + "defalut is 1, only search once."); namespace paddle { namespace operators { @@ -117,41 +118,60 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { workspace_size_limit, &algo)); VLOG(3) << "cuDNN forward algo " << algo; } else { + auto search_func = [&]() { + int returned_algo_count; + std::array + fwd_perf_stat; + auto cudnn_find_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, + kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit)); + }; + workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); + VLOG(3) << "Perf result: (algo: stat, time, memory)"; + for (int i = 0; i < returned_algo_count; ++i) { + const auto& stat = fwd_perf_stat[i]; + VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " " + << stat.memory; + } + return fwd_perf_stat[0].algo; + }; AlgorithmsCache* algo_cache = nullptr; - if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) { + int search_times = ctx.Attr("search_times"); + search_times = std::max( + static_cast(FLAGS_cudnn_exhaustive_search_times), search_times); + if (search_times > 0) { + // The searched algo will be cached by `search_times` times for + // different input dimension. For other dimensions, select the algo + // of closest area. + auto var_name = ctx.Inputs("AlgoCache")[0]; algo_cache = ctx.scope() - .FindVar(kCUDNNFwdAlgoCache) + .FindVar(var_name) ->GetMutable>(); + algo = algo_cache->GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0, + search_func); } else { - algo_cache = - const_cast(ctx.scope()) - .Var(kCUDNNFwdAlgoCache) - ->GetMutable>(); + // Cache searched algo in Var(kCUDNNFwdAlgoCache). + // all conv ops use the same kCUDNNFwdAlgoCache variable. + if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) { + algo_cache = + ctx.scope() + .FindVar(kCUDNNFwdAlgoCache) + ->GetMutable>(); + } else { + // TODO(qingqing) remove const_cast + algo_cache = + const_cast(ctx.scope().parent()) + ->Var(kCUDNNFwdAlgoCache) + ->GetMutable>(); + } + algo = algo_cache->GetAlgorithm(x_dims, f_dims, strides, paddings, + dilations, 0, search_func); } - algo = algo_cache->GetAlgorithm( - x_dims, f_dims, strides, paddings, dilations, 0, [&]() { - int returned_algo_count; - std::array - fwd_perf_stat; - auto cudnn_find_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE( - platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( - handle, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, cudnn_output_desc, - output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count, - fwd_perf_stat.data(), cudnn_workspace, - workspace_size_limit)); - }; - workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit); - VLOG(3) << "Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = fwd_perf_stat[i]; - VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time - << " " << stat.memory; - } - return fwd_perf_stat[0].algo; - }); VLOG(3) << "choose algo " << algo; } @@ -195,6 +215,27 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { }; workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } + std::vector channels = ctx.Attr>("split_channels"); + if (channels.size()) { + auto outs = ctx.MultiOutput("Outputs"); + if (x_dims[0] == 1) { + // share data with Output + framework::Tensor t; + t.ShareDataWith(*output); + auto y_dims = output->dims(); + t.Resize({y_dims[1], y_dims[2], y_dims[3]}); + int s = 0; + for (size_t i = 0; i < channels.size(); ++i) { + int e = s + channels[i]; + outs[i]->ShareDataWith(t.Slice(s, e)); + outs[i]->Resize({x_dims[0], channels[i], y_dims[2], y_dims[3]}); + s = e; + } + } else { + // TODO(qingiqng): do copy when batch size large than 1 + PADDLE_THROW("Batch size greater than 1 is Unsupported"); + } + } } }; #endif diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index a0397acab1267..2bddba7db2f1c 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -1,6 +1,8 @@ include(operators) -register_operators(EXCLUDES fusion_transpose_flatten_concat_op) +register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op) if (WITH_GPU) op_library(fusion_transpose_flatten_concat_op) + op_library(fusion_conv_inception_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n") + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n") endif() diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cc b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc new file mode 100644 index 0000000000000..4690bd766d0b8 --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +class ConvInceptionFusionOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + // 1 x + auto in_dims = ctx->GetInputDim("Input"); + // 4 filters + auto w_dims = ctx->GetInputsDim("Filter"); + + PADDLE_ENFORCE(in_dims.size(), 4, "Conv intput should be 4-D tensor."); + PADDLE_ENFORCE_EQ(w_dims.size(), 4, "There should be 4 filters"); + PADDLE_ENFORCE_EQ(w_dims[0][1], in_dims[1]); + PADDLE_ENFORCE_EQ(w_dims[1][1], in_dims[1]); + + int n = in_dims[0]; + // compute output channel + // 1st channel + int c = w_dims[0][0]; + // add 2nd channel + c += (w_dims[1][0] - w_dims[2][1] * 2); + // add 3rd channel + c += (w_dims[2][0] - w_dims[3][1]); + // add 4-th channel + c += w_dims[3][0]; + + int h = in_dims[2]; + int w = in_dims[3]; + + ctx->SetOutputDim("Output", {n, c, h, w}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input("Input")->type(), ctx.device_context()); + } +}; + +class ConvInceptionFusionOpMaker : public framework::OpProtoAndCheckerMaker { + protected: + void Make() override { + AddInput("Input", "(Tensor) NCHW layout."); + AddInput("Filter", "(vector) 4 aggregated filters").AsDuplicable(); + AddInput("Bias", "(vector) it's lenght is equal to Filter") + .AsDuplicable(); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator. " + "The format of output tensor is also NCHW."); + AddOutput("TempOutput", "").AsDuplicable(); + AddAttr( + "pooling_type", + "(string), pooling type, can be \"max\" for max-pooling " + "and \"avg\" for average-pooling.") + .InEnum({"max", "avg"}); + AddAttr( + "exclusive", + "(bool, default True) When true, will exclude the zero-padding in the " + "averaging calculating, otherwise, include the zero-padding. Note, it " + "is only used when pooling_type is avg. The defalut is True.") + .SetDefault(true); + AddAttr( + "activation", + "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' " + "'relux' , 'tanh', 'band_pass'") + .SetDefault("relu"); + AddAttr("workspace_size_MB", + "Only used in cudnn kernel. Need set use_cudnn to true." + "workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(4096); + AddComment(R"DOC( +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(conv2d_inception_fusion, ops::ConvInceptionFusionOp, + ops::ConvInceptionFusionOpMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu new file mode 100644 index 0000000000000..3349b0b31ebf6 --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -0,0 +1,272 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +DECLARE_uint64(conv_workspace_size_limit); + +namespace paddle { +namespace operators { + +#if CUDNN_VERSION >= 7001 +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using ScopedActivationDescriptor = platform::ScopedActivationDescriptor; +using DataLayout = platform::DataLayout; + +using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; +using PoolingMode = platform::PoolingMode; +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; + +template +using CudnnDataType = platform::CudnnDataType; + +template +class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + auto* input = ctx.Input("Input"); + auto filters = ctx.MultiInput("Filter"); + auto bias = ctx.MultiInput("Bias"); + + auto* output = ctx.Output("Output"); + auto temp_outs = ctx.MultiOutput("TempOutput"); + + const std::string pool_type = ctx.Attr("pooling_type"); + const std::string activation = ctx.Attr("activation"); + const bool exclusive = ctx.Attr("exclusive"); + + int64_t user_workspace_size = + static_cast(ctx.Attr("workspace_size_MB")); + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + T* temp_data = temp_outs[0]->mutable_data(input->dims(), ctx.GetPlace()); + + DataLayout layout = DataLayout::kNCHW; + std::vector in_dim = framework::vectorize2int(input->dims()); + + // ------------------- cudnn descriptors --------------------- + PoolingMode pooling_mode; + if (pool_type == "max") { + pooling_mode = PoolingMode::kMaximum; + } else { + pooling_mode = exclusive ? PoolingMode::kAverageExclusive + : (PoolingMode::kAverageInclusive); + } + std::vector k0x0 = {0, 0}; + std::vector k1x1 = {1, 1}; + std::vector k1x1_2 = {1, 1}; + std::vector k3x3 = {3, 3}; + ScopedPoolingDescriptor pool_desc; + ScopedActivationDescriptor act_desc; + ScopedTensorDescriptor out_pool_desc; + ScopedTensorDescriptor input_desc; + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1); + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + + cudnnDataType_t cudnn_dtype = CudnnDataType::type; + cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4]; + cudnnFilterDescriptor_t* filter_desc = new cudnnFilterDescriptor_t[4]; + cudnnTensorDescriptor_t* bias_desc = new cudnnTensorDescriptor_t[4]; + cudnnTensorDescriptor_t* in_desc = new cudnnTensorDescriptor_t[4]; + cudnnConvolutionDescriptor_t* conv_desc = + new cudnnConvolutionDescriptor_t[4]; + for (int i = 0; i < 4; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateFilterDescriptor(&filter_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&bias_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&in_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&out_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateConvolutionDescriptor(&conv_desc[i])); + } + + std::vector> filter_dims; + std::vector> bias_dims; + std::vector> in_dims; + std::vector> out_dims; + std::vector> in_strides; + std::vector> out_strides; + std::vector> bias_strides; + + cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW; + int n = in_dim[0]; + int h = in_dim[2]; + int w = in_dim[3]; + int oc = output->dims()[1]; + + cudnnDataType_t compute_type = (cudnn_dtype == CUDNN_DATA_DOUBLE) + ? CUDNN_DATA_DOUBLE + : CUDNN_DATA_FLOAT; + + for (int i = 0; i < 4; ++i) { + filter_dims.push_back(framework::vectorize2int(filters[i]->dims())); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data())); + bias_dims.push_back({1, filter_dims[i][0], 1, 1}); + bias_strides.push_back({filter_dims[i][0], 1, 1, 1}); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + bias_desc[i], cudnn_dtype, 4, bias_dims[i].data(), + bias_strides[i].data())); + in_dims.push_back({n, filter_dims[i][1], h, w}); + out_dims.push_back({n, filter_dims[i][0], h, w}); + in_strides.push_back({filter_dims[i][1] * h * w, h * w, w, 1}); + out_strides.push_back({oc * h * w, h * w, w, 1}); + + if (i < 2) { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor( + conv_desc[i], 2, k0x0.data(), k1x1.data(), k1x1.data(), + CUDNN_CROSS_CORRELATION, compute_type)); + } else { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionNdDescriptor( + conv_desc[i], 2, k1x1.data(), k1x1.data(), k1x1.data(), + CUDNN_CROSS_CORRELATION, compute_type)); + } + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + conv_desc[i], CUDNN_DEFAULT_MATH)); + } + in_dims[2][1] *= 2; + in_strides[2][0] = oc * h * w; + out_strides[2][0] = filter_dims[2][0] * h * w; // this out is continuous. + in_strides[3][0] = filter_dims[2][0] * h * w; + CUDNN_ENFORCE( + platform::dynload::cudnnSetConvolutionGroupCount(conv_desc[2], 2)); + + cudnnConvolutionFwdAlgo_t algo[4]; + auto handle = dev_ctx.cudnn_handle(); + size_t workspace_size_in_bytes = 0; // final workspace to allocate. + + size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES; + if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) { + int64_t max_user_size = + std::max(static_cast(FLAGS_conv_workspace_size_limit), + user_workspace_size); + workspace_size_limit = max_user_size * 1024 * 1024; + } + + for (int i = 0; i < 4; ++i) { + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + in_desc[i], cudnn_dtype, 4, in_dims[i].data(), in_strides[i].data())); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + out_desc[i], cudnn_dtype, 4, out_dims[i].data(), + out_strides[i].data())); + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, + &algo[i])); + size_t tmp_size = 0; + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, in_desc[i], filter_desc[i], conv_desc[i], out_desc[i], + algo[i], &tmp_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size); + } + cudnnActivationDescriptor_t cudnn_act_desc = + act_desc.descriptor(activation); + + int oc0 = filter_dims[0][0]; + int oc1 = filter_dims[1][0] - filter_dims[2][1] * 2; + int oc3 = filter_dims[3][0]; + int oc2 = oc - oc0 - oc1 - oc3; + + // branch1: pool + 1x1 conv + ScalingParamType alpha = 1.0f, beta = 0.0f; + CUDNN_ENFORCE(platform::dynload::cudnnPoolingForward( + handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta, + pool_out_desc, temp_data)); + + std::vector in_datas; + in_datas.push_back(static_cast(temp_data)); + in_datas.push_back(static_cast(input_data)); + in_datas.push_back( + static_cast(output_data + (oc0 + oc1) * h * w)); + T* temp2_data = temp_outs[1]->mutable_data( + framework::make_ddim(out_dims[2]), ctx.GetPlace()); + in_datas.push_back(static_cast(temp2_data + oc2 * h * w)); + + std::vector out_datas; + out_datas.push_back(static_cast(output_data)); + out_datas.push_back(static_cast(output_data + oc0 * h * w)); + out_datas.push_back(static_cast(temp2_data)); + out_datas.push_back( + static_cast(output_data + (oc0 + oc1 + oc2) * h * w)); + + for (int i = 0; i < 4; ++i) { + auto func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha, in_desc[i], in_datas[i], filter_desc[i], + static_cast(filters[i]->data()), conv_desc[i], + algo[i], cudnn_workspace, workspace_size_in_bytes, &beta, + out_desc[i], out_datas[i], bias_desc[i], + static_cast(bias[i]->data()), cudnn_act_desc, + out_desc[i], out_datas[i])); + }; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + workspace_handle.RunFunc(func, workspace_size_in_bytes); + } + + cudnnTensorDescriptor_t x_desc; + cudnnTensorDescriptor_t y_desc; + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&x_desc)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&y_desc)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[2].data())); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc, cudnn_dtype, 4, out_dims[3].data(), out_strides[3].data())); + CUDNN_ENFORCE(platform::dynload::cudnnTransformTensor( + handle, CudnnDataType::kOne(), x_desc, + static_cast(out_datas[2]), CudnnDataType::kZero(), + y_desc, static_cast(output_data + (oc0 + oc1) * h * w))); + + for (int i = 0; i < 4; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(in_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(out_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyFilterDescriptor(filter_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(bias_desc[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyConvolutionDescriptor(conv_desc[i])); + } + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(x_desc)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(y_desc)); + } +}; +#endif + +} // namespace operators +} // namespace paddle + +#if CUDNN_VERSION >= 7001 +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion, + ops::CUDNNConvInceptionFusionOpKernel, + ops::CUDNNConvInceptionFusionOpKernel); +#endif diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index e0078e53141ac..7433c2cbb6357 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -154,9 +154,14 @@ def __bootstrap__(): if core.is_compiled_with_cuda(): read_env_flags += [ - 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', - 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus' + 'fraction_of_gpu_memory_to_use', + 'cudnn_deterministic', + 'enable_cublas_tensor_op_math', + 'conv_workspace_size_limit', + 'cudnn_exhaustive_search', + 'memory_optimize_debug', + 'selected_gpus', + 'cudnn_exhaustive_search_times', ] core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 45e6a856f209d..921d59158f906 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -647,20 +647,16 @@ def find_name(var_list, name): self.desc.set_input(in_proto.name, []) if outputs is not None: - given = set() - need = set() - for n in outputs: - given.add(n) for m in proto.outputs: - need.add(m.name) - if not given == need: - raise ValueError(("Incorrect setting for output(s) of " - "operator \"%s\". Need: [%s] Given: [%s]") % - (type, - ", ".join(six.binary_type(e) for e in need), - ", ".join(six.binary_type(e) for e in given))) - + if (m.name not in outputs) and m.dispensable: + continue + if not ((m.name in outputs) or m.dispensable): + raise ValueError( + ("Incorrect setting for output(s) of " + "operator \"%s\", should set: [%s].") % (type, m.name)) for out_proto in proto.outputs: + if out_proto.name not in outputs: + continue out_args = outputs[out_proto.name] if not isinstance(out_args, list): out_args = [out_args] diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py index 6cd71e39e41da..a27212f38f4e9 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py @@ -32,6 +32,8 @@ def setUp(self): self.activation = 'relu' self.add_bias = True self.add_residual_data = True + self.channels = None + self.outputs = None self.init_group() self.init_dilation() @@ -49,8 +51,8 @@ def setUp(self): input = np.random.random(self.input_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype) - output = conv2d_forward_naive(input, filter, self.groups, - conv2d_param).astype(self.dtype) + self.output = conv2d_forward_naive(input, filter, self.groups, + conv2d_param).astype(self.dtype) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input), @@ -58,19 +60,20 @@ def setUp(self): } if self.add_residual_data: - residual_data = np.random.random(output.shape).astype(self.dtype) + residual_data = np.random.random(self.output.shape).astype( + self.dtype) self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype( residual_data) - output += residual_data + self.output += residual_data if self.add_bias: bias = np.random.random(self.filter_size[0]).astype(self.dtype) self.inputs['Bias'] = OpTest.np_dtype_to_fluid_dtype(bias) - output = output + bias.reshape((1, bias.size, 1, 1)) + self.output = self.output + bias.reshape((1, bias.size, 1, 1)) assert self.activation in ['relu', 'identity'] if self.activation == 'relu': - output = np.maximum(output, 0) + self.output = np.maximum(self.output, 0) self.attrs = { 'strides': self.stride, @@ -79,9 +82,12 @@ def setUp(self): 'dilations': self.dilations, 'data_format': self.data_format, 'exhaustive_search': self.exhaustive_search, - 'activation': self.activation + 'activation': self.activation, + 'split_channels': self.channels } - self.outputs = {'Output': output} + self.outputs = {'Output': self.output} + + self.set_outputs() def testcuda(self): return core.is_compiled_with_cuda() @@ -117,6 +123,9 @@ def init_activation(self): def set_search_method(self): self.exhaustive_search = False + def set_outputs(self): + pass + class TestWithoutResidual(TestConv2dFusionOp): def init_bias_residual(self): @@ -160,5 +169,21 @@ def set_search_method(self): self.exhaustive_search = True +class TestMultipleOutputs(TestConv2dFusionOp): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [1, 32, 17, 17] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [126, f_c, 3, 3] + self.channels = [84, 42] + + def set_outputs(self): + out1 = self.output[:, 0:84, :, :] + out2 = self.output[:, 84:126, :, :] + self.outputs['Outputs'] = [('out1', out1), ('out2', out2)] + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py index dc3b2cb8bc158..c4eb26893cd1f 100644 --- a/python/paddle/fluid/tests/unittests/testsuite.py +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -137,9 +137,9 @@ def create_var(block, name, np_list, var_proto): var_dict = {} for var_proto in proto_list: var_name = str(var_proto.name) + if (var_name not in np_list) and var_proto.dispensable: + continue if is_input: - if (var_name not in np_list) and var_proto.dispensable: - continue assert (var_name in np_list) or (var_proto.dispensable), \ "Missing {} as input".format(var_name) if var_proto.duplicable: From ce70229ba6b67a9ed3d4a5a315e88a9c1e26389d Mon Sep 17 00:00:00 2001 From: gongweibao Date: Fri, 28 Dec 2018 15:45:05 +0800 Subject: [PATCH 091/103] Add max_body_size flags to brpc (#15084) --- .../distributed/collective_server_test.cc | 5 +- paddle/fluid/pybind/pybind.cc | 10 ++-- paddle/testing/paddle_gtest_main.cc | 53 +++++++++++++++---- python/paddle/fluid/__init__.py | 4 ++ 4 files changed, 57 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc index c5d18f7c60e4a..46c761000c31e 100644 --- a/paddle/fluid/operators/distributed/collective_server_test.cc +++ b/paddle/fluid/operators/distributed/collective_server_test.cc @@ -52,12 +52,12 @@ std::unique_ptr GenerateVars(platform::Place place) { framework::Scope* scope = new framework::Scope(); framework::Variable* var = scope->Var("var1"); auto* slr = var->GetMutable(); - slr->set_height(1000); + slr->set_height(20000); auto* tensor = slr->mutable_value(); auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({3, 5})); + tensor->Resize(framework::make_ddim({20000, 1024})); tensor->mutable_data(place); paddle::operators::math::set_constant(ctx, tensor, 32.7); @@ -83,6 +83,7 @@ void Gather(const std::vector& vars, } TEST(PREFETCH, GPU) { + setenv("FLAGS_max_body_size", "2147483647", 1); platform::CUDAPlace place; platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2ffdc90d8477f..d664107d57091 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -84,11 +84,15 @@ bool IsCompiledWithCUDA() { } bool IsCompiledWithBrpc() { -#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA) - return true; -#else +#ifndef PADDLE_WITH_DISTRIBUTE return false; #endif + +#ifdef PADDLE_WITH_GRPC + return false; +#endif + + return true; } bool IsCompiledWithDIST() { diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index ef43d13e18698..47c5248b57d19 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -28,20 +28,53 @@ int main(int argc, char** argv) { for (int i = 0; i < argc; ++i) { new_argv.push_back(argv[i]); } + + std::vector envs; + std::vector undefok; +#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC) + envs.push_back("max_body_size"); +#endif + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - new_argv.push_back( - strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy")); + envs.push_back("fraction_of_gpu_memory_to_use"); + envs.push_back("allocator_strategy"); #elif __clang__ - new_argv.push_back( - strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_" - "mb,allocator_strategy")); - new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb")); + envs.push_back("use_mkldnn"); + envs.push_back("initial_cpu_memory_in_mb"); + envs.push_back("allocator_strategy"); + + undefok.push_back("use_mkldnn"); + undefok.push_back("initial_cpu_memory_in_mb"); #else - new_argv.push_back( - strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_" - "mb,allocator_strategy")); - new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb")); + envs.push_back("use_pinned_memory"); + envs.push_back("use_mkldnn"); + envs.push_back("initial_cpu_memory_in_mb"); + envs.push_back("allocator_strategy"); + + undefok.push_back("use_mkldnn"); + undefok.push_back("initial_cpu_memory_in_mb"); #endif + + if (envs.size() > 0) { + std::string env_string = "--tryfromenv="; + for (auto t : envs) { + env_string += t + ","; + } + env_string = env_string.substr(0, env_string.length() - 1); + new_argv.push_back(strdup(env_string.c_str())); + VLOG(1) << "gtest env_string:" << env_string; + } + + if (undefok.size() > 0) { + std::string undefok_string = "--undefok="; + for (auto t : undefok) { + undefok_string += t + ","; + } + undefok_string = undefok_string.substr(0, undefok_string.length() - 1); + new_argv.push_back(strdup(undefok_string.c_str())); + VLOG(1) << "gtest undefok_string:" << undefok_string; + } + int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 7433c2cbb6357..7a72670935da2 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -151,6 +151,10 @@ def __bootstrap__(): read_env_flags.append('rpc_get_thread_num') read_env_flags.append('rpc_prefetch_thread_num') read_env_flags.append('rpc_disable_reuse_port') + if core.is_compiled_with_brpc(): + read_env_flags.append('max_body_size') + #set brpc max body size + os.environ['FLAGS_max_body_size'] = "2147483647" if core.is_compiled_with_cuda(): read_env_flags += [ From 8e271896ae14a4f86f255c74b60136ea5e0c705c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 28 Dec 2018 04:07:10 +0000 Subject: [PATCH 092/103] add test data for seqpool1 --- .../fluid/inference/tests/api/CMakeLists.txt | 9 +- .../tests/api/analyzer_seq_pool1_tester.cc | 172 +++++++++++++----- 2 files changed, 127 insertions(+), 54 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 9aa9db031cd46..e8da6255b3243 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -90,6 +90,11 @@ set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1") download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) +# seq_pool1 +set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1") +download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model.tar.gz" "seq_pool1_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc) + # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) @@ -108,10 +113,6 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") -# seq_pool1 -inference_analysis_api_test_with_fake_data(test_analyzer_seq_pool1 -"${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1" analyzer_seq_pool1_tester.cc "seq_pool1.tar.gz") - # mobilenet with depthwise_conv op inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 2ae840fd11f62..30ebfbebf385f 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include #include "paddle/fluid/inference/tests/api/tester_helper.h" @@ -20,6 +21,106 @@ namespace paddle { namespace inference { namespace analysis { +struct OneSlotInBatch { + std::string name; + std::vector> data; + std::vector shape; + std::vector lod; +}; + +struct DataRecord { + std::vector> batched_data; + std::map>> datasets; + size_t batch_iter{0}, num_samples; // total number of samples + + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) { + Load(path); + Prepare(batch_size); + } + + void Load(const std::string &path) { + std::ifstream file(path); + constexpr int num_slots = 154; + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, '\t', &data); + std::vector slot_data; + split_to_float(data[1], ' ', &slot_data); + std::string name = data[0]; + PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0, + "line %d, %s should be divisible", num_lines, name); + datasets[name].emplace_back(std::move(slot_data)); + } + num_samples = num_lines / num_slots; + PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast(num_lines), + "num samples should be divisible"); + PADDLE_ENFORCE_GT(num_samples, 0); + } + + void Prepare(int bs) { + for (auto it = datasets.begin(); it != datasets.end(); ++it) { + PADDLE_ENFORCE_EQ(it->second.size(), num_samples, + "size of each slot should be equal"); + } + size_t num_batches = num_samples / bs; + EXPECT_GT(num_batches, 0); + batched_data.resize(num_batches); + for (auto &one_batch : batched_data) { + one_batch.resize(datasets.size()); + size_t i = 0; + for (auto it = datasets.begin(); it != datasets.end(); ++it) { + auto &slot = one_batch[i]; + slot.name = it->first; + slot.data.resize(bs); + slot.lod.resize(bs + 1); + slot.lod[0] = 0; + auto &lod = slot.lod; + auto &datas = it->second; + for (int k = 0; k < bs; ++k) { + size_t id = k + batch_iter * bs; + std::copy(datas[id].begin(), datas[id].end(), + std::back_inserter(slot.data[k])); + size_t len = datas[id].size() / 11; + PADDLE_ENFORCE_EQ(len * 11, datas[id].size(), + "%s %d size should be divisible", slot.name, id); + lod[k + 1] = lod[k] + len; + } + slot.shape.assign({static_cast(lod[bs]), 11}); + i++; + } + } + } + + const std::vector &NextBatch() { + if (batch_iter >= batched_data.size() - 1) { + batch_iter = -1; + } + return batched_data[++batch_iter]; + } +}; + +static void TensorAssignSlot(PaddleTensor *tensor, const OneSlotInBatch &slot) { + tensor->name = slot.name + "_embed"; + tensor->shape = slot.shape; + tensor->dtype = PaddleDType::FLOAT32; + tensor->lod.clear(); + tensor->lod.emplace_back(slot.lod); + TensorAssignData(tensor, slot.data); +} + +void PrepareInputs(std::vector *input_slots, DataRecord *data) { + const auto &one_batch = data->NextBatch(); + input_slots->resize(one_batch.size()); + for (size_t i = 0; i < one_batch.size(); ++i) { + auto &slot = one_batch[i]; + TensorAssignSlot(&((*input_slots)[i]), slot); + } +} + void SetConfig(AnalysisConfig *cfg) { cfg->param_file = FLAGS_infer_model + "/params"; cfg->prog_file = FLAGS_infer_model + "/model"; @@ -27,62 +128,22 @@ void SetConfig(AnalysisConfig *cfg) { cfg->device = 0; cfg->enable_ir_optim = true; cfg->specify_input_name = true; + cfg->pass_builder()->TurnOnDebug(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); } void SetInput(std::vector> *inputs) { - std::vector feed_names = { - "slot10000_embed", "slot10001_embed", "slot10004_embed", - "slot10005_embed", "slot10008_embed", "slot10009_embed", - "slot10012_embed", "slot10013_embed", "slot10108_embed", - "slot13324_embed", "slot13325_embed", "slot13326_embed", - "slot13327_embed", "slot13328_embed", "slot13329_embed", - "slot13330_embed", "slot13331_embed", "slot15501_embed", - "slot15502_embed", "slot15503_embed", "slot15504_embed", - "slot15505_embed", "slot15506_embed", "slot15507_embed", - "slot15508_embed", "slot15516_embed", "slot15519_embed", - "slot15523_embed", "slot15531_embed", "slot15533_embed", - "slot15548_embed", "slot15564_embed", "slot15565_embed", - "slot15566_embed", "slot15570_embed", "slot15571_embed", - "slot15572_embed", "slot15573_embed", "slot15574_embed", - "slot15575_embed", "slot15576_embed", "slot15577_embed", - "slot15579_embed", "slot15581_embed", "slot15582_embed", - "slot15583_embed", "slot15584_embed", "slot5016_embed", - "slot5021_embed", "slot6002_embed", "slot6003_embed", - "slot6004_embed", "slot6005_embed", "slot6006_embed", - "slot6007_embed", "slot6008_embed", "slot6009_embed", - "slot6011_embed", "slot6014_embed", "slot6015_embed", - "slot6023_embed", "slot6024_embed", "slot6025_embed", - "slot6027_embed", "slot6029_embed", "slot6031_embed", - "slot6034_embed", "slot6035_embed", "slot6036_embed", - "slot6037_embed", "slot6039_embed", "slot6048_embed", - "slot6050_embed", "slot6058_embed", "slot6059_embed", - "slot6060_embed", "slot6066_embed", "slot6067_embed", - "slot6068_embed", "slot6069_embed", "slot6070_embed", - "slot6071_embed", "slot6072_embed", "slot6073_embed", - "slot6182_embed", "slot6183_embed", "slot6184_embed", - "slot6185_embed", "slot6186_embed", "slot6188_embed", - "slot6189_embed", "slot6190_embed", "slot6201_embed", - "slot6202_embed", "slot6203_embed", "slot6247_embed", - "slot6248_embed", "slot6250_embed", "slot6251_embed", - "slot6807_embed", "slot6808_embed", "slot6809_embed", - "slot6810_embed", "slot6811_embed", "slot6812_embed", - "slot6813_embed", "slot6814_embed", "slot6815_embed", - "slot6816_embed", "slot6817_embed", "slot6818_embed", - "slot6819_embed", "slot6820_embed", "slot6822_embed", - "slot6823_embed", "slot6826_embed", "slot7002_embed", - "slot7003_embed", "slot7004_embed", "slot7005_embed", - "slot7006_embed", "slot7008_embed", "slot7009_embed", - "slot7010_embed", "slot7011_embed", "slot7013_embed", - "slot7014_embed", "slot7015_embed", "slot7016_embed", - "slot7017_embed", "slot7019_embed", "slot7100_embed", - "slot7506_embed", "slot7507_embed", "slot7514_embed", - "slot7515_embed", "slot7516_embed"}; - SetFakeImageInput(inputs, FLAGS_infer_model, true, "model", "params", - &feed_names); + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.batched_data.size() : 1; + LOG(INFO) << "number of samples: " + << data.batched_data.size() * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data); + (*inputs).emplace_back(input_slots); + } } -// Easy for profiling independently. void profile(bool use_mkldnn = false) { AnalysisConfig cfg; SetConfig(&cfg); @@ -100,6 +161,17 @@ void profile(bool use_mkldnn = false) { TEST(Analyzer_seq_pool1, profile) { profile(); } +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_seq_pool1, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + // Check the fuse status TEST(Analyzer_seq_pool1, fuse_statis) { AnalysisConfig cfg; From cd94df86793e1380d44a177eecb2cde90cc734e9 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 28 Dec 2018 07:39:59 +0000 Subject: [PATCH 093/103] fix load and refine --- paddle/fluid/inference/api/analysis_predictor.cc | 2 +- paddle/fluid/inference/tests/api/analyzer_ner_tester.cc | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3f8feaaa1e9f9..6e3c0aa1e13a6 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -251,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, input.set_lod(lod); int idx = -1; if (config_.specify_input_name) { - idx = feed_names_[inputs[i].name]; + idx = feed_names_.at(inputs[i].name); } else { idx = boost::get(feeds_[i]->GetAttr("col")); } diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index f8635968cebc4..04f8b3ffe894c 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -60,8 +60,7 @@ struct DataRecord { } }; -void PrepareInputs(std::vector *input_slots, DataRecord *data, - int batch_size) { +void PrepareInputs(std::vector *input_slots, DataRecord *data) { PaddleTensor lod_word_tensor, lod_mention_tensor; lod_word_tensor.name = "word"; lod_mention_tensor.name = "mention"; @@ -100,7 +99,7 @@ void SetInput(std::vector> *inputs) { int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; for (int bid = 0; bid < epoch; ++bid) { - PrepareInputs(&input_slots, &data, FLAGS_batch_size); + PrepareInputs(&input_slots, &data); (*inputs).emplace_back(input_slots); } } From 484085693e1c6ea88958d453d5c7473e89daee60 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 28 Dec 2018 08:32:53 +0000 Subject: [PATCH 094/103] update url and num_ops test=develop --- paddle/fluid/inference/tests/api/CMakeLists.txt | 2 +- paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index e8da6255b3243..5038629aa471c 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -92,7 +92,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana # seq_pool1 set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1") -download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model.tar.gz" "seq_pool1_data.txt.tar.gz") +download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc) # ocr diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 30ebfbebf385f..1c251e0c22f1e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -181,7 +181,7 @@ TEST(Analyzer_seq_pool1, fuse_statis) { auto fuse_statis = GetFuseStatis( static_cast(predictor.get()), &num_ops); LOG(INFO) << "num_ops: " << num_ops; - EXPECT_EQ(num_ops, 314); + EXPECT_EQ(num_ops, 349); } } // namespace analysis From 33b7821a75c3514c9bc322a88e1845edd313fe63 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 28 Dec 2018 19:21:10 +0800 Subject: [PATCH 095/103] fix save and load ops on windows test=develop --- paddle/fluid/operators/load_combine_op.cc | 4 ++-- paddle/fluid/operators/load_op.cc | 2 +- paddle/fluid/operators/save_combine_op.cc | 2 +- paddle/fluid/operators/save_op.cc | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index e28d199eebc09..c03249644ad87 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -38,13 +38,13 @@ class LoadCombineOp : public framework::OperatorBase { static_cast(out_var_names.size()), 0, "The number of output variables should be greater than 0."); if (!model_from_memory) { - std::ifstream fin(filename); + std::ifstream fin(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load_combine op", filename); LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); } else { PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); - std::stringstream fin(filename); + std::stringstream fin(filename, std::ios::binary); LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); } } diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 06773d1d0ed67..4bce4eba22e4a 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -34,7 +34,7 @@ class LoadOp : public framework::OperatorBase { // FIXME(yuyang18): We save variable to local file now, but we should change // it to save an output stream. auto filename = Attr("file_path"); - std::ifstream fin(filename); + std::ifstream fin(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s for load op", filename); diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index a0b9fa305d85e..d0edcc170f0af 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -49,7 +49,7 @@ class SaveCombineOp : public framework::OperatorBase { } MkDirRecursively(DirName(filename).c_str()); - std::ofstream fout(filename); + std::ofstream fout(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", filename); diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index e1c9fd8ff1f08..fcc598f4f1613 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -80,7 +80,7 @@ class SaveOp : public framework::OperatorBase { // FIXME(yuyang18): We save variable to local file now, but we should change // it to save an output stream. - std::ofstream fout(filename); + std::ofstream fout(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", filename); @@ -122,7 +122,7 @@ class SaveOp : public framework::OperatorBase { // FIXME(yuyang18): We save variable to local file now, but we should change // it to save an output stream. - std::ofstream fout(filename); + std::ofstream fout(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fout), "Cannot open %s to write", filename); framework::SerializeToStream(fout, selectedRows, dev_ctx); From dca68cdf97c8408313aa461b968e1830016d70f2 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 28 Dec 2018 16:05:23 +0000 Subject: [PATCH 096/103] throw error when name not find test=develop --- paddle/fluid/inference/api/analysis_predictor.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6e3c0aa1e13a6..5aceea7d01c93 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -251,7 +251,12 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, input.set_lod(lod); int idx = -1; if (config_.specify_input_name) { - idx = feed_names_.at(inputs[i].name); + auto name = inputs[i].name; + if (feed_names_.find(name) == feed_names_.end()) { + LOG(ERROR) << "feed names from program do not have name: " << name + << " from specified input"; + } + idx = feed_names_[name]; } else { idx = boost::get(feeds_[i]->GetAttr("col")); } From 8a83d6994e10580716f3eb76fdfebf6227a5f1f4 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Sat, 29 Dec 2018 02:02:35 +0000 Subject: [PATCH 097/103] delete data_balance unittest test=develop --- .../tests/unittests/test_data_balance.py | 197 ------------------ 1 file changed, 197 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/test_data_balance.py diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py deleted file mode 100644 index aa19a5edc7814..0000000000000 --- a/python/paddle/fluid/tests/unittests/test_data_balance.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import unittest -import paddle.fluid as fluid -import paddle -import numpy as np - - -class TestDataBalance(unittest.TestCase): - def prepare_data(self): - def fake_data_generator(): - for n in range(self.total_ins_num): - yield np.ones((3, 4)) * n, n - - # Prepare data - with fluid.program_guard(fluid.Program(), fluid.Program()): - reader = paddle.batch( - fake_data_generator, batch_size=self.batch_size) - feeder = fluid.DataFeeder( - feed_list=[ - fluid.layers.data( - name='image', shape=[3, 4], dtype='float32'), - fluid.layers.data( - name='label', shape=[1], dtype='int64'), - ], - place=fluid.CPUPlace()) - self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file( - self.data_file_name, reader, feeder) - - def prepare_lod_data(self): - def fake_data_generator(): - for n in range(1, self.total_ins_num + 1): - d1 = (np.ones((n, 3)) * n).astype('float32') - d2 = (np.array(n).reshape((1, 1))).astype('int32') - yield d1, d2 - - # Prepare lod data - with fluid.program_guard(fluid.Program(), fluid.Program()): - with fluid.recordio_writer.create_recordio_writer( - filename=self.lod_data_file_name) as writer: - eof = False - generator = fake_data_generator() - while (not eof): - data_batch = [ - np.array([]).reshape((0, 3)), np.array([]).reshape( - (0, 1)) - ] - lod = [0] - for _ in range(self.batch_size): - try: - ins = next(generator) - except StopIteration: - eof = True - break - for i, d in enumerate(ins): - data_batch[i] = np.concatenate( - (data_batch[i], d), axis=0) - lod.append(lod[-1] + ins[0].shape[0]) - if data_batch[0].shape[0] > 0: - for i, d in enumerate(data_batch): - t = fluid.LoDTensor() - t.set(data_batch[i], fluid.CPUPlace()) - if i == 0: - t.set_lod([lod]) - writer.append_tensor(t) - writer.complete_append_tensor() - - def setUp(self): - self.use_cuda = fluid.core.is_compiled_with_cuda() - self.data_file_name = './data_balance_test.recordio' - self.lod_data_file_name = './data_balance_with_lod_test.recordio' - self.total_ins_num = 50 - self.batch_size = 12 - self.prepare_data() - self.prepare_lod_data() - - def main(self): - main_prog = fluid.Program() - startup_prog = fluid.Program() - with fluid.program_guard(main_prog, startup_prog): - data_reader = fluid.layers.io.open_files( - filenames=[self.data_file_name], - shapes=[[-1, 3, 4], [-1, 1]], - lod_levels=[0, 0], - dtypes=['float32', 'int64']) - if self.use_cuda: - data_reader = fluid.layers.double_buffer(data_reader) - image, label = fluid.layers.read_file(data_reader) - - place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup_prog) - - build_strategy = fluid.BuildStrategy() - build_strategy.enable_data_balance = True - parallel_exe = fluid.ParallelExecutor( - use_cuda=self.use_cuda, - main_program=main_prog, - build_strategy=build_strategy) - - if (parallel_exe.device_count > self.batch_size): - print("WARNING: Unittest TestDataBalance skipped. \ - For the result is not correct when device count \ - is larger than batch size.") - return - fetch_list = [image.name, label.name] - - data_appeared = [False] * self.total_ins_num - while (True): - try: - image_val, label_val = parallel_exe.run(fetch_list, - return_numpy=True) - except fluid.core.EOFException: - break - ins_num = image_val.shape[0] - broadcasted_label = np.ones( - (ins_num, 3, 4)) * label_val.reshape((ins_num, 1, 1)) - self.assertEqual(image_val.all(), broadcasted_label.all()) - for l in label_val: - self.assertFalse(data_appeared[l[0]]) - data_appeared[l[0]] = True - for i in data_appeared: - self.assertTrue(i) - - def main_lod(self): - main_prog = fluid.Program() - startup_prog = fluid.Program() - with fluid.program_guard(main_prog, startup_prog): - data_reader = fluid.layers.io.open_files( - filenames=[self.lod_data_file_name], - shapes=[[-1, 3], [-1, 1]], - lod_levels=[1, 0], - dtypes=['float32', 'int32']) - ins, label = fluid.layers.read_file(data_reader) - - place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup_prog) - build_strategy = fluid.BuildStrategy() - build_strategy.enable_data_balance = True - parallel_exe = fluid.ParallelExecutor( - use_cuda=self.use_cuda, - main_program=main_prog, - build_strategy=build_strategy) - - if parallel_exe.device_count > self.batch_size: - print("WARNING: Unittest TestDataBalance skipped. \ - For the result is not correct when device count \ - is larger than batch size.") - exit(0) - fetch_list = [ins.name, label.name] - - data_appeared = [False] * self.total_ins_num - while (True): - try: - ins_tensor, label_tensor = parallel_exe.run( - fetch_list, return_numpy=False) - except fluid.core.EOFException: - break - - ins_val = np.array(ins_tensor) - label_val = np.array(label_tensor) - ins_lod = ins_tensor.lod()[0] - self.assertEqual(ins_val.shape[1], 3) - self.assertEqual(label_val.shape[1], 1) - self.assertEqual(len(ins_lod) - 1, label_val.shape[0]) - for i in range(0, len(ins_lod) - 1): - ins_elem = ins_val[ins_lod[i]:ins_lod[i + 1]][:] - label_elem = label_val[i][0] - self.assertEqual(ins_elem.all(), label_elem.all()) - self.assertFalse(data_appeared[int(label_elem - 1)]) - data_appeared[int(label_elem - 1)] = True - - for i in data_appeared: - self.assertTrue(i) - - def test_all(self): - self.main() - self.main_lod() - - -if __name__ == '__main__': - unittest.main() From 5d8f28139703cb80686def3a6993e4df5dec9008 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 29 Dec 2018 11:10:36 +0800 Subject: [PATCH 098/103] restore the memory mode test=develop --- paddle/fluid/operators/load_combine_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index c03249644ad87..691c5cc1a1d53 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -44,7 +44,7 @@ class LoadCombineOp : public framework::OperatorBase { LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); } else { PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); - std::stringstream fin(filename, std::ios::binary); + std::stringstream fin(filename); LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); } } From b9c645639b73f701cafd753f2dbafd97312ceaa0 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Sat, 29 Dec 2018 04:01:49 +0000 Subject: [PATCH 099/103] workaround with third party cache test=develop --- paddle/fluid/inference/tests/api/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 5038629aa471c..a1a79c6885568 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -91,7 +91,7 @@ download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_c inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) # seq_pool1 -set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool1") +set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool") download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc) From bf518ec8724e6209f029f9480f4163a9936c9229 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Sat, 29 Dec 2018 13:32:03 +0800 Subject: [PATCH 100/103] update CI rules for checking change of python reference (#15104) * test=develop * test=develop * test=develop * test=develop * test=develop --- paddle/scripts/paddle_build.sh | 14 +++++- tools/check_doc_approval.py | 85 ++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 tools/check_doc_approval.py diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 1220f80100785..d7ab36223c72c 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -527,6 +527,18 @@ function assert_api_spec_approvals() { fi fi + pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl + CHECK_DOCK_MD5=`python ${PADDLE_ROOT}/tools/check_doc_approval.py` + if [ "True" != ${CHECK_DOCK_MD5} ]; then + APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ + python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308` + echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" + if [ "${APPROVALS}" == "FALSE" ]; then + echo "You must have shanyi15 approval for the api doc change! " + exit 1 + fi + echo ${CHECK_DOCK_MD5} >/root/.cache/doc_md5.txt + fi } @@ -906,11 +918,11 @@ function main() { cmake_gen ${PYTHON_ABI:-""} build assert_api_not_changed ${PYTHON_ABI:-""} + assert_api_spec_approvals run_test gen_capi_package gen_fluid_lib test_fluid_lib - assert_api_spec_approvals ;; assert_api) assert_api_not_changed ${PYTHON_ABI:-""} diff --git a/tools/check_doc_approval.py b/tools/check_doc_approval.py new file mode 100644 index 0000000000000..44fdf58b49a17 --- /dev/null +++ b/tools/check_doc_approval.py @@ -0,0 +1,85 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import ast +import hashlib +import importlib +import paddle.fluid + +files = [ + "paddle.fluid", "paddle.fluid.average", "paddle.fluid.backward", + "paddle.fluid.clip", "paddle.fluid.data_feeder", "paddle.fluid.executor", + "paddle.fluid.initializer", "paddle.fluid.io", "paddle.fluid.layers", + "paddle.fluid.metrics", "paddle.fluid.nets", "paddle.fluid.optimizer", + "paddle.fluid.profiler", "paddle.fluid.recordio_writer", + "paddle.fluid.regularizer", "paddle.fluid.transpiler" +] + + +def md5(doc): + hash = hashlib.md5() + hash.update(str(doc)) + return hash.hexdigest() + + +def get_module(): + for fi in files: + fi_lib = importlib.import_module(fi) + doc_function = getattr(fi_lib, "__all__") + for api in doc_function: + api_name = fi + "." + api + try: + doc_module = getattr(eval(api_name), "__doc__") + except: + pass + doc_md5_code = md5(doc_module) + doc_dict[api_name] = doc_md5_code + + +def doc_md5_dict(doc_md5_path): + with open(doc_md5_path, "rb") as f: + doc_md5 = f.read() + doc_md5_dict = ast.literal_eval(doc_md5) + return doc_md5_dict + + +def check_doc_md5(): + for k, v in doc_dict.items(): + try: + if doc_ci_dict[k] != v: + return doc_dict + except: + return doc_dict + return True + + +if __name__ == "__main__": + doc_dict = {} + doc_ci_dict = {} + doc_md5_file = "/root/.cache/doc_md5.txt" + if not os.path.exists(doc_md5_file): + os.mknod(doc_md5_file) + else: + doc_ci_dict = doc_md5_dict(doc_md5_file) + get_module() + if not os.path.getsize(doc_md5_file): + with open(doc_md5_file, 'w') as f: + f.write(str(doc_dict)) + check_dic = True + print(check_dic) + else: + check_dic = check_doc_md5() + print(check_dic) From b3688100adafb50117cb85a7c1190e64156b7dcf Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 29 Dec 2018 13:54:32 +0800 Subject: [PATCH 101/103] fix unittest test=develop --- paddle/fluid/operators/load_combine_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index 691c5cc1a1d53..c4a2282e16483 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -44,7 +44,7 @@ class LoadCombineOp : public framework::OperatorBase { LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); } else { PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); - std::stringstream fin(filename); + std::stringstream fin(filename, std::ios::in | std::ios::binary); LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); } } From 516fe301ee036bca4018a282072c1226dcd38b68 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Sat, 29 Dec 2018 07:28:00 +0000 Subject: [PATCH 102/103] add comment in case of empty name test=develop --- paddle/fluid/inference/api/analysis_predictor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5aceea7d01c93..3aaec10ee2d44 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -253,8 +253,8 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, if (config_.specify_input_name) { auto name = inputs[i].name; if (feed_names_.find(name) == feed_names_.end()) { - LOG(ERROR) << "feed names from program do not have name: " << name - << " from specified input"; + LOG(ERROR) << "feed names from program do not have name: [" << name + << "] from specified input"; } idx = feed_names_[name]; } else { From 5f9c88868b59eefdab8393d69d3b6fa3c1dddabb Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 2 Jan 2019 10:03:20 +0800 Subject: [PATCH 103/103] Upgrade ar version (#15109) --- Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Dockerfile b/Dockerfile index 716b164ab84c1..acfd091265e26 100644 --- a/Dockerfile +++ b/Dockerfile @@ -149,6 +149,14 @@ RUN git clone /~https://github.com/woboq/woboq_codebrowser /woboq && \ -DCMAKE_BUILD_TYPE=Release . \ make) +# ar mishandles 4GB files +# https://sourceware.org/bugzilla/show_bug.cgi?id=14625 +# remove them when apt-get support 2.27 and higher version +RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/2.27-9ubuntu1/binutils_2.27.orig.tar.gz && \ + tar -xzf binutils_2.27.orig.tar.gz && \ + cd binutils-2.27 && \ + ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz + # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service RUN mkdir /var/run/sshd RUN echo 'root:root' | chpasswd